├── .gitignore ├── .gitmodules ├── LICENSE.md ├── README.md ├── exps ├── __init__.py ├── bmc.py ├── exp_class.py ├── exp_net.py ├── exp_simple.py ├── exp_tree.py ├── expr_sweep.py ├── fig_jk.py ├── fig_yj.py ├── jsonplotter.py ├── karen.py ├── mcsizeplotter.py ├── mpm.py ├── mpm_play.py ├── priorstrength.py ├── samplesize.py ├── simpleplotter.py ├── tcga.py └── treevbnet.py ├── include └── utils.h ├── lib └── libdai.so ├── samcnet ├── __init__.py ├── bayesnet.pxd ├── bayesnet.pyx ├── bayesnet.so ├── bayesnetcpd.pxd ├── bayesnetcpd.pyx ├── bayesnetcpd.so ├── calibrate.py ├── csnet.pxd ├── dai_bind.pxd ├── data.py ├── generator.py ├── lori.py ├── metropolis.py ├── mh.pyx ├── mh.so ├── mixturepoisson.pyx ├── mixturepoisson.so ├── probability.pxd ├── probability.pyx ├── probability.so ├── pydai.pxd ├── pydai.pyx ├── pydai.so ├── report.py ├── samc.pyx ├── samc.so ├── simple.py ├── tail.py ├── tcga-parser.py ├── treenet.py ├── utils.cpp └── utils.py ├── tests ├── __init__.py ├── all_poisson.py ├── cov_poisson.py ├── dai_test.py ├── ex_data_0.csv ├── ex_data_1.csv ├── ex_data_predict.csv ├── example.py ├── mpm_yousef.py ├── poisson.py ├── poisson_synth.py ├── test_class.py ├── test_net.py ├── test_poisson.py ├── test_simple.py ├── test_tree.py └── treevbnet.py ├── waf └── wscript /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | .waf* 3 | .lock* 4 | *.pyc 5 | cde.options 6 | cde-package 7 | *.so 8 | .exps 9 | .tmp 10 | config.py 11 | env 12 | mon.py 13 | samcnet/wishart.py 14 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "deps/libdai"] 2 | path = deps/libdai 3 | url = git://github.com/binarybana/libDAI.git 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Samcnet is licensed under the MIT License: 2 | 3 | > Copyright (c) 2012: Jason Knight and other contributors 4 | > 5 | > Permission is hereby granted, free of charge, to any person obtaining 6 | > a copy of this software and associated documentation files (the 7 | > "Software"), to deal in the Software without restriction, including 8 | > without limitation the rights to use, copy, modify, merge, publish, 9 | > distribute, sublicense, and/or sell copies of the Software, and to 10 | > permit persons to whom the Software is furnished to do so, subject to 11 | > the following conditions: 12 | > 13 | > The above copyright notice and this permission notice shall be 14 | > included in all copies or substantial portions of the Software. 15 | > 16 | > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | > EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | > MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | > NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | > LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | > OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | > WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## SAMCNet 2 | 3 | This package started as a toolkit and demonstration of Bayesian model averaging 4 | applied to a class of graphical models known as Bayesian networks. I then added 5 | functionality to perform optimal Bayesian Classification for a publication 6 | [[Knight, Ivanov, Dougherty 7 | 2014]](http://www.biomedcentral.com/1471-2105/15/401). 8 | In other words, it can handle classification of RNA-Seq data using a the 9 | published statistical model that shows superior performance when compared to 10 | nonlinear SVM, LDA, and others. 11 | 12 | Both of these functionalities still work, although for cutting edge 13 | development, effort has moved over to the Julia ports for classification 14 | [(OBC.jl)](https://github.com/binarybana/OBC.jl), network inference 15 | [(MCBN.jl)](https://github.com/binarybana/MCBN.jl), and a package split off to 16 | contain the MCMC methods at the API resolution I needed 17 | [(SAMC.jl)](https://github.com/binarybana/SAMC.jl). 18 | 19 | ## Installing 20 | In order to use the classification component of the library, in a recent 21 | version of Ubuntu you'll need the following: 22 | ``` 23 | sudo apt-get install cython python-pandas python-numpy python-scipy git clone 24 | git://github.com/binarybana/samcnet.git 25 | cd samcnet 26 | ./waf configure 27 | ./waf 28 | export LD_LIBRARY_PATH=lib:build 29 | ``` 30 | 31 | Then test with 32 | ``` 33 | python -m tests.example 34 | ``` 35 | 36 | ### Usage 37 | 38 | A video tutorial explaining how to operate the classifier on your RNA-Seq 39 | dataset has been posted at: http://www.youtube.com/watch?v=fPa5qy1tdhY 40 | 41 | 42 | ### Network Inference (Deprecated) 43 | 44 | If you'd like to use the network inference component, I highly recommend using 45 | the (non-abandoned) Julia port 46 | [(MCBN.jl)](https://github.com/binarybana/MCBN.jl), but if you'd like to try 47 | this it'd look something like the following: 48 | 49 | ``` 50 | sudo apt-get install python-networkx libboost-dev libboost-program-options-dev 51 | libboost-test-dev libjudy-dev libgmp-dev python-networkx 52 | cd samcnet 53 | git submodule update --init 54 | cd deps/libdai 55 | cp Makefile.LINUX Makefile.conf 56 | make -j 57 | cd ../.. 58 | ln -s ../deps/libdai/lib/libdai.so lib/ 59 | for f in build/*.so; ln -s ../$f samcnet/; done 60 | ``` 61 | 62 | ## Building Blocks 63 | 64 | This software would not be possible without the following components: 65 | - Python for the main driving and glue code 66 | - Cython for C and C++ integration and speed 67 | - [libdai](http://cs.ru.nl/~jorism/libDAI/) for Bayesian network inference. 68 | - [Redis](http://redis.io) for the (optional) distributed job management 69 | - [waf](http://code.google.com/p/waf/) for the build system 70 | - rsyslog for remote logging 71 | 72 | -------------------------------------------------------------------------------- /exps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binarybana/samcnet/84f3ba8241d416115a8aa9ba5c659a9513175072/exps/__init__.py -------------------------------------------------------------------------------- /exps/bmc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | from scipy.stats.mstats import mquantiles 12 | import scipy.stats.distributions as di 13 | 14 | from sklearn.lda import LDA 15 | from sklearn.feature_selection import SelectKBest, f_classif 16 | from samcnet.data import * 17 | from samcnet.calibrate import * 18 | 19 | params = {} 20 | 21 | seed = setv(params, 'seed', np.random.randint(10**8), int) 22 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 23 | 24 | # Synthetic Params 25 | Ntrn = setv(params, 'Ntrn', 1000, int) 26 | Ntst = setv(params, 'Ntst', 1000, int) 27 | mu0 = setv(params, 'mu0', np.random.randn()*0.2, float) 28 | mu1 = setv(params, 'mu1', np.random.randn()*0.2, float) 29 | sigma0 = setv(params, 'sigma0', di.invgamma.rvs(3), float) 30 | sigma1 = setv(params, 'sigma1', di.invgamma.rvs(3), float) 31 | 32 | ### For YJ #### 33 | f_glob = setv(params, 'f_glob', 10, int) 34 | subclasses = setv(params, 'subclasses', 2, int) 35 | f_het = setv(params, 'f_het', 20, int) 36 | f_rand = setv(params, 'f_rand', 20, int) 37 | rho = setv(params, 'rho', np.random.rand(), float) 38 | f_tot = setv(params, 'f_tot', f_glob+f_het*subclasses+f_rand, float) 39 | blocksize = setv(params, 'blocksize', 5, int) 40 | ############ 41 | 42 | ### For JK ### 43 | num_gen_feat = setv(params, 'num_gen_feat', 20, int) 44 | lowd = setv(params, 'lowd', 9.0, float) 45 | highd = setv(params, 'highd', 11.0, float) 46 | #kappa = setv(params, 'kappa', 2000, float) 47 | kappa = setv(params, 'kappa', 22.0, float) 48 | ############## 49 | 50 | # Final number of features 51 | num_feat = setv(params, 'num_feat', 4, int) 52 | 53 | output = {} 54 | output['errors'] = {} 55 | errors = output['errors'] 56 | #np.seterr(all='ignore') # Careful with this 57 | 58 | 59 | #################### CLASSIFICATION ################ 60 | def yj(): 61 | params['mu0'] = np.random.randn()*0.2 62 | params['mu1'] = np.random.randn()*0.2 63 | params['sigma0'] = di.invgamma.rvs(3) 64 | params['sigma1'] = di.invgamma.rvs(3) 65 | sel, rawdata, normdata = get_data(data_yj, params) 66 | norm_trn_data = normdata.loc[sel['trn'], sel['feats']] 67 | norm_tst_data = normdata.loc[sel['tst'], sel['feats']] 68 | 69 | sklda = LDA() 70 | sklda.fit(norm_trn_data, sel['trnl']) 71 | error = (1-sklda.score(norm_tst_data, sel['tstl'])) 72 | print("skLDA error: %f" % error) 73 | return error 74 | 75 | def jk(): 76 | params['mu0'] = np.random.randn()*0.2 77 | params['mu1'] = np.random.randn()*0.2 78 | params['sigma0'] = di.invgamma.rvs(3) 79 | params['sigma1'] = di.invgamma.rvs(3) 80 | sel, rawdata, normdata = get_data(data_jk, params) 81 | norm_trn_data = normdata.loc[sel['trn'], sel['feats']] 82 | norm_tst_data = normdata.loc[sel['tst'], sel['feats']] 83 | tst_data = rawdata.loc[sel['tst'], sel['feats']] 84 | 85 | sklda = LDA() 86 | sklda.fit(norm_trn_data, sel['trnl']) 87 | error = (1-sklda.score(norm_tst_data, sel['tstl'])) 88 | print("skLDA error: %f" % error) 89 | return error 90 | 91 | yjs = [yj() for i in range(1000)] 92 | print("") 93 | jks = [jk() for i in range(1000)] 94 | -------------------------------------------------------------------------------- /exps/exp_class.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import redis 4 | import random 5 | import numpy as np 6 | import yaml 7 | 8 | try: 9 | from samcnet import samc,lori,utils 10 | from samcnet.lori import * 11 | except ImportError as e: 12 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 13 | " directory is populated by waf.\n\n %s" % str(e)) 14 | 15 | if 'WORKHASH' in os.environ: 16 | try: 17 | server = os.environ['SERVER'] 18 | except: 19 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 20 | 21 | if 'PARAM' in os.environ: 22 | params = yaml.load(os.environ['PARAM']) 23 | iters = int(params['iters']) 24 | 25 | ## First generate true distributions and data 26 | seed = 40767 27 | np.random.seed(seed) 28 | cval, dist0, dist1 = gen_dists() 29 | np.random.seed() 30 | 31 | ## Now test Gaussian Analytic calculation 32 | gc = GaussianCls(dist0, dist1) 33 | 34 | c = GaussianSampler(dist0,dist1) 35 | s = samc.SAMCRun(c, burn=0, stepscale=1000, refden=1, thin=10) 36 | s.sample(iters, temperature=1) 37 | 38 | # Now save extra info to the database 39 | s.db.root.object.objfxn._v_attrs['seed'] = seed 40 | s.db.root.object.objfxn._v_attrs['bayes_error'] = gc.approx_error("bayes", cval) 41 | s.db.root.object.objfxn._v_attrs['posterior_error'] = gc.approx_error("true", cval) 42 | s.db.root.object.objfxn._v_attrs['sample_error_full'] = c.approx_error(s.db, cval) 43 | s.db.root.object.objfxn._v_attrs['sample_error_20'] = c.approx_error(s.db, cval, partial=20) 44 | 45 | if 'WORKHASH' in os.environ: 46 | import zmq,time 47 | ctx = zmq.Context() 48 | socket = ctx.socket(zmq.REQ) 49 | socket.connect('tcp://'+server+':7000') 50 | 51 | data = s.read_db() 52 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 53 | socket.send(data) 54 | socket.recv() 55 | socket.close() 56 | ctx.term() 57 | 58 | s.db.close() 59 | -------------------------------------------------------------------------------- /exps/exp_net.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import zlib, cPickle 3 | ############### SAMC Setup ############### 4 | import numpy as np 5 | import scipy as sp 6 | import networkx as nx 7 | 8 | from samcnet.samc import SAMCRun 9 | from samcnet.bayesnetcpd import BayesNetSampler, BayesNetCPD 10 | from samcnet import utils 11 | from samcnet.generator import * 12 | 13 | if 'WORKHASH' in os.environ: 14 | try: 15 | redis_server = os.environ['REDIS'] 16 | import redis 17 | r = redis.StrictRedis(redis_server) 18 | except: 19 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 20 | ############### /SAMC Setup ############### 21 | 22 | N = 8 23 | iters = 3e5 24 | numdata = 40 25 | priorweight = 0.0 26 | numtemplate = 10 27 | burn = 1000 28 | stepscale = 5000 29 | temperature = 10.0 30 | thin = 50 31 | refden = 0.0 32 | 33 | random.seed(12345) 34 | np.random.seed(12345) 35 | 36 | groundgraph = generateHourGlassGraph(nodes=N) 37 | #joint, states = generateJoint(groundgraph, method='dirichlet') 38 | joint, states = generateJoint(groundgraph, method='noisylogic') 39 | data = generateData(groundgraph, joint, numdata) 40 | template = sampleTemplate(groundgraph, numtemplate) 41 | 42 | if 'WORKHASH' in os.environ: 43 | jobhash = os.environ['WORKHASH'] 44 | if not r.hexists('jobs:grounds', jobhash): 45 | r.hset('jobs:grounds', jobhash, zlib.compress(cPickle.dumps(groundgraph))) 46 | 47 | random.seed() 48 | np.random.seed() 49 | 50 | groundbnet = BayesNetCPD(states, data, limparent=3) 51 | groundbnet.set_cpds(joint) 52 | 53 | obj = BayesNetCPD(states, data, limparent=3) 54 | 55 | b = BayesNetSampler(obj, template, groundbnet, priorweight) 56 | s = SAMCRun(b,burn,stepscale,refden,thin) 57 | s.sample(iters, temperature) 58 | 59 | s.compute_means() 60 | #s.compute_means(cummeans=False) 61 | 62 | if 'WORKHASH' in os.environ: 63 | r.lpush('jobs:done:' + jobhash, s.read_db()) 64 | s.db.close() 65 | ############# 66 | -------------------------------------------------------------------------------- /exps/exp_simple.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import redis 4 | import random 5 | import numpy as np 6 | from samcnet import samc,utils,simple 7 | 8 | if 'WORKHASH' in os.environ: 9 | try: 10 | import redis 11 | r = redis.StrictRedis(os.environ['REDIS']) 12 | except: 13 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 14 | 15 | random.seed(123) 16 | np.random.seed(123) 17 | 18 | o = simple.Simple(truemu=0.0, mu0=0.0) 19 | 20 | random.seed() 21 | np.random.seed() 22 | 23 | s = samc.SAMCRun(o, burn=100, 24 | stepscale=10000, 25 | refden=0.0, 26 | thin=100) 27 | s.sample(1e5) 28 | 29 | res = [] 30 | res.append(s.func_mean()) 31 | fm = s.func_cummean().copy() 32 | res.append(fm) 33 | 34 | res_wire = utils.prepare_data([utils.encode_element(x) for x in res]) 35 | if 'WORKHASH' in os.environ: 36 | r.lpush('jobs:done:'+os.environ['WORKHASH'], res_wire) 37 | 38 | -------------------------------------------------------------------------------- /exps/exp_tree.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import numpy as np 3 | import scipy as sp 4 | import networkx as nx 5 | import json as js 6 | import tables as t 7 | import zlib 8 | import cPickle 9 | 10 | from samcnet.samc import SAMCRun 11 | from samcnet.treenet import TreeNet, generateTree, generateData 12 | from samcnet import utils 13 | from samcnet.generator import sampleTemplate 14 | 15 | if 'WORKHASH' in os.environ: 16 | try: 17 | redis_server = os.environ['REDIS'] 18 | import redis 19 | r = redis.StrictRedis(redis_server) 20 | except: 21 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 22 | 23 | N = 10 24 | comps = 2 25 | iters = 4e5 26 | numdata = 30 27 | burn = 1000 28 | stepscale = 200 29 | temperature = 3.0 30 | thin = 10 31 | refden = 0.0 32 | numtemplate = 10 33 | priorweight = 1.0 34 | 35 | random.seed(12345) 36 | np.random.seed(12345) 37 | 38 | groundgraph = generateTree(N, comps) 39 | data = generateData(groundgraph,numdata) 40 | template = sampleTemplate(groundgraph, numtemplate) 41 | 42 | random.seed() 43 | np.random.seed() 44 | 45 | ground = TreeNet(N, data=data, graph=groundgraph) 46 | 47 | if 'WORKHASH' in os.environ: 48 | jobhash = os.environ['WORKHASH'] 49 | if not r.hexists('jobs:grounds', jobhash): 50 | r.hset('jobs:grounds', jobhash, zlib.compress(cPickle.dumps(ground))) 51 | 52 | b = TreeNet(N, data=data, ground=ground, priorweight=priorweight, 53 | template=template, verbose=True) 54 | s = SAMCRun(b,burn,stepscale,refden,thin,verbose=True) 55 | s.sample(iters, temperature) 56 | 57 | 58 | s.compute_means() 59 | 60 | if 'WORKHASH' in os.environ: 61 | r.lpush('jobs:done:' + jobhash, s.read_db()) 62 | -------------------------------------------------------------------------------- /exps/expr_sweep.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | from scipy.stats.mstats import mquantiles 12 | 13 | try: 14 | from sklearn.lda import LDA 15 | from sklearn.svm import SVC 16 | from sklearn.neighbors import KNeighborsClassifier as KNN 17 | from sklearn.feature_selection import SelectKBest, f_classif 18 | 19 | import samcnet.mh as mh 20 | from samcnet.mixturepoisson import * 21 | from samcnet.lori import * 22 | from samcnet.data import * 23 | except ImportError as e: 24 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 25 | " directory is populated by waf.\n\n %s" % str(e)) 26 | 27 | if 'WORKHASH' in os.environ: 28 | try: 29 | server = os.environ['SERVER'] 30 | except: 31 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 32 | 33 | if 'PARAM' in os.environ: 34 | params = yaml.load(os.environ['PARAM']) 35 | else: 36 | params = {} 37 | 38 | iters = setv(params, 'iters', int(1e4), int) 39 | 40 | num_feat = setv(params, 'num_feat', 2, int) 41 | seed = setv(params, 'seed', np.random.randint(10**8), int) 42 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 43 | 44 | Ntrn = setv(params, 'Ntrn', 20, int) 45 | Ntst = setv(params, 'Ntst', 3000, int) 46 | muadd = setv(params, 'muadd', 0.0, float) 47 | mu0 = setv(params, 'mu0', -2.0 + muadd, float) 48 | mu1 = setv(params, 'mu1', -1.0 + muadd, float) 49 | sigma0 = setv(params, 'sigma0', 0.2, float) 50 | sigma1 = setv(params, 'sigma1', 0.6, float) 51 | kappa = setv(params, 'kappa', 10.0, float) 52 | 53 | lowd = setv(params, 'lowd', 9.0, float) 54 | highd = setv(params, 'highd', 11.0, float) 55 | 56 | output = {} 57 | output['errors'] = {} 58 | errors = output['errors'] 59 | np.seterr(all='ignore') # Careful with this 60 | rseed = np.random.randint(10**8) 61 | 62 | t1 = time() 63 | 64 | trn_data, trn_labels, tst_data, tst_labels = data_jk(params) 65 | norm_trn_data, norm_tst_data = norm(trn_data, tst_data) 66 | 67 | norm_trn_data0, norm_trn_data1 = split(norm_trn_data, trn_labels) 68 | norm_tst_data0, norm_tst_data1 = split(norm_tst_data, tst_labels) 69 | trn_data0, trn_data1 = split(trn_data, trn_labels) 70 | tst_data0, tst_data1 = split(tst_data, tst_labels) 71 | 72 | #################### CLASSIFICATION ################ 73 | sklda = LDA() 74 | skknn = KNN(3, warn_on_equidistant=False) 75 | sksvm = SVC() 76 | sklda.fit(norm_trn_data, trn_labels) 77 | skknn.fit(norm_trn_data, trn_labels) 78 | sksvm.fit(norm_trn_data, trn_labels) 79 | errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels)) 80 | errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels)) 81 | errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels)) 82 | print("skLDA error: %f" % errors['lda']) 83 | print("skKNN error: %f" % errors['knn']) 84 | print("skSVM error: %f" % errors['svm']) 85 | 86 | kappa = 10 87 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), norm_trn_data0) 88 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), norm_trn_data1) 89 | 90 | # Gaussian Analytic 91 | gc = GaussianCls(bayes0, bayes1) 92 | errors['gauss'] = gc.approx_error_data(norm_tst_data, tst_labels) 93 | print("Gaussian Analytic error: %f" % errors['gauss']) 94 | 95 | # MPM Model 96 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08) 97 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08) 98 | mpm = MPMCls(dist0, dist1) 99 | mhmc = mh.MHRun(mpm, burn=1000, thin=50) 100 | mhmc.sample(iters,verbose=False) 101 | errors['mpm'] = mpm.approx_error_data(mhmc.db, tst_data, tst_labels,numlam=50) 102 | print("MPM Sampler error: %f" % errors['mpm']) 103 | 104 | output['acceptance'] = float(mhmc.accept_loc)/mhmc.total_loc 105 | mhmc.clean_db() 106 | 107 | # MPM Model 108 | priorsigma = np.ones(4)*0.1 109 | pm0 = np.ones(4) * mu0 110 | pm1 = np.ones(4) * mu1 111 | ud = True 112 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=200,lammove=0.01,mumove=0.08,#S=S0,kappa=kappa, 113 | priormu=pm0,priorsigma=priorsigma, usedata=ud) 114 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=200,lammove=0.01,mumove=0.08,#S=S1,kappa=kappa, 115 | priormu=pm1, priorsigma=priorsigma, usedata=ud) 116 | mpm = MPMCls(dist0, dist1) 117 | mhmc = mh.MHRun(mpm, burn=1000, thin=50) 118 | mhmc.sample(iters,verbose=False) 119 | errors['mpm_prior'] = mpm.approx_error_data(mhmc.db, tst_data, tst_labels,numlam=50) 120 | print("MPM prior Sampler error: %f" % errors['mpm_prior']) 121 | output['acceptance_prior'] = float(mhmc.accept_loc)/mhmc.total_loc 122 | 123 | output['seed'] = seed 124 | output['time'] = time()-t1 125 | 126 | def jitter(x): 127 | rand = np.random.randn 128 | return x + rand(*x.shape)*0.0 129 | 130 | def myplot(ax,g,data0,data1,gext): 131 | ax.plot(data0[:,0], data0[:,1], 'g.',label='0', alpha=0.5) 132 | ax.plot(data1[:,0], data1[:,1], 'r.',label='1', alpha=0.5) 133 | ax.legend(fontsize=8, loc='best') 134 | 135 | im = ax.imshow(g, extent=gext, aspect=1.0, origin='lower') 136 | #p.colorbar(im,ax=ax) 137 | ax.contour(g, extent=gext, aspect=1.0, origin='lower') 138 | #ax.contour(g, [0.0], extent=gext, aspect=1.0, origin='lower', cmap = p.cm.gray) 139 | 140 | def plot_all(n, gext, grid, data0, data1, g0, g1, gavg): 141 | Z = np.exp(g0)+np.exp(g1) 142 | eg0 = np.exp(g0)/Z 143 | eg1 = np.exp(g1)/Z 144 | err = np.minimum(eg0,eg1) 145 | err = err.reshape(-1,n) 146 | 147 | lx,hx,ly,hy = gext 148 | asp = float(hx-lx) / (hy-ly) 149 | alp = 1.0 150 | ms = 8 151 | 152 | p.figure() 153 | p.subplot(2,2,1) 154 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 155 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 156 | p.legend(fontsize=8, loc='best') 157 | #p.contour(gavg, extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 158 | #p.contour(gavg, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 159 | #p.imshow(gavg, extent=gext, aspect=1, origin='lower') 160 | #p.imshow(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 161 | #p.colorbar() 162 | p.contour(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Greens) 163 | 164 | p.subplot(2,2,2) 165 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 166 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 167 | p.legend(fontsize=8, loc='best') 168 | #p.contour(g0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 169 | #p.contour(g1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 170 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 171 | #p.imshow((g1-g0).reshape(-1,n), extent=gext, aspect=1, origin='lower') 172 | #p.imshow(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 173 | #p.colorbar() 174 | p.contour(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Reds) 175 | 176 | p.subplot(2,2,3) 177 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 178 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 179 | p.legend(fontsize=8, loc='best') 180 | #p.imshow(err, extent=gext, origin='lower', aspect=asp) 181 | #p.colorbar() 182 | p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 183 | #p.contour(eg0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 184 | #p.contour(eg1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 185 | 186 | p.subplot(2,2,4) 187 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 188 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 189 | p.legend(fontsize=8, loc='best') 190 | p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 191 | CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 192 | p.clabel(CS, inline=1, fontsize=10, aspect=asp) 193 | p.show() 194 | 195 | def plot_concise(n, gext, grid, data0, data1, g0, g1, gavg): 196 | p.figure() 197 | Z = np.exp(g0)+np.exp(g1) 198 | eg0 = np.exp(g0)/Z 199 | eg1 = np.exp(g1)/Z 200 | err = np.minimum(eg0,eg1) 201 | err = err.reshape(-1,n) 202 | ms=8 203 | 204 | lx,hx,ly,hy = gext 205 | asp = float(hx-lx) / (hy-ly) 206 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 207 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 208 | p.legend(fontsize=8, loc='best') 209 | 210 | cont = (g0.max() + g1.max()) / 2.0 - 0.6 211 | #print("g0.max() = %f" % g0.max()) 212 | #print("g1.max() = %f" % g1.max()) 213 | #print("cont = %f" % cont) 214 | p.contour(g0.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 215 | p.contour(g1.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 216 | p.imshow(err, extent=gext, origin='lower', aspect=asp, alpha=0.4, cmap = p.cm.Reds) 217 | p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray, linewidth=15.0) 218 | CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 219 | p.clabel(CS, inline=1, fontsize=10, aspect=asp) 220 | p.show() 221 | 222 | n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 223 | gavg = mpm.calc_gavg(mhmc.db, grid, numlam=20).reshape(-1,n) 224 | myplot(p.subplot(1,1,1),gavg,trn_data0,trn_data1,gext) 225 | 226 | #g0 = mpm.dist0.calc_db_g(mhmc.db, mhmc.db.root.object.dist0, grid) 227 | #g1 = mpm.dist1.calc_db_g(mhmc.db, mhmc.db.root.object.dist1, grid) 228 | 229 | #myplot(p.subplot(3,1,3),err.reshape(-1,n),jitter(tst_data0),jitter(tst_data1),gext) 230 | 231 | #plot_all(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 232 | #plot_concise(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 233 | 234 | p.figure() 235 | n,gext,grid = get_grid_data(np.vstack(( norm_trn_data0, norm_trn_data1 )), positive=False) 236 | myplot(p.gca(),sksvm.decision_function(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 237 | p.figure() 238 | myplot(p.gca(),gc.calc_gavg(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 239 | p.show() 240 | 241 | #Plot data 242 | # 243 | p.figure() 244 | n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 245 | p.plot(jitter(trn_data0[:,0]), jitter(trn_data0[:,1]), 'go') 246 | p.plot(jitter(trn_data1[:,0]), jitter(trn_data1[:,1]), 'ro') 247 | p.figure() 248 | p.plot(jitter(tst_data0[:,0]), jitter(tst_data0[:,1]), 'go') 249 | p.plot(jitter(tst_data1[:,0]), jitter(tst_data1[:,1]), 'ro') 250 | p.show() 251 | 252 | #p.figure() 253 | #myplot(p.subplot(1,1,1),gavg,jitter(tst_data0),jitter(tst_data1),gext) 254 | #p.axis(gext) 255 | #p.figure() 256 | #mpm.dist0.plot_traces(mhmc.db, '/object/dist0', ['mu','lam','sigma']) 257 | #mpm.dist1.plot_traces(mhmc.db, '/object/dist1', ['mu','lam']) 258 | #p.show() 259 | 260 | if 'WORKHASH' in os.environ: 261 | import zmq 262 | ctx = zmq.Context() 263 | socket = ctx.socket(zmq.REQ) 264 | socket.connect('tcp://'+server+':7000') 265 | 266 | wiredata = zlib.compress(js.dumps(output)) 267 | #wiredata = s.read_db() 268 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 269 | socket.send(wiredata) 270 | socket.recv() 271 | socket.close() 272 | ctx.term() 273 | 274 | mhmc.clean_db() 275 | 276 | -------------------------------------------------------------------------------- /exps/fig_jk.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | from scipy.stats.mstats import mquantiles 12 | import scipy.stats.distributions as di 13 | 14 | try: 15 | from sklearn.lda import LDA 16 | from sklearn.svm import SVC 17 | from sklearn.neighbors import KNeighborsClassifier as KNN 18 | from sklearn.feature_selection import SelectKBest, f_classif 19 | 20 | import samcnet.mh as mh 21 | from samcnet.mixturepoisson import * 22 | from samcnet.lori import * 23 | from samcnet.data import * 24 | from samcnet.calibrate import * 25 | except ImportError as e: 26 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 27 | " directory is populated by waf.\n\n %s" % str(e)) 28 | 29 | if 'WORKHASH' in os.environ: 30 | try: 31 | server = os.environ['SERVER'] 32 | except: 33 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 34 | 35 | if 'PARAM' in os.environ: 36 | params = yaml.load(os.environ['PARAM']) 37 | else: 38 | params = {} 39 | 40 | iters = setv(params, 'iters', int(1e4), int) 41 | 42 | seed = setv(params, 'seed', np.random.randint(10**8), int) 43 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 44 | 45 | # Synthetic Params 46 | Ntrn = setv(params, 'Ntrn', 20, int) 47 | Ntst = setv(params, 'Ntst', 3000, int) 48 | mu0 = setv(params, 'mu0', np.random.randn()*0.2, float) 49 | mu1 = setv(params, 'mu1', np.random.randn()*0.2, float) 50 | sigma0 = setv(params, 'sigma0', di.invgamma.rvs(3), float) 51 | sigma1 = setv(params, 'sigma1', di.invgamma.rvs(3), float) 52 | 53 | ### For YJ #### 54 | f_glob = setv(params, 'f_glob', 10, int) 55 | subclasses = setv(params, 'subclasses', 2, int) 56 | f_het = setv(params, 'f_het', 20, int) 57 | f_rand = setv(params, 'f_rand', 20, int) 58 | rho = setv(params, 'rho', np.random.rand(), float) 59 | f_tot = setv(params, 'f_tot', f_glob+f_het*subclasses+f_rand, float) 60 | blocksize = setv(params, 'blocksize', 5, int) 61 | ############ 62 | 63 | ### For JK ### 64 | num_gen_feat = setv(params, 'num_gen_feat', 20, int) 65 | lowd = setv(params, 'lowd', 9.0, float) 66 | highd = setv(params, 'highd', 11.0, float) 67 | #kappa = setv(params, 'kappa', 2000, float) 68 | #kappa = setv(params, 'kappa', 22.0, float) 69 | ############## 70 | 71 | # Final number of features 72 | num_feat = setv(params, 'num_feat', 4, int) 73 | 74 | # MCMC 75 | mumove = setv(params, 'mumove', 0.08, float) 76 | lammove = setv(params, 'lammove', 0.01, float) 77 | priorkappa = setv(params, 'priorkappa', 150, int) 78 | burn = setv(params, 'burn', 3000, int) 79 | thin = setv(params, 'thin', 40, int) 80 | numlam = setv(params, 'numlam', 40, int) 81 | 82 | output = {} 83 | output['errors'] = {} 84 | errors = output['errors'] 85 | np.seterr(all='ignore') # Careful with this 86 | 87 | sel, rawdata, normdata = get_data(data_yj, params) 88 | 89 | norm_trn_data = normdata.loc[sel['trn'], sel['feats']] 90 | norm_tst_data = normdata.loc[sel['tst'], sel['feats']] 91 | tst_data = rawdata.loc[sel['tst'], sel['feats']] 92 | 93 | t1 = time() 94 | #################### CLASSIFICATION ################ 95 | sklda = LDA() 96 | skknn = KNN(3, warn_on_equidistant=False) 97 | sksvm = SVC() 98 | sklda.fit(norm_trn_data, sel['trnl']) 99 | skknn.fit(norm_trn_data, sel['trnl']) 100 | sksvm.fit(norm_trn_data, sel['trnl']) 101 | errors['lda'] = (1-sklda.score(norm_tst_data, sel['tstl'])) 102 | errors['knn'] = (1-skknn.score(norm_tst_data, sel['tstl'])) 103 | errors['svm'] = (1-sksvm.score(norm_tst_data, sel['tstl'])) 104 | print("skLDA error: %f" % errors['lda']) 105 | print("skKNN error: %f" % errors['knn']) 106 | print("skSVM error: %f" % errors['svm']) 107 | 108 | lorikappa = 10 109 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, lorikappa, 110 | np.eye(num_feat)*(lorikappa-1-num_feat), 111 | normdata.loc[sel['trn0'], sel['feats']]) 112 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, lorikappa, 113 | np.eye(num_feat)*(lorikappa-1-num_feat), 114 | normdata.loc[sel['trn1'], sel['feats']]) 115 | 116 | # Gaussian Analytic 117 | gc = GaussianCls(bayes0, bayes1) 118 | errors['gauss'] = gc.approx_error_data(norm_tst_data, sel['tstl']) 119 | print("Gaussian Analytic error: %f" % errors['gauss']) 120 | 121 | # MPM Model 122 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 123 | lammove=lammove,mumove=mumove) 124 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 125 | lammove=lammove,mumove=mumove) 126 | mpm = MPMCls(dist0, dist1) 127 | mhmc = mh.MHRun(mpm, burn=burn, thin=thin) 128 | mhmc.sample(iters,verbose=False) 129 | errors['mpm'] = mpm.approx_error_data(mhmc.db, tst_data, sel['tstl'],numlam=numlam) 130 | print("MPM Sampler error: %f" % errors['mpm']) 131 | 132 | output['acceptance'] = float(mhmc.accept_loc)/mhmc.total_loc 133 | mhmc.clean_db() 134 | ######################################## 135 | ######################################## 136 | ######################################## 137 | ######################################## 138 | ######################################## 139 | # Calibrated MPM Model 140 | p0, p1 = calibrate(rawdata, sel, params) 141 | 142 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 143 | lammove=lammove,mumove=mumove,**p0) 144 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 145 | lammove=lammove,mumove=mumove,**p1) 146 | mpmc = MPMCls(dist0, dist1) 147 | mhmcc = mh.MHRun(mpmc, burn=burn, thin=thin) 148 | mhmcc.sample(iters,verbose=False) 149 | errors['mpmc_calib'] = mpmc.approx_error_data(mhmcc.db, tst_data, sel['tstl'],numlam=numlam) 150 | print("mpmc Calibrated error: %f" % errors['mpmc_calib']) 151 | 152 | output['acceptance_calib'] = float(mhmcc.accept_loc)/mhmcc.total_loc 153 | mhmcc.clean_db() 154 | ######################################## 155 | ######################################## 156 | ######################################## 157 | ######################################## 158 | ######################################## 159 | priorsigma = np.ones(4)*0.1 160 | pm0 = np.ones(4) * mu0 161 | pm1 = np.ones(4) * mu1 162 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 163 | lammove=lammove,mumove=mumove, 164 | priormu=pm0,priorsigma=priorsigma) 165 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 166 | lammove=lammove,mumove=mumove, 167 | priormu=pm1,priorsigma=priorsigma) 168 | mpmp = MPMCls(dist0, dist1) 169 | mhmcp = mh.MHRun(mpmp, burn=burn, thin=thin) 170 | mhmcp.sample(iters,verbose=False) 171 | errors['mpm_prior'] = mpmp.approx_error_data(mhmcp.db, tst_data, sel['tstl'],numlam=numlam) 172 | print("MPM prior Sampler error: %f" % errors['mpm_prior']) 173 | output['acceptance_prior'] = float(mhmcp.accept_loc)/mhmcp.total_loc 174 | mhmcp.clean_db() 175 | ######################################## 176 | ######################################## 177 | ######################################## 178 | ######################################## 179 | 180 | output['seed'] = seed 181 | output['time'] = time()-t1 182 | 183 | if 'WORKHASH' in os.environ: 184 | import zmq 185 | ctx = zmq.Context() 186 | socket = ctx.socket(zmq.REQ) 187 | socket.connect('tcp://'+server+':7000') 188 | 189 | wiredata = zlib.compress(js.dumps(output)) 190 | #wiredata = s.read_db() 191 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 192 | socket.send(wiredata) 193 | socket.recv() 194 | socket.close() 195 | ctx.term() 196 | 197 | 198 | -------------------------------------------------------------------------------- /exps/fig_yj.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | from scipy.stats.mstats import mquantiles 12 | 13 | try: 14 | from sklearn.lda import LDA 15 | from sklearn.svm import SVC 16 | from sklearn.neighbors import KNeighborsClassifier as KNN 17 | from sklearn.feature_selection import SelectKBest, f_classif 18 | 19 | import samcnet.mh as mh 20 | from samcnet.mixturepoisson import * 21 | from samcnet.lori import * 22 | from samcnet.data import * 23 | except ImportError as e: 24 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 25 | " directory is populated by waf.\n\n %s" % str(e)) 26 | 27 | if 'WORKHASH' in os.environ: 28 | try: 29 | server = os.environ['SERVER'] 30 | except: 31 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 32 | 33 | if 'PARAM' in os.environ: 34 | params = yaml.load(os.environ['PARAM']) 35 | else: 36 | params = {} 37 | 38 | iters = setv(params, 'iters', int(1e4), int) 39 | 40 | num_feat = setv(params, 'num_feat', 4, int) 41 | seed = setv(params, 'seed', np.random.randint(10**8), int) 42 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 43 | 44 | Ntrn = setv(params, 'Ntrn', 10, int) 45 | Ntst = setv(params, 'Ntst', 3000, int) 46 | f_glob = setv(params, 'f_glob', 5, int) 47 | subclasses = setv(params, 'subclasses', 0, int) 48 | f_het = setv(params, 'f_het', 0, int) 49 | f_rand = setv(params, 'f_rand', 10, int) 50 | rho = setv(params, 'rho', 0.6, float) 51 | f_tot = setv(params, 'f_tot', f_glob+f_het*subclasses+f_rand, float) 52 | blocksize = setv(params, 'blocksize', 5, int) 53 | mu0 = setv(params, 'mu0', -2.0, float) 54 | mu1 = setv(params, 'mu1', -1.0, float) 55 | sigma0 = setv(params, 'sigma0', 0.2, float) 56 | sigma1 = setv(params, 'sigma1', 0.6, float) 57 | 58 | lowd = setv(params, 'lowd', 9.0, float) 59 | highd = setv(params, 'highd', 11.0, float) 60 | 61 | output = {} 62 | output['errors'] = {} 63 | errors = output['errors'] 64 | np.seterr(all='ignore') # Careful with this 65 | rseed = np.random.randint(10**8) 66 | 67 | t1 = time() 68 | 69 | trn_data, trn_labels, tst_data, tst_labels = data_yj(params) 70 | norm_trn_data, norm_tst_data = norm(trn_data, tst_data) 71 | 72 | norm_trn_data0, norm_trn_data1 = split(norm_trn_data, trn_labels) 73 | norm_tst_data0, norm_tst_data1 = split(norm_tst_data, tst_labels) 74 | trn_data0, trn_data1 = split(trn_data, trn_labels) 75 | tst_data0, tst_data1 = split(tst_data, tst_labels) 76 | 77 | #################### CLASSIFICATION ################ 78 | sklda = LDA() 79 | skknn = KNN(3, warn_on_equidistant=False) 80 | sksvm = SVC() 81 | sklda.fit(norm_trn_data, trn_labels) 82 | skknn.fit(norm_trn_data, trn_labels) 83 | sksvm.fit(norm_trn_data, trn_labels) 84 | errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels)) 85 | errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels)) 86 | errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels)) 87 | print("skLDA error: %f" % errors['lda']) 88 | print("skKNN error: %f" % errors['knn']) 89 | print("skSVM error: %f" % errors['svm']) 90 | 91 | kappa = 10 92 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), norm_trn_data0) 93 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), norm_trn_data1) 94 | 95 | # Gaussian Analytic 96 | gc = GaussianCls(bayes0, bayes1) 97 | errors['gauss'] = gc.approx_error_data(norm_tst_data, tst_labels) 98 | print("Gaussian Analytic error: %f" % errors['gauss']) 99 | 100 | # MPM Model 101 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=180,lammove=0.002,mumove=0.08) 102 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=180,lammove=0.002,mumove=0.08) 103 | mpm = MPMCls(dist0, dist1) 104 | mhmc = mh.MHRun(mpm, burn=1000, thin=50) 105 | mhmc.sample(iters,verbose=False) 106 | errors['mpm'] = mpm.approx_error_data(mhmc.db, tst_data, tst_labels,numlam=50) 107 | print("MPM Sampler error: %f" % errors['mpm']) 108 | output['acceptance'] = float(mhmc.accept_loc)/mhmc.total_loc 109 | mhmc.clean_db() 110 | 111 | kappa = 200 112 | S0 = (np.ones(4) + (np.eye(4)-1)*0.4) * (kappa - 4 - 1) *0.2 113 | S1 = (np.ones(4) + (np.eye(4)-1)*0.4) * (kappa - 4 - 1) *0.6 114 | priormu1 = np.ones(4)*0.6 115 | priorsigma = np.ones(4) * 0.1 116 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=280,lammove=0.002,mumove=0.08,S=S0,kappa=kappa, 117 | priorsigma=priorsigma) 118 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=280,lammove=0.002,mumove=0.08,S=S1,kappa=kappa, 119 | priormu=priormu1, priorsigma=priorsigma) 120 | mpm = MPMCls(dist0, dist1) 121 | mhmc = mh.MHRun(mpm, burn=1000, thin=50) 122 | mhmc.sample(iters,verbose=False) 123 | errors['mpm_prior'] = mpm.approx_error_data(mhmc.db, tst_data, tst_labels,numlam=50) 124 | print("MPM prior Sampler error: %f" % errors['mpm_prior']) 125 | output['acceptance_prior'] = float(mhmc.accept_loc)/mhmc.total_loc 126 | mhmc.clean_db() 127 | 128 | output['seed'] = seed 129 | output['time'] = time()-t1 130 | 131 | if 'WORKHASH' in os.environ: 132 | import zmq 133 | ctx = zmq.Context() 134 | socket = ctx.socket(zmq.REQ) 135 | socket.connect('tcp://'+server+':7000') 136 | 137 | wiredata = zlib.compress(js.dumps(output)) 138 | #wiredata = s.read_db() 139 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 140 | socket.send(wiredata) 141 | socket.recv() 142 | socket.close() 143 | ctx.term() 144 | 145 | 146 | -------------------------------------------------------------------------------- /exps/jsonplotter.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | import simplejson as js 4 | import pandas as pa 5 | import yaml 6 | 7 | import matplotlib as mpl 8 | mpl.use('Agg') 9 | import pylab as p 10 | 11 | from collections import defaultdict 12 | 13 | from jobmon import redisbackend as rb 14 | 15 | p.rc('font', size=11) 16 | 17 | db = rb.RedisDataStore('localhost') 18 | jobhash = db.select_jobfile() 19 | name = raw_input("Name: ") 20 | 21 | resdir = os.path.join('/home/bana/largeresearch/results', jobhash) 22 | dependent_var = [] 23 | alldata = {} 24 | 25 | for i,paramdir in enumerate(os.listdir(resdir)): 26 | output = defaultdict(list) 27 | diffs = defaultdict(list) 28 | other = defaultdict(list) 29 | params = yaml.load(db.get_params(paramdir)) 30 | if len(params.values()) != 1: # ie: make sure this is a univariate sweep 31 | continue 32 | val = params.values()[0] 33 | dependent_var.append(val) 34 | for fname in os.listdir(os.path.join(resdir,paramdir)): 35 | data = js.loads(open(os.path.join(resdir,paramdir,fname)).read()) 36 | for k,v in data.iteritems(): 37 | if k == 'errors': 38 | for kk,vv in data['errors'].iteritems(): 39 | output[kk].append(vv) 40 | else: 41 | other[k].append(v) 42 | 43 | df = pa.DataFrame(output) 44 | alldata[val] = df 45 | otherdf = pa.DataFrame(other) 46 | print(otherdf.describe()) 47 | 48 | dependent_var = np.array(dependent_var) 49 | ind = np.argsort(dependent_var)#[1:] 50 | markers = list('xD*o>s^<+') 51 | colors = list('bgrcmy') 52 | offset = 0 53 | key = {'gauss':'Normal OBC', 'svm':'RBF SVM', 'knn':'3NN', 'lda':'LDA', 'mpm':'MP OBC', 54 | 'mpm_prior':'MP OBC Prior', 'mpmc_calib':'MP OBC Calibrated', 55 | 'nngauss': 'NN Normal OBC', 'nnsvm': 'NN RBF SVM', 'nnlda': 'NN LDA', 'nnknn': 'NN 3NN'} 56 | colorkey = {'gauss':'b', 'svm':'y', 'knn':'g', 'lda':'r', 'mpm':'c', 57 | 'mpm_prior':'k', 'mpmc_calib':'m', 58 | 'nngauss':'b', 'nnsvm': 'y', 'nnlda': 'r', 'nnknn': 'g'} 59 | symkey = {'gauss':'x', 'svm':'^', 'knn':'D', 'lda':'*', 'mpm':'o', 60 | 'mpm_prior':'+', 'mpmc_calib':'s', 61 | 'nngauss':'o', 'nnsvm': 'x', 'nnlda': '^', 'nnknn': '+'} 62 | 63 | def adjust_plot(): 64 | lx,ux,ly,uy = p.axis() 65 | xl = ux - lx 66 | yl = uy - ly 67 | p.axis((lx-xl*0.1, ux+xl*0.1, ly-yl*0.1, uy+yl*0.1)) 68 | 69 | #p.close('all') 70 | df.sort_index(axis=1) 71 | 72 | offset = 0 73 | p.figure() 74 | for i in range(len(df.columns)): 75 | if df.columns[i] == 'mpm_prior':# or df.columns[i] == 'gauss': 76 | continue 77 | means = [alldata[j].mean()[i] for j in dependent_var[ind]] 78 | stds = [alldata[j].std()[i] for j in dependent_var[ind]] 79 | p.plot(dependent_var[ind], means, marker=symkey[df.columns[i]], color=colorkey[df.columns[i]], 80 | markersize=7, label=key[df.columns[i]]) 81 | #p.errorbar(dependent_var[ind]+offset, means, yerr=stds, marker='o', 82 | #label=key[df.columns[i]]) 83 | #offset += 0.1 84 | p.legend() 85 | #p.ylim(0.30, 0.40) # For TCGA 86 | #p.ylim(0.40, 0.46) # For HC 87 | p.ylabel('Mean separate sampling holdout error') 88 | #p.ylabel('Mean estimated true error') 89 | p.xlabel('Training samples per class') 90 | #p.xlabel('Total training samples') 91 | #p.xlabel(r'$\mu_1$') 92 | #p.xlabel('Gene expression \"strength\"') 93 | p.grid(True) 94 | adjust_plot() 95 | 96 | savename = '../class-paper/pdf/{}-{}.pdf'.format(name, jobhash[:4]) 97 | print('Saving to {}'.format(savename)) 98 | p.savefig(savename, bbox_inches='tight') 99 | 100 | #p.show() 101 | sys.exit() 102 | 103 | p.figure() 104 | for i in range(len(df.columns)): 105 | if df.columns[i] == 'mpm' or df.columns[i] == 'mpm_prior': 106 | continue 107 | diffs = [alldata[j].iloc[:,i]-alldata[j].loc[:,'mpm'] for j in dependent_var[ind]] 108 | diffdf = pa.concat(diffs, keys=dependent_var[ind], axis=1) 109 | p.errorbar(dependent_var[ind]+offset, diffdf.mean(), yerr=diffdf.std(), marker='o', 110 | label=key[df.columns[i]]) 111 | offset += 0.01 112 | p.legend() 113 | p.ylabel('Holdout error - Holdout error for MP OBC') 114 | p.xlabel('Total training samples per class') 115 | p.grid(True) 116 | adjust_plot() 117 | 118 | #p.title(jobhash[:6] + ' ' + db.get_description(jobhash)) 119 | #p.xlabel('Number of final features') 120 | #p.xlabel('mu1') 121 | 122 | 123 | p.show() 124 | -------------------------------------------------------------------------------- /exps/karen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import pandas as pa 8 | import simplejson as js 9 | import subprocess as sb 10 | from time import time,sleep 11 | from os import path 12 | from scipy.stats.mstats import mquantiles 13 | from sklearn.feature_selection import SelectKBest, f_classif 14 | 15 | from sklearn.lda import LDA 16 | from sklearn.svm import SVC 17 | from sklearn.neighbors import KNeighborsClassifier as KNN 18 | from sklearn.feature_selection import SelectKBest, f_classif 19 | 20 | import samcnet.mh as mh 21 | from samcnet.mixturepoisson import * 22 | from samcnet.lori import * 23 | from samcnet.data import * 24 | 25 | if 'WORKHASH' in os.environ: 26 | try: 27 | server = os.environ['SERVER'] 28 | except: 29 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 30 | 31 | if 'PARAM' in os.environ: 32 | params = yaml.load(os.environ['PARAM']) 33 | else: 34 | params = {} 35 | 36 | iters = setv(params, 'iters', int(1e4), int) 37 | num_feat = setv(params, 'num_feat', 4, int) 38 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 39 | seed = setv(params, 'seed', np.random.randint(10**8), int) 40 | Ntrn = setv(params, 'Ntrn', 8, int) 41 | low = setv(params, 'low_filter', 10, int) 42 | high = setv(params, 'high_filter', 30, int) 43 | num_candidates = setv(params, 'num_candidates', 50, int) 44 | 45 | np.random.seed(seed) 46 | 47 | output = {} 48 | output['errors'] = {} 49 | errors = output['errors'] 50 | np.seterr(all='ignore') # Careful with this 51 | 52 | t1 = time() 53 | 54 | trn_data, trn_labels, tst_data, tst_labels = data_karen(params) 55 | norm_trn_data, norm_tst_data = norm(trn_data, tst_data) 56 | 57 | norm_trn_data0, norm_trn_data1 = split(norm_trn_data, trn_labels) 58 | trn_data0, trn_data1 = split(trn_data, trn_labels) 59 | tst_data0, tst_data1 = split(tst_data, tst_labels) 60 | 61 | #p.close("all") 62 | #p.figure() 63 | #p.plot(trn_data0[:,0], trn_data0[:,1], 'g.',label='0', alpha=0.5) 64 | #p.plot(trn_data1[:,0], trn_data1[:,1], 'r.',label='1', alpha=0.5) 65 | #p.legend(fontsize=8, loc='best') 66 | 67 | #p.figure() 68 | #p.plot(tst_data0[:,0], tst_data0[:,1], 'g.',label='0', alpha=0.5) 69 | #p.plot(tst_data1[:,0], tst_data1[:,1], 'r.',label='1', alpha=0.5) 70 | #p.legend(fontsize=8, loc='best') 71 | 72 | #p.show() 73 | #################### CLASSIFICATION ################ 74 | sklda = LDA() 75 | skknn = KNN(3, warn_on_equidistant=False) 76 | sksvm = SVC() 77 | sklda.fit(norm_trn_data, trn_labels) 78 | skknn.fit(norm_trn_data, trn_labels) 79 | sksvm.fit(norm_trn_data, trn_labels) 80 | errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels)) 81 | errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels)) 82 | errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels)) 83 | print("skLDA error: %f" % errors['lda']) 84 | print("skKNN error: %f" % errors['knn']) 85 | print("skSVM error: %f" % errors['svm']) 86 | 87 | kappa = 10 88 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), norm_trn_data0) 89 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, kappa, np.eye(num_feat)*(kappa-1-num_feat), norm_trn_data1) 90 | 91 | # Gaussian Analytic 92 | gc = GaussianCls(bayes0, bayes1) 93 | errors['gauss'] = gc.approx_error_data(norm_tst_data, tst_labels) 94 | print("Gaussian Analytic error: %f" % errors['gauss']) 95 | 96 | # MPM Model 97 | up = True 98 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=90,lammove=0.01,mumove=0.18,d=10.0,usepriors=up) 99 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=90,lammove=0.01,mumove=0.18,d=10.0,usepriors=up) 100 | mpm = MPMCls(dist0, dist1) 101 | mhmc = mh.MHRun(mpm, burn=3000, thin=20) 102 | mhmc.sample(iters,verbose=False) 103 | errors['mpm'] = mpm.approx_error_data(mhmc.db, tst_data, tst_labels,numlam=40) 104 | print("MPM Sampler error: %f" % errors['mpm']) 105 | 106 | #p.close('all') 107 | 108 | #def jitter(x): 109 | #rand = np.random.randn 110 | #return x + rand(*x.shape)*0.0 111 | 112 | #def myplot(ax,g,data0,data1,gext): 113 | #ax.plot(data0[:,0], data0[:,1], 'g.',label='0', alpha=0.5) 114 | #ax.plot(data1[:,0], data1[:,1], 'r.',label='1', alpha=0.5) 115 | #ax.legend(fontsize=8, loc='best') 116 | 117 | #im = ax.imshow(g, extent=gext, aspect=1.0, origin='lower') 118 | ##p.colorbar(im,ax=ax) 119 | #ax.contour(g, extent=gext, aspect=1.0, origin='lower') 120 | ##ax.contour(g, [0.0], extent=gext, aspect=1.0, origin='lower', cmap = p.cm.gray) 121 | 122 | #def plot_all(n, gext, grid, data0, data1, g0, g1, gavg): 123 | #Z = np.exp(g0)+np.exp(g1) 124 | #eg0 = np.exp(g0)/Z 125 | #eg1 = np.exp(g1)/Z 126 | #err = np.minimum(eg0,eg1) 127 | #err = err.reshape(-1,n) 128 | 129 | #lx,hx,ly,hy = gext 130 | #asp = float(hx-lx) / (hy-ly) 131 | #alp = 1.0 132 | #ms = 8 133 | 134 | #p.figure() 135 | #p.subplot(2,2,1) 136 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 137 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 138 | #p.legend(fontsize=8, loc='best') 139 | ##p.contour(gavg, extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 140 | ##p.contour(gavg, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 141 | ##p.imshow(gavg, extent=gext, aspect=1, origin='lower') 142 | ##p.imshow(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 143 | ##p.colorbar() 144 | #p.contour(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Greens) 145 | 146 | #p.subplot(2,2,2) 147 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 148 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 149 | #p.legend(fontsize=8, loc='best') 150 | ##p.contour(g0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 151 | ##p.contour(g1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 152 | ##p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 153 | ##p.imshow((g1-g0).reshape(-1,n), extent=gext, aspect=1, origin='lower') 154 | ##p.imshow(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 155 | ##p.colorbar() 156 | #p.contour(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Reds) 157 | 158 | #p.subplot(2,2,3) 159 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 160 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 161 | #p.legend(fontsize=8, loc='best') 162 | ##p.imshow(err, extent=gext, origin='lower', aspect=asp) 163 | ##p.colorbar() 164 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 165 | ##p.contour(eg0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 166 | ##p.contour(eg1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 167 | 168 | #p.subplot(2,2,4) 169 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 170 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 171 | #p.legend(fontsize=8, loc='best') 172 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 173 | #CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 174 | #p.clabel(CS, inline=1, fontsize=10, aspect=asp) 175 | #p.show() 176 | 177 | #def plot_concise(n, gext, grid, data0, data1, g0, g1, gavg): 178 | #p.figure() 179 | #Z = np.exp(g0)+np.exp(g1) 180 | #eg0 = np.exp(g0)/Z 181 | #eg1 = np.exp(g1)/Z 182 | #err = np.minimum(eg0,eg1) 183 | #err = err.reshape(-1,n) 184 | #ms=8 185 | 186 | #lx,hx,ly,hy = gext 187 | #asp = float(hx-lx) / (hy-ly) 188 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 189 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 190 | #p.legend(fontsize=8, loc='best') 191 | 192 | #cont = (g0.max() + g1.max()) / 2.0 - 0.6 193 | ##print("g0.max() = %f" % g0.max()) 194 | ##print("g1.max() = %f" % g1.max()) 195 | ##print("cont = %f" % cont) 196 | #p.contour(g0.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 197 | #p.contour(g1.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 198 | #p.imshow(err, extent=gext, origin='lower', aspect=asp, alpha=0.4, cmap = p.cm.Reds) 199 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray, linewidth=15.0) 200 | #CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 201 | #p.clabel(CS, inline=1, fontsize=10, aspect=asp) 202 | #p.show() 203 | 204 | #n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 205 | #gavg = mpm.calc_gavg(mhmc.db, grid, numlam=20).reshape(-1,n) 206 | #myplot(p.subplot(3,1,1),gavg,trn_data0,trn_data1,gext) 207 | 208 | #g0 = mpm.dist0.calc_db_g(mhmc.db, mhmc.db.root.object.dist0, grid) 209 | #g1 = mpm.dist1.calc_db_g(mhmc.db, mhmc.db.root.object.dist1, grid) 210 | 211 | #myplot(p.subplot(3,1,3),err.reshape(-1,n),jitter(tst_data0),jitter(tst_data1),gext) 212 | 213 | #plot_all(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 214 | #plot_concise(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 215 | 216 | #p.figure() 217 | #n,gext,grid = get_grid_data(np.vstack(( norm_trn_data0, norm_trn_data1 )), positive=False) 218 | #myplot(p.gca(),sksvm.decision_function(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 219 | #p.figure() 220 | #myplot(p.gca(),gc.calc_gavg(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 221 | #p.show() 222 | 223 | #Plot data 224 | # 225 | #p.figure() 226 | #n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 227 | #p.plot(jitter(trn_data0[:,0]), jitter(trn_data0[:,1]), 'go') 228 | #p.plot(jitter(trn_data1[:,0]), jitter(trn_data1[:,1]), 'ro') 229 | #p.figure() 230 | #p.plot(jitter(tst_data0[:,0]), jitter(tst_data0[:,1]), 'go') 231 | #p.plot(jitter(tst_data1[:,0]), jitter(tst_data1[:,1]), 'ro') 232 | #p.show() 233 | 234 | #p.figure() 235 | #myplot(p.subplot(1,1,1),gavg,jitter(tst_data0),jitter(tst_data1),gext) 236 | #p.axis(gext) 237 | #p.figure() 238 | #mpm.dist0.plot_traces(mhmc.db, '/object/dist0', ['mu','lam','sigma']) 239 | #mpm.dist1.plot_traces(mhmc.db, '/object/dist1', ['mu','lam']) 240 | #p.show() 241 | 242 | output['acceptance'] = float(mhmc.accept_loc)/mhmc.total_loc 243 | 244 | output['seed'] = seed 245 | output['time'] = time()-t1 246 | mhmc.clean_db() 247 | 248 | if 'WORKHASH' in os.environ: 249 | import zmq 250 | ctx = zmq.Context() 251 | socket = ctx.socket(zmq.REQ) 252 | socket.connect('tcp://'+server+':7000') 253 | 254 | wiredata = zlib.compress(js.dumps(output)) 255 | #wiredata = s.read_db() 256 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 257 | socket.send(wiredata) 258 | socket.recv() 259 | socket.close() 260 | ctx.term() 261 | 262 | -------------------------------------------------------------------------------- /exps/mcsizeplotter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | import tables as t 4 | import numpy as np 5 | import pylab as p 6 | import pandas as pa 7 | 8 | from jobmon import redisbackend as rb 9 | 10 | def sumvals(x): 11 | vec = np.array(x) 12 | return vec.mean(), vec.std() 13 | 14 | db = rb.RedisDataStore('localhost') 15 | jobhash = db.select_jobfile() 16 | 17 | resdir = os.path.join('/home/bana/largeresearch/results', jobhash) 18 | 19 | res = pa.DataFrame() 20 | #res = pa.DataFrame(columns=('iters', 'bayes', 'post', 'sampler_full', 'sampler_20')) 21 | 22 | for exp in os.listdir(resdir): 23 | # Get param value 24 | iters = yaml.load(db.get_params(exp))['iters'] 25 | for fname in os.listdir(os.path.join(resdir,exp)): 26 | fid = t.openFile(os.path.join(resdir,exp,fname)) 27 | v = {} 28 | for item in fid.root.object.objfxn._v_attrs._f_list(): 29 | v[item] = fid.root.object.objfxn._v_attrs[item] # Ugly 30 | v['iters'] = iters 31 | row = pa.DataFrame(v, index=[1]) 32 | res = res.append(row, ignore_index=True) 33 | fid.close() 34 | 35 | groups = res.groupby('iters') 36 | means = groups.mean() 37 | std = groups.std() 38 | 39 | print means.columns 40 | p.errorbar(means.index, means['bayes_error'], yerr=std['bayes_error']) 41 | p.errorbar(means.index, means['posterior_error'], yerr=std['posterior_error']) 42 | p.errorbar(means.index, means['sample_error_full'], yerr=std['sample_error_full'], label='Sampler full') 43 | p.errorbar(means.index, means['sample_error_20'], yerr=std['sample_error_20'], label='Sampler20') 44 | p.legend() 45 | p.show() 46 | 47 | -------------------------------------------------------------------------------- /exps/mpm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | from scipy.stats.mstats import mquantiles 12 | 13 | try: 14 | from sklearn.lda import LDA 15 | from sklearn.svm import SVC 16 | from sklearn.neighbors import KNeighborsClassifier as KNN 17 | from sklearn.feature_selection import SelectKBest, f_classif 18 | 19 | import samcnet.mh as mh 20 | import samcnet.samc as samc 21 | from samcnet.mixturepoisson import * 22 | from samcnet.lori import * 23 | except ImportError as e: 24 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 25 | " directory is populated by waf.\n\n %s" % str(e)) 26 | 27 | if 'WORKHASH' in os.environ: 28 | try: 29 | server = os.environ['SERVER'] 30 | except: 31 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 32 | 33 | if 'PARAM' in os.environ: 34 | params = yaml.load(os.environ['PARAM']) 35 | else: 36 | params = {} 37 | 38 | def setv(p,s,d,conv=None): 39 | if s not in p: 40 | p[s] = str(d) 41 | return d 42 | elif conv is not None: 43 | return conv(p[s]) 44 | else: 45 | p[s] 46 | 47 | iters = setv(params, 'iters', int(1e4), int) 48 | 49 | num_feat = setv(params, 'num_feat', 4, int) 50 | #seed = setv(params, 'seed', 1234, int) 51 | seed = setv(params, 'seed', np.random.randint(10**8), int) 52 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 53 | 54 | Ntrn = setv(params, 'Ntrn', 3, int) 55 | Ntst = setv(params, 'Ntst', 3000, int) 56 | f_glob = setv(params, 'f_glob', 2, int) 57 | subclasses = setv(params, 'subclasses', 2, int) 58 | f_het = setv(params, 'f_het', 1, int) 59 | f_rand = setv(params, 'f_rand', 0, int) 60 | rho = setv(params, 'rho', 0.6, float) 61 | f_tot = setv(params, 'f_tot', f_glob+f_het*subclasses+f_rand, int) 62 | blocksize = setv(params, 'blocksize', 1, int) 63 | mu0 = setv(params, 'mu0', -1.2, float) 64 | mu1 = setv(params, 'mu1', -0.2, float) 65 | sigma0 = setv(params, 'sigma0', 0.5, float) 66 | sigma1 = setv(params, 'sigma1', 0.2, float) 67 | S = setv(params, 'S', 10.0, float) 68 | 69 | lowd = setv(params, 'lowd', 9.0, float) 70 | highd = setv(params, 'highd', 11.0, float) 71 | 72 | numlam = setv(params, 'numlam', 20, int) 73 | 74 | output = {} 75 | output['errors'] = {} 76 | errors = output['errors'] 77 | np.seterr(all='ignore') # Careful with this 78 | 79 | t1 = time() 80 | 81 | #trn_data, trn_labels, tst_data, tst_labels = data_yj(params) 82 | trn_data, trn_labels, tst_data, tst_labels = data_jason(params) 83 | 84 | norm_trn_data = norm(trn_data) 85 | norm_tst_data = norm(tst_data) 86 | 87 | norm_trn_data0, norm_trn_data1 = split(norm_trn_data) 88 | norm_tst_data0, norm_tst_data1 = split(norm_tst_data) 89 | trn_data0, trn_data1 = split(trn_data) 90 | tst_data0, tst_data1 = split(tst_data) 91 | 92 | #################### CLASSIFICATION ################ 93 | sklda = LDA() 94 | skknn = KNN(3) 95 | sksvm = SVC() 96 | sklda.fit(norm_trn_data, trn_labels) 97 | skknn.fit(norm_trn_data, trn_labels) 98 | sksvm.fit(norm_trn_data, trn_labels) 99 | errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels)) 100 | errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels)) 101 | errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels)) 102 | 103 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, 8, np.eye(num_feat)*3, norm_trn_data0) 104 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, 8, np.eye(num_feat)*3, norm_trn_data1) 105 | 106 | # Gaussian Analytic 107 | gc = GaussianCls(bayes0, bayes1) 108 | errors['gauss'] = gc.approx_error_data(norm_tst_data, tst_labels) 109 | 110 | # MPM Model 111 | #d0 = np.asarray(mquantiles(trn_data0, 0.75, axis=1)).reshape(-1) 112 | #d1 = np.asarray(mquantiles(trn_data1, 0.75, axis=1)).reshape(-1) 113 | #dist0 = MPMDist(trn_data0,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08,d=d0) 114 | #dist1 = MPMDist(trn_data1,kmax=1,priorkappa=150,lammove=0.01,mumove=0.08,d=d1) 115 | 116 | up = True 117 | kappa = 10.0 118 | S = np.eye(4) * 0.4 * (kappa - 1 - 4) 119 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=200,lammove=0.05,mumove=0.08,usepriors=up, 120 | kappa=kappa, S=S) 121 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=200,lammove=0.05,mumove=0.08,usepriors=up, 122 | kappa=kappa, S=S) 123 | mpm1 = MPMCls(dist0, dist1) 124 | mhmc1 = mh.MHRun(mpm1, burn=2000, thin=50) 125 | mhmc1.sample(iters,verbose=False) 126 | errors['mpm'] = mpm1.approx_error_data(mhmc1.db, tst_data, tst_labels,numlam=numlam) 127 | print("") 128 | print("skLDA error: %f" % errors['lda']) 129 | print("skKNN error: %f" % errors['knn']) 130 | print("skSVM error: %f" % errors['svm']) 131 | print("gauss error: %f" % errors['gauss']) 132 | print("my MP error: %f" % errors['mpm']) 133 | 134 | #n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 135 | 136 | #def myplot(ax,g,data0,data1,gext): 137 | #ax.plot(data0[:,0], data0[:,1], 'g.',label='0', alpha=0.5) 138 | #ax.plot(data1[:,0], data1[:,1], 'r.',label='1', alpha=0.5) 139 | #ax.legend(fontsize=8, loc='best') 140 | 141 | ##im = ax.imshow(g, extent=gext, aspect=1.0, origin='lower') 142 | ##p.colorbar(im,ax=ax) 143 | #ax.contour(g, [0.0], extent=gext, aspect=1.0, origin='lower', cmap = p.cm.gray) 144 | 145 | #p.close("all") 146 | #gavg = mpm1.calc_gavg(mhmc1.db, grid, numlam=numlam).reshape(-1,n) 147 | ##myplot(p.subplot(3,1,1),gavg,trn_data0,trn_data1,gext) 148 | 149 | #g0 = mpm1.dist0.calc_db_g(mhmc1.db, mhmc1.db.root.object.dist0, grid) 150 | #g1 = mpm1.dist1.calc_db_g(mhmc1.db, mhmc1.db.root.object.dist1, grid) 151 | 152 | ##def jitter(x): 153 | ##rand = np.random.rand 154 | ##n = x.shape[0] 155 | ##return (x.T + rand(n)).T 156 | #def jitter(x): 157 | #rand = np.random.rand 158 | #return x + rand(*x.shape)-0.5 159 | 160 | ##myplot(p.subplot(3,1,3),err.reshape(-1,n),jitter(tst_data0),jitter(tst_data1),gext) 161 | 162 | #def plot_all(n, gext, grid, data0, data1, g0, g1, gavg): 163 | #Z = np.exp(g0)+np.exp(g1) 164 | #eg0 = np.exp(g0)/Z 165 | #eg1 = np.exp(g1)/Z 166 | #err = np.minimum(eg0,eg1) 167 | #err = err.reshape(-1,n) 168 | 169 | #lx,hx,ly,hy = gext 170 | #asp = float(hx-lx) / (hy-ly) 171 | #alp = 1.0 172 | #ms = 8 173 | 174 | #p.figure() 175 | #p.subplot(2,2,1) 176 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 177 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 178 | #p.legend(fontsize=8, loc='best') 179 | ##p.contour(gavg, extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 180 | ##p.contour(gavg, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 181 | ##p.imshow(gavg, extent=gext, aspect=1, origin='lower') 182 | ##p.imshow(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 183 | ##p.colorbar() 184 | #p.contour(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Greens) 185 | 186 | #p.subplot(2,2,2) 187 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 188 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 189 | #p.legend(fontsize=8, loc='best') 190 | ##p.contour(g0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 191 | ##p.contour(g1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 192 | ##p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 193 | ##p.imshow((g1-g0).reshape(-1,n), extent=gext, aspect=1, origin='lower') 194 | ##p.imshow(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 195 | ##p.colorbar() 196 | #p.contour(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Reds) 197 | 198 | #p.subplot(2,2,3) 199 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 200 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 201 | #p.legend(fontsize=8, loc='best') 202 | ##p.imshow(err, extent=gext, origin='lower', aspect=asp) 203 | ##p.colorbar() 204 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 205 | ##p.contour(eg0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 206 | ##p.contour(eg1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 207 | 208 | #p.subplot(2,2,4) 209 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 210 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 211 | #p.legend(fontsize=8, loc='best') 212 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 213 | #CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 214 | #p.clabel(CS, inline=1, fontsize=10, aspect=asp) 215 | #p.show() 216 | 217 | #def plot_concise(n, gext, grid, data0, data1, g0, g1, gavg): 218 | #p.figure() 219 | #Z = np.exp(g0)+np.exp(g1) 220 | #eg0 = np.exp(g0)/Z 221 | #eg1 = np.exp(g1)/Z 222 | #err = np.minimum(eg0,eg1) 223 | #err = err.reshape(-1,n) 224 | #ms=8 225 | 226 | #lx,hx,ly,hy = gext 227 | #asp = float(hx-lx) / (hy-ly) 228 | #p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 229 | #p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 230 | #p.legend(fontsize=8, loc='best') 231 | 232 | #cont = (g0.max() + g1.max()) / 2.0 - 0.6 233 | #p.contour(g0.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 234 | #p.contour(g1.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 235 | #p.imshow(err, extent=gext, origin='lower', aspect=asp, alpha=0.4, cmap = p.cm.Reds) 236 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray, linewidth=15.0) 237 | #CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 238 | #p.clabel(CS, inline=1, fontsize=10, aspect=asp) 239 | #p.show() 240 | 241 | #plot_all(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 242 | #plot_concise(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 243 | 244 | ##n,gext,grid = get_grid_data(np.vstack(( norm_trn_data0, norm_trn_data1 )), positive=False) 245 | ##myplot(p.subplot(3,1,3),sksvm.decision_function(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 246 | 247 | #p.figure() 248 | #myplot(p.subplot(1,1,1),gavg,jitter(tst_data0),jitter(tst_data1),gext) 249 | #p.axis(gext) 250 | #mpm1.dist0.plot_traces(mhmc1.db, '/object/dist0', ['sigma']) 251 | #p.show() 252 | 253 | output['seed'] = seed 254 | output['time'] = time()-t1 255 | output['acceptance'] = float(mhmc1.accept_loc)/mhmc1.total_loc 256 | 257 | if 'WORKHASH' in os.environ: 258 | import zmq 259 | ctx = zmq.Context() 260 | socket = ctx.socket(zmq.REQ) 261 | socket.connect('tcp://'+server+':7000') 262 | 263 | wiredata = zlib.compress(js.dumps(output)) 264 | #wiredata = s.read_db() 265 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 266 | socket.send(wiredata) 267 | socket.recv() 268 | socket.close() 269 | ctx.term() 270 | 271 | mhmc1.db.close() 272 | -------------------------------------------------------------------------------- /exps/mpm_play.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | from scipy.stats.mstats import mquantiles 12 | 13 | try: 14 | from sklearn.lda import LDA 15 | from sklearn.svm import SVC 16 | from sklearn.neighbors import KNeighborsClassifier as KNN 17 | from sklearn.feature_selection import SelectKBest, f_classif 18 | 19 | import samcnet.mh as mh 20 | from samcnet.mixturepoisson import * 21 | from samcnet.lori import * 22 | from samcnet.data import * 23 | from samcnet.calibrate import * 24 | except ImportError as e: 25 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 26 | " directory is populated by waf.\n\n %s" % str(e)) 27 | 28 | if 'WORKHASH' in os.environ: 29 | try: 30 | server = os.environ['SERVER'] 31 | except: 32 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 33 | 34 | if 'PARAM' in os.environ: 35 | params = yaml.load(os.environ['PARAM']) 36 | else: 37 | params = {} 38 | 39 | iters = setv(params, 'iters', int(1e4), int) 40 | 41 | num_feat = setv(params, 'num_feat', 2, int) 42 | seed = setv(params, 'seed', np.random.randint(10**8), int) 43 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 44 | 45 | Ntrn = setv(params, 'Ntrn', 20, int) 46 | Ntst = setv(params, 'Ntst', 3000, int) 47 | mu0 = setv(params, 'mu0', 0.0, float) 48 | mu1 = setv(params, 'mu1', 0.6, float) 49 | sigma0 = setv(params, 'sigma0', 0.2, float) 50 | sigma1 = setv(params, 'sigma1', 0.6, float) 51 | kappa = setv(params, 'kappa', 30.0, float) 52 | 53 | lowd = setv(params, 'lowd', 9.0, float) 54 | highd = setv(params, 'highd', 11.0, float) 55 | 56 | num_gen_feat = setv(params, 'num_gen_feat', 20, int) 57 | mumove = setv(params, 'mumove', 0.08, float) 58 | lammove = setv(params, 'lammove', 0.01, float) 59 | priorkappa = setv(params, 'priorkappa', 150, int) 60 | burn = setv(params, 'burn', 3000, int) 61 | thin = setv(params, 'thin', 40, int) 62 | numlam = setv(params, 'numlam', 40, int) 63 | 64 | output = {} 65 | output['errors'] = {} 66 | errors = output['errors'] 67 | np.seterr(all='ignore') # Careful with this 68 | rseed = np.random.randint(10**8) 69 | 70 | sel, rawdata, normdata = get_data(data_jk, params) 71 | norm_trn_data = normdata.loc[sel['trn'], sel['feats']] 72 | norm_tst_data = normdata.loc[sel['tst'], sel['feats']] 73 | tst_data = rawdata.loc[sel['tst'], sel['feats']] 74 | 75 | t1 = time() 76 | #################### CLASSIFICATION ################ 77 | ######################################## 78 | ######################################## 79 | ######################################## 80 | sklda = LDA() 81 | skknn = KNN(3, warn_on_equidistant=False) 82 | sksvm = SVC() 83 | sklda.fit(norm_trn_data, sel['trnl']) 84 | skknn.fit(norm_trn_data, sel['trnl']) 85 | sksvm.fit(norm_trn_data, sel['trnl']) 86 | errors['lda'] = (1-sklda.score(norm_tst_data, sel['tstl'])) 87 | errors['knn'] = (1-skknn.score(norm_tst_data, sel['tstl'])) 88 | errors['svm'] = (1-sksvm.score(norm_tst_data, sel['tstl'])) 89 | print("skLDA error: %f" % errors['lda']) 90 | print("skKNN error: %f" % errors['knn']) 91 | print("skSVM error: %f" % errors['svm']) 92 | 93 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, kappa, 94 | np.eye(num_feat)*(kappa-1-num_feat), 95 | normdata.loc[sel['trn0'], sel['feats']]) 96 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, kappa, 97 | np.eye(num_feat)*(kappa-1-num_feat), 98 | normdata.loc[sel['trn1'], sel['feats']]) 99 | 100 | # Gaussian Analytic 101 | gc = GaussianCls(bayes0, bayes1) 102 | errors['gauss'] = gc.approx_error_data(norm_tst_data, sel['tstl']) 103 | print("Gaussian Analytic error: %f" % errors['gauss']) 104 | 105 | ######################################## 106 | ######################################## 107 | ######################################## 108 | ######################################## 109 | ######################################## 110 | # MPM Model 111 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 112 | lammove=lammove,mumove=mumove) 113 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 114 | lammove=lammove,mumove=mumove) 115 | mpm = MPMCls(dist0, dist1) 116 | mhmc = mh.MHRun(mpm, burn=burn, thin=thin) 117 | mhmc.sample(iters,verbose=False) 118 | errors['mpm'] = mpm.approx_error_data(mhmc.db, tst_data, sel['tstl'],numlam=numlam) 119 | print("MPM Sampler error: %f" % errors['mpm']) 120 | 121 | output['acceptance'] = float(mhmc.accept_loc)/mhmc.total_loc 122 | ######################################## 123 | ######################################## 124 | ######################################## 125 | ######################################## 126 | ######################################## 127 | # Calibrated MPM Model 128 | p0, p1 = calibrate(rawdata, sel, params) 129 | 130 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 131 | lammove=lammove,mumove=mumove,**p0) 132 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 133 | lammove=lammove,mumove=mumove,**p1) 134 | mpmc = MPMCls(dist0, dist1) 135 | mhmcc = mh.MHRun(mpmc, burn=burn, thin=thin) 136 | mhmcc.sample(iters,verbose=False) 137 | errors['mpmc_calib'] = mpmc.approx_error_data(mhmcc.db, tst_data, sel['tstl'],numlam=numlam) 138 | print("mpmc Calibrated error: %f" % errors['mpmc_calib']) 139 | 140 | output['acceptance_calib'] = float(mhmcc.accept_loc)/mhmcc.total_loc 141 | ######################################## 142 | ######################################## 143 | ######################################## 144 | ######################################## 145 | ######################################## 146 | priorsigma = np.ones(4)*0.1 147 | pm0 = np.ones(4) * mu0 148 | pm1 = np.ones(4) * mu1 149 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 150 | lammove=lammove,mumove=mumove, 151 | priormu=pm0,priorsigma=priorsigma) 152 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 153 | lammove=lammove,mumove=mumove, 154 | priormu=pm1,priorsigma=priorsigma) 155 | #dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],kmax=1,priorkappa=200, 156 | #lammove=0.01,mumove=0.08,#S=S0,kappa=kappa, 157 | #priormu=pm0,priorsigma=priorsigma, usedata=ud) 158 | #dist1 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],kmax=1,priorkappa=200, 159 | #lammove=0.01,mumove=0.08,#S=S1,kappa=kappa, 160 | #priormu=pm1, priorsigma=priorsigma, usedata=ud) 161 | mpmp = MPMCls(dist0, dist1) 162 | mhmcp = mh.MHRun(mpmp, burn=burn, thin=thin) 163 | mhmcp.sample(iters,verbose=False) 164 | errors['mpm_prior'] = mpmp.approx_error_data(mhmcp.db, tst_data, sel['tstl'],numlam=numlam) 165 | print("MPM prior Sampler error: %f" % errors['mpm_prior']) 166 | output['acceptance_prior'] = float(mhmcp.accept_loc)/mhmcp.total_loc 167 | ######################################## 168 | ######################################## 169 | ######################################## 170 | ######################################## 171 | import pylab as p 172 | n,gext,grid = get_grid_data(np.vstack(( rawdata.loc[sel['trn0'],sel['feats']], 173 | rawdata.loc[sel['trn1'],sel['feats']])), positive=True) 174 | 175 | def myplot(ax,g,data,sel,gext): 176 | data0 = data.loc[sel['trn0'], sel['feats']] 177 | data1 = data.loc[sel['trn1'], sel['feats']] 178 | ax.plot(data0.iloc[:,0], data0.iloc[:,1], 'g.',label='0', alpha=0.5) 179 | ax.plot(data1.iloc[:,0], data1.iloc[:,1], 'r.',label='1', alpha=0.5) 180 | ax.legend(fontsize=8, loc='best') 181 | 182 | im = ax.imshow(g, extent=gext, aspect=1.0, origin='lower') 183 | p.colorbar(im,ax=ax) 184 | ax.contour(g, [0.0], extent=gext, aspect=1.0, origin='lower', cmap = p.cm.gray) 185 | 186 | def plot_all(n, gext, grid, data0, data1, g0, g1, gavg): 187 | Z = np.exp(g0)+np.exp(g1) 188 | eg0 = np.exp(g0)/Z 189 | eg1 = np.exp(g1)/Z 190 | err = np.minimum(eg0,eg1) 191 | err = err.reshape(-1,n) 192 | 193 | lx,hx,ly,hy = gext 194 | asp = float(hx-lx) / (hy-ly) 195 | alp = 1.0 196 | ms = 8 197 | 198 | p.figure() 199 | p.subplot(2,2,1) 200 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 201 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 202 | p.legend(fontsize=8, loc='best') 203 | #p.contour(gavg, extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 204 | #p.contour(gavg, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 205 | #p.imshow(gavg, extent=gext, aspect=1, origin='lower') 206 | #p.imshow(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 207 | #p.colorbar() 208 | p.contour(g0.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Greens) 209 | 210 | p.subplot(2,2,2) 211 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 212 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 213 | p.legend(fontsize=8, loc='best') 214 | #p.contour(g0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 215 | #p.contour(g1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 216 | #p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 217 | #p.imshow((g1-g0).reshape(-1,n), extent=gext, aspect=1, origin='lower') 218 | #p.imshow(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower') 219 | #p.colorbar() 220 | p.contour(g1.reshape(-1,n), extent=gext, aspect=asp, origin='lower', cmap = p.cm.Reds) 221 | 222 | p.subplot(2,2,3) 223 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms, alpha=alp) 224 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms, alpha=alp) 225 | p.legend(fontsize=8, loc='best') 226 | #p.imshow(err, extent=gext, origin='lower', aspect=asp) 227 | #p.colorbar() 228 | p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 229 | #p.contour(eg0.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Greens) 230 | #p.contour(eg1.reshape(-1,n), extent=gext, aspect=1, origin='lower', cmap = p.cm.Reds) 231 | 232 | p.subplot(2,2,4) 233 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 234 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 235 | p.legend(fontsize=8, loc='best') 236 | p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 237 | CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 238 | p.clabel(CS, inline=1, fontsize=10, aspect=asp) 239 | p.show() 240 | 241 | def plot_concise(n, gext, grid, data0, data1, g0, g1, gavg): 242 | p.figure() 243 | Z = np.exp(g0)+np.exp(g1) 244 | eg0 = np.exp(g0)/Z 245 | eg1 = np.exp(g1)/Z 246 | err = np.minimum(eg0,eg1) 247 | err = err.reshape(-1,n) 248 | ms=8 249 | 250 | lx,hx,ly,hy = gext 251 | asp = float(hx-lx) / (hy-ly) 252 | p.plot(data0[:,0], data0[:,1], 'g^',label='0', markersize=ms) 253 | p.plot(data1[:,0], data1[:,1], 'ro',label='1', markersize=ms) 254 | p.legend(fontsize=8, loc='best') 255 | 256 | cont = (g0.max() + g1.max()) / 2.0 - 0.6 257 | p.contour(g0.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 258 | p.contour(g1.reshape(-1,n), [cont], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray) 259 | p.imshow(err, extent=gext, origin='lower', aspect=asp, alpha=0.4, cmap = p.cm.Reds) 260 | p.contour((g1-g0).reshape(-1,n), [0.0], extent=gext, aspect=asp, origin='lower', cmap = p.cm.gray, linewidth=15.0) 261 | CS = p.contour(err, [0.4, 0.3, 0.2, 0.1, 0.05], extent=gext, aspect=asp, origin='lower') 262 | p.clabel(CS, inline=1, fontsize=10, aspect=asp) 263 | p.show() 264 | 265 | ##def jitter(x): 266 | ##rand = np.random.rand 267 | ##n = x.shape[0] 268 | ##return (x.T + rand(n)).T 269 | #def jitter(x): 270 | #rand = np.random.rand 271 | #return x + rand(*x.shape)-0.5 272 | 273 | p.close("all") 274 | gavg = mpm.calc_gavg(mhmc.db, grid, numlam=numlam).reshape(-1,n) 275 | myplot(p.subplot(3,1,1),gavg,rawdata,sel,gext) 276 | gavgc = mpmc.calc_gavg(mhmcc.db, grid, numlam=numlam).reshape(-1,n) 277 | myplot(p.subplot(3,1,2),gavgc,rawdata,sel,gext) 278 | gavgp = mpmp.calc_gavg(mhmcp.db, grid, numlam=numlam).reshape(-1,n) 279 | myplot(p.subplot(3,1,3),gavgp,rawdata,sel,gext) 280 | 281 | p.show() 282 | 283 | #g0 = mpm1.dist0.calc_db_g(mhmc1.db, mhmc1.db.root.object.dist0, grid) 284 | #g1 = mpm1.dist1.calc_db_g(mhmc1.db, mhmc1.db.root.object.dist1, grid) 285 | 286 | ##myplot(p.subplot(3,1,3),err.reshape(-1,n),jitter(tst_data0),jitter(tst_data1),gext) 287 | 288 | #plot_all(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 289 | #plot_concise(n, gext, grid, trn_data0, trn_data1, g0,g1,gavg) 290 | 291 | ##n,gext,grid = get_grid_data(np.vstack(( norm_trn_data0, norm_trn_data1 )), positive=False) 292 | ##myplot(p.subplot(3,1,3),sksvm.decision_function(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 293 | 294 | #p.figure() 295 | #myplot(p.subplot(1,1,1),gavg,jitter(tst_data0),jitter(tst_data1),gext) 296 | #p.axis(gext) 297 | #mpm1.dist0.plot_traces(mhmc1.db, '/object/dist0', ['sigma']) 298 | 299 | output['seed'] = seed 300 | output['time'] = time()-t1 301 | 302 | if 'WORKHASH' in os.environ: 303 | import zmq 304 | ctx = zmq.Context() 305 | socket = ctx.socket(zmq.REQ) 306 | socket.connect('tcp://'+server+':7000') 307 | 308 | wiredata = zlib.compress(js.dumps(output)) 309 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 310 | socket.send(wiredata) 311 | socket.recv() 312 | socket.close() 313 | ctx.term() 314 | 315 | #mhmc.clean_db() 316 | 317 | -------------------------------------------------------------------------------- /exps/priorstrength.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import zlib, cPickle 3 | ############### SAMC Setup ############### 4 | import numpy as np 5 | import scipy as sp 6 | import networkx as nx 7 | 8 | from samcnet.samc import SAMCRun 9 | from samcnet.bayesnetcpd import BayesNetSampler, BayesNetCPD 10 | from samcnet import utils 11 | from samcnet.generator import * 12 | 13 | if 'WORKHASH' in os.environ: 14 | try: 15 | redis_server = os.environ['REDIS'] 16 | import redis 17 | r = redis.StrictRedis(redis_server) 18 | except: 19 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 20 | ############### /SAMC Setup ############### 21 | 22 | N = 9 23 | iters = 3e5 24 | numdata = 0 #NEED TO ADD NOISE FIRST 25 | 26 | temperature = 1.0 27 | burn = 1000 28 | stepscale = 10000 29 | thin = 10 30 | refden = 0.0 31 | 32 | random.seed(12345) 33 | np.random.seed(12345) 34 | 35 | groundgraph = generateHourGlassGraph(nodes=N) 36 | #joint, states = generateJoint(groundgraph, method='dirichlet') 37 | joint, states = generateJoint(groundgraph, method='noisylogic') 38 | data = generateData(groundgraph, joint, numdata) 39 | groundbnet = BayesNetCPD(states, data, limparent=3) 40 | groundbnet.set_cpds(joint) 41 | 42 | if 'WORKHASH' in os.environ: 43 | jobhash = os.environ['WORKHASH'] 44 | if not r.hexists('jobs:grounds', jobhash): 45 | r.hset('jobs:grounds', jobhash, zlib.compress(cPickle.dumps(groundgraph))) 46 | 47 | random.seed() 48 | np.random.seed() 49 | 50 | #p_struct = float(sys.argv[1]) 51 | p_struct = 30.0 52 | for numtemplate in [4,8]: 53 | for cpd in [True, False]: 54 | if cpd: 55 | p_cpd = p_struct 56 | else: 57 | p_cpd = 0.0 58 | 59 | random.seed(12345) 60 | np.random.seed(12345) 61 | 62 | obj = BayesNetCPD(states, data, limparent=3) 63 | template = sampleTemplate(groundgraph, numtemplate) 64 | 65 | random.seed() 66 | np.random.seed() 67 | 68 | b = BayesNetSampler(obj, 69 | template, 70 | groundbnet, 71 | p_struct=p_struct, 72 | p_cpd=p_cpd) 73 | s = SAMCRun(b,burn,stepscale,refden,thin) 74 | s.sample(iters, temperature) 75 | s.compute_means(cummeans=False) 76 | 77 | if 'WORKHASH' in os.environ: 78 | r.lpush('jobs:done:' + jobhash, s.read_db()) 79 | r.lpush('custom:%s:p_struct=%d:ntemplate=%d:p_cpd=%d' % 80 | (jobhash, int(p_struct*10), numtemplate, int(p_cpd*10)), 81 | s.db.root.computed.means._v_attrs['kld'] ) 82 | s.db.close() 83 | 84 | -------------------------------------------------------------------------------- /exps/samplesize.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import zlib, cPickle 3 | ############### SAMC Setup ############### 4 | import numpy as np 5 | import scipy as sp 6 | import networkx as nx 7 | 8 | from samcnet.samc import SAMCRun 9 | from samcnet.bayesnetcpd import BayesNetSampler, BayesNetCPD 10 | from samcnet import utils 11 | from samcnet.generator import * 12 | 13 | if 'WORKHASH' in os.environ: 14 | try: 15 | redis_server = os.environ['REDIS'] 16 | import redis 17 | r = redis.StrictRedis(redis_server) 18 | except: 19 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 20 | ############### /SAMC Setup ############### 21 | 22 | N = 8 23 | iters = 3e5 24 | priorweight = 0.0 25 | numtemplate = 0 26 | burn = 1000 27 | stepscale = 10000 28 | thin = 10 29 | refden = 0.0 30 | 31 | random.seed(12345) 32 | np.random.seed(12345) 33 | 34 | groundgraph = generateHourGlassGraph(nodes=N) 35 | #joint, states = generateJoint(groundgraph, method='dirichlet') 36 | joint, states = generateJoint(groundgraph, method='noisylogic') 37 | template = sampleTemplate(groundgraph, numtemplate) 38 | 39 | if 'WORKHASH' in os.environ: 40 | jobhash = os.environ['WORKHASH'] 41 | if not r.hexists('jobs:grounds', jobhash): 42 | r.hset('jobs:grounds', jobhash, zlib.compress(cPickle.dumps(groundgraph))) 43 | 44 | random.seed() 45 | np.random.seed() 46 | 47 | datasizes = [4, 16, 32, 64, 128, 256] 48 | temps = [1.0, 1.0, 2.0, 2.0, 5.0, 5.0] 49 | 50 | for temperature, numdata in zip(temps, datasizes): 51 | data = generateData(groundgraph, joint, numdata) 52 | groundbnet = BayesNetCPD(states, data, limparent=3) 53 | groundbnet.set_cpds(joint) 54 | obj = BayesNetCPD(states, data, limparent=3) 55 | b = BayesNetSampler(obj, template, groundbnet, priorweight) 56 | s = SAMCRun(b,burn,stepscale,refden,thin) 57 | s.sample(iters, temperature) 58 | s.compute_means() 59 | 60 | if 'WORKHASH' in os.environ: 61 | r.lpush('jobs:done:' + jobhash, s.read_db()) 62 | r.lpush('custom:%s:samplesize=%d' % 63 | (jobhash, numdata), 64 | s.db.root.computed.means._v_attrs['kld'] ) 65 | 66 | s.db.close() 67 | 68 | -------------------------------------------------------------------------------- /exps/simpleplotter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pylab as p 4 | import simplejson as js 5 | import pandas as pa 6 | import yaml 7 | 8 | from collections import defaultdict 9 | 10 | from jobmon import redisbackend as rb 11 | 12 | db = rb.RedisDataStore('localhost') 13 | jobhash = db.select_jobfile() 14 | 15 | resdir = os.path.join('/home/bana/largeresearch/results', jobhash) 16 | 17 | p.close('all') 18 | 19 | output = defaultdict(list) 20 | other = defaultdict(list) 21 | diffs = defaultdict(list) 22 | 23 | for mydir in os.listdir(resdir): 24 | params = yaml.load(db.get_params(mydir)) 25 | if len(params.values()) != 0: 26 | continue 27 | for fname in os.listdir(os.path.join(resdir,mydir)): 28 | data = js.loads(open(os.path.join(resdir,mydir,fname)).read()) 29 | print(data['errors']) 30 | for k,v in data.iteritems(): 31 | if k == 'errors': 32 | mpmerr = data['errors']['mpm'] 33 | for kk,vv in data['errors'].iteritems(): 34 | output[kk].append(vv) 35 | if kk != 'mpm': 36 | diffs[kk].append((vv-mpmerr)) 37 | else: 38 | other[k].append(v) 39 | 40 | df = pa.DataFrame(output) 41 | otherdf = pa.DataFrame(other) 42 | diffdf = pa.DataFrame(diffs) 43 | 44 | print(otherdf.describe()) 45 | df.boxplot() 46 | p.figure() 47 | #diffdf.boxplot() 48 | key = {'gauss':'Normal OBC', 'svm':'SVM', 'knn':'3NN', 'lda':'LDA', 'mpm':'MP OBC', 49 | 'mpm_prior':'MP OBC Prior'} 50 | 51 | ind = np.arange(len(df.columns)) 52 | width = 0.7 53 | p.grid(True) 54 | p.bar(ind, df.mean(), width=width) 55 | meanmin, meanmax = np.min(df.mean()), np.max(df.mean()) 56 | spread = meanmax-meanmin 57 | p.ylim(meanmin-0.8*spread, meanmax+0.8*spread) 58 | #p.bar(ind, df.mean(), width=width, yerr=df.std()) 59 | p.gca().set_xticks(ind+width/2.) 60 | p.gca().set_xticklabels([key[x] for x in df.columns]) 61 | p.ylabel('Mean True error') 62 | #p.title(jobhash[:6] + ' ' + db.get_description(jobhash) + ' ' + str(params)) 63 | 64 | p.show() 65 | -------------------------------------------------------------------------------- /exps/tcga.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import sha 7 | import numpy as np 8 | import pandas as pa 9 | import simplejson as js 10 | import subprocess as sb 11 | from time import time,sleep 12 | from os import path 13 | from scipy.stats.mstats import mquantiles 14 | from sklearn.feature_selection import SelectKBest, f_classif 15 | 16 | from sklearn.lda import LDA 17 | from sklearn.svm import SVC 18 | from sklearn.neighbors import KNeighborsClassifier as KNN 19 | from sklearn.feature_selection import SelectKBest, f_classif 20 | 21 | import samcnet.mh as mh 22 | from samcnet.mixturepoisson import * 23 | from samcnet.lori import * 24 | from samcnet.data import * 25 | from samcnet.calibrate import * 26 | 27 | if 'WORKHASH' in os.environ: 28 | try: 29 | server = os.environ['SERVER'] 30 | except: 31 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 32 | 33 | if 'PARAM' in os.environ: 34 | params = yaml.load(os.environ['PARAM']) 35 | else: 36 | params = {} 37 | 38 | np.seterr(all='ignore') # Careful with this 39 | num_feat = setv(params, 'num_feat', 4, int) 40 | rseed = setv(params, 'rseed', np.random.randint(10**8), int) 41 | seed = setv(params, 'seed', np.random.randint(10**8), int) 42 | 43 | low = setv(params, 'low_filter', 1, int) 44 | high = setv(params, 'high_filter', 10, int) 45 | 46 | # MCMC 47 | mumove = setv(params, 'mumove', 0.08, float) 48 | lammove = setv(params, 'lammove', 0.01, float) 49 | priorkappa = setv(params, 'priorkappa', 150, int) 50 | iters = setv(params, 'iters', int(1e4), int) 51 | burn = setv(params, 'burn', 3000, int) 52 | thin = setv(params, 'thin', 40, int) 53 | numlam = setv(params, 'numlam', 40, int) 54 | d = setv(params, 'd', 10, int) 55 | 56 | np.random.seed(seed) 57 | 58 | Ntrn = setv(params, 'Ntrn', 40, int) 59 | assert Ntrn >= 40 60 | 61 | sel, rawdata, normdata = get_data(data_tcga, params) 62 | 63 | for Ntrn in [40, 35, 30, 25, 20, 15, 10, 5]: 64 | output = {} 65 | output['errors'] = {} 66 | errors = output['errors'] 67 | 68 | ### Select Ntrn number of training samples 69 | numsub = sel['trn0'].sum() - Ntrn 70 | sel = subsample(sel, numsub) 71 | 72 | norm_trn_data = normdata.loc[sel['trn'], sel['feats']] 73 | norm_tst_data = normdata.loc[sel['tst'], sel['feats']] 74 | tst_data = rawdata.loc[sel['tst'], sel['feats']] 75 | 76 | t1 = time() 77 | #################### CLASSIFICATION ################ 78 | sklda = LDA() 79 | skknn = KNN(3, warn_on_equidistant=False) 80 | sksvm = SVC() 81 | sklda.fit(norm_trn_data, sel['trnl']) 82 | skknn.fit(norm_trn_data, sel['trnl']) 83 | sksvm.fit(norm_trn_data, sel['trnl']) 84 | errors['lda'] = (1-sklda.score(norm_tst_data, sel['tstl'])) 85 | errors['knn'] = (1-skknn.score(norm_tst_data, sel['tstl'])) 86 | errors['svm'] = (1-sksvm.score(norm_tst_data, sel['tstl'])) 87 | print("skLDA error: %f" % errors['lda']) 88 | print("skKNN error: %f" % errors['knn']) 89 | print("skSVM error: %f" % errors['svm']) 90 | 91 | lorikappa = 10 92 | bayes0 = GaussianBayes(np.zeros(num_feat), 1, lorikappa, 93 | np.eye(num_feat)*(lorikappa-1-num_feat), 94 | normdata.loc[sel['trn0'], sel['feats']]) 95 | bayes1 = GaussianBayes(np.zeros(num_feat), 1, lorikappa, 96 | np.eye(num_feat)*(lorikappa-1-num_feat), 97 | normdata.loc[sel['trn1'], sel['feats']]) 98 | 99 | # Gaussian Analytic 100 | gc = GaussianCls(bayes0, bayes1) 101 | errors['gauss'] = gc.approx_error_data(norm_tst_data, sel['tstl']) 102 | print("Gaussian Analytic error: %f" % errors['gauss']) 103 | 104 | # MPM Model 105 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 106 | lammove=lammove,mumove=mumove,d=d) 107 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 108 | lammove=lammove,mumove=mumove,d=d) 109 | mpm = MPMCls(dist0, dist1) 110 | mhmc = mh.MHRun(mpm, burn=burn, thin=thin) 111 | mhmc.sample(iters,verbose=False) 112 | errors['mpm'] = mpm.approx_error_data(mhmc.db, tst_data, sel['tstl'],numlam=numlam) 113 | print("MPM Sampler error: %f" % errors['mpm']) 114 | 115 | output['acceptance'] = float(mhmc.accept_loc)/mhmc.total_loc 116 | mhmc.clean_db() 117 | ######################################## 118 | ######################################## 119 | ######################################## 120 | ######################################## 121 | ######################################## 122 | # Calibrated MPM Model 123 | p0, p1 = calibrate(rawdata, sel, params) 124 | record_hypers(output, p0, p1) 125 | 126 | dist0 = MPMDist(rawdata.loc[sel['trn0'],sel['feats']],priorkappa=priorkappa, 127 | lammove=lammove,mumove=mumove,d=d,**p0) 128 | dist1 = MPMDist(rawdata.loc[sel['trn1'],sel['feats']],priorkappa=priorkappa, 129 | lammove=lammove,mumove=mumove,d=d,**p1) 130 | mpmc = MPMCls(dist0, dist1) 131 | mhmcc = mh.MHRun(mpmc, burn=burn, thin=thin) 132 | mhmcc.sample(iters,verbose=False) 133 | errors['mpmc_calib'] = mpmc.approx_error_data(mhmcc.db, tst_data, sel['tstl'],numlam=numlam) 134 | print("mpmc Calibrated error: %f" % errors['mpmc_calib']) 135 | 136 | output['acceptance_calib'] = float(mhmcc.accept_loc)/mhmcc.total_loc 137 | mhmcc.clean_db() 138 | 139 | output['seed'] = seed 140 | output['time'] = time()-t1 141 | 142 | if 'WORKHASH' in os.environ: 143 | jobhash, paramhash = os.environ['WORKHASH'].split('|') 144 | param = yaml.dump({'Ntrn':Ntrn}).strip() 145 | paramhash = sha.sha(param).hexdigest() 146 | # submit paramhash 147 | import redis 148 | r = redis.StrictRedis(server) 149 | r.hset('params:sources', paramhash, param) 150 | 151 | os.environ['WORKHASH'] = jobhash.strip() + '|' + paramhash 152 | 153 | import zmq 154 | ctx = zmq.Context() 155 | socket = ctx.socket(zmq.REQ) 156 | socket.connect('tcp://'+server+':7000') 157 | 158 | wiredata = zlib.compress(js.dumps(output)) 159 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 160 | socket.send(wiredata) 161 | socket.recv() 162 | socket.close() 163 | ctx.term() 164 | -------------------------------------------------------------------------------- /exps/treevbnet.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import numpy as np 3 | import scipy as sp 4 | import networkx as nx 5 | import json as js 6 | import tables as t 7 | import zlib 8 | import cPickle 9 | import time as gtime 10 | import pylab as p 11 | 12 | from samcnet.samc import SAMCRun 13 | from samcnet.treenet import TreeNet, generateTree, generateData 14 | from samcnet.bayesnetcpd import BayesNetSampler, BayesNetCPD 15 | from samcnet import utils 16 | from samcnet.generator import sampleTemplate 17 | import samcnet.generator as gen 18 | 19 | start = None 20 | def time(): 21 | global start 22 | if start is None: 23 | start = gtime.time() 24 | else: 25 | t = gtime.time() 26 | print("Time taken: {} seconds".format(t-start)) 27 | start = None 28 | 29 | if 'WORKHASH' in os.environ: 30 | try: 31 | redis_server = os.environ['REDIS'] 32 | import redis 33 | r = redis.StrictRedis(redis_server) 34 | except: 35 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 36 | 37 | N = 10 38 | comps = 3 39 | iters = 3e5 40 | numdata = 30 41 | burn = 1000 42 | stepscale = 30000 43 | temperature = 1.0 44 | thin = 50 45 | refden = 0.0 46 | numtemplate = 10 47 | priorweight = 0.0 48 | 49 | random.seed(12345) 50 | np.random.seed(12345) 51 | 52 | groundgraph = generateTree(N, comps) 53 | data = generateData(groundgraph,numdata) 54 | template = sampleTemplate(groundgraph, numtemplate) 55 | 56 | if 'WORKHASH' in os.environ: 57 | jobhash = os.environ['WORKHASH'] 58 | if not r.hexists('jobs:grounds', jobhash): 59 | r.hset('jobs:grounds', jobhash, zlib.compress(cPickle.dumps(groundgraph))) 60 | 61 | random.seed() 62 | np.random.seed() 63 | 64 | ############### TreeNet ############## 65 | 66 | #groundtree = TreeNet(N, data=data, graph=groundgraph) 67 | #b1 = TreeNet(N, data, template, priorweight, groundtree) 68 | #s1 = SAMCRun(b1,burn,stepscale,refden,thin) 69 | #time() 70 | #s1.sample(iters, temperature) 71 | #time() 72 | 73 | #s1.compute_means() 74 | #if 'WORKHASH' in os.environ: 75 | #r.lpush('jobs:done:' + jobhash, s1.read_db()) 76 | #s1.db.close() 77 | 78 | ############# bayesnetcpd ############ 79 | 80 | #import pstats, cProfile 81 | 82 | joint = utils.graph_to_joint(groundgraph) 83 | states = np.ones(len(joint.dists),dtype=np.int32)*2 84 | groundbnet = BayesNetCPD(states, data) 85 | groundbnet.set_cpds(joint) 86 | 87 | obj = BayesNetCPD(states, data) 88 | b2 = BayesNetSampler(obj, template, groundbnet, priorweight) 89 | s2 = SAMCRun(b2,burn,stepscale,refden,thin) 90 | time() 91 | #cProfile.runctx("s2.sample(iters, temperature)", globals(), locals(), "prof.prof") 92 | s2.sample(iters, temperature) 93 | time() 94 | s2.compute_means() 95 | #s2.compute_means(cummeans=False) 96 | if 'WORKHASH' in os.environ: 97 | r.lpush('jobs:done:' + jobhash, s2.read_db()) 98 | s2.db.close() 99 | ####################################### 100 | ### 101 | 102 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #ifndef __utils__h 7 | #define __utils__h 8 | 9 | std::string crepr(const dai::FactorGraph &x); 10 | std::string crepr(const dai::Factor &x); 11 | std::string crepr(const dai::VarSet &x); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /lib/libdai.so: -------------------------------------------------------------------------------- 1 | ../deps/libdai/lib/libdai.so -------------------------------------------------------------------------------- /samcnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binarybana/samcnet/84f3ba8241d416115a8aa9ba5c659a9513175072/samcnet/__init__.py -------------------------------------------------------------------------------- /samcnet/bayesnet.pxd: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | 3 | cdef class BayesNet: 4 | cdef public: 5 | object nodes,states,data,graph,x,mat,fvalue,changelist 6 | object oldmat, oldx, oldfvalue 7 | object gtemplate, ntemplate, ground 8 | object verbose 9 | int limparent, data_num, node_num, changelength 10 | double prior_alpha, prior_gamma 11 | cdef: 12 | int **cmat, **cdata 13 | double **ctemplate 14 | 15 | cdef double **npy2c_double(np.ndarray a) 16 | cdef np.ndarray c2npy_double(double **a, int n, int m) 17 | cdef int **npy2c_int(np.ndarray a) 18 | cdef np.ndarray c2npy_int(int **a, int n, int m) 19 | -------------------------------------------------------------------------------- /samcnet/bayesnet.pyx: -------------------------------------------------------------------------------- 1 | cimport csnet 2 | cimport cython 3 | from libc.stdlib cimport malloc, free, rand, RAND_MAX 4 | from libc.string cimport memcpy 5 | from libc.math cimport exp 6 | from math import ceil, floor 7 | import tables as tb 8 | 9 | import networkx as nx 10 | import sys 11 | import os 12 | 13 | import numpy as np 14 | cimport numpy as np 15 | 16 | cdef class BayesNet: 17 | def __cinit__(self, *args, **kwargs): 18 | pass 19 | 20 | def __init__(self, states, data, template=None, ground=None, priorweight=1.0, 21 | verbose=False): 22 | """ 23 | nodes: a list of strings for the nodes 24 | states: a list of number of states for each node 25 | data: a matrix with each row being a draw from the Bayesian network 26 | with each entry being [0..n_i-1] 27 | template: A networkx graph of the prior information 28 | ground: A (networkx graph, joint distribution) tuple of the ground truth 29 | Initializes the BayesNet as a set of independent nodes 30 | """ 31 | self.verbose = verbose 32 | 33 | self.states = np.asarray(states,dtype=np.int32) 34 | self.nodes = np.arange(self.states.shape[0], dtype=np.int) 35 | self.data = np.asarray(data,dtype=np.int32) 36 | 37 | self.graph = nx.DiGraph() 38 | self.graph.add_nodes_from(self.nodes) 39 | 40 | self.limparent = 3 41 | self.prior_alpha = 1.0 42 | self.prior_gamma = priorweight 43 | 44 | self.data_num = self.data.shape[0] 45 | self.node_num = self.states.shape[0] 46 | 47 | # Template and Ground truth networks 48 | self.ground = ground 49 | 50 | if template == None: 51 | self.gtemplate = None 52 | self.ntemplate = np.zeros((self.node_num, self.node_num), dtype=np.double) 53 | else: 54 | self.gtemplate = template.copy() 55 | self.ntemplate = np.asarray(nx.to_numpy_matrix(template), dtype=np.double) 56 | np.fill_diagonal(self.ntemplate, 1.0) 57 | self.ctemplate = npy2c_double(self.ntemplate) 58 | 59 | self.x = np.arange(self.node_num, dtype=np.int32) 60 | np.random.shuffle(self.x) # We're going to make this a 0-9 permutation 61 | 62 | cdef int cols = self.node_num 63 | 64 | self.mat = np.eye(cols, dtype=np.int32) 65 | self.fvalue = np.zeros((cols,), dtype=np.double) 66 | self.changelist = self.x.copy() 67 | 68 | self.changelength = self.node_num 69 | self.cmat = npy2c_int(self.mat) 70 | self.cdata = npy2c_int(self.data) 71 | 72 | def global_edge_presence(self): 73 | if self.ground == None: 74 | return np.nan 75 | else: 76 | s = self.x.argsort() 77 | sg = self.ground.x.argsort() 78 | ordmat = self.mat[s].T[s].T 79 | return float(np.abs(self.ground.mat[sg].T[sg].T - ordmat).sum()) / self.x.shape[0]**2 80 | 81 | def save_to_db(self): 82 | return self.global_edge_presence() 83 | 84 | def update_graph(self, matx=None): 85 | """ 86 | Update the networkx graph from either the current state, or pass 87 | in a 2-tuple of (matrix,vector) with the adjacency matrix and the 88 | node values. 89 | 90 | See self.update_matrix as well. 91 | """ 92 | if matx: 93 | assert len(matx) == 2 94 | assert matx[0].shape == (self.node_num,self.node_num) 95 | assert matx[1].shape == (self.node_num,) 96 | assert matx[0].dtype == np.int32 97 | assert matx[1].dtype == np.int32 98 | self.mat = matx[0].copy() 99 | self.x = matx[1].copy() 100 | 101 | self.cmat = npy2c_int(self.mat) 102 | self.fvalue = np.zeros_like(self.fvalue) 103 | self.changelength = self.node_num 104 | self.changelist = self.x.copy() 105 | 106 | self.graph.clear() 107 | s = self.x.argsort() 108 | ordered = self.mat[s].T[s].T 109 | self.graph = nx.from_numpy_matrix(ordered - np.eye(self.node_num), create_using=nx.DiGraph()) 110 | 111 | def update_matrix(self, graph): 112 | """ 113 | From a networkx graph, update the internal representation of the graph 114 | (an adjacency matrix and node list). 115 | 116 | Also see self.update_graph 117 | """ 118 | assert graph.number_of_nodes() == self.node_num 119 | mat = np.array(nx.to_numpy_matrix(graph),dtype=np.int32) 120 | np.fill_diagonal(mat, 1) 121 | 122 | self.mat = mat.copy() 123 | self.x = np.arange(self.node_num, dtype=np.int32) 124 | self.cmat = npy2c_int(self.mat) 125 | self.fvalue = np.zeros_like(self.fvalue) 126 | self.changelength = self.node_num 127 | self.changelist = self.x.copy() 128 | self.graph = graph.copy() 129 | 130 | def to_dot(self): 131 | self.update_graph() 132 | nx.write_dot(self.graph, '/tmp/graph.dot') 133 | 134 | def to_adjacency(self): 135 | return nx.to_numpy_matrix(self.graph) 136 | 137 | def copy(self): 138 | return (self.mat.copy(), self.x.copy()) 139 | 140 | def energy(self): 141 | """ 142 | Calculate the -log probability. 143 | """ 144 | cdef double prior 145 | cdef np.ndarray[np.int32_t, ndim=1, mode="c"] x = \ 146 | self.x 147 | cdef np.ndarray[np.int32_t, ndim=1, mode="c"] states = \ 148 | self.states 149 | cdef np.ndarray[np.double_t, ndim=1, mode="c"] fvalue = \ 150 | self.fvalue 151 | cdef np.ndarray[np.int32_t, ndim=1, mode="c"] changelist = \ 152 | self.changelist 153 | self.cmat = npy2c_int(self.mat) 154 | energy = csnet.cost( 155 | self.node_num, 156 | self.data_num, 157 | self.limparent, 158 | states.data, 159 | self.cdata, 160 | self.prior_alpha, 161 | self.prior_gamma, 162 | self.ctemplate, 163 | x.data, 164 | self.cmat, 165 | fvalue.data, 166 | changelist.data, 167 | self.changelength) 168 | 169 | return energy 170 | 171 | def reject(self): 172 | """ Revert graph, mat, x, fvalue, changelist, and changelength. """ 173 | self.mat = self.oldmat 174 | self.x = self.oldx 175 | self.fvalue = self.oldfvalue 176 | # IF I really wanted to be safe I would set changelngeth=10 and maybe 177 | # changelist 178 | 179 | def propose(self): 180 | """ 'Propose' a new network structure by backing up the old one and then 181 | changing the current one. """ 182 | 183 | cdef int i,j,i1,j1,i2,j2 184 | self.oldmat = self.mat.copy() 185 | self.oldx = self.x.copy() 186 | self.oldfvalue = self.fvalue.copy() 187 | 188 | scheme = np.random.randint(1,4) 189 | 190 | if scheme==1: # temporal order change 191 | k = np.random.randint(self.node_num-1) 192 | self.x[k], self.x[k+1] = self.x[k+1], self.x[k] 193 | self.changelist[0], self.changelist[1] = k, k+1 194 | self.changelength = 2 195 | 196 | for j in range(k+2, self.node_num): 197 | if self.mat[k,j]==1 or self.mat[k+1,j]==1: 198 | self.changelength += 1 199 | self.changelist[self.changelength-1] = j 200 | 201 | if scheme==2: # skeletal change 202 | 203 | i = np.random.randint(self.node_num) 204 | j = np.random.randint(self.node_num) 205 | while i==j: 206 | j = np.random.randint(self.node_num) 207 | if iself.node_num-1): 222 | i1=floor(rand()*1.0/RAND_MAX*self.node_num)+1 223 | j1=i1 224 | while(j1<0 or j1>self.node_num-1 or j1==i1): 225 | j1=floor(rand()*1.0/RAND_MAX*self.node_num)+1 226 | if(i1>j1): 227 | k=i1 228 | i1=j1 229 | j1=k 230 | i2=0 231 | while(i2<0 or i2>self.node_num-1): 232 | i2=floor(rand()*1.0/RAND_MAX*self.node_num)+1 233 | j2=i2 234 | while(j2<0 or j2>self.node_num-1 or j2==i2): 235 | j2=floor(rand()*1.0/RAND_MAX*self.node_num)+1 236 | if(i2>j2): 237 | k=i2 238 | i2=j2 239 | j2=k 240 | 241 | if(j1==j2): 242 | self.changelength=1 243 | self.changelist[1]=j1 244 | else: 245 | self.changelength=2 246 | self.changelist[1]=j1 247 | self.changelist[2]=j2 248 | 249 | self.mat[i1,j1]=1-self.mat[i1,j1] 250 | self.mat[i2,j2]=1-self.mat[i2,j2] 251 | 252 | cdef double **npy2c_double(np.ndarray a): 253 | cdef int m = a.shape[0] 254 | cdef int n = a.shape[1] 255 | cdef int i 256 | cdef double **data 257 | data = malloc(m*sizeof(double*)) 258 | for i in range(m): 259 | data[i] = &(a.data)[i*n] 260 | return data 261 | 262 | cdef np.ndarray c2npy_double(double **a, int n, int m): 263 | cdef np.ndarray[np.double_t,ndim=2]result = np.zeros((m,n),dtype=np.double) 264 | cdef double *dest 265 | cdef int i 266 | dest = malloc(m*n*sizeof(double*)) 267 | for i in range(m): 268 | memcpy(dest + i*n,a[i],m*sizeof(double*)) 269 | free(a[i]) 270 | memcpy(result.data,dest,m*n*sizeof(double*)) 271 | free(dest) 272 | free(a) 273 | return result 274 | 275 | cdef int **npy2c_int(np.ndarray a): 276 | cdef int m = a.shape[0] 277 | cdef int n = a.shape[1] 278 | cdef int i 279 | cdef int **data 280 | data = malloc(m*sizeof(int*)) 281 | for i in range(m): 282 | data[i] = &(a.data)[i*n] 283 | return data 284 | 285 | cdef np.ndarray c2npy_int(int **a, int n, int m): 286 | cdef np.ndarray[np.int32_t,ndim=2]result = np.zeros((m,n),dtype=np.int32) 287 | cdef int *dest 288 | cdef int i 289 | dest = malloc(m*n*sizeof(int*)) 290 | for i in range(m): 291 | memcpy(dest + i*n,a[i],m*sizeof(int*)) 292 | free(a[i]) 293 | memcpy(result.data,dest,m*n*sizeof(int*)) 294 | free(dest) 295 | free(a) 296 | return result 297 | 298 | -------------------------------------------------------------------------------- /samcnet/bayesnet.so: -------------------------------------------------------------------------------- 1 | ../build/bayesnet.so -------------------------------------------------------------------------------- /samcnet/bayesnetcpd.pxd: -------------------------------------------------------------------------------- 1 | from bayesnet cimport BayesNet 2 | from dai_bind cimport FactorGraph, Var, VarSet, JTree 3 | from libcpp.vector cimport vector 4 | 5 | cdef class BayesNetCPD: 6 | """To initialize a BayesNetCPD we need: 7 | 8 | ==Required== 9 | nodes: Node names. 10 | states: Arities. 11 | data: For -logP calculations. 12 | 13 | ==Optional prior information== 14 | priorweight: A float used in the -logP calculations. 15 | template: An adjacency matrix over the nodes with float values from [0,1] 16 | 17 | With all of these parameters, the network will be initialized with no 18 | interconnections and all nodes assigned a uniform probability over their 19 | arities. 20 | 21 | """ 22 | cdef public: 23 | # From Bayesnet 24 | object states,data,x,mat,changelist,fvalue 25 | object oldmat, oldx, oldfvalue 26 | int limparent, node_num, changelength 27 | 28 | double logqfactor # For RJMCMC weighting of the acceptance probability 29 | double memo_entropy 30 | object dirty 31 | cdef: 32 | vector[Var] pnodes 33 | vector[vector[ulong]] pdata 34 | FactorGraph fg 35 | JTree jtree 36 | int convert(BayesNetCPD self, int node, vector[ulong] state) 37 | int convert_separate(BayesNetCPD self, int node, int state, int parstate) 38 | -------------------------------------------------------------------------------- /samcnet/bayesnetcpd.so: -------------------------------------------------------------------------------- 1 | ../build/bayesnetcpd.so -------------------------------------------------------------------------------- /samcnet/calibrate.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import samcnet.mh as mh 3 | from samcnet.mixturepoisson import * 4 | 5 | def rho_matrix(p, diag, offdiag): 6 | assert np.abs(diag) >= np.abs(offdiag) 7 | return np.diag(np.ones(p) * diag) + (np.ones((p,p)) - np.eye(p)) * offdiag 8 | 9 | def calc_avgs(db): 10 | D = db.mu.read()[0].size 11 | mumean = db.mu.read().mean() 12 | sigmean = db.sigma.read().mean(axis=0) 13 | return np.r_[mumean, sigmean[0,0], sigmean[1,1], sigmean[0,1]] 14 | 15 | def get_calibration_params(params, D): 16 | meanp = params.mean(axis=0) 17 | mumean = meanp[0] 18 | muvar = params[:,0].var(ddof=1) 19 | 20 | diags = params[:,[1,2]] 21 | sigdiagmean = diags.mean() 22 | sigoffmean = params[:,3].mean() 23 | 24 | sigdiagvar = 1./(diags.size - 1) * ((sigdiagmean - diags.flatten())**2).sum() 25 | sigma2 = 2 * sigdiagmean * (sigdiagmean**2/sigdiagvar + 1) 26 | rho = sigoffmean/sigdiagmean 27 | kappa = 2*sigdiagmean**2 / sigdiagvar + D + 3 28 | 29 | S=rho_matrix(D, sigma2, sigma2*rho) 30 | try: 31 | np.linalg.cholesky(S) 32 | except np.linalg.LinAlgError: 33 | S = rho_matrix(D, sigma2, 0.0) 34 | return dict( 35 | priormu=np.ones(D) * mumean, 36 | priorsigma=np.ones(D) * muvar, 37 | kappa=int(kappa), 38 | S=S) 39 | 40 | def record_hypers(output, p0, p1): 41 | for k in p0.keys(): 42 | if type(p0[k]) == np.ndarray: 43 | output['p0_'+k] = list(p0[k].flat) 44 | output['p1_'+k] = list(p1[k].flat) 45 | else: 46 | output['p0_'+k] = p0[k] 47 | output['p1_'+k] = p1[k] 48 | 49 | def calibrate(rawdata, sel, params): 50 | iters = params['iters'] 51 | num_feat = params['num_feat'] 52 | burn = params['burn'] 53 | thin = params['thin'] 54 | c = params['c'] 55 | d = params.get('d', 10) 56 | 57 | paramlog0 = np.empty((0,4), dtype=float) 58 | paramlog1 = np.empty((0,4), dtype=float) 59 | for feats in sel['subcalibs']: 60 | dist0 = MPMDist(rawdata.loc[sel['trn0'],feats], 61 | priorkappa=params['priorkappa'], 62 | lammove=params['lammove'], 63 | mumove=params['mumove'], 64 | d=d, 65 | usepriors=False) 66 | dist1 = MPMDist(rawdata.loc[sel['trn1'],feats], 67 | priorkappa=params['priorkappa'], 68 | lammove=params['lammove'], 69 | mumove=params['mumove'], 70 | d=d, 71 | usepriors=False) 72 | mpm = MPMCls(dist0, dist1, c=c) 73 | mhmc = mh.MHRun(mpm, burn=burn, thin=thin, verbose=False) 74 | mhmc.sample(iters,verbose=False) 75 | paramlog0 = np.vstack(( paramlog0, calc_avgs(mhmc.db.root.object.dist0) )) 76 | paramlog1 = np.vstack(( paramlog1, calc_avgs(mhmc.db.root.object.dist1) )) 77 | mhmc.clean_db() 78 | 79 | p0 = get_calibration_params(paramlog0, num_feat) 80 | p1 = get_calibration_params(paramlog1, num_feat) 81 | return p0, p1 82 | -------------------------------------------------------------------------------- /samcnet/csnet.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from "cost.h": 2 | double cost(int node_num, 3 | int data_num, 4 | int limparent, 5 | int *state, 6 | int **datax, 7 | double prior_alpha, 8 | double prior_gamma, 9 | double **priormat, 10 | int *x, 11 | int **mat, 12 | double *fvalue, 13 | int *changelist, 14 | int changelength) 15 | 16 | -------------------------------------------------------------------------------- /samcnet/dai_bind.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | from libcpp.string cimport string 3 | from libcpp.map cimport map 4 | 5 | cdef extern from "dai/var.h" namespace "dai": 6 | cdef cppclass Var: 7 | Var(size_t, size_t) 8 | size_t states() 9 | size_t label() 10 | 11 | #cdef extern from "dai/smallset.h" namespace "dai": 12 | #cdef cppclass SmallSet[T]: 13 | #SmallSet() 14 | #SmallSet(T) 15 | 16 | cdef extern from "dai/util.h" namespace "dai": 17 | ctypedef void* BigInt 18 | size_t BigInt_size_t(BigInt &) 19 | 20 | cdef extern from "dai/varset.h" namespace "dai": 21 | size_t calcLinearState( VarSet &, map[Var, size_t] &) 22 | map[Var, size_t] calcState( VarSet &, size_t) 23 | cdef cppclass VarSet: 24 | VarSet() 25 | #VarSet(SmallSet[Var] &) 26 | VarSet(Var &) 27 | VarSet(Var &, Var &) 28 | VarSet(vector[Var].iterator, vector[Var].iterator, size_t) 29 | BigInt nrStates() 30 | #SmallSet[Var] & operator|(SmallSet[Var] &) 31 | size_t size() 32 | vector[Var] & elements() 33 | vector[Var].iterator begin() 34 | vector[Var].iterator end() 35 | VarSet& insert(Var &) 36 | VarSet& erase(Var &) 37 | VarSet operator/(VarSet &) 38 | VarSet operator|(VarSet &) 39 | #VarSet& remove(VarSet &) 40 | #VarSet& add(VarSet &) 41 | bint operator==(VarSet&, VarSet&) 42 | 43 | cdef extern from "dai/properties.h" namespace "dai": 44 | cdef cppclass PropertySet: 45 | PropertySet() 46 | PropertySet(string) 47 | 48 | cdef extern from "dai/factor.h" namespace "dai": 49 | cdef cppclass TFactor[T]: 50 | TFactor() 51 | TFactor(Var &) 52 | TFactor(VarSet &) 53 | void set(size_t, T) 54 | T get(size_t) 55 | VarSet & vars() 56 | size_t nrStates() 57 | T entropy() 58 | TFactor[T] marginal(VarSet &) 59 | TFactor[T] embed(VarSet &) 60 | T operator[](size_t) 61 | T normalize() 62 | 63 | ctypedef TFactor[double] Factor 64 | 65 | cdef extern from "dai/factorgraph.h" namespace "dai": 66 | cdef cppclass FactorGraph: 67 | FactorGraph() 68 | FactorGraph(vector[Factor] &) 69 | Var & var(size_t) 70 | FactorGraph* clone() 71 | vector[Var] & vars() 72 | size_t nrVars() 73 | size_t nrFactors() 74 | size_t nrEdges() 75 | double logScore(vector[long unsigned int] &) 76 | Factor factor(int) 77 | void setFactor(int, Factor, bool) except + 78 | void setFactor(int, Factor) except + 79 | void clearBackups() except + 80 | void restoreFactors() except + 81 | void ReadFromFile(char*) 82 | 83 | cdef extern from "dai/jtree.h" namespace "dai": 84 | cdef cppclass JTree: 85 | JTree() 86 | JTree(FactorGraph &, PropertySet &) 87 | void init() 88 | void run() 89 | size_t Iterations() 90 | string printProperties() 91 | Factor calcMarginal(VarSet &) 92 | Factor belief(VarSet &) 93 | -------------------------------------------------------------------------------- /samcnet/generator.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import numpy as np 3 | import random as ra 4 | from collections import defaultdict 5 | from functools import partial 6 | from math import ceil,floor 7 | from probability import CPD,fast_space_iterator,JointDistribution 8 | 9 | def generateSubGraphs(numgraphs = 2, nodespersub = 5, interconnects = 0): 10 | kernel = lambda x: x**0.0001 11 | x = temp = nx.gn_graph(nodespersub, kernel) 12 | 13 | for i in range(numgraphs-1): 14 | temp = nx.gn_graph(nodespersub, kernel) 15 | x = nx.disjoint_union(x, temp) 16 | #Now add some crosstalk 17 | workinglen = len(x) - len(temp) 18 | for i in range(interconnects): 19 | firstnode = ra.choice(range(workinglen)) 20 | secondnode = ra.choice(range(workinglen,len(x))) 21 | 22 | newedge = [firstnode, secondnode] 23 | ra.shuffle(newedge) 24 | x.add_edge(newedge[0], newedge[1]) 25 | return x 26 | 27 | def generateHourGlassGraph(nodes=10, interconnects = 0): 28 | def flipGraph(g): 29 | e = g.edges() 30 | g.remove_edges_from(e) 31 | g.add_edges_from(zip(*zip(*e)[::-1])) 32 | 33 | kernel = lambda x: x**0.0001 34 | if nodes < 4: 35 | return nx.gn_graph(nodes,kernel) 36 | n1 , n2 = int(floor(nodes/2.)), int(ceil(nodes/2.)) 37 | x1 = nx.gn_graph(n1, kernel) 38 | x2 = nx.gn_graph(n2, kernel) 39 | flipGraph(x2) 40 | x = nx.disjoint_union(x1,x2) 41 | x.add_edge(0,n1+1) 42 | 43 | for i in range(interconnects): 44 | firstnode = ra.choice(range(n1)) 45 | secondnode = ra.choice(range(n1,n1+n2)) 46 | #newedge = [firstnode, secondnode] 47 | #ra.shuffle(newedge) 48 | #x.add_edge(newedge[0], newedge[1]) 49 | x.add_edge(firstnode,secondnode) 50 | return x 51 | 52 | def noisylogic(name, arity, pdomain): 53 | """ 54 | Generate a CPD where each value is the cumulative distribution of 55 | the categorical random variable given the parents configuration as the key. 56 | 57 | In this case, we want to pick a random logic function, and then return that. 58 | 59 | Assuming binary valued nodes for now. 60 | """ 61 | eps = 0.1 62 | if pdomain == {}: 63 | if np.random.rand() < 0.5: 64 | params = {(): np.array([ra.choice([0+eps, 1-eps])])} 65 | else: 66 | params = {(): np.array([0.5])} 67 | else: 68 | params = {} 69 | for key in fast_space_iterator(pdomain): 70 | params[key] = np.array([ra.choice([0+eps, 1-eps])]) 71 | return CPD(name, arity, params, pdomain) 72 | 73 | def dirichlet(name, arity, pdomain): 74 | """ 75 | Generate a CPD where each value is the cumulative distribution of 76 | the categorical random variable given the parents configuration as the key. 77 | 78 | In this case, we will use a random distribution for each state of the 79 | parents variables. 80 | """ 81 | params = {} 82 | for key in fast_space_iterator(pdomain): 83 | params[key] = np.random.dirichlet([1.]*(arity))[:-1] 84 | return CPD(name, arity, params, pdomain) 85 | 86 | def generateJoint(graph, method='dirichlet'): 87 | numnodes = graph.number_of_nodes() 88 | adj = np.array(nx.to_numpy_matrix(graph),dtype=np.int) 89 | states = np.ones(numnodes)*2 90 | names = graph.nodes() #or maybe np.arange(numnodes) 91 | 92 | if method == 'dirichlet': 93 | func = dirichlet 94 | elif method == 'noisylogic': 95 | assert np.all(states == 2) # we can generalize this later 96 | func = noisylogic 97 | cpds = [func(nd,st,{k:int(states[k]) 98 | for k in graph.predecessors(nd)}) for nd,st in zip(names,states)] 99 | joint = JointDistribution() 100 | for cpd in cpds: 101 | joint.add_distribution(cpd) 102 | return joint, states 103 | 104 | def generateData(graph, joint, numpoints=50, noise=0.0, ): 105 | """ 106 | Generate random draws from graph, with 107 | randomly assigned CPDs and additive zero mean Gaussian 108 | noise with std_dev=noise on the observations. 109 | """ 110 | numnodes = graph.number_of_nodes() 111 | order = nx.topological_sort(graph) 112 | adj = np.array(nx.to_numpy_matrix(graph),dtype=np.int) 113 | states = np.ones(numnodes)*2 # FIXME this is hardcoded atm 114 | cpds = zip(*sorted((k,v) for k,v in joint.dists.iteritems()))[1] 115 | draws = np.empty((numpoints, numnodes), dtype=np.int) 116 | for i in range(numpoints): 117 | for node in order: 118 | parents = adj[:,node] 119 | parstate = tuple(draws[i,parents==1]) 120 | if np.random.rand() < noise: 121 | draws[i,node] = np.random.randint(0, states[node]) 122 | else: 123 | draws[i,node] = np.searchsorted(np.cumsum(cpds[node].params[parstate]), np.random.random()) 124 | return draws 125 | 126 | def sampleTemplate(graph, numEdges=3): 127 | edges = graph.edges() 128 | ra.shuffle(edges) 129 | new = graph.copy() 130 | new.remove_edges_from(new.edges()) 131 | new.add_edges_from(edges[:numEdges]) 132 | return new 133 | 134 | -------------------------------------------------------------------------------- /samcnet/metropolis.py: -------------------------------------------------------------------------------- 1 | 2 | class MHRun(): 3 | def __init__(self, obj, burn, thin=1): 4 | self.obj = obj 5 | self.burn = burn 6 | self.db = None 7 | 8 | self.mapvalue = None 9 | self.mapenergy = None 10 | 11 | self.thin = thin 12 | 13 | self.iteration = 0 14 | 15 | self.propose = 0 16 | self.accept = 0 17 | 18 | def sample(self, num): 19 | num = int(num) 20 | self.db = self.obj.init_db(self.db, (self.iteration + num - self.burn)//self.thin, 'mh') 21 | minenergy = np.infty 22 | 23 | oldenergy = self.obj.energy() 24 | for i in range(int(num)): 25 | self.iteration += 1 26 | 27 | self.obj.propose() 28 | self.propose+=1 29 | 30 | newenergy = self.obj.energy() 31 | 32 | r = oldenergy - newenergy # ignoring non-symmetric proposals for now 33 | if r > 0.0 or np.random.rand() < exp(r): 34 | # Accept 35 | oldenergy = newenergy 36 | self.accept += 1 37 | else: 38 | self.obj.reject() 39 | 40 | if self.iteration>self.burn and i%self.thin == 0: 41 | self.obj.save_to_db(self.db, 0, oldenergy, (self.iteration-self.burn-1)//self.thin) 42 | 43 | if oldenergy < minenergy: 44 | minenergy = oldenergy 45 | self.mapvalue = self.obj.copy() 46 | self.mapenergy = oldenergy 47 | 48 | if self.iteration%1e3 == 0: 49 | print "Iteration: %9d, best energy: %7f, current energy: %7f" \ 50 | % (self.iteration, minenergy, oldenergy) 51 | 52 | print "Sampling done, acceptance: %d/%d = %f" \ 53 | % (self.accept, self.propose, float(self.accept)/float(self.propose)) 54 | 55 | -------------------------------------------------------------------------------- /samcnet/mh.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | cimport cython 3 | from libc.math cimport exp, ceil, floor 4 | 5 | import sys 6 | import os 7 | import tempfile 8 | import zlib 9 | import tables as t 10 | from collections import Counter 11 | 12 | import numpy as np 13 | cimport numpy as np 14 | 15 | cdef class MHRun: 16 | cdef public: 17 | object obj, db, hist, mapvalue, verbose, scheme_accept, scheme_propose 18 | int accept_loc, total_loc, iteration, burn, thin 19 | double mapenergy 20 | def __init__(self, obj, burn=100000, thin=1, verbose=False): 21 | self.verbose = verbose 22 | self.obj = obj 23 | self.clear() 24 | 25 | self.burn = burn 26 | self.thin = thin 27 | self.db = None 28 | 29 | self.scheme_accept = Counter() 30 | self.scheme_propose = Counter() 31 | 32 | def clear(self): 33 | self.db = None 34 | self.mapenergy = np.inf 35 | self.mapvalue = None 36 | self.iteration = 0 37 | self.accept_loc = 0 38 | self.total_loc = 0 39 | 40 | def init_db(self, size): 41 | if self.db == None: 42 | filt = t.Filters(complib='bzip2', complevel=7, fletcher32=True) 43 | if not os.path.exists('.tmp'): 44 | print("Creating temp directory: .tmp") 45 | os.mkdir('.tmp') 46 | name = tempfile.mktemp(prefix='mh', dir='.tmp') 47 | self.db = t.openFile(name, mode = 'w', title='Metropolis Hastings Run Data', filters=filt) 48 | self.db.root._v_attrs["mcmc_type"] = 'mh' 49 | 50 | self.db.createGroup('/', 'mh', 'Metropolis Hastings info', filters=filt) 51 | self.db.createEArray('/mh', 'energy_trace', t.Float64Atom(), (0,), expectedrows=size) 52 | 53 | objdb = self.db.createGroup('/', 'object', 'Object info', filters=filt) 54 | samples = self.db.createGroup(objdb, 'samples', 'Samples') 55 | objfxn = self.db.createGroup(objdb, 'objfxn', 'Objective function outputs') 56 | 57 | self.obj.init_db(self.db, self.db.root.object, size) 58 | 59 | def read_db(self): 60 | assert self.db.isopen == 1, "DB not open!" 61 | fname = self.db.filename 62 | self.db.close() 63 | fid = open(fname, 'r') 64 | data = zlib.compress(fid.read()) 65 | fid.close() 66 | self.db = t.openFile(fname, 'r+') 67 | return data 68 | 69 | def close_db(self): 70 | self.db.close() 71 | 72 | def clean_db(self): 73 | fname = self.db.filename 74 | self.close_db() 75 | os.remove(fname) 76 | 77 | def save_iter_db(self, double energy, int iteration): 78 | self.db.root.mh.energy_trace.append((energy,)) 79 | self.obj.save_iter_db(self.db, self.db.root.object) 80 | 81 | def save_state_db(self): 82 | mhroot = self.db.root.mh 83 | 84 | mhroot._v_attrs.prop_accept = self.accept_loc 85 | mhroot._v_attrs.prop_total = self.total_loc 86 | 87 | mhroot._v_attrs.burnin = self.burn 88 | mhroot._v_attrs.thin = self.thin 89 | mhroot._v_attrs.curr_iteration = self.iteration 90 | 91 | def compute_means(self, cummeans=True): 92 | """ 93 | Using the currently saved samples from the object in the pytables db, 94 | compute the cumulative mean of the function on the random weighted samples. 95 | And save the results to the /computed/cummeans region of the db. 96 | """ 97 | assert self.db != None, 'db not initialized' 98 | #assert len(self.db) != 0, 'Length of db is zero! Perhaps you have not "\ 99 | #"proceeded beyond the burn-in period' 100 | 101 | if not 'computed' in self.db.root: 102 | self.db.createGroup('/', 'computed', 'Computed quantities') 103 | if cummeans and not 'cummeans' in self.db.root.computed: 104 | cumgroup = self.db.createGroup('/computed', 'cummeans', 'Cumulative means') 105 | elif cummeans and 'cummeans' in self.db.root.computed: 106 | cumgroup = self.db.root.computed.cummeans 107 | if 'means' in self.db.root.computed: 108 | meangroup = self.db.root.computed.means 109 | else: 110 | meangroup = self.db.createGroup('/computed', 'means', 'Means') 111 | for item in self.db.walkNodes('/object'): 112 | if isinstance(item, t.array.Array): 113 | funcs = item.read().astype(np.float) 114 | if cummeans: 115 | numerator = funcs.cumsum(axis=0) 116 | if item.name in cumgroup: 117 | raise Exception("Not implemented yet: multiple calls to func_cummean") 118 | arr = self.db.createCArray(cumgroup, item.name, 119 | t.Float64Atom(shape=funcs[-1].shape), 120 | (funcs.size,)) 121 | denom = np.ones(funcs.size).cumsum() 122 | arr[:] = (numerator.T / denom).T 123 | meangroup._v_attrs[item.name] = arr[-1] 124 | else: 125 | denom = funcs.size() 126 | numerator = funcs.sum(axis=0) 127 | meangroup._v_attrs[item.name] = (numerator/denom).astype(np.float) 128 | 129 | #@cython.boundscheck(False) # turn off bounds-checking for entire function 130 | def sample(self, int iters, object verbose = False): 131 | cdef int current_iter, accept, i, dbsize, scheme 132 | cdef double oldenergy, newenergy, r 133 | oldenergy = self.obj.energy() 134 | 135 | dbsize = (self.iteration + int(iters) - self.burn)//self.thin 136 | if dbsize < 0: 137 | dbsize = 0 138 | self.init_db(dbsize) 139 | 140 | if self.verbose: 141 | print("Initial Energy: %g" % oldenergy) 142 | 143 | for current_iter in range(self.iteration, self.iteration + int(iters)): 144 | self.iteration += 1 145 | 146 | scheme = self.obj.propose() 147 | newenergy = self.obj.energy() 148 | 149 | if newenergy < self.mapenergy: # NB: Even if not accepted 150 | self.mapenergy = newenergy 151 | self.mapvalue = self.obj.copy() 152 | 153 | ####### acceptance of new moves ######### 154 | 155 | r = oldenergy-newenergy 156 | 157 | self.scheme_propose[scheme] += 1 158 | if np.random.rand() < exp(r): 159 | accept=1 160 | self.scheme_accept[scheme] += 1 161 | else: 162 | accept=0; 163 | if verbose:# and self.iteration % 10 == 0: 164 | print("old: %8.2f, new: %8.2f, r: %5.2f, accept: %d" % (oldenergy, newenergy, r, accept)) 165 | 166 | if accept == 0: 167 | self.obj.reject() 168 | self.total_loc += 1 169 | elif accept == 1: 170 | self.accept_loc += 1 171 | self.total_loc += 1 172 | oldenergy = newenergy 173 | 174 | if current_iter >= self.burn and current_iter % self.thin == 0: 175 | self.save_iter_db(oldenergy, 176 | (current_iter-self.burn)//self.thin) 177 | 178 | if self.iteration % 1000 == 0 and self.verbose: 179 | print("Iteration: %8d, best energy: %7g, current energy: %7g" % \ 180 | (self.iteration, self.mapenergy, newenergy)) 181 | 182 | self.save_state_db() 183 | 184 | if self.verbose: 185 | ###### Calculate summary statistics ####### 186 | print("Accept_loc: %d" % self.accept_loc) 187 | print("Total_loc: %d" % self.total_loc) 188 | print("Acceptance: %f" % (float(self.accept_loc)/float(self.total_loc))) 189 | 190 | -------------------------------------------------------------------------------- /samcnet/mh.so: -------------------------------------------------------------------------------- 1 | ../build/mh.so -------------------------------------------------------------------------------- /samcnet/mixturepoisson.so: -------------------------------------------------------------------------------- 1 | ../build/mixturepoisson.so -------------------------------------------------------------------------------- /samcnet/probability.pxd: -------------------------------------------------------------------------------- 1 | from pydai cimport PyJTree 2 | 3 | cpdef object fast_space_iterator(object domain) 4 | 5 | cdef class GroundNet: 6 | cdef public: 7 | object joint 8 | object mymarginal 9 | 10 | #object factors 11 | PyJTree jtree 12 | double entropy 13 | cpdef int mux(self, int state, int pastate, int pos, int numpars) 14 | 15 | cdef class JointDistribution: 16 | cdef public: 17 | object domain, parent_domain, dists 18 | 19 | cdef class CPD: 20 | cdef public: 21 | object name, parent_domain, sorted_parent_names, params 22 | int arity, parent_arity 23 | 24 | -------------------------------------------------------------------------------- /samcnet/probability.so: -------------------------------------------------------------------------------- 1 | ../build/probability.so -------------------------------------------------------------------------------- /samcnet/pydai.pxd: -------------------------------------------------------------------------------- 1 | from dai_bind cimport Var, VarSet, Factor, FactorGraph, JTree 2 | cimport numpy as np 3 | 4 | cdef np.ndarray [np.double_t, ndim=1, mode="c"] factor_to_array(Factor f) 5 | 6 | cdef class PyVar: 7 | cdef Var *thisptr 8 | cdef class PyVarSet: 9 | cdef VarSet *thisptr 10 | cdef class PyFactor: 11 | cdef Factor *thisptr 12 | cdef class PyJTree: 13 | cdef JTree *thisptr 14 | cdef FactorGraph *fg 15 | -------------------------------------------------------------------------------- /samcnet/pydai.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=False 2 | cimport cython 3 | from dai_bind cimport Var, VarSet, Factor, FactorGraph, JTree, PropertySet 4 | from libcpp.vector cimport vector 5 | from libcpp.string cimport string 6 | from cython.operator cimport dereference as deref 7 | 8 | cimport numpy as np 9 | import numpy as np 10 | 11 | cdef class PyVar: 12 | """A variable, initialized with (int, int) being the label 13 | and arity""" 14 | def __cinit__(self, *args):#int label, int states): 15 | if len(args) == 0: 16 | self.thisptr = NULL 17 | return 18 | assert len(args) == 2 19 | #print "ALLOCATE PyVar" 20 | self.thisptr = new Var(args[0], args[1]) 21 | def __dealloc__(self): 22 | if self.thisptr is not NULL: 23 | #print "DELETING PyVar" 24 | del self.thisptr 25 | else: 26 | print "Possible memory leak PyVar" 27 | def states(self): 28 | if self.thisptr is not NULL: 29 | return self.thisptr.states() 30 | else: 31 | raise MemoryError() 32 | def label(self): 33 | if self.thisptr is not NULL: 34 | return self.thisptr.label() 35 | else: 36 | raise MemoryError() 37 | 38 | cdef class PyVarSet: 39 | """A set of variables initialized with a variable 40 | number of PyVars.""" 41 | def __cinit__(self, *args): 42 | cdef int i 43 | cdef vector[Var] vargs 44 | if len(args) == 0: 45 | self.thisptr = NULL 46 | return 47 | for i in range(len(args)): 48 | assert isinstance(args[i], PyVar) 49 | vargs.push_back(Var(args[i].label(), args[i].states())) 50 | self.thisptr = new VarSet(vargs.begin(), vargs.end(), vargs.size()) 51 | #print "ALLOCATE PyVarSet" 52 | if self.thisptr is NULL: 53 | raise MemoryError() 54 | def __dealloc__(self): 55 | if self.thisptr is not NULL: 56 | #print "DELETING PyVarSet" 57 | del self.thisptr 58 | else: 59 | print "Possible memory leak PyVarSet" 60 | def __len__(self): 61 | if self.thisptr is not NULL: 62 | return self.thisptr.size() 63 | else: 64 | raise MemoryError() 65 | #cdef copy(self): 66 | #if self.thisptr is not NULL: 67 | 68 | #return self.thisptr.size() 69 | #else: 70 | #raise MemoryError() 71 | 72 | cdef class PyFactor: 73 | """A factor initialized with either a PyVar or a PyVarSet.""" 74 | def __cinit__(self, *arg): 75 | if len(arg) == 0: 76 | self.thisptr = NULL 77 | #print "Created Empty PyFactor" 78 | return 79 | assert len(arg) == 1 80 | arg = arg[0] 81 | if isinstance(arg,PyVar): 82 | self.thisptr = new Factor(deref((arg).thisptr)) 83 | elif isinstance(arg,PyVarSet): 84 | self.thisptr = new Factor(deref((arg).thisptr)) 85 | #print "ALLOCATED new PyFactor" 86 | if self.thisptr is NULL: 87 | raise MemoryError() 88 | def __dealloc__(self): 89 | if self.thisptr is not NULL: 90 | #print "DELETING PyFactor" 91 | del self.thisptr 92 | else: 93 | print "Possible memory leak PyFactor" 94 | 95 | def nrStates(self): 96 | if self.thisptr is not NULL: 97 | return self.thisptr.nrStates() 98 | else: 99 | raise MemoryError() 100 | def entropy(self): 101 | if self.thisptr is not NULL: 102 | return self.thisptr.entropy() 103 | else: 104 | raise MemoryError() 105 | def get(self, int i): 106 | if self.thisptr is not NULL: 107 | return self.thisptr.get(i) 108 | else: 109 | raise MemoryError() 110 | def set(self, int i, double value): 111 | if self.thisptr is not NULL: 112 | self.thisptr.set(i,value) 113 | else: 114 | raise MemoryError() 115 | def vars(self): 116 | cdef VarSet *copy = new VarSet() 117 | if self.thisptr is not NULL: 118 | copy[0] = self.thisptr.vars() 119 | 120 | pv = PyVarSet() 121 | pv.thisptr = copy 122 | 123 | return pv 124 | else: 125 | raise MemoryError() 126 | def marginal(self, PyVarSet vs): 127 | cdef Factor *copy = new Factor() 128 | if self.thisptr is not NULL: 129 | copy[0] = self.thisptr.marginal(deref(vs.thisptr)) 130 | 131 | pv = PyFactor() 132 | pv.thisptr = copy 133 | return pv 134 | else: 135 | raise MemoryError() 136 | def normalize(self): 137 | if self.thisptr is not NULL: 138 | self.thisptr.normalize() 139 | else: 140 | raise MemoryError() 141 | def __repr__(self): 142 | if self.thisptr is NULL: 143 | raise MemoryError() 144 | 145 | cdef int states = self.thisptr.nrStates() 146 | cdef VarSet vs = self.thisptr.vars() 147 | cdef vector[Var] celements = vs.elements() 148 | s = '' 149 | elements = [] 150 | for i in range(vs.size()): 151 | elements.append((celements[i].label(), celements[i].states())) 152 | s += '[(State, Arity)]:\n' 153 | s += str(elements) 154 | s += '\nState: value\n' 155 | for i in range(states): 156 | s += ' %3d: %3f\n' % (i,self.thisptr.get(i)) 157 | return s 158 | def __array__(self): 159 | cdef int nr = self.thisptr.nrStates() 160 | cdef np.ndarray[np.double_t, ndim=1, mode="c"] x = np.empty(nr, dtype=np.double) 161 | cdef int i 162 | for i in range(nr): 163 | x[i] = self.thisptr.get(i) 164 | return x 165 | 166 | cdef np.ndarray [np.double_t, ndim=1, mode="c"] factor_to_array(Factor f): 167 | cdef int nr = f.nrStates() 168 | cdef np.ndarray[np.double_t, ndim=1, mode="c"] x = np.empty(nr, dtype=np.double) 169 | cdef int i 170 | for i in range(nr): 171 | x[i] = f.get(i) 172 | return x 173 | 174 | cdef class PyJTree: 175 | """A junction tree for efficient inference, initialized with a varargs number of 176 | PyFactors.""" 177 | def __cinit__(self, *facs): 178 | cdef int i 179 | cdef vector[Factor] vargs 180 | for i in range(len(facs)): 181 | assert isinstance(facs[i], PyFactor) 182 | vargs.push_back(deref(((facs[i])).thisptr)) # Should we be copying here instead?? 183 | self.fg = new FactorGraph(vargs) 184 | if self.fg is NULL: 185 | raise MemoryError() 186 | cdef PropertySet ps = PropertySet('[updates=HUGIN]') 187 | self.thisptr = new JTree(deref(self.fg), ps) 188 | #print "ALLOCATED Jtree" 189 | if self.thisptr is NULL: 190 | raise MemoryError() 191 | 192 | def __dealloc__(self): 193 | if self.thisptr is not NULL and self.fg is not NULL: 194 | #print "DELETING PyJtree" 195 | del self.thisptr 196 | del self.fg 197 | else: 198 | print "Possible memory leak PyFactor" 199 | 200 | def init(self): 201 | if self.thisptr is not NULL: 202 | self.thisptr.init() 203 | else: 204 | raise MemoryError() 205 | def run(self): 206 | if self.thisptr is not NULL: 207 | self.thisptr.run() 208 | else: 209 | raise MemoryError() 210 | def iterations(self): 211 | if self.thisptr is not NULL: 212 | return self.thisptr.Iterations() 213 | else: 214 | raise MemoryError() 215 | 216 | def belief_array(self, *vs): 217 | cdef vector[Var] vec 218 | if self.fg is NULL or self.thisptr is NULL: 219 | raise MemoryError() 220 | if len(vs)==1: 221 | return factor_to_array(self.thisptr.belief(VarSet(self.fg.var(vs[0])))) 222 | elif len(vs)==2: 223 | return factor_to_array(self.thisptr.belief(VarSet(self.fg.var(vs[0]), self.fg.var(vs[1])))) 224 | else: 225 | for i in vs: 226 | vec.push_back(self.fg.var(i)) 227 | return factor_to_array(self.thisptr.belief(VarSet(vec.begin(), vec.end(), vec.size()))) 228 | 229 | def belief(self, PyVarSet vs): 230 | cdef Factor *copy = new Factor() 231 | if self.thisptr is not NULL: 232 | copy[0] = self.thisptr.belief(deref(vs.thisptr)) 233 | 234 | pv = PyFactor() 235 | pv.thisptr = copy 236 | return pv 237 | else: 238 | raise MemoryError() 239 | 240 | def marginal_array(self, *vs): 241 | cdef vector[Var] vec 242 | if self.fg is NULL or self.thisptr is NULL: 243 | raise MemoryError() 244 | if len(vs)==1: 245 | return factor_to_array(self.thisptr.calcMarginal(VarSet(self.fg.var(vs[0])))) 246 | elif len(vs)==2: 247 | return factor_to_array(self.thisptr.calcMarginal(VarSet(self.fg.var(vs[0]), self.fg.var(vs[1])))) 248 | else: 249 | for i in vs: 250 | vec.push_back(self.fg.var(i)) 251 | return factor_to_array(self.thisptr.calcMarginal(VarSet(vec.begin(), vec.end(), vec.size()))) 252 | 253 | def marginal(self, PyVarSet vs): 254 | cdef Factor *copy = new Factor() 255 | if self.thisptr is not NULL: 256 | copy[0] = self.thisptr.calcMarginal(deref(vs.thisptr)) 257 | 258 | pv = PyFactor() 259 | pv.thisptr = copy 260 | return pv 261 | else: 262 | raise MemoryError() 263 | 264 | if __name__ == '__main__': 265 | import doctest 266 | doctest.testmod() 267 | 268 | -------------------------------------------------------------------------------- /samcnet/pydai.so: -------------------------------------------------------------------------------- 1 | ../build/pydai.so -------------------------------------------------------------------------------- /samcnet/report.py: -------------------------------------------------------------------------------- 1 | import os, sys, zlib, redis 2 | import pylab as p 3 | from matplotlib import rc 4 | import numpy as np 5 | import tables as t 6 | 7 | rc('text', usetex=True) 8 | 9 | #custom:3dabbb5ffad97a8d205a6b22bd9543f5c5a0e1b7:p_struct=15:ntemplate=4:p_cpd=0 10 | 11 | def sweep_plot(r, jobhashes): 12 | def transform(s): 13 | return dict([x.split('=') for x in s.split(':')]) 14 | def filt(crit, params, datasets): 15 | for i,p in enumerate(params): 16 | cp = dict([(k,int(v)) for k,v in p.iteritems()]) 17 | if cp == crit: 18 | return i 19 | params = [] 20 | datasets = [] 21 | for jobhash in jobhashes: 22 | keystart = 'custom:%s:' % jobhash 23 | cutlen = len(keystart) 24 | keys = r.keys(keystart + '*') 25 | 26 | params.extend([transform(x[cutlen:]) for x in keys]) 27 | datasets.extend([map(float,r.lrange(x,0,-1)) for x in keys]) 28 | 29 | pen = ['g','r','m','c'] 30 | x = [0.5,1.5,3.0,5.0,8.0,15.0,30.0] 31 | for pristine_cpd in [0, 1]: 32 | for numtemplate in [4,8]: 33 | res = [] 34 | err = [] 35 | for p_struct in map(lambda t: int(t*10), x): 36 | cpd = pristine_cpd * p_struct 37 | ind = filt({'ntemplate':numtemplate, 38 | 'p_cpd':cpd, 39 | 'p_struct':p_struct}, 40 | params, datasets) 41 | res.append(np.median(datasets[ind])) 42 | err.append(np.std(datasets[ind])) 43 | p.errorbar(x,res,yerr=err, 44 | color=pen.pop(), 45 | label='%s; $\,$ %d edges' % 46 | (r'$\gamma_{\textrm{cpd}}=\gamma_{\textrm{structural}}$' if pristine_cpd else 47 | r'$\gamma_{\textrm{cpd}}=0$', 48 | numtemplate), 49 | linewidth=2) 50 | 51 | #p.boxplot(datasets, positions=[int(x['p_struct'])/10. for x in params]) 52 | 53 | p.grid(True) 54 | p.legend(bbox_to_anchor=(.5,1), loc=2, fontsize=12) 55 | #legend(bbox_to_anchor=(0, 0, 1, 1), bbox_transform=gcf().transFigure) 56 | p.xlim(0,31) 57 | p.xlabel(r'$\gamma_{\textrm{structural}}$')#,fontsize=20) 58 | p.ylabel('Posterior average of KLD') 59 | #p.ylim(0,5) 60 | 61 | def cummeans_plot(ax, filelist, node, ylabel=None): 62 | first = True 63 | for d in filelist: 64 | fid = t.openFile(d, 'r') 65 | obj = fid.getNode(node) 66 | label = obj.name 67 | if label == 'freq_hist': 68 | x = np.linspace(fid.root.samc._v_attrs['lowEnergy'], 69 | fid.root.samc._v_attrs['highEnergy'], 70 | fid.root.samc._v_attrs['grid']) 71 | else: 72 | x = np.arange(obj.read().size) 73 | if first: 74 | ax.plot(x, obj.read(), 'b', alpha=0.4, label=label) 75 | else: 76 | ax.plot(x, obj.read(), 'b', alpha=0.4) 77 | first = False 78 | if 'descs' in fid.root.object._v_attrs and label in fid.root.object._v_attrs.descs: 79 | if ylabel: 80 | p.ylabel(ylabel) 81 | else: 82 | p.ylabel(fid.root.object._v_attrs.descs[label]) 83 | fid.close() 84 | ax.grid(True) 85 | ax.legend() 86 | 87 | def prompt_for_dataset(r, n=[]): 88 | # Get all job hashes with results and sort by time submitted 89 | done_hashes = sorted(r.keys('jobs:done:*'), key=lambda x: int(r.hget('jobs:times', x[10:]) or '0')) 90 | if n: 91 | return [done_hashes[x][10:] for x in n] 92 | # Print results 93 | for i, d in enumerate(done_hashes): 94 | desc = r.hget('jobs:descs', d[10:]) or '' 95 | num = r.llen(d) 96 | print "%4d. (%3s) %s %s" % (i, num, d[10:15], desc) 97 | 98 | sel = raw_input("Choose a dataset or 'q' to exit: ") 99 | if not sel.isdigit() or int(sel) not in range(i+1): 100 | sys.exit() 101 | return done_hashes[int(sel)][10:] 102 | 103 | def pull_data(r, basedir, jobhash): 104 | if not os.path.exists(basedir): 105 | os.mkdir(basedir) 106 | # Test if we already have pulled this data down 107 | TMPDIR = os.path.join(basedir, jobhash) 108 | if not os.path.isdir(TMPDIR): 109 | os.mkdir(TMPDIR) 110 | filelist = os.listdir(TMPDIR) 111 | if len(filelist) != r.llen('jobs:done:'+jobhash): 112 | # Grab datasets 113 | datastrings = r.lrange('jobs:done:'+jobhash, 0, -1) 114 | print "Persisting %d datasets from hash %s" % (len(datastrings), jobhash[:5]) 115 | for i,data in enumerate(datastrings): 116 | with open(os.path.join(TMPDIR, str(i)), 'w') as fid: 117 | fid.write(zlib.decompress(data)) 118 | filelist = os.listdir(TMPDIR) 119 | else: 120 | print "Found %d datasets from hash %s in cache" % (len(os.listdir(TMPDIR)), jobhash[:5]) 121 | return [os.path.join(TMPDIR, x) for x in filelist] 122 | 123 | def print_h5_info(loc): 124 | # Grab info that should be identical for all samples 125 | fid = t.openFile(loc, 'r') 126 | print("###### SAMC ######") 127 | for name in fid.root.samc._v_attrs._f_list('user'): 128 | print("%30s:\t%s" % (name, str(fid.root.samc._v_attrs[name]))) 129 | print("###### Object ######") 130 | for name in fid.root.object._v_attrs._f_list('user'): 131 | print("%30s:\t%s" % (name, str(fid.root.object._v_attrs[name]))) 132 | fid.close() 133 | 134 | def show_ground(r, jobhash): 135 | import cPickle as cp 136 | import networkx as nx 137 | import subprocess as sb 138 | x = r.hget('jobs:grounds', jobhash) 139 | z = cp.loads(zlib.decompress(x)) 140 | nx.write_dot(z, '/tmp/tmp.dot') 141 | sb.call('dot /tmp/tmp.dot -Tpng -o /tmp/tmp.png'.split()) 142 | sb.call('xdg-open /tmp/tmp.png'.split()) 143 | 144 | if __name__ == "__main__": 145 | 146 | r = redis.StrictRedis('localhost') 147 | #jobhash = prompt_for_dataset(r) 148 | 149 | #basedir = '/tmp/samcfiles' 150 | #filelist = pull_data(r, basedir, jobhash) 151 | 152 | #plot_list = [ 153 | ##'/samc/freq_hist', 154 | #'/computed/cummeans/entropy', 155 | #'/computed/cummeans/kld', 156 | #'/computed/cummeans/edge_distance'] 157 | #label_list = [ 158 | ##'Samples from energy', 159 | #'Entropy in bits', 160 | #'KLD in bits', 161 | #'Incorrect edge proportion'] 162 | #p.figure() 163 | #for i,node in enumerate(plot_list): 164 | #cummeans_plot(p.subplot(len(plot_list), 1, i+1), filelist, node, label_list[i]) 165 | #if i==0: 166 | #p.title(r.hget('jobs:descs', jobhash) + "\n" + \ 167 | #'Experiment version: ' + jobhash[:5] + '\n' + \ 168 | #'Code version: ' + r.hget('jobs:githashes', jobhash)[:5]) 169 | 170 | #p.xlabel('Samples obtained after burnin (after thinning)') 171 | 172 | #print_h5_info(filelist[0]) 173 | 174 | jobhashes = prompt_for_dataset(r, [25,26,27,28,29,30,31]) 175 | show_ground(r, jobhashes[0]) 176 | sweep_plot(r, jobhashes) 177 | 178 | p.show() 179 | -------------------------------------------------------------------------------- /samcnet/samc.so: -------------------------------------------------------------------------------- 1 | ../build/samc.so -------------------------------------------------------------------------------- /samcnet/simple.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats as st 3 | 4 | class Simple(): 5 | def __init__(self, truemu=2.0, sigma=1.0, N=20, mu0=0.0, tau=5.0): 6 | self.x = 0.0 7 | self.old_x = 0.0 8 | 9 | self.truemu = truemu 10 | self.sigma = sigma 11 | self.N = N 12 | 13 | self.mu0 = mu0 14 | self.tau = tau 15 | 16 | self.data = st.norm.rvs(loc=self.truemu, scale=self.sigma, size=self.N) 17 | 18 | self.postmode = (self.mu0/self.tau**2 + self.data.mean()/self.sigma**2) \ 19 | / (1/self.tau**2 + 1/self.sigma**2) 20 | 21 | print "Postmode: %f" % self.postmode 22 | 23 | def propose(self): 24 | self.old_x = self.x 25 | self.x += np.random.normal(scale=3.0) 26 | 27 | def copy(self): 28 | return self.x 29 | 30 | def save_to_db(self): 31 | return self.x 32 | 33 | def reject(self): 34 | self.x = self.old_x 35 | 36 | def energy(self): 37 | sum = 0.0 38 | # prior 39 | sum -= st.norm.logpdf(self.x, loc=self.mu0, scale=self.tau) 40 | 41 | # likelihood 42 | sum -= st.norm.logpdf(self.data, loc=self.x, scale=self.sigma).sum() 43 | return sum 44 | -------------------------------------------------------------------------------- /samcnet/tail.py: -------------------------------------------------------------------------------- 1 | # A simple example of calculating predictive posteriors in a normal unknown 2 | # mean unknown variance case. 3 | 4 | from __future__ import division 5 | from lori import MHRun 6 | 7 | import numpy as np 8 | import pylab as p 9 | import matplotlib as mpl 10 | import random 11 | from math import log, exp, pi, lgamma 12 | 13 | import scipy.stats as st 14 | import scipy 15 | from scipy.special import betaln 16 | import scipy.stats.distributions as di 17 | 18 | from statsmodels.sandbox.distributions.mv_normal import MVT 19 | #from sklearn.qda import QDA 20 | 21 | import sys 22 | sys.path.append('/home/bana/GSP/research/samc/code') 23 | sys.path.append('/home/bana/GSP/research/samc/code/build') 24 | 25 | mydb = [] 26 | 27 | class Classification(): 28 | def __init__(self): 29 | np.random.seed(1234) 30 | 31 | self.n = 4 # Data points 32 | 33 | self.true_mu = 0.0 34 | self.true_sigma = 1 #di.invgamma.rvs(3) 35 | 36 | # For G function calculation and averaging 37 | self.grid_n = 100 38 | low,high = -4, 4 39 | self.gextent = (low,high) 40 | self.grid = np.linspace(low,high,self.grid_n) 41 | self.gavg = np.zeros(self.grid_n) 42 | self.numgavg = 0 43 | 44 | #self.data = di.norm.rvs(size=self.n) 45 | self.data = np.array([0.0, -0.0, 0.5, -0.5]) 46 | assert self.data.size == self.n 47 | 48 | ######## Starting point of MCMC Run ####### 49 | self.mu = 0.0 50 | self.sigma = 2.0 51 | 52 | ###### Bookeeping ###### 53 | self.oldmu = None 54 | self.oldsigma = None 55 | 56 | ##### Prior Values and Confidences ###### 57 | self.priorsigma = 2 58 | self.kappa = 1 59 | self.priormu = 0 60 | self.nu = 8.0 61 | #### Calculating the Analytic solution given on page 15 of Lori's 62 | #### Optimal Classification eq 34. 63 | self.nustar = self.nu + self.n 64 | 65 | samplemean = self.data.mean() 66 | samplevar = np.cov(self.data) 67 | 68 | self.mustar = (self.nu*self.priormu + self.n * samplemean) \ 69 | / (self.nu + self.n) 70 | self.kappastar = self.kappa + self.n 71 | self.Sstar = self.priorsigma + (self.n-1)*samplevar + self.nu*self.n/(self.nu+self.nu)\ 72 | * (samplemean - self.priormu)**2 73 | 74 | #### Now calculate effective class conditional densities from eq 55 75 | #### page 21 76 | 77 | #self.fx = MVT( 78 | #self.mu0star, 79 | #(self.nu0star+1)/(self.kappa0star-self.D+1)/self.nu0star * self.S0star, 80 | #self.kappa0star - self.D + 1) 81 | # So I'm pretty sure this is incorrect below, off by some scaling 82 | # parameters 83 | self.fx = MVT( 84 | [self.mustar], 85 | [(self.nustar+1)/(self.kappastar)/self.nustar * self.Sstar / 2], 86 | self.kappastar /2 ) 87 | 88 | self.analyticfx = self.fx.logpdf(self.grid.reshape(-1,1)) 89 | 90 | 91 | def propose(self): 92 | self.oldmu = self.mu 93 | self.oldsigma = self.sigma 94 | 95 | self.mu += np.random.randn()*0.1 96 | #self.mu = np.random.randn() 97 | self.sigma = di.invgamma.rvs(1) 98 | return 0 99 | 100 | def copy(self): 101 | return (self.mu, self.sigma, di.norm.rvs(loc=self.mu, scale=self.sigma)) 102 | 103 | def reject(self): 104 | self.mu = self.oldmu 105 | self.sigma = self.oldsigma 106 | 107 | def energy(self): 108 | sum = 0.0 109 | sum -= di.norm.logpdf(self.data, loc=self.mu, scale=self.sigma).sum() 110 | #Now add in the priors... 111 | sum -= log(self.sigma)*(-0.5) - self.nu/2 * (self.mu-self.priormu)**2/self.sigma 112 | sum -= log(self.sigma)*(self.kappa+2)/(-2) - 0.5*self.priorsigma/self.sigma 113 | return sum 114 | 115 | def calc_gfunc(self): 116 | return di.norm.pdf(self.grid, loc=self.mu, scale=self.sigma) 117 | 118 | def init_db(self, db, dbsize): 119 | pass 120 | #dtype = [('thetas',np.double), 121 | #('energies',np.double), 122 | #('funcs',np.double)] 123 | #if db == None: 124 | #return np.zeros(dbsize, dtype=dtype) 125 | #elif db.shape[0] != dbsize: 126 | #return np.resize(db, dbsize) 127 | #else: 128 | #raise Exception("DB Not inited") 129 | 130 | def save_to_db(self, db, theta, energy, iteration): 131 | #func = 0.0 132 | #db[iteration] = np.array([theta, energy, func]) 133 | global mydb 134 | mydb.append(self.copy()) 135 | 136 | # Update G function average 137 | self.numgavg += 1 138 | self.gavg += (self.calc_gfunc() - self.gavg) / self.numgavg 139 | 140 | def pnorm(loc,scale): 141 | p.figure() 142 | x = np.linspace(-20, 20, 400) 143 | p.plot(x, di.norm.pdf(x,loc=loc, scale=scale)) 144 | p.show() 145 | 146 | if __name__ == '__main__': 147 | import matplotlib.pyplot as plt 148 | import matplotlib.animation as animation 149 | 150 | import samc 151 | from samcnet.utils import * 152 | c = Classification() 153 | 154 | #p.close('all') 155 | s = MHRun(c, burn=0) 156 | #s = samc.SAMCRun(c, burn=0, stepscale=1000, refden=0) 157 | s.sample(1e3) 158 | #plotHist(s) 159 | 160 | ################################## 161 | ################################## 162 | #p.subplot(4,1,1) 163 | #p.plot(c.grid, c.gavg, 'r') 164 | #p.plot(c.data, np.ones_like(c.data), 'ko') 165 | #p.grid(True) 166 | #x = np.linspace(0.01,2,50) 167 | #p.subplot(4,1,2) 168 | #p.hist(np.vstack(mydb)[:,1],bins=x) 169 | 170 | #p.subplot(4,1,3) 171 | #mus = np.vstack(mydb)[:,0] 172 | #counts,bins,_ = p.hist(mus,bins=80) 173 | 174 | #xx = np.linspace(bins[0], bins[-1], 300) 175 | #ty = di.t.pdf(xx, *di.t.fit(mus)) 176 | #ny = di.norm.pdf(xx, *di.norm.fit(mus)) 177 | 178 | #p.plot(xx,ty*counts.max()/ty.max(),'g', label='t fit') 179 | #p.plot(xx,ny*counts.max()/ny.max(),'b--', label='normal fit') 180 | #p.legend() 181 | 182 | #p.subplot(4,1,4) 183 | #ys = np.vstack(mydb)[:,2] 184 | #counts,bins,_ = p.hist(ys,bins=80) 185 | 186 | #xx = np.linspace(bins[0], bins[-1], 300) 187 | #ty = di.t.pdf(xx, *di.t.fit(ys)) 188 | #ny = di.norm.pdf(xx, *di.norm.fit(ys)) 189 | #ay = c.fx.pdf(xx.reshape(-1,1)) 190 | 191 | #p.title("sampled y's") 192 | #p.plot(xx,ty*counts.max()/ty.max(),'g', label='t fit') 193 | #p.plot(xx,ny*counts.max()/ny.max(),'b--', label='normal fit') 194 | 195 | #p.plot(xx,ay*counts.max()/ay.max(),'k--', label='t analytic') 196 | 197 | #p.legend() 198 | 199 | ############################## 200 | fig1 = plt.figure() 201 | #xx = np.linspace(bins[0], bins[-1], 300) 202 | #ty = di.t.logpdf(xx, *di.t.fit(ys)) 203 | #p.plot(xx,ty,'g', label='t empirical') 204 | 205 | plt.title("predictive posteriors") 206 | plt.ylabel('logpdfs') 207 | plt.grid(True) 208 | plt.hold(True) 209 | 210 | plt.plot(c.data, np.ones_like(c.data), 'ko', label='data') 211 | plt.plot(c.grid, np.exp(c.analyticfx), 'k--', label='student t') 212 | if True: 213 | s.sample(3e3) 214 | plt.plot(c.grid, c.gavg, 'r', label='gavg') 215 | ys = np.vstack(mydb)[:,2] 216 | counts,bins,_ = p.hist(ys,bins=80,normed=True) 217 | 218 | elif False: # Animation 219 | l, = plt.plot(c.grid, c.gavg, 'r', label='gavg') 220 | def update_line(num, data, line): 221 | global c 222 | line.set_data(c.grid,data[num]) 223 | return line, 224 | data = [c.gavg.copy()] 225 | N=int(sys.argv[1]) 226 | for x in range(N): 227 | s.sample(1e3) 228 | data.append(c.gavg.copy()) 229 | 230 | line_ani = animation.FuncAnimation(fig1, update_line, N, fargs=(data, l), 231 | interval=50, blit=True) 232 | 233 | plt.show() 234 | 235 | 236 | -------------------------------------------------------------------------------- /samcnet/tcga-parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import urllib2 5 | 6 | import pandas as pa 7 | import numpy as np 8 | import simplejson as js 9 | 10 | from sklearn.feature_selection import SelectKBest, f_classif 11 | 12 | def getuuid(fname): 13 | return os.path.basename(fname).split('.')[2] 14 | 15 | def istumor(legend, fname): 16 | return int(legend[getuuid(fname)].split('-')[3][:-1]) < 10 17 | 18 | def build_legend(path, globstring): 19 | uuids = [] 20 | for fname in glob.glob(os.path.join(path,globstring)): 21 | uuids.append(os.path.basename(fname).split('.')[2]) 22 | url = 'https://tcga-data.nci.nih.gov/uuid/uuidws/mapping/json/uuid/batch' 23 | req = urllib2.Request(url=url, headers={'Content-Type': 'text/plain'},data=','.join(uuids)) 24 | data = js.loads(urllib2.urlopen(req).read()) 25 | legend = {} 26 | for d in data['uuidMapping']: 27 | legend[d['uuid']] = d['barcode'] 28 | return legend 29 | 30 | def load_df(path, dtype): 31 | if dtype == 'norm': 32 | colname = 'normalized_count' 33 | globstring = '*_results' 34 | elif dtype == 'raw': 35 | colname = 'raw_count' 36 | globstring = '*.results' 37 | else: 38 | raise Exception("Invalid data type requested") 39 | legend = build_legend(path,globstring) 40 | accum = [] 41 | for fname in glob.glob(os.path.join(path,globstring)): 42 | if istumor(legend, fname): 43 | df = pa.read_csv(fname, sep='\t', index_col=0, usecols=['gene_id',colname]) 44 | df.rename(columns={colname: getuuid(fname)}, inplace=True) 45 | accum.append(df) 46 | return pa.concat(accum, axis=1) 47 | 48 | 49 | #### WRITE #### 50 | #store = pa.HDFStore('store.h5', complib='blosc', complevel=6) 51 | 52 | #store['lusc_norm'] = load_df('tcga-lusc','norm') 53 | #store['lusc_raw'] = load_df('tcga-lusc','raw') 54 | 55 | #store['luad_norm'] = load_df('tcga-luad','norm') 56 | #store['luad_raw'] = load_df('tcga-luad','raw') 57 | 58 | #store.close() 59 | #sys.exit() 60 | #### WRITE #### 61 | 62 | 63 | #### READ #### 64 | store = pa.HDFStore('store.h5') 65 | #brca = store['brca_norm'] 66 | #paad = store['paad_norm'] 67 | luad = store['luad_norm'] 68 | lusc = store['lusc_norm'] 69 | 70 | #alldata = np.hstack(( paad_res, brca_all )).T 71 | #alllabels = np.hstack(( np.ones(paad_res.shape[1]), np.zeros(brca_all.shape[1]) )) 72 | 73 | #somedata = np.hstack(( paad_res, brca_some )).T 74 | #somelabels = np.hstack(( np.ones(paad_res.shape[1]), np.zeros(brca_some.shape[1]) )) 75 | 76 | #selector = SelectKBest(f_classif, k=4) 77 | #selector.fit(somedata,somelabels) 78 | 79 | ##td = selector.transform(somedata) 80 | #inds = selector.pvalues_.argsort() 81 | #start = 8004 + np.random.randint(0,1000) 82 | #td = alldata[:, inds[start:start+2]] 83 | 84 | #import pylab as p 85 | #p.figure() 86 | 87 | #p.plot(td[40:,0], td[40:,1], 'r.') 88 | #p.plot(td[:40,0], td[:40,1], 'g.') 89 | #p.show() 90 | #### READ #### 91 | 92 | -------------------------------------------------------------------------------- /samcnet/utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | 10 | string crepr(const dai::FactorGraph &x) { 11 | ostringstream s; 12 | s << x; 13 | return s.str(); 14 | } 15 | 16 | string crepr(const dai::Factor &x) { 17 | ostringstream s; 18 | s << x; 19 | return s.str(); 20 | } 21 | 22 | string crepr(const dai::VarSet &x) { 23 | ostringstream s; 24 | s << x; 25 | return s.str(); 26 | } 27 | -------------------------------------------------------------------------------- /samcnet/utils.py: -------------------------------------------------------------------------------- 1 | import pylab as p 2 | import os 3 | #import networkx as nx 4 | import numpy as np 5 | #import pandas as pa 6 | #import pebl as pb 7 | #import StringIO as si 8 | #import tempfile 9 | import tables as t 10 | 11 | #from probability import CPD,fast_space_iterator,JointDistribution 12 | 13 | def graph_to_joint(graph): 14 | joint = JointDistribution() 15 | cpds = [] 16 | for node in graph.nodes(): 17 | marg = graph.node[node]['marginal'] 18 | eta = graph.node[node]['eta'] 19 | delta = graph.node[node]['delta'] 20 | if np.isnan(marg): # yes parents 21 | params = {(0,):np.r_[1-eta], (1,):np.r_[delta]} 22 | pars = {graph.predecessors(node)[0]:2} 23 | else: 24 | params = {():np.r_[1-marg]} 25 | pars = {} 26 | joint.add_distribution(CPD(node,2,params,pars)) 27 | 28 | return joint 29 | 30 | def getHost(): 31 | return os.uname()[1].split('.')[0] 32 | 33 | def plot_h5(loc): 34 | fid = t.openFile(loc, 'r') 35 | samcattrs = fid.root.samc._v_attrs 36 | energy = np.linspace(samcattrs['lowEnergy'], samcattrs['highEnergy'], samcattrs['grid']) 37 | theta = fid.root.samc.theta_hist.read() 38 | counts = fid.root.samc.freq_hist.read() 39 | theta_trace = fid.root.samc.theta_trace.read() 40 | energy_trace = fid.root.samc.energy_trace.read() 41 | burn = samcattrs['burnin'] 42 | _plot_SAMC(energy, theta, counts, energy_trace, theta_trace, burn) 43 | 44 | def plotHist(s): 45 | _plot_SAMC(s.hist[0], s.hist[1], s.hist[2], 46 | s.db.root.samc.energy_trace.read(), 47 | s.db.root.samc.theta_trace.read(), s.burn) 48 | 49 | def _plot_SAMC(energy, theta, counts, energy_trace, theta_trace, burn): 50 | rows = 3 51 | cols = 2 52 | 53 | p.figure() 54 | p.subplot(rows, cols, 1) 55 | p.plot(energy, theta, 'k.') 56 | p.title("Region's theta values") 57 | p.ylabel('Theta') 58 | p.xlabel('Energy') 59 | 60 | p.subplot(rows, cols, 2) 61 | p.plot(energy, counts, 'k.') 62 | p.title("Region's Sample Counts") 63 | p.ylabel('Count') 64 | p.xlabel('Energy') 65 | 66 | p.subplot(rows, cols, 3) 67 | p.plot(np.arange(burn, energy_trace.shape[0]+burn), energy_trace, 'k.') 68 | p.title("Energy Trace") 69 | p.ylabel('Energy') 70 | p.xlabel('Iteration') 71 | 72 | p.subplot(rows, cols, 4) 73 | p.plot(np.arange(burn, theta_trace.shape[0]+burn), theta_trace, 'k.') 74 | p.ylabel('Theta Trace') 75 | p.xlabel('Iteration') 76 | 77 | p.subplot(rows, cols, 5) 78 | part = np.exp(theta_trace - theta_trace.max()) 79 | p.hist(part, log=True, bins=100) 80 | p.xlabel('exp(theta - theta_max)') 81 | p.ylabel('Number of samples at this value') 82 | p.title('Histogram of normalized sample thetas from %d iterations' % theta_trace.shape[0]) 83 | 84 | p.subplot(rows, cols, 6) 85 | p.hist(part, weights=part, bins=50) 86 | p.xlabel('exp(theta - theta_max)') 87 | p.ylabel('Amount of weight at this value') 88 | 89 | def stan_vec(**vecs): 90 | s = '' 91 | for k,v in vecs.iteritems(): 92 | if type(v) == np.ndarray: 93 | if v.ndim == 1: 94 | s += k + ' <- c' + str(tuple(v)) + '\n' 95 | else: 96 | s += k + ' <- structure(c'+str(tuple(v.T.flatten())) + ', .Dim = c' + str(v.shape) + ')\n' 97 | else: 98 | s += k + ' <- ' + str(v) + '\n' 99 | return s 100 | 101 | if __name__ == '__main__': 102 | print(stan_vec(a=[1,2,3], b=np.arange(16).reshape(4,4), c=np.arange(10), d=4.5 )) 103 | 104 | #def plot_nodes(loc, node, parts=[0.0, 0.1, 0.2]): 105 | #filelist = os.listdir(loc) 106 | #n = len(filelist) 107 | #avgs = [np.empty(n) for i in parts] 108 | #for i,d in enumerate(filelist): 109 | #fid = t.openFile(os.path.join(loc,d), 'r') 110 | #theta_trace = fid.root.samc.theta_trace.read() 111 | #n = theta_trace.shape[0] 112 | #array = fid.getNode(node).read() 113 | #inds = theta_trace.argsort() 114 | #theta_sort = theta_trace[inds] 115 | #array_sort = array[inds] 116 | 117 | #for j,frac in enumerate(parts): 118 | #last = int(n*(1-frac)) 119 | #part = np.exp(theta_sort[:last] - theta_sort[:last].max()) 120 | #denom = part.sum() 121 | #numerator = (part * array_sort[:last]).sum() 122 | #avgs[j][i] = numerator / denom 123 | ## Plotting 124 | #rows = len(parts) 125 | #cols = 1 126 | #agg = np.hstack(avgs) 127 | #bins = np.linspace(agg.min()-0.3, agg.max()+0.3, 20) 128 | #p.figure() 129 | #for i,frac in enumerate(parts): 130 | #p.subplot(rows, cols, i+1) 131 | #p.hist(avgs[i], bins=bins) 132 | #p.title('%s at %.3f fraction' % (node, frac)) 133 | 134 | #def plot_thetas(loc, parts=[0.1, 0.2]): 135 | #fid = t.openFile(loc, 'r') 136 | #theta_trace = fid.root.samc.theta_trace.read() 137 | #_plot_thetas(theta_trace, parts) 138 | 139 | #def _plot_thetas(theta_trace, parts): 140 | #rows = len(parts) + 1 141 | #cols = 2 142 | 143 | #theta_trace.sort() 144 | #n = theta_trace.shape[0] 145 | 146 | #def plot_theta_hist(i, frac): 147 | #p.subplot(rows, cols, 2*i+1) 148 | #last = int(n*(1-frac)) 149 | #part = np.exp(theta_trace[:last] - theta_trace[:last].max()) 150 | #p.hist(part, log=True, bins=100) 151 | #p.xlabel('exp(theta - theta_max)') 152 | #p.ylabel('Number of samples at this value') 153 | #p.title('Normalized sample thetas at %.3f' % frac) 154 | 155 | #p.subplot(rows, cols, 2*i+2) 156 | #p.hist(part, weights=part, bins=50) 157 | #p.xlabel('exp(theta - theta_max)') 158 | #p.ylabel('Amount of weight at this value') 159 | 160 | #p.figure() 161 | #for i,part in enumerate([0.0] + parts): 162 | #plot_theta_hist(i,part) 163 | 164 | #def plotScatter(s): 165 | #energies = s.db.root.samc.energy_trace.read() 166 | #thetas = s.db.root.samc.theta_trace.read() 167 | 168 | #p.figure() 169 | #p.plot(energies, thetas, 'k.', alpha=0.7) 170 | #p.xlabel('Energy') 171 | #p.ylabel('Theta') 172 | 173 | #def drawGraph(graph, show=False): 174 | #fname = os.tempnam() 175 | #nx.write_dot(graph, fname+'.dot') 176 | #os.popen('dot -Tsvg -o %s.svg %s.dot' % (fname,fname)) 177 | #if show: 178 | #os.popen('xdg-open %s.svg > /dev/null' % fname) 179 | #return fname 180 | 181 | #def drawGraphs(*args, **kwargs): 182 | #agraphs = [nx.to_agraph(graph) for graph in args] 183 | 184 | #files = [tempfile.mkstemp(suffix='.svg') for x in agraphs] 185 | #for f in files: 186 | #os.close(f[0]) 187 | 188 | #agraphs[0].layout(prog='dot') 189 | #agraphs[0].draw(files[0][1]) 190 | #agraphs[0].remove_edges_from(agraphs[0].edges()) 191 | 192 | #for fname,g in zip(files[1:],agraphs[1:]): 193 | #agraphs[0].add_edges_from(g.edges()) 194 | #agraphs[0].draw(fname[1]) 195 | #agraphs[0].remove_edges_from(g.edges()) 196 | 197 | #combo = tempfile.mkstemp(suffix='.png') 198 | #os.close(combo[0]) 199 | #os.popen('convert %s +append -quality 75 %s' % (' '.join(zip(*files)[1]), combo[1])) 200 | #if 'show' in kwargs and not kwargs['show']: 201 | #pass 202 | #else: 203 | #os.popen('xdg-open %s > /dev/null' % combo[1]) 204 | 205 | #for f in files: 206 | #os.unlink(f[1]) 207 | 208 | #def best_to_graph(mapvalue): 209 | #mat = mapvalue[0] 210 | #x = mapvalue[1] 211 | #s = x.argsort() 212 | #mat = mat[s].T[s].T 213 | #np.fill_diagonal(mat, 0) 214 | #return nx.from_numpy_matrix(mat) 215 | 216 | #def to_pebl(states, data): 217 | #header = ['%d,discrete(%d)' %(i,a) for i,a in enumerate(states)] 218 | #df = pa.DataFrame(data, columns=header) 219 | #x = si.StringIO() 220 | #x.write('\t'.join(header) + '\n') 221 | #df.to_csv(x, header=False, index=True, sep='\t') 222 | #x.seek(0) 223 | #return pb.data.fromstring(x.read()) 224 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binarybana/samcnet/84f3ba8241d416115a8aa9ba5c659a9513175072/tests/__init__.py -------------------------------------------------------------------------------- /tests/all_poisson.py: -------------------------------------------------------------------------------- 1 | 2 | from samcnet.mixturepoisson import * 3 | import numpy as np 4 | import samcnet.mh as mh 5 | from samcnet.lori import * 6 | import scipy.stats as st 7 | import scipy.stats.distributions as di 8 | import scipy 9 | 10 | from sklearn.lda import LDA 11 | from sklearn.svm import SVC 12 | from sklearn.neighbors import KNeighborsClassifier as KNN 13 | 14 | np.seterr(all='ignore') # Careful with this 15 | 16 | ######## Generate Data ######## 17 | def gen_data(mu, cov, n): 18 | lams = MVNormal(mu, cov).rvs(n) 19 | ps = np.empty_like(lams) 20 | for i in xrange(lams.shape[0]): 21 | for j in xrange(lams.shape[1]): 22 | ps[i,j] = di.poisson.rvs(10* np.exp(lams[i,j])) 23 | return ps 24 | 25 | mu0 = np.zeros(2) #- 0.5 26 | mu1 = np.zeros(2) #+ 0.5 27 | rho0 = -0.4 28 | rho1 = 0.4 29 | cov0 = np.array([[1, rho0],[rho0, 1]]) 30 | cov1 = np.array([[1, rho1],[rho1, 1]]) 31 | 32 | rseed = np.random.randint(10**6) 33 | dseed = 1 34 | #dseed = np.random.randint(1000) 35 | 36 | print("rseed: %d" % rseed) 37 | print("dseed: %d" % dseed) 38 | np.random.seed(dseed) 39 | ps0 = gen_data(mu0,cov0,30) 40 | ps1 = gen_data(mu1,cov1,30) 41 | superps0 = gen_data(mu0,cov0,3000) 42 | superps1 = gen_data(mu1,cov1,3000) 43 | np.random.seed(rseed) 44 | ps = np.vstack(( ps0, ps1 )) 45 | superps = np.vstack(( superps0, superps1 )) 46 | 47 | n,gext,grid = get_grid_data(ps, positive=True) 48 | ######## /Generate Data ######## 49 | 50 | ########## Comparison ############# 51 | p.close('all') 52 | # Run Yousef/Jianping RNA Synthetic 53 | currdir = path.abspath('.') 54 | synloc = path.expanduser('~/GSP/research/samc/synthetic/rnaseq') 55 | 56 | write_data(ps0, ps1, path.join(synloc, 'out', 'trn.txt')) 57 | write_data(superps0, superps1, path.join(synloc, 'out', 'tst.txt')) 58 | 59 | try: 60 | os.chdir(synloc) 61 | #sb.check_call(path.join(synloc, 62 | #'gen -i params/easyparams -sr 0.05 -lr 9 -hr 10').split()) 63 | sb.check_call(path.join(synloc, 64 | 'cls -t out/trn.txt -s out/tst.txt').split()) 65 | finally: 66 | os.chdir(currdir) 67 | # Grab some info from the run 68 | data = np.loadtxt(path.join(synloc,'out','out')) 69 | lda,knn,svm,num_feats = data[0:4] 70 | print("LDA error: %f" % lda) 71 | print("KNN error: %f" % knn) 72 | print("SVM error: %f" % svm) 73 | feat_inds = data[4:].astype(int) 74 | 75 | rawdata = np.loadtxt(path.join(synloc, 'out','trn.txt'), 76 | delimiter=',', skiprows=1) 77 | data = rawdata[:,feat_inds] 78 | Ntrn = data.shape[0] 79 | data0 = data[:Ntrn/2,:] 80 | data1 = data[Ntrn/2:,:] 81 | norm_data = (data - data.mean(axis=0)) / np.sqrt(data.var(axis=0,ddof=1)) 82 | norm_data0 = norm_data[:Ntrn/2,:] 83 | norm_data1 = norm_data[Ntrn/2:,:] 84 | rawtest = np.loadtxt(path.join(synloc, 'out','tst.txt'), 85 | delimiter=',', skiprows=1) 86 | test = rawtest[:,feat_inds] 87 | norm_test = (test - test.mean(axis=0)) / np.sqrt(test.var(axis=0,ddof=1)) 88 | N = test.shape[0] 89 | D = data.shape[1] 90 | #sys.exit() 91 | 92 | trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) )) 93 | tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) )) 94 | sklda = LDA() 95 | skknn = KNN(3, warn_on_equidistant=False) 96 | sksvm = SVC() 97 | sklda.fit(norm_data, trn_labels) 98 | skknn.fit(norm_data, trn_labels) 99 | sksvm.fit(norm_data, trn_labels) 100 | print("skLDA error: %f" % (1-sklda.score(norm_test, tst_labels))) 101 | print("skKNN error: %f" % (1-skknn.score(norm_test, tst_labels))) 102 | print("skSVM error: %f" % (1-sksvm.score(norm_test, tst_labels))) 103 | 104 | labels = np.hstack((np.zeros(N/2), np.ones(N/2))) 105 | n,gext,grid = get_grid_data(np.vstack(( norm_data0, norm_data1 ))) 106 | 107 | bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data0) 108 | bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data1) 109 | 110 | # Gaussian Analytic 111 | gc = GaussianCls(bayes0, bayes1) 112 | print("Gaussian Analytic error: %f" % gc.approx_error_data(norm_test, labels)) 113 | gavg = gc.calc_gavg(grid).reshape(-1,n) 114 | myplot(p.subplot(2,3,1),gavg,norm_data0, norm_data1) 115 | 116 | #Gaussian Sampler 117 | #c = GaussianSampler(bayes0,bayes1,norm_data0,norm_data1) 118 | #s1 = samc.SAMCRun(c, burn=0, stepscale=1000, refden=1, thin=10, lim_iters=200) 119 | #s1.sample(1e3, temperature=1) 120 | #print("Gaussian Sampler error: %f" % c.approx_error_data(s1.db, norm_test, labels)) 121 | #gavg = c.calc_gavg(s1.db, grid, 50).reshape(-1,n) 122 | #myplot(p.subplot(2,3,2),gavg) 123 | 124 | # MPM Model 125 | n,gext,grid = get_grid_data(np.vstack(( data0, data1 )), positive=True) 126 | 127 | dist0 = MPMDist(data0,kmax=1) 128 | dist1 = MPMDist(data1,kmax=1) 129 | mpm = MPMCls(dist0, dist1) 130 | #s2 = samc.SAMCRun(mpm, burn=0, stepscale=1000, refden=1, thin=10, 131 | #lim_iters=100, low_margin=0.2, high_margin=-0.5) 132 | #s2.sample(2e5, temperature=2) 133 | mh = mh.MHRun(mpm, burn=100, thin=20) 134 | mh.sample(3e3,verbose=False) 135 | print("MPM Sampler error: %f" % mpm.approx_error_data(mh.db, test, labels)) 136 | 137 | gavg = mpm.calc_gavg(mh.db, grid, numlam=100).reshape(-1,n) 138 | #g = mpm.calc_curr_g(grid).reshape(-1,n) 139 | ga1 = mpm.dist0.calc_db_g(mh.db, mh.db.root.object.dist0, grid).reshape(-1,n) 140 | ga2 = mpm.dist1.calc_db_g(mh.db, mh.db.root.object.dist1, grid).reshape(-1,n) 141 | 142 | myplot(p.subplot(2,3,2),gavg,data0,data1) 143 | myplot(p.subplot(2,3,3),ga1,data0,data1) 144 | myplot(p.subplot(2,3,4),ga2,data0,data1) 145 | myplot(p.subplot(2,3,5),gavg,test[:500,:],test[500:,:]) 146 | p.subplot(2,3,6) 147 | p.plot(test[500:,0], test[500:,1],'m.',alpha=0.5) 148 | p.plot(test[:500,0], test[:500,1],'g.',alpha=0.5) 149 | -------------------------------------------------------------------------------- /tests/cov_poisson.py: -------------------------------------------------------------------------------- 1 | from samcnet.mixturepoisson import * 2 | import numpy as np 3 | import pylab as p 4 | import tables as t 5 | import samcnet.samc as samc 6 | import samcnet.mh as mh 7 | 8 | from samcnet.lori import * 9 | from math import exp,log 10 | 11 | import scipy.stats as st 12 | import scipy.stats.distributions as di 13 | import scipy 14 | import subprocess as sb 15 | import os 16 | import os.path as path 17 | from scipy.special import gammaln 18 | from time import time 19 | 20 | p.close('all') 21 | 22 | def log_poisson(k,lam): 23 | return log(lam) * k - gammaln(k+1) - lam 24 | 25 | ######## PARAMS ######## 26 | numlam = 10 27 | kappa = 5 28 | priorkappa = 80 29 | S = np.eye(2) * (kappa-2-1) * 0.1 30 | #S = np.array([[1,-.9],[-.9,1]]) * kappa 31 | prior_mu = np.zeros(2) + 0 32 | prior_sigma = np.zeros(2) + 10 33 | ######## /PARAMS ######## 34 | 35 | ######## Generate Data ######## 36 | def gen_data(mu, cov, n): 37 | lams = MVNormal(mu, cov).rvs(n) 38 | ps = np.empty_like(lams) 39 | for i in xrange(lams.shape[0]): 40 | for j in xrange(lams.shape[1]): 41 | ps[i,j] = di.poisson.rvs(10 * np.exp(lams[i,j])) 42 | return ps 43 | 44 | rho = -0.0 45 | cov = np.array([[1, rho],[rho, 1]]) * 0.01 46 | mu1 = np.array([log(2), log(4)]) 47 | mu2 = np.array([log(4), log(2)]) 48 | mu3 = np.array([log(5), log(5)]) 49 | 50 | rseed = np.random.randint(1000) 51 | #rseed = 875 52 | dseed = 36 53 | #dseed = np.random.randint(1000) 54 | 55 | print("rseed: %d" % rseed) 56 | print("dseed: %d" % dseed) 57 | np.random.seed(dseed) 58 | ps = np.vstack(( gen_data(mu1,cov,10), gen_data(mu2,cov,10), gen_data(mu3,cov,10) )) 59 | superps = np.vstack(( gen_data(mu1,cov,1000), gen_data(mu2,cov,1000) )) 60 | np.random.seed(rseed) 61 | 62 | n,gext,grid = get_grid_data(ps, positive=True) 63 | #p.plot(superps[:,0], superps[:,1], 'k.', alpha=0.1) 64 | #p.show() 65 | #sys.exit() 66 | ######## /Generate Data ######## 67 | 68 | ######## MH Samples ######## 69 | #startmu = np.array([[log(8),log(8)],[log(2),log(2)],[log(2),log(2)]]).T 70 | startmu = np.array([[log(2),log(4)],[log(4),log(2)],[log(5),log(5)]]).T 71 | #startmu = np.array([[log(2),log(4)],[log(4),log(2)]]).T 72 | #startmu = np.array([[log(3),log(3)],[log(3),log(3)]]).T 73 | #startmu = np.array([[log(3),log(3)]]).T 74 | dist = MPMDist(ps,kappa=kappa,S=S,priormu=prior_mu,priorsigma=prior_sigma, 75 | priorkappa=priorkappa,kmax=3, mumove=0.2, lammove=0.0, 76 | startk=3,startmu=startmu,wmove=0.2,birthmove=0.5) 77 | print("Initial energy: %f" % dist.energy()) 78 | #mymc = mh.MHRun(dist, burn=0, thin=50) 79 | mymc = samc.SAMCRun(dist, burn=0, thin=100, stepscale=1000, refden=2.0, low_margin=0.1, high_margin=-0.2) 80 | iters = 1e4 81 | t1=time() 82 | mymc.sample(iters,verbose=False) 83 | print "%d SAMC iters took %f seconds" % (iters, time()-t1) 84 | 85 | t1=time() 86 | gavg = dist.calc_db_g(mymc.db, mymc.db.root.object, grid, numlam=200, partial=10).reshape(-1,n) 87 | #gavg = dist.calc_db_g(mymc.db, mymc.db.root.object, grid, numlam=numlam).reshape(-1,n) 88 | print "Generating gavg using numlam %d took %f seconds" % (numlam, time()-t1) 89 | #gavg = dist.calc_curr_g(grid, numlam=3).reshape(-1,n) 90 | 91 | p.subplot(2,1,1) 92 | p.imshow(gavg, extent=gext, aspect=1, origin='lower') 93 | p.colorbar() 94 | p.plot(ps[:,0], ps[:,1], 'k.') 95 | 96 | p.subplot(2,1,2) 97 | p.imshow(gavg, extent=gext, aspect=1, origin='lower') 98 | p.colorbar() 99 | p.plot(superps[:,0], superps[:,1], 'k.', alpha=0.1) 100 | 101 | dist.plot_traces(mymc.db, mymc.db.root.object, names=('w','k','mu','lam','sigma')) 102 | from samcnet.utils import * 103 | plotHist(mymc) 104 | 105 | p.show() 106 | -------------------------------------------------------------------------------- /tests/dai_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('./build') 3 | sys.path.append('../lib') 4 | from samcnet.probability import * 5 | from samcnet.pydai import PyVar, PyVarSet, PyFactor, PyJTree 6 | 7 | u = PyVar(0,2) 8 | v = PyVar(1,2) 9 | 10 | x = PyVarSet(u,v) 11 | y = PyVarSet(u) 12 | 13 | print len(x) 14 | print len(y) 15 | 16 | f1 = PyFactor(x) 17 | f2 = PyFactor(u) 18 | 19 | f1.set(0,0.01) 20 | 21 | print f1 22 | f1.normalize() 23 | print f1 24 | 25 | jt = PyJTree(f1) 26 | 27 | print jt 28 | 29 | ######################################### 30 | vars = [] 31 | facs = [] 32 | for i in range(50): 33 | vars.append(PyVar(i,2)) 34 | facs.append(PyFactor(vars[i])) 35 | 36 | #vs1 = PyVarSet(*vars[:3]) 37 | #fac1 = PyFactor(vs) 38 | print facs[0].get(1) 39 | jt = PyJTree(*facs) 40 | 41 | jt.init() 42 | jt.run() 43 | print jt.iterations() 44 | 45 | print jt.marginal(PyVarSet(vars[0], vars[1])) 46 | 47 | node1 = CPD(0, 2, {0:0.25, 1:0.9}, {1:2}) 48 | node2 = CPD(1, 2, {(): 0.25}) 49 | j = node1*node2 50 | 51 | g = GroundNet(j) 52 | 53 | g.kld(j) 54 | -------------------------------------------------------------------------------- /tests/ex_data_0.csv: -------------------------------------------------------------------------------- 1 | 6,2,1 2 | 4,2,0 3 | 5,1,1 4 | 7,2,2 5 | -------------------------------------------------------------------------------- /tests/ex_data_1.csv: -------------------------------------------------------------------------------- 1 | 0,3,0 2 | 1,3,1 3 | 2,1,2 4 | 1,2,1 5 | -------------------------------------------------------------------------------- /tests/ex_data_predict.csv: -------------------------------------------------------------------------------- 1 | 0,3,1 2 | 1,3,2 3 | 7,1,1 4 | -------------------------------------------------------------------------------- /tests/example.py: -------------------------------------------------------------------------------- 1 | import pandas as pa 2 | 3 | import samcnet.mh as mh 4 | from samcnet.mixturepoisson import * 5 | 6 | trn_data0 = pa.read_csv('tests/ex_data_0.csv', header=None) 7 | trn_data1 = pa.read_csv('tests/ex_data_1.csv', header=None) 8 | predict_samples = pa.read_csv('tests/ex_data_predict.csv', header=None) 9 | 10 | dist0 = MPMDist(trn_data0) 11 | dist1 = MPMDist(trn_data1) 12 | mpm = MPMCls(dist0, dist1) 13 | mh = mh.MHRun(mpm, burn=1000, thin=50, verbose=True) 14 | mh.sample(1e4) 15 | 16 | print(mpm.predict(mh.db, predict_samples)) 17 | mh.db.close() 18 | -------------------------------------------------------------------------------- /tests/mpm_yousef.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import tempfile 4 | import yaml 5 | import zlib 6 | import numpy as np 7 | import simplejson as js 8 | import subprocess as sb 9 | from time import time,sleep 10 | from os import path 11 | 12 | try: 13 | from sklearn.lda import LDA 14 | from sklearn.svm import SVC 15 | from sklearn.neighbors import KNeighborsClassifier as KNN 16 | from sklearn.feature_selection import SelectKBest, f_classif 17 | 18 | import samcnet.mh as mh 19 | from samcnet.mixturepoisson import * 20 | from samcnet.lori import * 21 | except ImportError as e: 22 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 23 | " directory is populated by waf.\n\n %s" % str(e)) 24 | 25 | if 'WORKHASH' in os.environ: 26 | try: 27 | server = os.environ['SERVER'] 28 | except: 29 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 30 | 31 | if 'PARAM' in os.environ: 32 | params = yaml.load(os.environ['PARAM']) 33 | #samples = int(params['samples']) 34 | #assert samples < 50, "Need to modify mcmcparams" 35 | 36 | output = {} 37 | output['errors'] = {} 38 | errors = output['errors'] 39 | np.seterr(all='ignore') # Careful with this 40 | 41 | # Run Yousef/Jianping RNA Synthetic 42 | currdir = path.abspath('.') 43 | synloc = path.expanduser('~/GSP/research/samc/synthetic/rnaseq') 44 | 45 | params=""".d NoOfTrainSamples0 20 46 | .d NoOfTrainSamples1 20 47 | .d NoOfTestSamples0 300 48 | .d NoOfTestSamples1 300 49 | .d TotalFeatures 104 50 | .d GlobalFeatures 0 51 | .d HeteroSubTypes 2 52 | .d HeteroFeaturesPerSubType 2 53 | .d RandomFeatures 100 54 | .d CorrBlockSize 2 55 | .d CorrType 1 56 | .f Rho 0.5 57 | .d ScrambleFlag 0 58 | .f Mu_0 0.000000 59 | .f Mu_1 1.000000 60 | .f Sigma_0 0.200000 61 | .f Sigma_1 0.600000 62 | """ 63 | 64 | seed = np.random.randint(10**10) 65 | try: 66 | os.chdir(synloc) 67 | fid,fname = tempfile.mkstemp(dir='params') 68 | fname = path.basename(fname) 69 | fid = os.fdopen(fid,'w') 70 | fid.write(params) 71 | fid.close() 72 | inspec = 'gen -i params/%s -sr 0.05 -lr 9 -hr 11 -seed %d' % \ 73 | (fname, seed) 74 | spec = path.join(synloc, inspec).split() 75 | sb.check_call(spec) 76 | finally: 77 | os.chdir(currdir) 78 | 79 | sleep(3) 80 | 81 | rawdata = np.loadtxt(path.join(synloc, 'out','%s_trn.txt'%fname), 82 | delimiter=',', skiprows=1) 83 | 84 | Ntrn = rawdata.shape[0] 85 | trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) )) 86 | 87 | selector = SelectKBest(f_classif, k=2) 88 | selector.fit(rawdata, trn_labels) 89 | trn_data = selector.transform(rawdata) 90 | D = trn_data.shape[1] 91 | 92 | trn_data0 = trn_data[:Ntrn/2,:] 93 | trn_data1 = trn_data[Ntrn/2:,:] 94 | 95 | norm_trn_data = (trn_data - trn_data.mean(axis=0)) / np.sqrt(trn_data.var(axis=0,ddof=1)) 96 | norm_trn_data0 = norm_trn_data[:Ntrn/2,:] 97 | norm_trn_data1 = norm_trn_data[Ntrn/2:,:] 98 | 99 | raw_tst_data = np.loadtxt(path.join(synloc, 'out','%s_tst.txt'%fname), 100 | delimiter=',', skiprows=1) 101 | tst_data = selector.transform(raw_tst_data) 102 | N = tst_data.shape[0] 103 | tst_data0 = tst_data[:N/2,:] 104 | tst_data1 = tst_data[N/2:,:] 105 | 106 | norm_tst_data = (tst_data - tst_data.mean(axis=0)) / np.sqrt(tst_data.var(axis=0,ddof=1)) 107 | norm_tst_data0 = norm_tst_data[:N/2,:] 108 | norm_tst_data1 = norm_tst_data[N/2:,:] 109 | 110 | tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) )) 111 | 112 | sklda = LDA() 113 | skknn = KNN(3, warn_on_equidistant=False) 114 | sksvm = SVC() 115 | sklda.fit(norm_trn_data, trn_labels) 116 | skknn.fit(norm_trn_data, trn_labels) 117 | sksvm.fit(norm_trn_data, trn_labels) 118 | errors['lda'] = (1-sklda.score(norm_tst_data, tst_labels)) 119 | errors['knn'] = (1-skknn.score(norm_tst_data, tst_labels)) 120 | errors['svm'] = (1-sksvm.score(norm_tst_data, tst_labels)) 121 | print("skLDA error: %f" % errors['lda']) 122 | print("skKNN error: %f" % errors['knn']) 123 | print("skSVM error: %f" % errors['svm']) 124 | 125 | labels = np.hstack((np.zeros(N/2), np.ones(N/2))) 126 | bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_trn_data0) 127 | bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_trn_data1) 128 | 129 | # Gaussian Analytic 130 | gc = GaussianCls(bayes0, bayes1) 131 | errors['gauss'] = gc.approx_error_data(norm_tst_data, labels) 132 | print("Gaussian Analytic error: %f" % errors['gauss']) 133 | 134 | # MPM Model 135 | numlam = 100 136 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=80,lammove=0.02,mumove=0.1) 137 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=80,lammove=0.02,mumove=0.1) 138 | mpm = MPMCls(dist0, dist1) 139 | mh = mh.MHRun(mpm, burn=100, thin=20) 140 | mh.sample(2e3,verbose=False) 141 | errors['mpm'] = mpm.approx_error_data(mh.db, tst_data, labels,numlam=numlam) 142 | print("MPM Sampler error: %f" % errors['mpm']) 143 | 144 | output['acceptance'] = float(mh.accept_loc)/mh.total_loc 145 | output['seed'] = seed 146 | 147 | p.figure() 148 | def myplot(ax,g,data0,data1,gext): 149 | ax.plot(data0[:,0], data0[:,1], 'g.',label='0', alpha=0.3) 150 | ax.plot(data1[:,0], data1[:,1], 'r.',label='1', alpha=0.3) 151 | ax.legend(fontsize=8, loc='best') 152 | 153 | im = ax.imshow(g, extent=gext, aspect='equal', origin='lower') 154 | p.colorbar(im,ax=ax) 155 | ax.contour(g, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 156 | 157 | n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 158 | gavg = mpm.calc_gavg(mh.db, grid, numlam=numlam).reshape(-1,n) 159 | myplot(p.subplot(3,1,1),gavg,tst_data0,tst_data1,gext) 160 | #myplot(p.subplot(3,1,1),gavg,trn_data0,trn_data1,gext) 161 | 162 | n,gext,grid = get_grid_data(np.vstack(( norm_trn_data0, norm_trn_data1 )), positive=False) 163 | myplot(p.subplot(3,1,2),sksvm.decision_function(grid).reshape(-1,n),norm_tst_data0,norm_tst_data1,gext) 164 | #myplot(p.subplot(3,1,2),sksvm.decision_function(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 165 | p.subplot(3,1,3) 166 | def jit(x): 167 | return x+np.random.randn(*x.shape)/4.0 168 | p.plot(jit(tst_data0[:,0]), jit(tst_data0[:,1]), 'g.', alpha=0.5) 169 | p.plot(jit(tst_data1[:,0]), jit(tst_data1[:,1]), 'r.', alpha=0.5) 170 | 171 | p.show() 172 | 173 | if 'WORKHASH' in os.environ: 174 | import zmq,time 175 | ctx = zmq.Context() 176 | socket = ctx.socket(zmq.REQ) 177 | socket.connect('tcp://'+server+':7000') 178 | 179 | wiredata = zlib.compress(js.dumps(output)) 180 | #wiredata = s.read_db() 181 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 182 | socket.send(wiredata) 183 | socket.recv() 184 | socket.close() 185 | ctx.term() 186 | 187 | mh.db.close() 188 | -------------------------------------------------------------------------------- /tests/poisson.py: -------------------------------------------------------------------------------- 1 | from samcnet.mixturepoisson import * 2 | import numpy as np 3 | import pylab as p 4 | import tables as t 5 | import samcnet.samc as samc 6 | import samcnet.mh as mh 7 | 8 | from samcnet.lori import * 9 | from math import exp,log 10 | 11 | import scipy.stats as st 12 | import scipy.stats.distributions as di 13 | import scipy 14 | #import nlopt 15 | import subprocess as sb 16 | import os 17 | import os.path as path 18 | 19 | from sklearn.lda import LDA 20 | from sklearn.svm import SVC 21 | from sklearn.neighbors import KNeighborsClassifier as KNN 22 | 23 | np.seterr(all='ignore') # Careful with this 24 | 25 | seedr = np.random.randint(10**6) 26 | seedd = 40767 27 | #seed = 32 28 | print "Seed is %d" % seedr 29 | #np.random.seed(seedd) 30 | np.random.seed(seedr) 31 | 32 | def myplot(ax,g,data0,data1): 33 | ax.plot(data0[:,0], data0[:,1], 'g.',label='0', alpha=0.3) 34 | ax.plot(data1[:,0], data1[:,1], 'r.',label='1', alpha=0.3) 35 | ax.legend(fontsize=8, loc='best') 36 | 37 | im = ax.imshow(g, extent=gext, aspect='equal', origin='lower') 38 | p.colorbar(im,ax=ax) 39 | ax.contour(g, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 40 | 41 | ########## Profiling ############# 42 | #N = 20 43 | #data0 = np.vstack(( 44 | #np.hstack(( di.poisson.rvs(10*exp(1), size=(N,1)), di.poisson.rvs(10*exp(2), size=(N,1)) )), 45 | #np.hstack(( di.poisson.rvs(10*exp(2.2), size=(N,1)), di.poisson.rvs(10*exp(2), size=(N,1)) )) )) 46 | #data1 = np.hstack(( di.poisson.rvs(10*exp(2), size=(N,1)), di.poisson.rvs(10*exp(1), size=(N,1)) )) 47 | #dist0 = MPMDist(data0) 48 | #dist1 = MPMDist(data1) 49 | #mpm = MPMCls(dist0, dist1) # TODO allow params input here (or maybe per class) 50 | #s2 = samc.SAMCRun(mpm, burn=0, stepscale=100, refden=1, thin=10, lim_iters=200) 51 | #s2.sample(1e4, temperature=20) 52 | #import pstats, cProfile 53 | #cProfile.runctx("samc.SAMCRun(mpm, burn=0, stepscale=1000, thin=10)", globals(), locals(), "prof.prof") 54 | #cProfile.runctx("[mpm.energy() for i in xrange(10000)]", globals(), locals(), "prof.prof") 55 | #cProfile.runctx("mpm.propose()", globals(), locals(), "prof.prof") 56 | 57 | #s = pstats.Stats("prof.prof") 58 | #s.strip_dirs().sort_stats("time").print_stats() 59 | #s.strip_dirs().sort_stats("cumtime").print_stats() 60 | 61 | #sys.exit() 62 | ########## /Profiling ############# 63 | 64 | ######## Generate Data ######## 65 | def gen_data(mu, cov, n): 66 | lams = MVNormal(mu, cov).rvs(n) 67 | ps = np.empty_like(lams) 68 | for i in xrange(lams.shape[0]): 69 | for j in xrange(lams.shape[1]): 70 | ps[i,j] = di.poisson.rvs(10* np.exp(lams[i,j])) 71 | return ps 72 | 73 | def write_data(data0, data1, loc): 74 | assert data0.shape[0] == data1.shape[0] 75 | data = np.vstack(( data0, data1 )) 76 | np.savetxt(loc, data.astype(np.int), header="%d %d %d" % (data.shape+(2,)), 77 | delimiter=',', comments='', fmt='%d') 78 | 79 | mu0 = np.zeros(2) #- 0.5 80 | mu1 = np.zeros(2) #+ 0.5 81 | rho0 = -0.4 82 | rho1 = 0.4 83 | cov0 = np.array([[1, rho0],[rho0, 1]]) 84 | cov1 = np.array([[1, rho1],[rho1, 1]]) 85 | 86 | rseed = np.random.randint(10000) 87 | dseed = 10000 88 | #dseed = np.random.randint(1000) 89 | 90 | print("rseed: %d" % rseed) 91 | print("dseed: %d" % dseed) 92 | np.random.seed(dseed) 93 | ps0 = gen_data(mu0,cov0,30) 94 | ps1 = gen_data(mu1,cov1,30) 95 | superps0 = gen_data(mu0,cov0,3000) 96 | superps1 = gen_data(mu1,cov1,3000) 97 | np.random.seed(rseed) 98 | ps = np.vstack(( ps0, ps1 )) 99 | superps = np.vstack(( superps0, superps1 )) 100 | 101 | n,gext,grid = get_grid_data(ps, positive=True) 102 | ######## /Generate Data ######## 103 | 104 | ########## Comparison ############# 105 | p.close('all') 106 | # Run Yousef/Jianping RNA Synthetic 107 | currdir = path.abspath('.') 108 | synloc = path.expanduser('~/GSP/research/samc/synthetic/rnaseq') 109 | 110 | write_data(ps0, ps1, path.join(synloc, 'out', 'trn.txt')) 111 | write_data(superps0, superps1, path.join(synloc, 'out', 'tst.txt')) 112 | 113 | try: 114 | os.chdir(synloc) 115 | #sb.check_call(path.join(synloc, 116 | #'gen -i params/easyparams -sr 0.05 -lr 9 -hr 10').split()) 117 | sb.check_call(path.join(synloc, 118 | 'cls -t out/trn.txt -s out/tst.txt').split()) 119 | finally: 120 | os.chdir(currdir) 121 | # Grab some info from the run 122 | data = np.loadtxt(path.join(synloc,'out','out')) 123 | lda,knn,svm,num_feats = data[0:4] 124 | print("LDA error: %f" % lda) 125 | print("KNN error: %f" % knn) 126 | print("SVM error: %f" % svm) 127 | feat_inds = data[4:].astype(int) 128 | 129 | rawdata = np.loadtxt(path.join(synloc, 'out','trn.txt'), 130 | delimiter=',', skiprows=1) 131 | data = rawdata[:,feat_inds] 132 | Ntrn = data.shape[0] 133 | data0 = data[:Ntrn/2,:] 134 | data1 = data[Ntrn/2:,:] 135 | norm_data = (data - data.mean(axis=0)) / np.sqrt(data.var(axis=0,ddof=1)) 136 | norm_data0 = norm_data[:Ntrn/2,:] 137 | norm_data1 = norm_data[Ntrn/2:,:] 138 | rawtest = np.loadtxt(path.join(synloc, 'out','tst.txt'), 139 | delimiter=',', skiprows=1) 140 | test = rawtest[:,feat_inds] 141 | norm_test = (test - test.mean(axis=0)) / np.sqrt(test.var(axis=0,ddof=1)) 142 | N = test.shape[0] 143 | D = data.shape[1] 144 | #sys.exit() 145 | 146 | trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) )) 147 | tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) )) 148 | sklda = LDA() 149 | skknn = KNN(3, warn_on_equidistant=False) 150 | sksvm = SVC() 151 | sklda.fit(norm_data, trn_labels) 152 | skknn.fit(norm_data, trn_labels) 153 | sksvm.fit(norm_data, trn_labels) 154 | print("skLDA error: %f" % (1-sklda.score(norm_test, tst_labels))) 155 | print("skKNN error: %f" % (1-skknn.score(norm_test, tst_labels))) 156 | print("skSVM error: %f" % (1-sksvm.score(norm_test, tst_labels))) 157 | 158 | labels = np.hstack((np.zeros(N/2), np.ones(N/2))) 159 | n,gext,grid = get_grid_data(np.vstack(( norm_data0, norm_data1 ))) 160 | 161 | bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data0) 162 | bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_data1) 163 | 164 | # Gaussian Analytic 165 | gc = GaussianCls(bayes0, bayes1) 166 | print("Gaussian Analytic error: %f" % gc.approx_error_data(norm_test, labels)) 167 | gavg = gc.calc_gavg(grid).reshape(-1,n) 168 | myplot(p.subplot(2,3,1),gavg,norm_data0, norm_data1) 169 | 170 | #Gaussian Sampler 171 | #c = GaussianSampler(bayes0,bayes1,norm_data0,norm_data1) 172 | #s1 = samc.SAMCRun(c, burn=0, stepscale=1000, refden=1, thin=10, lim_iters=200) 173 | #s1.sample(1e3, temperature=1) 174 | #print("Gaussian Sampler error: %f" % c.approx_error_data(s1.db, norm_test, labels)) 175 | #gavg = c.calc_gavg(s1.db, grid, 50).reshape(-1,n) 176 | #myplot(p.subplot(2,3,2),gavg) 177 | 178 | # MPM Model 179 | n,gext,grid = get_grid_data(np.vstack(( data0, data1 )), positive=True) 180 | 181 | dist0 = MPMDist(data0,kmax=1) 182 | dist1 = MPMDist(data1,kmax=1) 183 | mpm = MPMCls(dist0, dist1) 184 | #s2 = samc.SAMCRun(mpm, burn=0, stepscale=1000, refden=1, thin=10, 185 | #lim_iters=100, low_margin=0.2, high_margin=-0.5) 186 | #s2.sample(2e5, temperature=2) 187 | mh = mh.MHRun(mpm, burn=100, thin=20) 188 | mh.sample(5e3,verbose=False) 189 | print("MPM Sampler error: %f" % mpm.approx_error_data(mh.db, test, labels)) 190 | 191 | numlam = 200 192 | 193 | gavg = mpm.calc_gavg(mh.db, grid, numlam=numlam).reshape(-1,n) 194 | #g = mpm.calc_curr_g(grid).reshape(-1,n) 195 | ga1 = mpm.dist0.calc_db_g(mh.db, mh.db.root.object.dist0, grid, numlam=numlam).reshape(-1,n) 196 | ga2 = mpm.dist1.calc_db_g(mh.db, mh.db.root.object.dist1, grid, numlam=numlam).reshape(-1,n) 197 | 198 | myplot(p.subplot(2,3,2),gavg,data0,data1) 199 | myplot(p.subplot(2,3,3),ga1,data0,data1) 200 | myplot(p.subplot(2,3,4),ga2,data0,data1) 201 | myplot(p.subplot(2,3,5),gavg,test[:500,:],test[500:,:]) 202 | p.subplot(2,3,6) 203 | p.plot(test[500:,0], test[500:,1],'m.',alpha=0.5) 204 | p.plot(test[:500,0], test[:500,1],'g.',alpha=0.5) 205 | 206 | p.show() 207 | sys.exit() 208 | ########## /Comparison ############# 209 | 210 | ########## SAMC ############# 211 | #p.close('all') 212 | #N = 20 213 | #data0 = np.vstack(( 214 | #np.hstack(( di.poisson.rvs(10*exp(1), size=(N,1)), di.poisson.rvs(10*exp(2), size=(N,1)) )), 215 | #np.hstack(( di.poisson.rvs(10*exp(2.2), size=(N,1)), di.poisson.rvs(10*exp(2), size=(N,1)) )) )) 216 | #data1 = np.hstack(( di.poisson.rvs(10*exp(2), size=(N,1)), di.poisson.rvs(10*exp(1), size=(N,1)) )) 217 | 218 | #dist0 = MPMDist(data0) 219 | #dist1 = MPMDist(data1) 220 | #mpm = MPMCls(dist0, dist1) 221 | #np.random.seed(seedr) 222 | #n,gext,grid = get_grid_data(np.vstack(( data0, data1 )) ) 223 | 224 | 225 | #g = mpm.calc_curr_g(grid).reshape(-1,n) 226 | #s = samc.SAMCRun(mpm, burn=0, stepscale=1000, refden=1, thin=10, lim_iters=200) 227 | #s.sample(1e3, temperature=1) 228 | #gavg = mpm.calc_gavg(s.db, grid, 50).reshape(-1,n) 229 | 230 | #ga1 = mpm.dist0.calc_db_g(s.db, s.db.root.object.dist0, grid, 50).reshape(-1,n) 231 | #ga2 = mpm.dist1.calc_db_g(s.db, s.db.root.object.dist1, grid, 50).reshape(-1,n) 232 | 233 | #myplot(p.subplot(2,2,1),gavg) 234 | ##myplot(p.subplot(2,2,2),g) 235 | #myplot(p.subplot(2,2,3),ga1) 236 | #myplot(p.subplot(2,2,4),ga2) 237 | 238 | #p.show() 239 | #sys.exit() 240 | ########## /SAMC ############# 241 | 242 | ########### NLOpt ############# 243 | #print mpm.energy() 244 | #x = mpm.get_params() 245 | #print x 246 | #print mpm.optim(x,None) 247 | 248 | #def pvec(x): 249 | #s = "[ " 250 | #for i in x: 251 | #s += "%5.1f," % i 252 | #s = s[:-1] 253 | #s+= " ]" 254 | #return s 255 | #def f(x,grad): 256 | #e = mpm.optim(x,grad) 257 | #print "Trying: %8.2f %s" % (e,pvec(x)) 258 | #return e 259 | 260 | ##opt = nlopt.opt(nlopt.LN_BOBYQA, mpm.get_dof()) 261 | #opt = nlopt.opt(nlopt.GN_DIRECT_L, mpm.get_dof()) 262 | ##opt = nlopt.opt(nlopt.G_MLSL_LDS, mpm.get_dof()) 263 | ##lopt = nlopt.opt(nlopt.LN_NELDERMEAD, mpm.get_dof()) 264 | ##lopt.set_ftol_abs(5) 265 | ##opt.set_local_optimizer(lopt) 266 | ##opt.set_min_objective(mpm.optim) 267 | ##opt.set_initial_step(0.5) 268 | #opt.set_min_objective(f) 269 | #opt.set_maxtime(10) 270 | ##opt.set_maxeval(100) 271 | ##opt.set_ftol_rel(1e-6) 272 | ##opt.set_lower_bounds(-10) 273 | ##opt.set_upper_bounds(10) 274 | #opt.set_lower_bounds(x-3.0) 275 | #opt.set_upper_bounds(x+3.0) 276 | 277 | 278 | #xopt = opt.optimize(x) 279 | #xopt_val = opt.last_optimum_value() 280 | #ret = opt.last_optimize_result() 281 | 282 | #print "Starting: %9.3f %s" % (mpm.optim(x,None), pvec(x)) 283 | #print "Final : %9.3f %s" % (xopt_val, pvec(xopt)) 284 | #print "Return: %d" %ret 285 | 286 | #sys.exit() 287 | ########## /NLOpt ############# 288 | 289 | ######## 3D Plot ######## 290 | #from mpl_toolkits.mplot3d import Axes3D 291 | #from matplotlib import cm 292 | #from matplotlib.ticker import LinearLocator, FormatStrFormatter 293 | 294 | #fig = p.figure() 295 | #ax = fig.gca(projection='3d') 296 | #X = np.linspace(gext[0], gext[1], n) 297 | #Y = np.linspace(gext[0], gext[1], n) 298 | #X, Y = np.meshgrid(X, Y) 299 | #surf = ax.plot_surface(X, Y, avglg, rstride=1, cstride=1, cmap=cm.coolwarm, 300 | #linewidth=0, antialiased=False) 301 | #fig = p.figure() 302 | #ax = fig.gca(projection='3d') 303 | #surf = ax.plot_surface(X, Y, avgavg, rstride=1, cstride=1, cmap=cm.coolwarm, 304 | #linewidth=0, antialiased=False) 305 | ######## /3D Plot ######## 306 | 307 | -------------------------------------------------------------------------------- /tests/poisson_synth.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from time import time 4 | 5 | try: 6 | from samcnet.mixturepoisson import * 7 | import samcnet.mh as mh 8 | from samcnet.lori import * 9 | except ImportError as e: 10 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 11 | " directory is populated by waf.\n\n %s" % str(e)) 12 | 13 | if 'WORKHASH' in os.environ: 14 | try: 15 | server = os.environ['SERVER'] 16 | except: 17 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 18 | 19 | 20 | import numpy as np 21 | import scipy.stats as st 22 | import scipy.stats.distributions as di 23 | import scipy 24 | 25 | from sklearn.lda import LDA 26 | from sklearn.svm import SVC 27 | from sklearn.neighbors import KNeighborsClassifier as KNN 28 | 29 | #np.seterr(all='ignore') # Careful with this 30 | 31 | ######## Generate Data ######## 32 | def gen_data(mu, cov, n): 33 | lams = MVNormal(mu, cov).rvs(n) 34 | ps = np.empty_like(lams) 35 | for i in xrange(lams.shape[0]): 36 | for j in xrange(lams.shape[1]): 37 | ps[i,j] = di.poisson.rvs(10* np.exp(lams[i,j])) 38 | return ps 39 | 40 | D = 2 41 | mu0 = np.zeros(D) - 0.5 42 | mu1 = np.zeros(D) + 0.5 43 | #rho0 = -0.4 44 | #rho1 = 0.4 45 | #cov0 = np.array([[1, rho0],[rho0, 1]]) 46 | #cov1 = np.array([[1, rho1],[rho1, 1]]) 47 | #cov0 = np.eye(D) 48 | #cov1 = np.eye(D) 49 | cov0 = sample_invwishart(np.eye(D)*3, 10) 50 | cov1 = sample_invwishart(np.eye(D)*3, 10) 51 | 52 | rseed = np.random.randint(10**6) 53 | #dseed = 1 54 | dseed = np.random.randint(1000) 55 | 56 | print("rseed: %d" % rseed) 57 | print("dseed: %d" % dseed) 58 | 59 | #np.random.seed(dseed) 60 | 61 | trn_data0 = gen_data(mu0,cov0,30) 62 | trn_data1 = gen_data(mu1,cov1,30) 63 | 64 | tst_data0 = gen_data(mu0,cov0,3000) 65 | tst_data1 = gen_data(mu1,cov1,3000) 66 | 67 | #np.random.seed(rseed) 68 | 69 | trn_data = np.vstack(( trn_data0, trn_data1 )) 70 | tst_data = np.vstack(( tst_data0, tst_data1 )) 71 | 72 | ######## /Generate Data ######## 73 | 74 | ########## Comparison ############# 75 | Ntrn = trn_data.shape[0] 76 | norm_trn_data = (trn_data - trn_data.mean(axis=0)) / np.sqrt(trn_data.var(axis=0,ddof=1)) 77 | norm_trn_data0 = norm_trn_data[:Ntrn/2,:] 78 | norm_trn_data1 = norm_trn_data[Ntrn/2:,:] 79 | norm_tst_data = (tst_data - tst_data.mean(axis=0)) / np.sqrt(tst_data.var(axis=0,ddof=1)) 80 | N = tst_data.shape[0] 81 | D = trn_data.shape[1] 82 | norm_tst_data0 = norm_tst_data[:N/2,:] 83 | norm_tst_data1 = norm_tst_data[N/2:,:] 84 | 85 | trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) )) 86 | tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) )) 87 | sklda = LDA() 88 | skknn = KNN(3, warn_on_equidistant=False) 89 | sksvm = SVC() 90 | sklda.fit(norm_trn_data, trn_labels) 91 | skknn.fit(norm_trn_data, trn_labels) 92 | sksvm.fit(norm_trn_data, trn_labels) 93 | 94 | output = {} 95 | output['ldaerr'] = (1-sklda.score(norm_tst_data, tst_labels)) 96 | output['knnerr'] = (1-skknn.score(norm_tst_data, tst_labels)) 97 | output['svmerr'] = (1-sksvm.score(norm_tst_data, tst_labels)) 98 | 99 | print("skLDA error: %f" % output['ldaerr']) 100 | print("skKNN error: %f" % output['knnerr']) 101 | print("skSVM error: %f" % output['svmerr']) 102 | 103 | # Gaussian Analytic 104 | bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_trn_data0) 105 | bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_trn_data1) 106 | gc = GaussianCls(bayes0, bayes1) 107 | 108 | output['gausserr'] = gc.approx_error_data(norm_tst_data, tst_labels) 109 | print("Gaussian Analytic error: %f" % output['gausserr']) 110 | 111 | # MPM Model 112 | dist0 = MPMDist(trn_data0,kmax=1,priorkappa=120,mumove=0.1,lammove=0.05) 113 | dist1 = MPMDist(trn_data1,kmax=1,priorkappa=120,mumove=0.1,lammove=0.05) 114 | mpm = MPMCls(dist0, dist1) 115 | mh = mh.MHRun(mpm, burn=100, thin=20) 116 | t1=time() 117 | iters = 2e3 118 | numlam = 50 119 | mh.sample(iters,verbose=False) 120 | output['mpmerr'] = mpm.approx_error_data(mh.db, tst_data, tst_labels,numlam=numlam) 121 | print("MPM Sampler error: %f" % output['mpmerr']) 122 | print "Whole run with %d iters and %d numlam took %f seconds" % (iters, numlam, time()-t1) 123 | 124 | p.figure() 125 | def myplot(ax,g,data0,data1,gext): 126 | ax.plot(data0[:,0], data0[:,1], 'g.',label='0', alpha=0.3) 127 | ax.plot(data1[:,0], data1[:,1], 'r.',label='1', alpha=0.3) 128 | ax.legend(fontsize=8, loc='best') 129 | 130 | im = ax.imshow(g, extent=gext, aspect='equal', origin='lower') 131 | p.colorbar(im,ax=ax) 132 | ax.contour(g, [0.0], extent=gext, aspect=1, origin='lower', cmap = p.cm.gray) 133 | def jit(x): 134 | return x+np.random.randn(*x.shape)/4.0 135 | 136 | n,gext,grid = get_grid_data(np.vstack(( trn_data0, trn_data1 )), positive=True) 137 | gavg = mpm.calc_gavg(mh.db, grid, numlam=numlam).reshape(-1,n) 138 | myplot(p.subplot(2,1,1),gavg,jit(tst_data0),jit(tst_data1),gext) 139 | #myplot(p.subplot(2,1,1),gavg,trn_data0,trn_data1,gext) 140 | 141 | n,gext,grid = get_grid_data(np.vstack(( norm_trn_data0, norm_trn_data1 )), positive=False) 142 | myplot(p.subplot(2,1,2),sksvm.decision_function(grid).reshape(-1,n),jit(norm_tst_data0),jit(norm_tst_data1),gext) 143 | #myplot(p.subplot(2,1,2),sksvm.decision_function(grid).reshape(-1,n),norm_trn_data0,norm_trn_data1,gext) 144 | 145 | p.show() 146 | 147 | if 'WORKHASH' in os.environ: 148 | import zmq,time,zlib 149 | import simplejson as js 150 | ctx = zmq.Context() 151 | socket = ctx.socket(zmq.REQ) 152 | socket.connect('tcp://'+server+':7000') 153 | 154 | #data = mh.read_db() 155 | data = zlib.compress(js.dumps(output)) 156 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 157 | socket.send(data) 158 | socket.recv() 159 | socket.close() 160 | ctx.term() 161 | 162 | mh.db.close() 163 | -------------------------------------------------------------------------------- /tests/test_class.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | try: 5 | from samcnet import samc,lori,utils 6 | from samcnet.lori import * 7 | except ImportError as e: 8 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 9 | " directory is populated by waf.\n\n %s" % str(e)) 10 | 11 | if 'WORKHASH' in os.environ: 12 | try: 13 | server = os.environ['SERVER'] 14 | except: 15 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 16 | 17 | ## First generate true distributions and data 18 | cval, dist0, dist1 = gen_dists() 19 | 20 | ## Now test Gaussian Analytic calculation 21 | gc = GaussianCls(dist0, dist1) 22 | 23 | c = GaussianSampler(dist0,dist1) 24 | s = samc.SAMCRun(c, burn=0, stepscale=1000, refden=1, thin=10) 25 | s.sample(1e2, temperature=1) 26 | 27 | if 'WORKHASH' in os.environ: 28 | import zmq,time 29 | ctx = zmq.Context() 30 | socket = ctx.socket(zmq.REQ) 31 | socket.connect('tcp://'+server+':7000') 32 | 33 | data = s.read_db() 34 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 35 | socket.send(data) 36 | socket.recv() 37 | socket.close() 38 | ctx.term() 39 | 40 | s.db.close() 41 | -------------------------------------------------------------------------------- /tests/test_net.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import numpy as np 3 | import scipy as sp 4 | import networkx as nx 5 | import tables as t 6 | import zlib 7 | 8 | from samcnet.samc import SAMCRun 9 | from samcnet.bayesnet import BayesNet 10 | from samcnet.bayesnetcpd import BayesNetCPD, BayesNetSampler 11 | from samcnet.generator import * 12 | from samcnet import utils 13 | 14 | if 'WORKHASH' in os.environ: 15 | try: 16 | redis_server = os.environ['REDIS'] 17 | import redis 18 | r = redis.StrictRedis(redis_server) 19 | except: 20 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 21 | 22 | start = None 23 | import time as gtime 24 | def time(): 25 | global start 26 | if start is None: 27 | start = gtime.time() 28 | else: 29 | t = gtime.time() 30 | print("Time taken: {} seconds".format(t-start)) 31 | start = None 32 | 33 | N = 4 34 | iters = 1e3 35 | numdata = 20 36 | priorweight = 5.0 37 | numtemplate = 10 38 | burn = 10 39 | stepscale=1000 40 | temperature = 1.0 41 | thin = 2 42 | refden = 0.0 43 | 44 | random.seed(123456) 45 | np.random.seed(123456) 46 | 47 | groundgraph = generateHourGlassGraph(nodes=N) 48 | #joint, states = generateJoint(groundgraph, method='dirichlet') 49 | joint, states = generateJoint(groundgraph, method='noisylogic') 50 | data = generateData(groundgraph, joint, numdata) 51 | template = sampleTemplate(groundgraph, numtemplate) 52 | 53 | print "Joint:" 54 | print joint 55 | 56 | random.seed() 57 | np.random.seed() 58 | 59 | ground = BayesNetCPD(states, data) 60 | ground.set_cpds(joint) 61 | 62 | obj = BayesNetCPD(states, data) 63 | 64 | b = BayesNetSampler(obj, template, ground, priorweight) 65 | s = SAMCRun(b,burn,stepscale,refden,thin) 66 | time() 67 | s.sample(iters, temperature) 68 | time() 69 | 70 | s.compute_means() 71 | 72 | #fname = '/tmp/test.h5' 73 | #fid = open(fname, 'w') 74 | #fid.write(zlib.decompress(s.read_db())) 75 | #fid.close() 76 | 77 | #db = t.openFile(fname, 'r') 78 | 79 | #if 'WORKHASH' in os.environ: 80 | #r.lpush('jobs:done:'+os.environ['WORKHASH'], s.read_db()) 81 | 82 | #db.close() 83 | s.db.close() 84 | -------------------------------------------------------------------------------- /tests/test_poisson.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | try: 5 | from samcnet.mixturepoisson import * 6 | import samcnet.mh as mh 7 | from samcnet.lori import * 8 | except ImportError as e: 9 | sys.exit("Make sure LD_LIBRARY_PATH is set correctly and that the build"+\ 10 | " directory is populated by waf.\n\n %s" % str(e)) 11 | 12 | if 'WORKHASH' in os.environ: 13 | try: 14 | server = os.environ['SERVER'] 15 | except: 16 | sys.exit("ERROR in worker: Need SERVER environment variable defined.") 17 | 18 | 19 | import numpy as np 20 | import scipy.stats as st 21 | import scipy.stats.distributions as di 22 | import scipy 23 | 24 | from sklearn.lda import LDA 25 | from sklearn.svm import SVC 26 | from sklearn.neighbors import KNeighborsClassifier as KNN 27 | 28 | #np.seterr(all='ignore') # Careful with this 29 | 30 | ######## Generate Data ######## 31 | def gen_data(mu, cov, n): 32 | lams = MVNormal(mu, cov).rvs(n) 33 | ps = np.empty_like(lams) 34 | for i in xrange(lams.shape[0]): 35 | for j in xrange(lams.shape[1]): 36 | ps[i,j] = di.poisson.rvs(10* np.exp(lams[i,j])) 37 | return ps 38 | 39 | D = 4 40 | mu0 = np.zeros(D) - 0.5 41 | mu1 = np.zeros(D) + 0.5 42 | #rho0 = -0.4 43 | #rho1 = 0.4 44 | #cov0 = np.array([[1, rho0],[rho0, 1]]) 45 | #cov1 = np.array([[1, rho1],[rho1, 1]]) 46 | #cov0 = np.eye(D) 47 | #cov1 = np.eye(D) 48 | cov0 = sample_invwishart(np.eye(D)*10, 10) 49 | cov1 = sample_invwishart(np.eye(D)*10, 10) 50 | 51 | rseed = np.random.randint(10**6) 52 | dseed = 1 53 | #dseed = np.random.randint(1000) 54 | 55 | print("rseed: %d" % rseed) 56 | print("dseed: %d" % dseed) 57 | 58 | #np.random.seed(dseed) 59 | 60 | trn_data0 = gen_data(mu0,cov0,30) 61 | trn_data1 = gen_data(mu1,cov1,30) 62 | 63 | tst_data0 = gen_data(mu0,cov0,300) 64 | tst_data1 = gen_data(mu1,cov1,300) 65 | 66 | #np.random.seed(rseed) 67 | 68 | trn_data = np.vstack(( trn_data0, trn_data1 )) 69 | tst_data = np.vstack(( tst_data0, tst_data1 )) 70 | 71 | ######## /Generate Data ######## 72 | 73 | ########## Comparison ############# 74 | Ntrn = trn_data.shape[0] 75 | norm_trn_data = (trn_data - trn_data.mean(axis=0)) / np.sqrt(trn_data.var(axis=0,ddof=1)) 76 | norm_trn_data0 = norm_trn_data[:Ntrn/2,:] 77 | norm_trn_data1 = norm_trn_data[Ntrn/2:,:] 78 | norm_tst_data = (tst_data - tst_data.mean(axis=0)) / np.sqrt(tst_data.var(axis=0,ddof=1)) 79 | N = tst_data.shape[0] 80 | D = trn_data.shape[1] 81 | 82 | trn_labels = np.hstack(( np.zeros(Ntrn/2), np.ones(Ntrn/2) )) 83 | tst_labels = np.hstack(( np.zeros(N/2), np.ones(N/2) )) 84 | sklda = LDA() 85 | skknn = KNN(3, warn_on_equidistant=False) 86 | sksvm = SVC() 87 | sklda.fit(norm_trn_data, trn_labels) 88 | skknn.fit(norm_trn_data, trn_labels) 89 | sksvm.fit(norm_trn_data, trn_labels) 90 | 91 | output = {} 92 | output['ldaerr'] = (1-sklda.score(norm_tst_data, tst_labels)) 93 | output['knnerr'] = (1-skknn.score(norm_tst_data, tst_labels)) 94 | output['svmerr'] = (1-sksvm.score(norm_tst_data, tst_labels)) 95 | 96 | print("skLDA error: %f" % output['ldaerr']) 97 | print("skKNN error: %f" % output['knnerr']) 98 | print("skSVM error: %f" % output['svmerr']) 99 | 100 | # Gaussian Analytic 101 | bayes0 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_trn_data0) 102 | bayes1 = GaussianBayes(np.zeros(D), 1, 8, np.eye(D)*3, norm_trn_data1) 103 | gc = GaussianCls(bayes0, bayes1) 104 | 105 | output['gausserr'] = gc.approx_error_data(norm_tst_data, tst_labels) 106 | print("Gaussian Analytic error: %f" % output['gausserr']) 107 | 108 | # MPM Model 109 | dist0 = MPMDist(trn_data0,kmax=1) 110 | dist1 = MPMDist(trn_data1,kmax=1) 111 | mpm = MPMCls(dist0, dist1) 112 | mh = mh.MHRun(mpm, burn=1, thin=2) 113 | mh.sample(40,verbose=False) 114 | output['mpmerr'] = mpm.approx_error_data(mh.db, tst_data, tst_labels,numlam=200) 115 | print("MPM Sampler error: %f" % output['mpmerr']) 116 | 117 | if 'WORKHASH' in os.environ: 118 | import zmq,time,zlib 119 | import simplejson as js 120 | ctx = zmq.Context() 121 | socket = ctx.socket(zmq.REQ) 122 | socket.connect('tcp://'+server+':7000') 123 | 124 | #data = mh.read_db() 125 | data = zlib.compress(js.dumps(output)) 126 | socket.send(os.environ['WORKHASH'], zmq.SNDMORE) 127 | socket.send(data) 128 | socket.recv() 129 | socket.close() 130 | ctx.term() 131 | 132 | mh.db.close() 133 | -------------------------------------------------------------------------------- /tests/test_simple.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import redis 4 | import random 5 | import numpy as np 6 | from samcnet import samc,utils,simple 7 | 8 | if 'WORKHASH' in os.environ: 9 | try: 10 | import redis 11 | r = redis.StrictRedis(os.environ['REDIS']) 12 | except: 13 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 14 | 15 | random.seed(123456) 16 | np.random.seed(123456) 17 | 18 | o = simple.Simple() 19 | 20 | random.seed() 21 | np.random.seed() 22 | 23 | s = samc.SAMCRun(o, burn=10, stepscale=10, refden=0, thin=1) 24 | s.sample(100) 25 | 26 | res = [] 27 | res.append(s.func_mean()) 28 | res.append(s.func_cummean()) 29 | 30 | 31 | res_wire = utils.prepare_data([utils.encode_element(x) for x in res]) 32 | 33 | sys.stderr.write("Writing res_wire, first 5 bytes: %s\n" % res_wire[:5]) 34 | 35 | if 'WORKHASH' in os.environ: 36 | r.lpush('jobs:done:'+os.environ['WORKHASH'], res_wire) 37 | 38 | -------------------------------------------------------------------------------- /tests/test_tree.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import numpy as np 3 | import scipy as sp 4 | import networkx as nx 5 | import tables as t 6 | import zlib 7 | 8 | from samcnet.samc import SAMCRun 9 | from samcnet.treenet import TreeNet, generateTree, generateData 10 | from samcnet import utils 11 | 12 | if 'WORKHASH' in os.environ: 13 | try: 14 | redis_server = os.environ['REDIS'] 15 | import redis 16 | r = redis.StrictRedis(redis_server) 17 | except: 18 | sys.exit("ERROR in worker: Need REDIS environment variable defined.") 19 | 20 | N = 5 21 | comps = 2 22 | iters = 1e3 23 | numdata = 20 24 | #priorweight = 5 25 | #numtemplate = 5 26 | burn = 1 27 | stepscale=10 28 | temperature = 1.0 29 | thin = 1 30 | refden = 0.0 31 | 32 | random.seed(123456) 33 | np.random.seed(123456) 34 | 35 | groundgraph = generateTree(N, comps) 36 | data = generateData(groundgraph,numdata) 37 | #template = sampleTemplate(groundgraph, numtemplate) 38 | 39 | random.seed() 40 | np.random.seed() 41 | 42 | ground = TreeNet(N, graph=groundgraph) 43 | b = TreeNet(N, data=data, ground=ground) 44 | s = SAMCRun(b,burn,stepscale,refden,thin,verbose=True) 45 | s.sample(iters, temperature) 46 | 47 | s.compute_means() 48 | 49 | # All to exercise cde deps 50 | tmp = s.read_db() 51 | import cPickle 52 | txt = zlib.compress(cPickle.dumps([1,2,3])) 53 | 54 | if 'WORKHASH' in os.environ: 55 | r.lpush('jobs:done:'+os.environ['WORKHASH'], s.read_db()) 56 | -------------------------------------------------------------------------------- /tests/treevbnet.py: -------------------------------------------------------------------------------- 1 | import sys, os, random 2 | import numpy as np 3 | import scipy as sp 4 | import networkx as nx 5 | import json as js 6 | import tables as t 7 | import zlib 8 | import cPickle 9 | import time as gtime 10 | import pylab as p 11 | 12 | from samcnet.samc import SAMCRun 13 | from samcnet.treenet import TreeNet, generateTree, generateData 14 | from samcnet.bayesnetcpd import BayesNetSampler, BayesNetCPD 15 | from samcnet import utils 16 | from samcnet.generator import sampleTemplate 17 | import samcnet.generator as gen 18 | 19 | N = 5 20 | comps = 2 21 | iters = 3e5 22 | numdata = 30 23 | burn = 1000 24 | stepscale = 30000 25 | temperature = 1.0 26 | thin = 50 27 | refden = 0.0 28 | numtemplate = 10 29 | priorweight = 0.0 30 | 31 | random.seed(12345) 32 | np.random.seed(12345) 33 | 34 | groundgraph = generateTree(N, comps) 35 | data = generateData(groundgraph,numdata) 36 | template = sampleTemplate(groundgraph, numtemplate) 37 | 38 | random.seed() 39 | np.random.seed() 40 | 41 | ############### TreeNet ############## 42 | 43 | groundtree = TreeNet(N, data=data, graph=groundgraph) 44 | b1 = TreeNet(N, data, template, priorweight, groundtree) 45 | #s1 = SAMCRun(b1,burn,stepscale,refden,thin) 46 | #s1.sample(iters, temperature) 47 | 48 | ############## bayesnetcpd ############ 49 | 50 | joint = utils.graph_to_joint(groundgraph) 51 | states = np.ones(len(joint.dists),dtype=np.int32)*2 52 | groundbnet = BayesNetCPD(states, data) 53 | groundbnet.set_cpds(joint) 54 | 55 | obj = BayesNetCPD(states, data) 56 | b2 = BayesNetSampler(obj, template, groundbnet, priorweight) 57 | #s2 = SAMCRun(b2,burn,stepscale,refden,thin) 58 | #s2.sample(iters, temperature) 59 | 60 | ####################################### 61 | 62 | def test(): 63 | def close_enough(n1,n2): 64 | return (n1-n2) < 1e-5 65 | return (n1-n2) < np.finfo(float).eps 66 | 67 | assert close_enough(groundtree.kld(b1), groundbnet.kld(b2.bayesnet)) 68 | 69 | b2.bayesnet.adjust_factor(1,[2],[]) 70 | b2.bayesnet.set_factor(1,[0.9,0.1,0.9,0.1]) 71 | 72 | b1.add_edge(2,1,0.9,0.1) 73 | 74 | assert close_enough(groundtree.entropy(), groundbnet.entropy()) 75 | assert close_enough(groundtree.kld(b1), groundbnet.kld(b2.bayesnet)) 76 | 77 | energy = 0.0 78 | count = 0 79 | while groundbnet.kld(b2.bayesnet) >= 0.0: 80 | b2.propose() 81 | count += 1 82 | if b2.energy() > 10000: 83 | b2.reject() 84 | 85 | -------------------------------------------------------------------------------- /waf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binarybana/samcnet/84f3ba8241d416115a8aa9ba5c659a9513175072/waf -------------------------------------------------------------------------------- /wscript: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # encoding: utf-8 3 | 4 | APPNAME = 'samcsynthetic' 5 | VERSION = '0.4' 6 | 7 | top = '.' 8 | out = 'build' 9 | 10 | def options(opt): 11 | opt.load('compiler_c compiler_cxx python cython') 12 | opt.add_option('-d', '--debug', action='store_true', default=False, help='Debug flag.') 13 | opt.add_option('-p', '--prof', action='store_true', default=False, help='Profiling flag.') 14 | 15 | def configure(conf): 16 | conf.load('compiler_c compiler_cxx python cython') 17 | conf.check_python_headers() 18 | conf.check_python_module('numpy') 19 | conf.check_python_module('scipy') 20 | #conf.check_python_module('networkx') 21 | conf.check_python_module('pandas') 22 | conf.env.append_value('LINKFLAGS', '-L%s/lib' % conf.path.abspath()) 23 | conf.env.append_value('LINKFLAGS', '-L/share/apps/lib') 24 | #conf.check(compiler='cc', lib='Judy', uselib_store='JUDY') 25 | conf.check(compiler='cc', lib='m', uselib_store='MATH') 26 | #conf.check(compiler='cc', lib='profiler', uselib_store='PROF') 27 | #conf.check_cxx(lib='gmp', uselib_store='GMP') 28 | #conf.check_cxx(lib='gmpxx', uselib_store='GMPXX') 29 | #conf.env.append_value('LINKFLAGS', '-lgmpxx') 30 | #conf.check_cxx(lib='dai', uselib_store='DAI') 31 | 32 | def build(bld): 33 | libs = 'MATH'.split() 34 | #libs = 'JUDY MATH'.split() 35 | includes = ['-I/share/apps/include'] 36 | 37 | CFLAGS = ['-Wall','-std=c99'] + includes 38 | CXXFLAGS = ['-fPIC'] + includes 39 | LDFLAGS = [] 40 | CYTHONFLAGS = [] 41 | if bld.options.debug: 42 | print('Beginning debug build') 43 | CFLAGS += ['-g','-DDEBUG'] 44 | CXXFLAGS += ['-g','-DDEBUG'] 45 | CYTHONFLAGS += ['--gdb'] 46 | LDFLAGS += ['-g','-DDEBUG'] 47 | if bld.options.prof: 48 | print('Adding profiling flag build') 49 | CFLAGS += ['-pg'] 50 | CXXFLAGS += ['-pg'] 51 | LDFLAGS += ['-pg'] 52 | #libs += ['PROF'] 53 | if not bld.options.prof and not bld.options.debug: 54 | CFLAGS += ['-O2', '-g'] 55 | CXXFLAGS += ['-O2', '-g'] 56 | 57 | #bld.env.CYTHONFLAGS = CYTHONFLAGS 58 | 59 | bld.env['PREFIX'] = '.' 60 | 61 | #bld.shlib(source = bld.path.ant_glob('samcnet/netcost/*.c'), 62 | #target='cost', 63 | #cflags=CFLAGS, 64 | #linkflags=LDFLAGS, 65 | #use=libs) 66 | 67 | #bld(features='c cshlib pyext', 68 | #source=['samcnet/samc.pyx'], 69 | #includes=[], 70 | #libpath=['.','./build'], 71 | #cflags=CFLAGS, 72 | #ldflags=LDFLAGS, 73 | #target='samc') 74 | 75 | bld(features='c cshlib pyext', 76 | source=['samcnet/mh.pyx'], 77 | includes=[], 78 | libpath=['.','./build'], 79 | cflags=CFLAGS, 80 | ldflags=LDFLAGS, 81 | target='mh') 82 | 83 | #bld(features='c cshlib pyext', 84 | #source=['samcnet/bayesnet.pyx'], 85 | #includes=['samcnet/netcost'], 86 | #use='cost', 87 | #libpath=['.','./build'], 88 | #cflags=CFLAGS, 89 | #ldflags=LDFLAGS, 90 | #target='bayesnet') 91 | 92 | bld(features='c cshlib pyext', 93 | source=['samcnet/mixturepoisson.pyx'], 94 | #includes=['samcnet/netcost'], 95 | #use='cost', 96 | #libpath=['.','./build'], 97 | cflags=CFLAGS, 98 | ldflags=LDFLAGS, 99 | target='mixturepoisson') 100 | 101 | #bld(features='c cshlib cxx pyext', 102 | #source=['samcnet/probability.pyx'], 103 | ##libpath=['.','./build'], 104 | #includes=['deps/libdai/include'], 105 | #cxxflags=CXXFLAGS, 106 | #ldflags=LDFLAGS, 107 | #target='probability') 108 | 109 | #libs = ['MATH', 'DAI', 'GMP', 'GMPXX'] 110 | 111 | #bld(features='c cshlib cxx pyext', 112 | #source=['samcnet/bayesnetcpd.pyx', 'samcnet/utils.cpp'], 113 | #includes=['deps/libdai/include','include'], 114 | #libpath=['lib','.','./build'], 115 | #use=libs, 116 | #cxxflags=CXXFLAGS, 117 | #ldflags=LDFLAGS, 118 | #target='bayesnetcpd') 119 | 120 | #CFLAGS.remove('-Wall') 121 | 122 | #bld(features='c cshlib cxx pyext', 123 | #source=['samcnet/pydai.pyx'], 124 | #libpath=['lib','.','build'], 125 | #includes=['deps/libdai/include'], 126 | #use=libs, 127 | #cxxflags=CXXFLAGS, 128 | #ldflags=LDFLAGS, 129 | #target='pydai') 130 | 131 | #bld.env['PREFIX'] = '.' 132 | #for x in 'dai.so probability.so bayesnetcpd.so bayesnet.so samc.so cost.so'.split(): 133 | #bld.symlink_as('${PREFIX}/lib/%s' % x, 'build/%s' % x) 134 | 135 | def dist(ctx): 136 | ctx.excl = '**/*.zip **/*.bz2 **/.waf-1* **/*~ **/*.pyc **/*.swp **/.lock-w*' 137 | #ctx.files = ctx.path.ant_glob('**/wscript **/*.h **/*.cpp waf') 138 | 139 | --------------------------------------------------------------------------------