├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── ipynb
    ├── .gitignore
    ├── demo_kgof.ipynb
    ├── demo_kgof_mltrain_workshop.ipynb
    ├── ex1_results.ipynb
    ├── ex2_results.ipynb
    ├── ex3_results.ipynb
    ├── fssd_locs_surface.ipynb
    ├── gof_kernel_stein.ipynb
    ├── gof_linear_kernel_stein.ipynb
    ├── gof_me_test.ipynb
    ├── gof_mmd_test.ipynb
    └── preliminary.ipynb
├── kgof
    ├── __init__.py
    ├── config.py
    ├── data.py
    ├── density.py
    ├── ex
    │   ├── __init__.py
    │   ├── ex1_vary_n.py
    │   ├── ex2_prob_params.py
    │   ├── ex3_vary_nlocs.py
    │   ├── run_ex1.sh
    │   ├── run_ex2.sh
    │   └── run_ex3.sh
    ├── glo.py
    ├── goftest.py
    ├── intertst.py
    ├── kernel.py
    ├── mmd.py
    ├── plot.py
    ├── test
    │   ├── __init__.py
    │   ├── test_density.py
    │   ├── test_goftest.py
    │   └── test_kernel.py
    └── util.py
├── run_unittest.sh
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | KSD_goodness_of_fit/
 2 | *.bkp
 3 | *.swp
 4 | .dropbox.attr
 5 | *.pyc
 6 | *.npz
 7 | *.bak
 8 | *.log
 9 | *~
10 | *.egg-info
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - 2.7
 4 | 
 5 | before_install:
 6 |     - sudo apt-get update
 7 |     - sudo apt-get install  libblas-dev liblapack-dev libatlas-base-dev gfortran
 8 |     #- sudo apt-get install  python-numpy python-scipy python-matplotlib
 9 | 
10 |     - wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
11 |     - bash miniconda.sh -b -p $HOME/miniconda
12 |     - export PATH="$HOME/miniconda/bin:$PATH"
13 |     - conda update --yes conda
14 |     - conda config --add channels conda-forge
15 | install:
16 |     - conda install --yes python=$TRAVIS_PYTHON_VERSION pip numpy scipy nose future autograd matplotlib
17 |     #- pip install -r requirements.txt
18 | 
19 | # command to run tests
20 | script:
21 |     - if [[ $TRAVIS_PYTHON_VERSION == '2.7' ]]; then python -m unittest discover; fi
22 | 
23 | notifications:
24 |     email:
25 |         recipients:
26 |             - wittawatj@gmail.com 
27 |         on_success: change 
28 |         on_failure: change
29 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Set the base image
 2 | 
 3 | # Dockerfile at https://github.com/ContinuumIO/docker-images/blob/master/miniconda/Dockerfile
 4 | FROM continuumio/miniconda
 5 | 
 6 | RUN apt-get update --fix-missing
 7 | RUN apt-get install -y gcc
 8 | RUN pip install jupyter
 9 | 
10 | # install kgof from https://github.com/wittawatj/kernel-gof
11 | RUN pip install git+https://github.com/wittawatj/kernel-gof.git
12 | 
13 | MAINTAINER Wittawat Jitkrittum <wittawatj@gmail.com>
14 | 
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Wittawat Jitkrittum and Wenkai Xu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # include the LICENSE
 2 | include LICENSE
 3 | 
 4 | # include the test script
 5 | include run_unittest.sh
 6 | 
 7 | # remove some files
 8 | #recursive-exclude data *
 9 | #recursive-exclude kgof/ex *
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # kgof
  2 | 
  3 | [![Build Status](https://travis-ci.org/wittawatj/kernel-gof.svg?branch=master)](https://travis-ci.org/wittawatj/kernel-gof)
  4 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/wittawatj/kernel-gof/blob/master/LICENSE)
  5 | 
  6 | **11th July 2019**: For an implementation of our test in Julia, see [this
  7 | repository](https://github.com/torfjelde/KernelGoodnessOfFit.jl) by [Tor Erlend Fjelde](http://retiredparkingguard.com/).
  8 | 
  9 | **UPDATE**: On 8th Mar 2018, we have updated the code to support Python 3 (with
 10 | `futurize`). If you find any problem, please let us know. Thanks.
 11 | 
 12 | This repository contains a Python 2.7/3 implementation of the nonparametric
 13 | linear-time goodness-of-fit test described in  [our paper](https://arxiv.org/abs/1705.07673)
 14 | 
 15 |     A Linear-Time Kernel Goodness-of-Fit Test
 16 |     Wittawat Jitkrittum, Wenkai Xu, Zoltan Szabo, Kenji Fukumizu, Arthur Gretton
 17 |     NIPS 2017 (Best paper)
 18 |     https://arxiv.org/abs/1705.07673
 19 | 
 20 | ## How to install?
 21 | 
 22 | The package can be installed with the `pip` command.
 23 | 
 24 |     pip install git+https://github.com/wittawatj/kernel-gof.git
 25 | 
 26 | Once installed, you should be able to do `import kgof` without any error.
 27 | `pip` will also resolve the following dependency automatically.
 28 | 
 29 | ## Dependency
 30 | 
 31 | The following Python packages were used during development. Ideally, the
 32 | following packages with the specified version numbers or newer should be used.
 33 | However, older versions may work as well. We did not specifically rely on
 34 | newest features in these specified versions.
 35 | 
 36 |     autograd == 1.1.7
 37 |     matplotlib == 2.0.0
 38 |     numpy == 1.11.3
 39 |     scipy == 0.19.0
 40 | 
 41 | ## Demo
 42 | 
 43 | To get started, check
 44 | [demo_kgof.ipynb](https://github.com/wittawatj/kernel-gof/blob/master/ipynb/demo_kgof.ipynb).
 45 | This is a Jupyter notebook which will guide you through from the beginning. It
 46 | can also be viewed on the web. There are many Jupyter notebooks in `ipynb`
 47 | folder demonstrating other implemented tests. Be sure to check them if you
 48 | would like to explore.
 49 | 
 50 | ## Reproduce experimental results
 51 | 
 52 | Each experiment is defined in its own Python file with a name starting with
 53 | `exXX` where `XX` is a number. All the experiment files are in `kgof/ex`
 54 | folder. Each file is runnable with a command line argument. For example in
 55 | `ex1_vary_n.py`, we aim to check the test power of each testing algorithm
 56 | as a function of the sample size `n`. The script `ex1_vary_n.py` takes a
 57 | dataset name as its argument. See `run_ex1.sh` which is a standalone Bash
 58 | script on how to execute  `ex1_power_vs_n.py`.
 59 | 
 60 | We used [independent-jobs](https://github.com/karlnapf/independent-jobs)
 61 | package to parallelize our experiments over a
 62 | [Slurm](http://slurm.schedmd.com/) cluster (the package is not needed if you
 63 | just need to use our developed tests). For example, for
 64 | `ex1_vary_n.py`, a job is created for each combination of 
 65 | 
 66 |     (dataset, test algorithm, n, trial)
 67 | 
 68 | If you do not use Slurm, you can change the line 
 69 | 
 70 |     engine = SlurmComputationEngine(batch_parameters)
 71 | 
 72 | to 
 73 | 
 74 |     engine = SerialComputationEngine()
 75 | 
 76 | which will instruct the computation engine to just use a normal for-loop on a
 77 | single machine (will take a lot of time). Other computation engines that you
 78 | use might be supported. See  [independent-jobs's repository
 79 | page](https://github.com/karlnapf/independent-jobs).  Running simulation will
 80 | create a lot of result files (one for each tuple above) saved as Pickle. Also,
 81 | the `independent-jobs` package requires a scratch folder to save temporary
 82 | files for communication among computing nodes. Path to the folder containing
 83 | the saved results can be specified in `kgof/config.py` by changing the value of
 84 | `expr_results_path`:
 85 | 
 86 |     # Full path to the directory to store experimental results.
 87 |     'expr_results_path': '/full/path/to/where/you/want/to/save/results/',
 88 | 
 89 | The scratch folder needed by the `independent-jobs` package can be specified in
 90 | the same file by changing the value of `scratch_path`
 91 | 
 92 |     # Full path to the directory to store temporary files when running experiments
 93 |     'scratch_path': '/full/path/to/a/temporary/folder/',
 94 | 
 95 | To plot the results, see the experiment's corresponding Jupyter notebook in the
 96 | `ipynb/` folder. For example, for `ex1_vary_n.py` see `ipynb/ex1_results.ipynb`
 97 | to plot the results.
 98 | 
 99 | 
100 | ## Some note
101 | 
102 | * When adding a new `Kernel` or new `UnnormalizedDensity`, use `np.dot(X, Y)`
103 |   instead of `X.dot(Y)`. `autograd` cannot differentiate the latter. Also, do
104 |   not use `x += ...`.  Use `x = x + ..` instead.
105 | 
106 | * The sub-module `kgof.intertst` depends on the linear-time two-sample test of
107 |   [Jitkrittum et al., 2016 (NIPS
108 |   2016)](http://papers.nips.cc/paper/6148-interpretable-distribution-features-with-maximum-testing-power)
109 |   implemented in  the `freqopttest` Python package which can be found
110 |   [here](https://github.com/wittawatj/interpretable-test).
111 | 
112 | 
113 | ---------------
114 | 
115 | If you have questions or comments about anything related to this work, please
116 | do not hesitate to contact [Wittawat Jitkrittum](http://wittawat.com).
117 | 


--------------------------------------------------------------------------------
/ipynb/.gitignore:
--------------------------------------------------------------------------------
1 | *.pdf
2 | 


--------------------------------------------------------------------------------
/ipynb/ex1_results.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "A notebook to process experimental results of ex1_vary_n.py. p(reject) as problem sample size increases."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%load_ext autoreload\n",
 21 |     "%autoreload 2\n",
 22 |     "%matplotlib inline\n",
 23 |     "#%config InlineBackend.figure_format = 'svg'\n",
 24 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 25 |     "\n",
 26 |     "import numpy as np\n",
 27 |     "\n",
 28 |     "import matplotlib\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import kgof.data as data\n",
 31 |     "import kgof.glo as glo\n",
 32 |     "import kgof.goftest as gof\n",
 33 |     "import kgof.kernel as kernel\n",
 34 |     "import kgof.plot as plot\n",
 35 |     "import kgof.util as util\n",
 36 |     "\n",
 37 |     "import scipy.stats as stats"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import kgof.plot\n",
 49 |     "kgof.plot.set_default_matplotlib_options()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def load_plot_vs_ns(fname, show_legend=True, xscale='linear', yscale='linear'):\n",
 61 |     "    \"\"\"\n",
 62 |     "    ns = sample sizes\n",
 63 |     "    \"\"\"\n",
 64 |     "    func_xvalues = lambda agg_results: agg_results['ns']\n",
 65 |     "    ex = 1\n",
 66 |     "    def func_title(agg_results):\n",
 67 |     "        repeats, _, n_methods = agg_results['job_results'].shape\n",
 68 |     "        alpha = agg_results['alpha']\n",
 69 |     "        title = '%s. %d trials. $\\\\alpha$ = %.2g.'%\\\n",
 70 |     "            ( agg_results['prob_label'], repeats, alpha)\n",
 71 |     "        return title\n",
 72 |     "    #plt.figure(figsize=(10,5))\n",
 73 |     "    results = plot.plot_prob_reject(\n",
 74 |     "        ex, fname, func_xvalues, '', func_title=func_title)\n",
 75 |     "    \n",
 76 |     "    plt.title('')\n",
 77 |     "    \n",
 78 |     "    if xscale is not None:\n",
 79 |     "        plt.xscale(xscale)\n",
 80 |     "    if yscale is not None:\n",
 81 |     "        plt.yscale(yscale)\n",
 82 |     "    \n",
 83 |     "    plt.xlabel('Sample size $n$')\n",
 84 |     "    plt.gca().legend(loc='best').set_visible(show_legend)\n",
 85 |     "    if show_legend:\n",
 86 |     "        plt.legend(bbox_to_anchor=(1.0, 1.05))\n",
 87 |     "        \n",
 88 |     "    plt.grid(False)\n",
 89 |     "    return results\n",
 90 |     "\n",
 91 |     "\n",
 92 |     "def load_runtime_vs_ns(fname, xlabel='Sample size $n$', \n",
 93 |     "                      show_legend=True, xscale='linear', yscale='linear'):\n",
 94 |     "    func_xvalues = lambda agg_results: agg_results['ns']\n",
 95 |     "    ex = 1\n",
 96 |     "    def func_title(agg_results):\n",
 97 |     "        repeats, _, n_methods = agg_results['job_results'].shape\n",
 98 |     "        alpha = agg_results['alpha']\n",
 99 |     "        title = '%s. %d trials. $\\\\alpha$ = %.2g.'%\\\n",
100 |     "            ( agg_results['prob_label'], repeats, alpha)\n",
101 |     "        return title\n",
102 |     "    #plt.figure(figsize=(10,6))\n",
103 |     "    \n",
104 |     "    results = plot.plot_runtime(ex, fname,  \n",
105 |     "                                func_xvalues, xlabel=xlabel, func_title=func_title)\n",
106 |     "    \n",
107 |     "    plt.title('')\n",
108 |     "    plt.gca().legend(loc='best').set_visible(show_legend)\n",
109 |     "    if show_legend:\n",
110 |     "        plt.legend(bbox_to_anchor=(1.0, 1.05))\n",
111 |     "    \n",
112 |     "    #plt.grid(True)\n",
113 |     "    if xscale is not None:\n",
114 |     "        plt.xscale(xscale)\n",
115 |     "    if yscale is not None:\n",
116 |     "        plt.yscale(yscale)\n",
117 |     "    return results\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "# # GMD\n",
129 |     "# # gmd_fname = 'ex1-gmd_p05_d10_ns-me4_rs50_nmi1000_nma4000_a0.050_trp0.50.p'\n",
130 |     "# gmd_fname = 'ex1-gmd_p03_d10_ns-me4_rs30_nmi1000_nma7000_a0.050_trp0.50.p'\n",
131 |     "# gmd_results = load_plot_vs_ns(gmd_fname, show_legend=True)\n",
132 |     "# plt.figure()\n",
133 |     "# load_runtime_vs_ns(gmd_fname);"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "$$p(x)=\\mathcal{N}([0, 0,\\ldots, 0], I) \\\\\n",
141 |     "q(x)=\\mathcal{N}([c,0,\\ldots, 0], I)$$"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {
148 |     "collapsed": false,
149 |     "scrolled": false
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "gbrbm_fname = 'ex1-gbrbm_dx50_dh10_vp1-me6_rs200_nmi1000_nma4000_a0.050_trp0.20.p'\n",
154 |     "# plt.ylim([0, 0.1])\n",
155 |     "gbrbm_results = load_plot_vs_ns(gbrbm_fname, show_legend=False)\n",
156 |     "plt.savefig(gbrbm_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "load_runtime_vs_ns(gbrbm_fname, show_legend=False);\n",
168 |     "plt.yticks([0, 100, 200, 300])\n",
169 |     "plt.savefig(gbrbm_fname.replace('.p', '_time.pdf', 1), bbox_inches='tight',\n",
170 |     "           show_legend=False)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "collapsed": false,
178 |     "scrolled": false
179 |    },
180 |    "outputs": [],
181 |    "source": [
182 |     "gbrbm_h0_fname = 'ex1-gbrbm_dx50_dh10_h0-me6_rs200_nmi1000_nma4000_a0.050_trp0.20.p'\n",
183 |     "gbrbm_h0_results = load_plot_vs_ns(gbrbm_h0_fname, show_legend=False)\n",
184 |     "# plt.ylim([0.01, 0.08])\n",
185 |     "plt.savefig(gbrbm_h0_fname.replace('.p', '.pdf', 1), bbox_inches='tight',\n",
186 |     "           show_legend=False)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {
193 |     "collapsed": false
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "load_runtime_vs_ns(gbrbm_h0_fname, show_legend=False);\n",
198 |     "plt.yticks([0, 100, 200, 300])\n",
199 |     "plt.savefig(gbrbm_h0_fname.replace('.p', '_time.pdf', 1), bbox_inches='tight',\n",
200 |     "           show_legend=False)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "collapsed": false
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "# gbrbm_highd_fname = 'ex1-gbrbm_dx50_dh40_vp1-me6_rs200_nmi1000_nma4000_a0.050_trp0.20.p'\n",
212 |     "# gbrbm_highd_fname = 'ex1-gbrbm_dx50_dh40_vp1-me1_rs200_nmi1000_nma4000_a0.050_trp0.20.p'\n",
213 |     "gbrbm_highd_fname = 'ex1-gbrbm_dx50_dh40_vp1-me2_rs200_nmi1000_nma4000_a0.050_trp0.20.p'\n",
214 |     "gbrbm_highd_results = load_plot_vs_ns(gbrbm_highd_fname, show_legend=False)\n",
215 |     "plt.yticks([0, 0.25, 0.5, 0.75])\n",
216 |     "plt.ylim([0, 0.75])\n",
217 |     "plt.ylabel('P(detect difference)', fontsize=26)\n",
218 |     "\n",
219 |     "plt.savefig(gbrbm_highd_fname.replace('.p', '.pdf', 1), bbox_inches='tight',\n",
220 |     "           show_legend=False)\n",
221 |     "\n"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "load_runtime_vs_ns(gbrbm_highd_fname, show_legend=False);\n",
233 |     "plt.yticks([0, 100, 200, 300])\n",
234 |     "plt.ylim([0, 300])\n",
235 |     "plt.savefig(gbrbm_highd_fname.replace('.p', '_time.pdf', 1), bbox_inches='tight',\n",
236 |     "           show_legend=False)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {
243 |     "collapsed": false
244 |    },
245 |    "outputs": [],
246 |    "source": [
247 |     "gbrbm_highd_h0_fname = 'ex1-gbrbm_dx50_dh40_h0-me6_rs200_nmi1000_nma4000_a0.050_trp0.20.p'\n",
248 |     "gbrbm_highd_h0_results = load_plot_vs_ns(gbrbm_highd_h0_fname, show_legend=False)\n",
249 |     "plt.savefig(gbrbm_h0_fname.replace('.p', '.pdf', 1), bbox_inches='tight',\n",
250 |     "           show_legend=False)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": null,
256 |    "metadata": {
257 |     "collapsed": false
258 |    },
259 |    "outputs": [],
260 |    "source": [
261 |     "# gbrbm_fname = 'ex1-gbrbm_dx50_dh10_vp1-me4_rs30_nmi1000_nma5000_a0.050_trp0.50.p'\n",
262 |     "# gbrbm_results = load_plot_vs_ns(gbrbm_fname)\n",
263 |     "# plt.figure()\n",
264 |     "# load_runtime_vs_ns(gbrbm_fname);"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": []
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {
280 |     "collapsed": true
281 |    },
282 |    "outputs": [],
283 |    "source": []
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {
289 |     "collapsed": true
290 |    },
291 |    "outputs": [],
292 |    "source": []
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "collapsed": true
299 |    },
300 |    "outputs": [],
301 |    "source": []
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {
307 |     "collapsed": true
308 |    },
309 |    "outputs": [],
310 |    "source": []
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": []
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": null,
324 |    "metadata": {
325 |     "collapsed": true
326 |    },
327 |    "outputs": [],
328 |    "source": []
329 |   }
330 |  ],
331 |  "metadata": {
332 |   "anaconda-cloud": {},
333 |   "kernelspec": {
334 |    "display_name": "Python [Root]",
335 |    "language": "python",
336 |    "name": "Python [Root]"
337 |   },
338 |   "language_info": {
339 |    "codemirror_mode": {
340 |     "name": "ipython",
341 |     "version": 2
342 |    },
343 |    "file_extension": ".py",
344 |    "mimetype": "text/x-python",
345 |    "name": "python",
346 |    "nbconvert_exporter": "python",
347 |    "pygments_lexer": "ipython2",
348 |    "version": "2.7.12"
349 |   }
350 |  },
351 |  "nbformat": 4,
352 |  "nbformat_minor": 0
353 | }
354 | 


--------------------------------------------------------------------------------
/ipynb/ex2_results.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "A notebook to process experimental results of ex2_prob_params.py. p(reject) as problem parameters are varied."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%load_ext autoreload\n",
 21 |     "%autoreload 2\n",
 22 |     "%matplotlib inline\n",
 23 |     "#%config InlineBackend.figure_format = 'svg'\n",
 24 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 25 |     "\n",
 26 |     "import numpy as np\n",
 27 |     "\n",
 28 |     "import matplotlib\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import kgof.data as data\n",
 31 |     "import kgof.glo as glo\n",
 32 |     "import kgof.goftest as gof\n",
 33 |     "import kgof.kernel as kernel\n",
 34 |     "import kgof.plot as plot\n",
 35 |     "import kgof.util as util\n",
 36 |     "\n",
 37 |     "import scipy.stats as stats"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import kgof.plot\n",
 49 |     "kgof.plot.set_default_matplotlib_options()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def load_plot_vs_params(fname, xlabel='Problem parameter', show_legend=True):\n",
 61 |     "    func_xvalues = lambda agg_results: agg_results['prob_params']\n",
 62 |     "    ex = 2\n",
 63 |     "    def func_title(agg_results):\n",
 64 |     "        repeats, _, n_methods = agg_results['job_results'].shape\n",
 65 |     "        alpha = agg_results['alpha']\n",
 66 |     "        test_size = (1.0 - agg_results['tr_proportion'])*agg_results['sample_size']\n",
 67 |     "        title = '%s. %d trials. test size: %d. $\\\\alpha$ = %.2g.'%\\\n",
 68 |     "            ( agg_results['prob_label'], repeats, test_size, alpha)\n",
 69 |     "        return title\n",
 70 |     "    #plt.figure(figsize=(10,5))\n",
 71 |     "    results = plot.plot_prob_reject(\n",
 72 |     "        ex, fname, func_xvalues, xlabel, func_title=func_title)\n",
 73 |     "    \n",
 74 |     "    plt.title('')\n",
 75 |     "    plt.gca().legend(loc='best').set_visible(show_legend)\n",
 76 |     "    if show_legend:\n",
 77 |     "        plt.legend(bbox_to_anchor=(1.80, 1.08))\n",
 78 |     "    \n",
 79 |     "    plt.grid(False)\n",
 80 |     "        \n",
 81 |     "    return results\n",
 82 |     "\n",
 83 |     "\n",
 84 |     "def load_runtime_vs_params(fname, xlabel='Problem parameter', \n",
 85 |     "                      show_legend=True, xscale='linear', yscale='linear'):\n",
 86 |     "    func_xvalues = lambda agg_results: agg_results['prob_params']\n",
 87 |     "    ex = 2\n",
 88 |     "    def func_title(agg_results):\n",
 89 |     "        repeats, _, n_methods = agg_results['job_results'].shape\n",
 90 |     "        alpha = agg_results['alpha']\n",
 91 |     "        title = '%s. %d trials. $\\\\alpha$ = %.2g.'%\\\n",
 92 |     "            ( agg_results['prob_label'], repeats, alpha)\n",
 93 |     "        return title\n",
 94 |     "    \n",
 95 |     "    #plt.figure(figsize=(10,6))\n",
 96 |     "    \n",
 97 |     "    results = plot.plot_runtime(ex, fname,  \n",
 98 |     "                                func_xvalues, xlabel=xlabel, func_title=func_title)\n",
 99 |     "    \n",
100 |     "    plt.title('')\n",
101 |     "    plt.gca().legend(loc='best').set_visible(show_legend)\n",
102 |     "    if show_legend:\n",
103 |     "        plt.legend(bbox_to_anchor=(1.80, 1.05))\n",
104 |     "    \n",
105 |     "    plt.grid(False)\n",
106 |     "    if xscale is not None:\n",
107 |     "        plt.xscale(xscale)\n",
108 |     "    if yscale is not None:\n",
109 |     "        plt.yscale(yscale)\n",
110 |     "        \n",
111 |     "    return results\n"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "# # Gaussian mean difference. Fix dimension. Vary the mean\n",
123 |     "# #gmd_fname = 'ex2-gmd_d10_ms-me5_n1000_rs100_pmi0.000_pma0.600_a0.050_trp0.50.p'\n",
124 |     "# gmd_fname = 'ex2-gmd_d10_ms-me4_n2000_rs50_pmi0.000_pma0.060_a0.050_trp0.50.p'\n",
125 |     "# gmd_results = load_plot_vs_params(gmd_fname, xlabel='$m$', show_legend=True)\n",
126 |     "# #plt.ylim([0.03, 0.1])\n",
127 |     "# #plt.savefig(bsg_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "$$p(x) = \\mathcal{N}(0, I) \\\\\n",
135 |     "q(x) = \\mathcal{N}((m,0,\\ldots), I)$$"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "collapsed": true
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "# # Gaussian increasing variance. Variance below 1.\n",
147 |     "# gvsub1_d1_fname = 'ex2-gvsub1_d1_vs-me8_n1000_rs100_pmi0.100_pma0.700_a0.050_trp0.50.p'\n",
148 |     "# gvsub1_d1_results = load_plot_vs_params(gvsub1_d1_fname, xlabel='$v$')\n",
149 |     "# plt.title('d=1')\n",
150 |     "# # plt.ylim([0.02, 0.08])\n",
151 |     "# # plt.xlim([0, 4])\n",
152 |     "# #plt.legend(bbox_to_anchor=(1.70, 1.05))\n",
153 |     "# #plt.savefig(gsign_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": false,
161 |     "scrolled": true
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "# # Gaussian increasing variance\n",
166 |     "# gvinc_d5_fname = 'ex2-gvinc_d5-me8_n1000_rs100_pmi1.000_pma2.500_a0.050_trp0.50.p'\n",
167 |     "# gvinc_d5_results = load_plot_vs_params(gvinc_d5_fname, xlabel='$v$', \n",
168 |     "#     show_legend=True)\n",
169 |     "# plt.title('d=5')\n",
170 |     "# # plt.ylim([0.02, 0.08])\n",
171 |     "# # plt.xlim([0, 4])\n",
172 |     "# #plt.legend(bbox_to_anchor=(1.70, 1.05))\n",
173 |     "# #plt.savefig(gsign_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "$$p(x)=\\mathcal{N}(0, I) \\\\\n",
181 |     "q(x)=\\mathcal{N}(0, vI)$$"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "collapsed": false,
189 |     "scrolled": false
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "# # Gaussian variance diffenece (GVD)\n",
194 |     "# gvd_fname = 'ex2-gvd-me4_n1000_rs100_pmi1.000_pma15.000_a0.050_trp0.50.p'\n",
195 |     "# # gvd_fname = 'ex2-gvd-me4_n1000_rs50_pmi1.000_pma15.000_a0.050_trp0.80.p'\n",
196 |     "# gvd_results = load_plot_vs_params(gvd_fname, xlabel='$d$', show_legend=True)\n",
197 |     "# plt.figure()\n",
198 |     "# load_runtime_vs_params(gvd_fname);\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "$$p(x)=\\mathcal{N}(0, I) \\\\\n",
206 |     "q(x)=\\mathcal{N}(0, \\mathrm{diag}(2,1,1,\\ldots))$$"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "collapsed": false,
214 |     "scrolled": false
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "# Gauss-Bernoulli RBM\n",
219 |     "# gb_rbm_fname = 'ex2-gbrbm_dx50_dh10-me4_n1000_rs200_pmi0.000_pma0.001_a0.050_trp0.20.p'\n",
220 |     "# gb_rbm_fname = 'ex2-gbrbm_dx50_dh10-me4_n1000_rs200_pmi0.000_pma0.000_a0.050_trp0.20.p'\n",
221 |     "# gb_rbm_fname = 'ex2-gbrbm_dx50_dh10-me6_n1000_rs300_pmi0.000_pma0.001_a0.050_trp0.20.p'\n",
222 |     "gb_rbm_fname = 'ex2-gbrbm_dx50_dh10-me6_n1000_rs200_pmi0_pma0.06_a0.050_trp0.20.p'\n",
223 |     "gb_rbm_results = load_plot_vs_params(gb_rbm_fname, xlabel='Perturbation SD $\\sigma_{per}$', \n",
224 |     "                                     show_legend=False)\n",
225 |     "plt.savefig(gb_rbm_fname.replace('.p', '.pdf', 1), bbox_inches='tight')\n",
226 |     "# plt.xlim([-0.1, -0.2])"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": false,
234 |     "scrolled": true
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "load_runtime_vs_params(gb_rbm_fname,  xlabel='Perturbation SD $\\sigma_{per}$', yscale='linear', show_legend=False);\n",
239 |     "plt.savefig(gb_rbm_fname.replace('.p', '_time.pdf', 1), bbox_inches='tight')"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "# gbrbm_highd_fname = 'ex2-gbrbm_dx50_dh40-me6_n1000_rs200_pmi0_pma0.06_a0.050_trp0.20.p'\n",
251 |     "# gbrbm_highd_fname = 'ex2-gbrbm_dx50_dh40-me2_n1000_rs200_pmi0_pma0.06_a0.050_trp0.20.p'\n",
252 |     "gbrbm_highd_fname = 'ex2-gbrbm_dx50_dh40-me1_n1000_rs200_pmi0_pma0.06_a0.050_trp0.20.p'\n",
253 |     "gbrbm_highd_results = load_plot_vs_params(\n",
254 |     "    gbrbm_highd_fname, \n",
255 |     "#     xlabel='Perturbation SD $\\sigma_{per}$', \n",
256 |     "    xlabel='Perturbation noise', \n",
257 |     "                                          show_legend=False)\n",
258 |     "plt.xticks([0, 0.02, 0.04, 0.06])\n",
259 |     "plt.yticks([0, 0.5, 1])\n",
260 |     "plt.ylim([0, 1.05])\n",
261 |     "plt.ylabel('P(detect difference)', fontsize=26)\n",
262 |     "plt.box(True)\n",
263 |     "plt.savefig(gbrbm_highd_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": false
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "load_runtime_vs_params(gbrbm_highd_fname,  xlabel='Perturbation SD $\\sigma_{per}$', \n",
275 |     "                       yscale='linear', show_legend=False);\n",
276 |     "plt.savefig(gbrbm_highd_fname.replace('.p', '_time.pdf', 1), bbox_inches='tight')"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {
283 |     "collapsed": false,
284 |     "scrolled": false
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "## p: Gaussian, q: Laplace. Vary d\n",
289 |     "# glaplace_fname = 'ex2-glaplace-me4_n1000_rs100_pmi1.000_pma15.000_a0.050_trp0.50.p'\n",
290 |     "# glaplace_fname = 'ex2-glaplace-me4_n1000_rs200_pmi1.000_pma15.000_a0.050_trp0.20.p'\n",
291 |     "# glaplace_fname = 'ex2-glaplace-me5_n1000_rs400_pmi1.000_pma15.000_a0.050_trp0.20.p'\n",
292 |     "glaplace_fname = 'ex2-glaplace-me6_n1000_rs200_pmi1_pma15_a0.050_trp0.20.p'\n",
293 |     "glaplace_results = load_plot_vs_params(glaplace_fname, xlabel='dimension $d$', show_legend=False)\n",
294 |     "plt.savefig(glaplace_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {
301 |     "collapsed": false
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "load_runtime_vs_params(glaplace_fname, xlabel='dimension $d$', show_legend=False, yscale='linear');\n",
306 |     "plt.savefig(glaplace_fname.replace('.p', '_time.pdf', 1), bbox_inches='tight')"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "$$p(x)=\\mathcal{N}(0, 1) \\\\\n",
314 |     "q(x)=\\mathrm{Laplace}(0, 1/\\sqrt{2})$$\n",
315 |     "\n",
316 |     "q has the same unit variance as p."
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [],
326 |    "source": []
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": []
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {
341 |     "collapsed": true
342 |    },
343 |    "outputs": [],
344 |    "source": []
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "kernelspec": {
349 |    "display_name": "Python 2",
350 |    "language": "python",
351 |    "name": "python2"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 2
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython2",
363 |    "version": "2.7.13"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 0
368 | }
369 | 


--------------------------------------------------------------------------------
/ipynb/ex3_results.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "A notebook to process experimental results of ex2_prob_params.py. p(reject) as problem parameters are varied."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%load_ext autoreload\n",
 19 |     "%autoreload 2\n",
 20 |     "%matplotlib inline\n",
 21 |     "#%config InlineBackend.figure_format = 'svg'\n",
 22 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 23 |     "\n",
 24 |     "import numpy as np\n",
 25 |     "\n",
 26 |     "import matplotlib\n",
 27 |     "import matplotlib.pyplot as plt\n",
 28 |     "import kgof.data as data\n",
 29 |     "import kgof.glo as glo\n",
 30 |     "import kgof.goftest as gof\n",
 31 |     "import kgof.kernel as kernel\n",
 32 |     "import kgof.plot as plot\n",
 33 |     "import kgof.util as util\n",
 34 |     "\n",
 35 |     "import scipy.stats as stats"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import kgof.plot\n",
 45 |     "kgof.plot.set_default_matplotlib_options()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "scrolled": false
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# np.random.seed(0)\n",
 57 |     "# x = np.linspace(-5., 5., 50)\n",
 58 |     "# y = 3 * np.exp(-0.5 * (x - 1.3)**2 / 0.8**2)\n",
 59 |     "# y += np.random.normal(0., 0.2, x.shape)\n",
 60 |     "# f = plt.figure(0)\n",
 61 |     "# plt.plot(x,y, 'b-')\n",
 62 |     "# plt.xlabel(r'$\\alpha$')\n",
 63 |     "# plt.ylabel('Test power')\n",
 64 |     "# f.savefig('test.pdf', bbox_inches='tight')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def load_plot_vs_Js(fname, show_legend=True, xscale='log', yscale='linear'):\n",
 76 |     "    \"\"\"\n",
 77 |     "    J = number of test locations.\n",
 78 |     "    \"\"\"\n",
 79 |     "    func_xvalues = lambda agg_results: agg_results['Js']\n",
 80 |     "    ex = 3\n",
 81 |     "    def func_title(agg_results):\n",
 82 |     "        repeats, _, n_methods = agg_results['job_results'].shape\n",
 83 |     "        alpha = agg_results['alpha']\n",
 84 |     "        test_size = (1.0 - agg_results['tr_proportion'])*agg_results['sample_size']\n",
 85 |     "        title = '%s. %d trials. test size: %d. $\\\\alpha$ = %.2g.'%\\\n",
 86 |     "            ( agg_results['prob_label'], repeats, test_size, alpha)\n",
 87 |     "        return title\n",
 88 |     "    #plt.figure(figsize=(10,5))\n",
 89 |     "    results = plot.plot_prob_reject(\n",
 90 |     "        ex, fname, func_xvalues, '', func_title=func_title)\n",
 91 |     "    \n",
 92 |     "    plt.title('')\n",
 93 |     "    \n",
 94 |     "    if xscale is not None:\n",
 95 |     "        plt.xscale(xscale)\n",
 96 |     "    if yscale is not None:\n",
 97 |     "        plt.yscale(yscale)\n",
 98 |     "    \n",
 99 |     "    plt.xlabel('$J$')\n",
100 |     "    plt.gca().legend(loc='best').set_visible(show_legend)\n",
101 |     "    if show_legend:\n",
102 |     "        plt.legend(bbox_to_anchor=(1.70, 1.05))\n",
103 |     "        \n",
104 |     "    plt.grid(False)\n",
105 |     "        \n",
106 |     "    return results\n",
107 |     "\n",
108 |     "\n",
109 |     "def load_runtime_vs_Js(fname, xlabel='$J$ parameter', \n",
110 |     "                      show_legend=True, xscale='linear', yscale='linear'):\n",
111 |     "    func_xvalues = lambda agg_results: agg_results['Js']\n",
112 |     "    ex = 3\n",
113 |     "    def func_title(agg_results):\n",
114 |     "        repeats, _, n_methods = agg_results['job_results'].shape\n",
115 |     "        alpha = agg_results['alpha']\n",
116 |     "        title = '%s. %d trials. $\\\\alpha$ = %.2g.'%\\\n",
117 |     "            ( agg_results['prob_label'], repeats, alpha)\n",
118 |     "        return title\n",
119 |     "    \n",
120 |     "    #plt.figure(figsize=(10,6))\n",
121 |     "    \n",
122 |     "    results = plot.plot_runtime(ex, fname,  \n",
123 |     "                                func_xvalues, xlabel=xlabel, func_title=func_title)\n",
124 |     "    \n",
125 |     "    plt.title('')\n",
126 |     "    plt.gca().legend(loc='best').set_visible(show_legend)\n",
127 |     "    if show_legend:\n",
128 |     "        plt.legend(bbox_to_anchor=(1.70, 1.05))\n",
129 |     "    \n",
130 |     "    plt.grid(False)\n",
131 |     "    if xscale is not None:\n",
132 |     "        plt.xscale(xscale)\n",
133 |     "    if yscale is not None:\n",
134 |     "        plt.yscale(yscale)\n",
135 |     "        \n",
136 |     "    return results\n"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# GMD\n",
146 |     "# gmd_fname = 'ex3-gmd1-me2_n500_rs100_Jmi2_Jma32_a0.050_trp0.50.p'\n",
147 |     "# gmd_results = load_plot_vs_Js(gmd_fname, show_legend=True)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "scrolled": false
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "# p: normal, q: Gaussian mixture\n",
159 |     "# g_vs_gmm_fname = 'ex3-g_vs_gmm_d5-me2_n500_rs50_Jmi2_Jma384_a0.050_trp0.50.p'\n",
160 |     "# g_vs_gmm_fname = 'ex3-g_vs_gmm_d5-me2_n500_rs100_Jmi2_Jma384_a0.050_trp0.50.p'\n",
161 |     "# g_vs_gmm_fname = 'ex3-g_vs_gmm_d2-me2_n500_rs50_Jmi2_Jma384_a0.050_trp0.50.p'\n",
162 |     "# g_vs_gmm_fname = 'ex3-g_vs_gmm_d1-me2_n500_rs50_Jmi2_Jma384_a0.050_trp0.50.p'\n",
163 |     "g_vs_gmm_fname = 'ex3-g_vs_gmm_d1-me2_n500_rs200_Jmi2_Jma384_a0.050_trp0.50.p'\n",
164 |     "# g_vs_gmm_fname = 'ex3-g_vs_gmm_d1-me2_n800_rs50_Jmi2_Jma384_a0.050_trp0.50.p'\n",
165 |     "g_vs_gmm_results = load_plot_vs_Js(g_vs_gmm_fname, show_legend=False)\n",
166 |     "plt.xticks([1, 10, 1e2, 1e3])\n",
167 |     "plt.savefig(g_vs_gmm_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "# Gaussian mixture\n",
177 |     "# gmm_fname = 'ex3-gmm_d1-me2_n500_rs100_Jmi2_Jma32_a0.050_trp0.50.p'\n",
178 |     "# gmm_results = load_plot_vs_Js(gmm_fname)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {
185 |     "scrolled": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "# Same Gaussian\n",
190 |     "sg5_fname = \"ex3-sg5-me2_n500_rs100_Jmi2_Jma384_a0.050_trp0.50.p\"\n",
191 |     "sg5_results = load_plot_vs_Js(sg5_fname, show_legend=False)\n",
192 |     "plt.ylim([0, 0.05])\n",
193 |     "plt.xticks([1, 10, 1e2, 1e3])\n",
194 |     "plt.savefig(sg5_fname.replace('.p', '.pdf', 1), bbox_inches='tight')"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "$$p(x)=\\mathcal{N}(0, I) \\\\\n",
202 |     "q(x)=\\mathcal{N}(0, I)$$"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "scrolled": false
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "# Gaussian variance difference. \n",
214 |     "gvd5_fname = 'ex3-gvd5-me2_n500_rs100_Jmi2_Jma384_a0.050_trp0.50.p'\n",
215 |     "gvd5_results = load_plot_vs_Js(gvd5_fname, show_legend=True)\n",
216 |     "plt.legend(bbox_to_anchor=(1.8, 1.05))\n",
217 |     "plt.xticks([1, 10, 1e2, 1e3])\n",
218 |     "plt.savefig(gvd5_fname.replace('.p', '.pdf', 1), bbox_inches='tight')\n",
219 |     "# plt.legend(ncol=2)\n",
220 |     "#plt.ylim([0.03, 0.1])\n"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "$$p(x)=\\mathcal{N}(0, I) \\\\\n",
228 |     "q(x)=\\mathcal{N}(0, \\mathrm{diag}(2,1,1,\\ldots))$$"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "# Gauss-Bernoulli RBM. H1 case\n",
238 |     "# rbm_h1_fname = 'ex3-gbrbm_dx5_dh3_v5em3-me2_n500_rs100_Jmi2_Jma384_a0.050_trp0.50.p'\n",
239 |     "# rbm_h1_results = load_plot_vs_Js(rbm_h1_fname, show_legend=True)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {
246 |     "collapsed": true
247 |    },
248 |    "outputs": [],
249 |    "source": []
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {
255 |     "collapsed": true
256 |    },
257 |    "outputs": [],
258 |    "source": []
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": []
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "collapsed": true
274 |    },
275 |    "outputs": [],
276 |    "source": []
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {
282 |     "collapsed": true
283 |    },
284 |    "outputs": [],
285 |    "source": []
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "collapsed": true
292 |    },
293 |    "outputs": [],
294 |    "source": []
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": []
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": []
311 |   }
312 |  ],
313 |  "metadata": {
314 |   "kernelspec": {
315 |    "display_name": "Python 3",
316 |    "language": "python",
317 |    "name": "python3"
318 |   },
319 |   "language_info": {
320 |    "codemirror_mode": {
321 |     "name": "ipython",
322 |     "version": 3
323 |    },
324 |    "file_extension": ".py",
325 |    "mimetype": "text/x-python",
326 |    "name": "python",
327 |    "nbconvert_exporter": "python",
328 |    "pygments_lexer": "ipython3",
329 |    "version": "3.6.3"
330 |   }
331 |  },
332 |  "nbformat": 4,
333 |  "nbformat_minor": 1
334 | }
335 | 


--------------------------------------------------------------------------------
/ipynb/fssd_locs_surface.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "Plot the Stein witness function, and the mean/std objective as a function of the test locations."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%load_ext autoreload\n",
 19 |     "%autoreload 2\n",
 20 |     "%matplotlib inline\n",
 21 |     "#%config InlineBackend.figure_format = 'svg'\n",
 22 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 23 |     "\n",
 24 |     "import kgof\n",
 25 |     "import kgof.data as data\n",
 26 |     "import kgof.density as density\n",
 27 |     "import kgof.goftest as gof\n",
 28 |     "import kgof.kernel as kernel\n",
 29 |     "import kgof.util as util\n",
 30 |     "import matplotlib\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "import autograd.numpy as np\n",
 33 |     "import scipy.stats as stats"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# # font options\n",
 43 |     "# font = {\n",
 44 |     "#     #'family' : 'normal',\n",
 45 |     "#     #'weight' : 'bold',\n",
 46 |     "#     'size'   : 18\n",
 47 |     "# }\n",
 48 |     "\n",
 49 |     "# plt.rc('font', **font)\n",
 50 |     "# plt.rc('lines', linewidth=2)\n",
 51 |     "# matplotlib.rcParams['pdf.fonttype'] = 42\n",
 52 |     "# matplotlib.rcParams['ps.fonttype'] = 42\n",
 53 |     "\n",
 54 |     "import kgof.plot\n",
 55 |     "kgof.plot.set_default_matplotlib_options()"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Surface plots in 2d"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "def generic_contourf(p, dat, k, func):\n",
 72 |     "    \"\"\"\n",
 73 |     "    func: (p, dat, k, V) |-> value. A function computing the values to plot.\n",
 74 |     "    \"\"\"\n",
 75 |     "    # should be an n x 2 matrix. 2d data.\n",
 76 |     "    X = dat.data()\n",
 77 |     "    max0, max1 = np.max(X, 0)\n",
 78 |     "    min0, min1 = np.min(X, 0)\n",
 79 |     "    \n",
 80 |     "    #sd1, sd2 = np.std(XY, 0)\n",
 81 |     "    margin_scale = 0.2\n",
 82 |     "    sd0, sd1 = ((max0-min0)*margin_scale, (max1-min1)*margin_scale)\n",
 83 |     "    # form a test location grid to try \n",
 84 |     "    nd0 = 50\n",
 85 |     "    nd1 = 50\n",
 86 |     "    loc0_cands = np.linspace(min0-sd0/2, max0+sd0/2, nd0)\n",
 87 |     "    loc1_cands = np.linspace(min1-sd1/2, max1+sd1/2, nd1)\n",
 88 |     "    lloc0, lloc1 = np.meshgrid(loc0_cands, loc1_cands)\n",
 89 |     "    # nd1 x nd0 x 2\n",
 90 |     "    loc3d = np.dstack((lloc0, lloc1))\n",
 91 |     "    # #candidates x 2\n",
 92 |     "    all_loc2s = np.reshape(loc3d, (-1, 2) )\n",
 93 |     "\n",
 94 |     "    # all_locs = #candidates x J x 2\n",
 95 |     "    #all_locs = np.array( [np.vstack((c, loc1)) for c in all_loc2s] )\n",
 96 |     "    \n",
 97 |     "    # evaluate the function on each candidate T on the grid. Size = (#candidates, )\n",
 98 |     "    stat_grid = np.array([func(p, dat, k, np.array([T])) for T in all_loc2s])\n",
 99 |     "    stat_grid = np.reshape(stat_grid, (nd1, nd0) )\n",
100 |     "\n",
101 |     "    den_grid = np.exp(p.log_normalized_den(all_loc2s))\n",
102 |     "    den_grid = np.reshape(den_grid, (nd1, nd0))\n",
103 |     "    #ax = fig.gca(projection='3d')\n",
104 |     "    #ax.plot_surface(lloc1, lloc2, stat_grid, rstride=8, cstride=8, alpha=0.3)\n",
105 |     "    #cset = ax.contourf(lloc1, lloc2, stat_grid, zdir='z', offset=0, cmap=cm.coolwarm)\n",
106 |     "    plt.figure(figsize=(10, 6))\n",
107 |     "    # Plot the unnormalized density\n",
108 |     "    CS = plt.contour(\n",
109 |     "        lloc0, lloc1, den_grid, alpha=0.9, \n",
110 |     "        #colors=('#500000', '#900000', '#d00000'),\n",
111 |     "        cmap=plt.cm.Reds,\n",
112 |     "        \n",
113 |     "    )\n",
114 |     "    #plt.clabel(CS, fontsize=12, inline=1, fmt='%1.1f', colors='k')\n",
115 |     "    plt.contourf(lloc0, lloc1, stat_grid, cmap=plt.cm.Greys, alpha=0.7)\n",
116 |     "    \n",
117 |     "    #plt.gca().get_xaxis().set_visible(False)\n",
118 |     "    #plt.gca().get_yaxis().set_visible(False)\n",
119 |     "    #plt.axis('off')\n",
120 |     "    #plt.colorbar()\n",
121 |     "\n",
122 |     "    max_stat = np.max(stat_grid)\n",
123 |     "   \n",
124 |     "    n = X.shape[0]\n",
125 |     "    #ax.view_init(elev=max_stat*2, azim=90)\n",
126 |     "\n",
127 |     "    # plot the data\n",
128 |     "    plt.plot(X[:, 0], X[:, 1], '.b', markeredgecolor='b', markersize=3, alpha=0.7)\n",
129 |     "    ax = plt.gca()\n",
130 |     "#     ax.set_aspect('auto')\n",
131 |     "    plt.xlim([-4, 4]);\n",
132 |     "    plt.ylim([-4, 4]);\n",
133 |     "    \n",
134 |     "    # return the locations V\n",
135 |     "    \n",
136 |     "    max_ind = np.argmax(stat_grid.reshape(-1))\n",
137 |     "    V = all_loc2s[max_ind]\n",
138 |     "    print('V: %s'%V)\n",
139 |     "    \n",
140 |     "    # put a star at the highest location\n",
141 |     "    plt.plot(V[0], V[1], '*', color='#EC008C', markersize=30)\n",
142 |     "    return V\n",
143 |     " \n",
144 |     "def func_fssd(p, dat, k, V):\n",
145 |     "    \"\"\"\n",
146 |     "    Return the value of FSSD test statistic.\n",
147 |     "    \"\"\"\n",
148 |     "    fssd = gof.FSSD(p, k, V, alpha=0.01, null_sim=None)\n",
149 |     "    return fssd.compute_stat(dat)\n",
150 |     "\n",
151 |     "def func_fssd_power_criterion(p, dat, k, V):\n",
152 |     "    \"\"\"\n",
153 |     "    Return the value of the power criterion of FSSD.\n",
154 |     "    \"\"\"\n",
155 |     "    return gof.FSSD.power_criterion(p, dat, k, V, reg=1e-6, use_unbiased=False)\n",
156 |     "    \n",
157 |     "def func_fssd_ustat_std(p, dat, k, V):\n",
158 |     "    \"\"\"\n",
159 |     "    Return the standard deviation of the U-statistic\n",
160 |     "    \"\"\"\n",
161 |     "    fssd = gof.FSSD(p, k, V, alpha=0.01, null_sim=None)\n",
162 |     "    X = dat.data()\n",
163 |     "    fea_tensor = fssd.feature_tensor(X)\n",
164 |     "    _, variance = gof.FSSD.ustat_h1_mean_variance(fea_tensor, return_variance=True)\n",
165 |     "    return np.sqrt(variance)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "# sample\n",
175 |     "n = 2000\n",
176 |     "# true p\n",
177 |     "seed = 20\n",
178 |     "d = 2\n",
179 |     "mean = np.zeros(d)\n",
180 |     "variance = 1\n",
181 |     "isonorm = density.IsotropicNormal(mean, variance)\n",
182 |     "#------------------------------\n",
183 |     "\n",
184 |     "# only one dimension of the mean is shifted\n",
185 |     "#draw_mean = mean + np.hstack((1, np.zeros(d-1)))\n",
186 |     "draw_mean = mean + 0\n",
187 |     "draw_variance = np.diag([variance+1, variance])\n",
188 |     "X = util.randn(n, d, seed=seed+3).dot(np.sqrt(draw_variance)) + draw_mean\n",
189 |     "dat = data.Data(X)\n",
190 |     "\n",
191 |     "\n",
192 |     "# Scaling of 1/sqrt(2) will make the variance 1.\n",
193 |     "ds_laplace = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2))\n",
194 |     "dat = ds_laplace.sample(n, seed=seed)\n",
195 |     "\n",
196 |     "# problem_name = 'pgauss_qgauss'\n",
197 |     "problem_name = 'pgauss_qlaplace'"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {
204 |     "scrolled": false
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "# Kernel\n",
209 |     "X = dat.data()\n",
210 |     "sig2 = util.meddistance(X, subsample=1000)**2\n",
211 |     "k = kernel.KGauss(sig2/2)\n",
212 |     "\n",
213 |     "# Test\n",
214 |     "J = 1\n",
215 |     "alpha = 0.01\n",
216 |     "\n",
217 |     "# random test locations\n",
218 |     "V = util.fit_gaussian_draw(X, J, seed=seed+1)\n",
219 |     "null_sim = gof.FSSDH0SimCovObs(n_simulate=1000, seed=10)\n",
220 |     "fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)\n",
221 |     "\n",
222 |     "fssd.perform_test(dat)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "metadata": {
229 |     "collapsed": true
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "p = isonorm"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {
240 |     "scrolled": false
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "generic_contourf(p, dat, k, func_fssd)\n",
245 |     "plt.title('Stein witness')\n",
246 |     "#plt.colorbar()\n",
247 |     "#plt.grid()\n",
248 |     "plt.savefig('{}_witness.pdf'.format(problem_name), bbox_inches='tight')"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {
255 |     "scrolled": false
256 |    },
257 |    "outputs": [],
258 |    "source": [
259 |     "generic_contourf(p, dat, k, func_fssd_power_criterion)\n",
260 |     "plt.title(r'$\\widehat{\\mathrm{FSSD^2}}/\\widehat{\\sigma_{H_1}}$')\n",
261 |     "# plt.colorbar()\n",
262 |     "# plt.grid()\n",
263 |     "plt.savefig('{}_optobj.pdf'.format(problem_name), bbox_inches='tight')"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "--------------"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {
277 |     "scrolled": false
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "generic_contourf(p, dat, k, func_fssd_ustat_std)\n",
282 |     "plt.title('U-statistic standard deviation')\n",
283 |     "plt.colorbar()\n",
284 |     "plt.grid()\n",
285 |     "plt.savefig('{}_h1sd.pdf'.format(problem_name), bbox_inches='tight')"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "## Plots in 1D"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {
299 |     "collapsed": true
300 |    },
301 |    "outputs": [],
302 |    "source": [
303 |     "def generic_1d_locs_plot(p, dat, k, func, func_label=None, cond_locs=None, \n",
304 |     "                         noise_level=None, qden_func=None, n_max_stats=0):\n",
305 |     "    \"\"\"\n",
306 |     "    func: (p, dat, k, V) |-> value. A function computing the values to plot.\n",
307 |     "    cond_locs: J'xd matrix of test locations to condition on\n",
308 |     "    func_label: plot label for the function \n",
309 |     "    qden_func: a function taking an array and producing the density values of q\n",
310 |     "    \n",
311 |     "    - n_max_stats: a count to show top few locations of the highest stats.\n",
312 |     "    \"\"\"\n",
313 |     "    \n",
314 |     "    # should be an n x 1 matrix. 1d data.\n",
315 |     "    X = dat.data()\n",
316 |     "    max0 = np.max(X, 0)\n",
317 |     "    min0 = np.min(X, 0)\n",
318 |     "    \n",
319 |     "    sd0  = (max0-min0)*0.3\n",
320 |     "    # form a test location grid to try \n",
321 |     "    nd0 = 600\n",
322 |     "    loc0_cands = np.hstack((\n",
323 |     "        np.linspace(min0-sd0/2, max0+sd0/2, nd0),\n",
324 |     "        np.linspace(-1e-4, 1e-4, 100),\n",
325 |     "        [0, -1e-6, 1e-6]\n",
326 |     "    ))\n",
327 |     "    \n",
328 |     "    loc0_cands.sort()\n",
329 |     "    # #candidates x 1\n",
330 |     "    all_locs = np.reshape(loc0_cands, (-1, 1) )\n",
331 |     "    \n",
332 |     "    # evaluate the function on each candidate on the grid. Size = (#candidates, )\n",
333 |     "    n_cand = len(loc0_cands)\n",
334 |     "    stat_grid = np.zeros(n_cand)\n",
335 |     "    \n",
336 |     "    for i in range(n_cand):\n",
337 |     "        vi = np.reshape(all_locs[i], (-1, 1))\n",
338 |     "        V = vi if cond_locs is None else np.vstack((vi, cond_locs))\n",
339 |     "        stat_grid[i] = func(p, dat, k, V)\n",
340 |     "    den_grid = np.exp(p.log_normalized_den(all_locs))\n",
341 |     "    \n",
342 |     "    plt.figure(figsize=(8, 4))\n",
343 |     "    # Plot the unnormalized density\n",
344 |     "    max_func = np.max(stat_grid)\n",
345 |     "    max_den = np.max(den_grid)\n",
346 |     "    #abs_max = max(max_func, max_den)\n",
347 |     "    abs_max = max_func\n",
348 |     "    \n",
349 |     "    rescaled_den = den_grid/max_den*abs_max\n",
350 |     "    rescaled_stat = stat_grid/np.max(stat_grid)*np.max(den_grid)*1.2\n",
351 |     "#     rescaled_stat = stat_grid\n",
352 |     "    if n_max_stats > 0:\n",
353 |     "        I = np.argsort(-rescaled_stat)\n",
354 |     "        for i in range(n_max_stats):\n",
355 |     "            best_loc_i = all_locs[I[i]]\n",
356 |     "            plt.plot([best_loc_i, best_loc_i], [0, rescaled_stat[I[i]]], 'k--')\n",
357 |     "\n",
358 |     "    plt.plot(all_locs, den_grid, 'b-', \n",
359 |     "#              label='$p(\\mathbf{x})$'\n",
360 |     "              label='$p$'\n",
361 |     "            )\n",
362 |     "    if qden_func is not None:\n",
363 |     "        qden = qden_func(all_locs)\n",
364 |     "        plt.plot(all_locs, qden, 'r-', \n",
365 |     "#                  label='$q(\\mathbf{x})$'\n",
366 |     "                 label='$q$'\n",
367 |     "                )\n",
368 |     "    plt.plot(all_locs, rescaled_stat, 'g-', label=func_label)\n",
369 |     "    # plot the data\n",
370 |     "    n = X.shape[0]\n",
371 |     "    if noise_level is None:\n",
372 |     "        noise_level = max(rescaled_den)*0.01\n",
373 |     "    with util.NumpySeedContext(seed=20):\n",
374 |     "        noise = np.random.randn(n)*noise_level\n",
375 |     "#         plt.plot(X[:, 0], noise, 'm.', \n",
376 |     "#                  markeredgecolor='m', markersize=4, alpha=0.7, label='data')\n",
377 |     "    \n",
378 |     "    # plot the conditioning test locations\n",
379 |     "    if cond_locs is not None:\n",
380 |     "        for i in range(len(cond_locs)):\n",
381 |     "            loci = cond_locs[i]\n",
382 |     "            plt.stem(loci, [abs_max/2.0], 'g-', label='Cond. features')\n",
383 |     "    \n",
384 |     "    maxi = np.argmax(stat_grid)\n",
385 |     "    # plot the location achieving the peak of the function\n",
386 |     "#     plt.plot([all_locs[maxi], all_locs[maxi]], [0, stat_grid[maxi]], 'k--')\n",
387 |     "#     plt.plot(all_locs[maxi], 0., 'k^', markersize=20, label='arg max')\n",
388 |     "    plt.tight_layout()\n",
389 |     "    plt.legend(\n",
390 |     "#         bbox_to_anchor=(1.5, 1)\n",
391 |     "    )\n",
392 |     "    #plt.xlabel('$X$')\n",
393 |     "    #plt.ylabelel('$Y$')\n",
394 |     "    \n",
395 |     "    "
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "# true p\n",
405 |     "seed = 21\n",
406 |     "d = 1\n",
407 |     "mean = np.zeros(d)\n",
408 |     "variance = 1\n",
409 |     "p = density.IsotropicNormal(mean, variance)\n",
410 |     "\n",
411 |     "# sample\n",
412 |     "n = 3000\n",
413 |     "\n",
414 |     "# only one dimension of the mean is shifted\n",
415 |     "#draw_mean = mean + np.hstack((1, np.zeros(d-1)))\n",
416 |     "# draw_mean = mean + 1\n",
417 |     "# draw_variance = variance + 0\n",
418 |     "# X = util.randn(n, d, seed=seed+3)*np.sqrt(draw_variance) + draw_mean\n",
419 |     "# dat = data.Data(X)\n",
420 |     "\n",
421 |     "# ds = data.DSIsotropicNormal(mean=mean, variance=variance)\n",
422 |     "# dat = ds.sample(n, seed=seed+3)\n",
423 |     "# X = dat.data()\n",
424 |     "\n",
425 |     "ds = data.DSLaplace(d=1, loc=0, scale=1.0/np.sqrt(2))\n",
426 |     "qden_func = lambda x: stats.laplace.pdf(x, loc=0, scale=1.0/np.sqrt(2))\n",
427 |     "dat = ds.sample(n, seed=seed+9)\n",
428 |     "\n",
429 |     "# tdf = 7\n",
430 |     "# qden_func = lambda x: stats.t.pdf(x, df=tdf)\n",
431 |     "# dst = data.DSTDistribution(df=tdf)\n",
432 |     "# dat = dst.sample(n, seed=21)\n",
433 |     "\n",
434 |     "X = dat.data()"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "# Kernel\n",
444 |     "sig2 = util.meddistance(X, subsample=1000)**2\n",
445 |     "k = kernel.KGauss(sig2)\n",
446 |     "\n",
447 |     "# Test\n",
448 |     "J = 1\n",
449 |     "alpha = 0.01\n",
450 |     "\n",
451 |     "# random test locations\n",
452 |     "V = util.fit_gaussian_draw(X, J, seed=seed+1)\n",
453 |     "null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)\n",
454 |     "fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)\n",
455 |     "fssd.perform_test(dat)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "### Plot"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "generic_1d_locs_plot(p, dat, k, \n",
472 |     "                     func_fssd_power_criterion, \n",
473 |     "#                      func_fssd,\n",
474 |     "#                      func_fssd_ustat_std,\n",
475 |     "                     func_label=r'$\\frac{\\mathrm{FSSD^2}}{\\sigma_{H_1}}$', \n",
476 |     "                    cond_locs=None, qden_func=qden_func, n_max_stats=0)\n",
477 |     "# plt.title('mean/sd')\n",
478 |     "plt.legend(loc='best', fontsize=26)\n",
479 |     "plt.gca().get_yaxis().set_visible(False)\n",
480 |     "plt.box(False)\n",
481 |     "# plt.xlabel('$v$', fontsize=30)\n",
482 |     "xax = plt.gca().get_xaxis()\n",
483 |     "xax.set_ticks_position('bottom')\n",
484 |     "plt.xlim([-5, 5])\n",
485 |     "# plt.tick_params(\n",
486 |     "#     axis='x',          # changes apply to the x-axis\n",
487 |     "#     which='both',      # both major and minor ticks are affected\n",
488 |     "#     bottom='on',      # ticks along the bottom edge are off\n",
489 |     "#     top='off',         # ticks along the top edge are off\n",
490 |     "#     labelbottom='on') # labels along the bottom edge are off\n",
491 |     "# plt.grid()\n",
492 |     "# plt.savefig('gauss_vs_t_obj.pdf', bbox_inches='tight')\n",
493 |     "ax = plt.gca()\n",
494 |     "\n",
495 |     "# plot with n =100000, seed=30\n",
496 |     "# loc1 = -0.895\n",
497 |     "# loc2 = 0.92\n",
498 |     "# plt.plot([loc1, loc1], [0, 0.46], 'k--')\n",
499 |     "# plt.plot([loc2, loc2], [0, 0.46], 'k--')\n",
500 |     "# ax.annotate('$v^*$', xy=(185, 26), xycoords='figure pixels' , fontsize=38)\n",
501 |     "# ax.annotate('$v^*$', xy=(275, 26), xycoords='figure pixels' , fontsize=38)\n",
502 |     "\n",
503 |     "plt.savefig('gauss_vs_laplace_obj.pdf', bbox_inches='tight')"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {
510 |     "scrolled": false
511 |    },
512 |    "outputs": [],
513 |    "source": [
514 |     "generic_1d_locs_plot(p, dat, k, func_fssd, func_label=None, cond_locs=None)\n",
515 |     "plt.title('$\\mathrm{FSSD}^2$')\n",
516 |     "plt.grid()"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "generic_1d_locs_plot(p, dat, k, func_fssd_ustat_std, func_label=None, cond_locs=None)\n",
526 |     "plt.title('FSSD standard deviation')\n",
527 |     "plt.grid()"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": null,
533 |    "metadata": {},
534 |    "outputs": [],
535 |    "source": [
536 |     "from ipywidgets import interact, interactive, fixed\n",
537 |     "from IPython.display import display\n",
538 |     "import ipywidgets as widgets\n",
539 |     "import math\n",
540 |     "\n",
541 |     "def interactive_1d_locs_plot(f, func_label=None, cond_loc=None):\n",
542 |     "    cond_locs = np.array([[cond_loc]])\n",
543 |     "    generic_1d_locs_plot(p, dat, k, f, func_label=func_label, \n",
544 |     "                    cond_locs=cond_locs, noise_level=1)\n",
545 |     "    plt.grid()\n",
546 |     "\n",
547 |     "X = dat.data()\n",
548 |     "minx = np.min(X)\n",
549 |     "maxx = np.max(X)\n",
550 |     "sdx = np.std(X)\n",
551 |     "gap = 1\n",
552 |     "vs = interactive(interactive_1d_locs_plot,\n",
553 |     "    f=fixed(func_fssd_power_criterion), func_label=fixed('mean/std'), \n",
554 |     "    cond_loc=(math.floor(minx-gap), math.ceil(maxx+gap), 0.2)\n",
555 |     ")\n",
556 |     "display(vs)"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": null,
562 |    "metadata": {
563 |     "collapsed": true
564 |    },
565 |    "outputs": [],
566 |    "source": []
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {
572 |     "collapsed": true
573 |    },
574 |    "outputs": [],
575 |    "source": []
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {
581 |     "collapsed": true
582 |    },
583 |    "outputs": [],
584 |    "source": []
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": null,
589 |    "metadata": {
590 |     "collapsed": true
591 |    },
592 |    "outputs": [],
593 |    "source": []
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": null,
598 |    "metadata": {
599 |     "collapsed": true
600 |    },
601 |    "outputs": [],
602 |    "source": []
603 |   },
604 |   {
605 |    "cell_type": "raw",
606 |    "metadata": {},
607 |    "source": []
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": null,
612 |    "metadata": {
613 |     "collapsed": true
614 |    },
615 |    "outputs": [],
616 |    "source": []
617 |   },
618 |   {
619 |    "cell_type": "code",
620 |    "execution_count": null,
621 |    "metadata": {
622 |     "collapsed": true
623 |    },
624 |    "outputs": [],
625 |    "source": []
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": null,
630 |    "metadata": {
631 |     "collapsed": true
632 |    },
633 |    "outputs": [],
634 |    "source": []
635 |   }
636 |  ],
637 |  "metadata": {
638 |   "anaconda-cloud": {},
639 |   "kernelspec": {
640 |    "display_name": "Python 3",
641 |    "language": "python",
642 |    "name": "python3"
643 |   },
644 |   "language_info": {
645 |    "codemirror_mode": {
646 |     "name": "ipython",
647 |     "version": 3
648 |    },
649 |    "file_extension": ".py",
650 |    "mimetype": "text/x-python",
651 |    "name": "python",
652 |    "nbconvert_exporter": "python",
653 |    "pygments_lexer": "ipython3",
654 |    "version": "3.6.3"
655 |   }
656 |  },
657 |  "nbformat": 4,
658 |  "nbformat_minor": 1
659 | }
660 | 


--------------------------------------------------------------------------------
/ipynb/gof_kernel_stein.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "A notebook to test and demonstrate `KernelSteinTest`. This implements the kernelized Stein discrepancy test of Chwialkowski et al., 2016 and Liu et al., 2016 in ICML 2016."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "The autoreload extension is already loaded. To reload it, use:\n",
 20 |       "  %reload_ext autoreload\n"
 21 |      ]
 22 |     }
 23 |    ],
 24 |    "source": [
 25 |     "%load_ext autoreload\n",
 26 |     "%autoreload 2\n",
 27 |     "%matplotlib inline\n",
 28 |     "#%config InlineBackend.figure_format = 'svg'\n",
 29 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 30 |     "\n",
 31 |     "import kgof\n",
 32 |     "import kgof.data as data\n",
 33 |     "import kgof.density as density\n",
 34 |     "import kgof.goftest as gof\n",
 35 |     "import kgof.kernel as ker\n",
 36 |     "import kgof.util as util\n",
 37 |     "import matplotlib\n",
 38 |     "import matplotlib.pyplot as plt\n",
 39 |     "import numpy as np\n",
 40 |     "import scipy.stats as stats"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# font options\n",
 50 |     "font = {\n",
 51 |     "    #'family' : 'normal',\n",
 52 |     "    #'weight' : 'bold',\n",
 53 |     "    'size'   : 18\n",
 54 |     "}\n",
 55 |     "\n",
 56 |     "plt.rc('font', **font)\n",
 57 |     "plt.rc('lines', linewidth=2)\n",
 58 |     "matplotlib.rcParams['pdf.fonttype'] = 42\n",
 59 |     "matplotlib.rcParams['ps.fonttype'] = 42"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "--------------"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Problem: p = Isotropic normal distribution"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# true p\n",
 83 |     "seed = 13\n",
 84 |     "d = \n",
 85 |     "# sample\n",
 86 |     "n = 800\n",
 87 |     "\n",
 88 |     "mean = np.zeros(d)\n",
 89 |     "variance = 1.0\n",
 90 |     "qmean = mean.copy()\n",
 91 |     "qmean[0] = 0\n",
 92 |     "qvariance = variance\n",
 93 |     "\n",
 94 |     "p = density.IsotropicNormal(mean, variance)\n",
 95 |     "ds = data.DSIsotropicNormal(qmean, qvariance)\n",
 96 |     "# ds = data.DSLaplace(d=d, loc=0, scale=1.0/np.sqrt(2))\n",
 97 |     "dat = ds.sample(n, seed=seed+1)\n",
 98 |     "X = dat.data()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# Test\n",
108 |     "alpha = 0.01\n",
109 |     "\n",
110 |     "# Gaussian kernel with median heuristic\n",
111 |     "sig2 = util.meddistance(X, subsample=1000)**2\n",
112 |     "k = ker.KGauss(sig2)\n",
113 |     "\n",
114 |     "# inverse multiquadric kernel\n",
115 |     "# From Gorham & Mackey 2017 (https://arxiv.org/abs/1703.01717)\n",
116 |     "# k = ker.KIMQ(b=-0.5, c=1.0)\n",
117 |     "\n",
118 |     "bootstrapper = gof.bootstrapper_rademacher\n",
119 |     "kstein = gof.KernelSteinTest(p, k, bootstrapper=bootstrapper, \n",
120 |     "                             alpha=alpha, n_simulate=500, seed=seed+1)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {
127 |     "scrolled": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "kstein_result = kstein.perform_test(dat, return_simulated_stats=True,\n",
132 |     "                                   return_ustat_gram=True)\n",
133 |     "kstein_result\n",
134 |     "#kstein.compute_stat(dat)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "print('p-value: ', kstein_result['pvalue'])\n",
144 |     "print('reject H0: ', kstein_result['h0_rejected'])"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "sim_stats = kstein_result['sim_stats']\n",
154 |     "plt.figure(figsize=(10, 4))\n",
155 |     "plt.hist(sim_stats, bins=20, normed=True);\n",
156 |     "plt.stem([kstein_result['test_stat']], [0.03], 'r-o', label='Stat')\n",
157 |     "plt.legend()"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## Test original implementation\n",
165 |     "\n",
166 |     "Original implementation of Chwialkowski et al., 2016"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "from scipy.spatial.distance import squareform, pdist\n",
176 |     "\n",
177 |     "def simulatepm(N, p_change):\n",
178 |     "    '''\n",
179 |     "\n",
180 |     "    :param N:\n",
181 |     "    :param p_change:\n",
182 |     "    :return:\n",
183 |     "    '''\n",
184 |     "    X = np.zeros(N) - 1\n",
185 |     "    change_sign = np.random.rand(N) < p_change\n",
186 |     "    for i in range(N):\n",
187 |     "        if change_sign[i]:\n",
188 |     "            X[i] = -X[i - 1]\n",
189 |     "        else:\n",
190 |     "            X[i] = X[i - 1]\n",
191 |     "    return X\n",
192 |     "\n",
193 |     "\n",
194 |     "class _GoodnessOfFitTest:\n",
195 |     "    def __init__(self, grad_log_prob, scaling=1):\n",
196 |     "        #scaling is the sigma^2 as in exp(-|x_y|^2/2*sigma^2)\n",
197 |     "        self.scaling = scaling*2\n",
198 |     "        self.grad = grad_log_prob\n",
199 |     "        # construct (slow) multiple gradient handle if efficient one is not given\n",
200 |     "        \n",
201 |     "\n",
202 |     "    def grad_multiple(self, X):\n",
203 |     "        #print self.grad\n",
204 |     "        return np.array([(self.grad)(x) for x in X])\n",
205 |     "    \n",
206 |     "    def kernel_matrix(self, X):\n",
207 |     "\n",
208 |     "        # check for stupid mistake\n",
209 |     "        assert X.shape[0] > X.shape[1]\n",
210 |     "\n",
211 |     "        sq_dists = squareform(pdist(X, 'sqeuclidean'))\n",
212 |     "\n",
213 |     "        K = np.exp(-sq_dists/ self.scaling)\n",
214 |     "        return K\n",
215 |     "\n",
216 |     "    def gradient_k_wrt_x(self, X, K, dim):\n",
217 |     "\n",
218 |     "        X_dim = X[:, dim]\n",
219 |     "        assert X_dim.ndim == 1\n",
220 |     "\n",
221 |     "        differences = X_dim.reshape(len(X_dim), 1) - X_dim.reshape(1, len(X_dim))\n",
222 |     "\n",
223 |     "        return -2.0 / self.scaling * K * differences\n",
224 |     "\n",
225 |     "    def gradient_k_wrt_y(self, X, K, dim):\n",
226 |     "        return -self.gradient_k_wrt_x(X, K, dim)\n",
227 |     "\n",
228 |     "    def second_derivative_k(self, X, K, dim):\n",
229 |     "        X_dim = X[:, dim]\n",
230 |     "        assert X_dim.ndim == 1\n",
231 |     "\n",
232 |     "        differences = X_dim.reshape(len(X_dim), 1) - X_dim.reshape(1, len(X_dim))\n",
233 |     "\n",
234 |     "        sq_differences = differences ** 2\n",
235 |     "\n",
236 |     "        return 2.0 * K * (self.scaling - 2 * sq_differences) / self.scaling ** 2\n",
237 |     "\n",
238 |     "    def get_statistic_multiple_dim(self, samples, dim):\n",
239 |     "        num_samples = len(samples)\n",
240 |     "\n",
241 |     "        log_pdf_gradients = self.grad_multiple(samples)\n",
242 |     "        # n x 1\n",
243 |     "        log_pdf_gradients = log_pdf_gradients[:, dim]\n",
244 |     "        # n x n\n",
245 |     "        K = self.kernel_matrix(samples)\n",
246 |     "        assert K.shape[0]==K.shape[1]\n",
247 |     "        # n x n\n",
248 |     "        gradient_k_x = self.gradient_k_wrt_x(samples, K, dim)\n",
249 |     "        assert gradient_k_x.shape[0] == gradient_k_x.shape[1]\n",
250 |     "        # n x n\n",
251 |     "        gradient_k_y = self.gradient_k_wrt_y(samples, K, dim)\n",
252 |     "        # n x n \n",
253 |     "        second_derivative = self.second_derivative_k(samples, K, dim)\n",
254 |     "        assert second_derivative.shape[0] == second_derivative.shape[1]\n",
255 |     "\n",
256 |     "        # use broadcasting to mimic the element wise looped call\n",
257 |     "        pairwise_log_gradients = log_pdf_gradients.reshape(num_samples, 1) \\\n",
258 |     "                                 * log_pdf_gradients.reshape(1, num_samples)\n",
259 |     "        A = pairwise_log_gradients * K\n",
260 |     "\n",
261 |     "        B = gradient_k_x * log_pdf_gradients\n",
262 |     "        C = (gradient_k_y.T * log_pdf_gradients).T\n",
263 |     "        D = second_derivative\n",
264 |     "\n",
265 |     "        V_statistic = A + B + C + D\n",
266 |     "        #V_statistic =  C\n",
267 |     "\n",
268 |     "        stat = num_samples * np.mean(V_statistic)\n",
269 |     "        return V_statistic, stat\n",
270 |     "\n",
271 |     "    def compute_pvalues_for_processes(self, U_matrix, chane_prob, num_bootstrapped_stats=300):\n",
272 |     "        N = U_matrix.shape[0]\n",
273 |     "        bootsraped_stats = np.zeros(num_bootstrapped_stats)\n",
274 |     "\n",
275 |     "        with util.NumpySeedContext(seed=10):\n",
276 |     "            for proc in range(num_bootstrapped_stats):\n",
277 |     "                # W = np.sign(orsetinW[:,proc])\n",
278 |     "                W = simulatepm(N, chane_prob)\n",
279 |     "                WW = np.outer(W, W)\n",
280 |     "                st = np.mean(U_matrix * WW)\n",
281 |     "                bootsraped_stats[proc] = N * st\n",
282 |     "\n",
283 |     "        stat = N * np.mean(U_matrix)\n",
284 |     "\n",
285 |     "        return float(np.sum(bootsraped_stats > stat)) / num_bootstrapped_stats\n",
286 |     "\n",
287 |     "    def is_from_null(self, alpha, samples, chane_prob):\n",
288 |     "        dims = samples.shape[1]\n",
289 |     "        boots = 10 * int(dims / alpha)\n",
290 |     "        num_samples = samples.shape[0]\n",
291 |     "        U = np.zeros((num_samples, num_samples))\n",
292 |     "        for dim in range(dims):\n",
293 |     "            U2, _ = self.get_statistic_multiple_dim(samples, dim)\n",
294 |     "            U += U2\n",
295 |     "\n",
296 |     "        p = self.compute_pvalues_for_processes(U, chane_prob, boots)\n",
297 |     "        return p, U\n",
298 |     " \n"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {
305 |     "scrolled": true
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "#sigma = np.array([[1, 0.2, 0.1], [0.2, 1, 0.4], [0.1, 0.4, 1]])\n",
310 |     "def grad_log_correleted(x):\n",
311 |     "    #sigmaInv = np.linalg.inv(sigma)\n",
312 |     "    #return - np.dot(sigmaInv.T + sigmaInv, x) / 2.0\n",
313 |     "    return -(x-mean)/variance\n",
314 |     "\n",
315 |     "#me = _GoodnessOfFitTest(grad_log_correleted)\n",
316 |     "\n",
317 |     "qm = _GoodnessOfFitTest(grad_log_correleted, scaling=sig2)\n",
318 |     "#X = np.random.multivariate_normal([0, 0, 0], sigma, 200)\n",
319 |     "\n",
320 |     "p_val, U = qm.is_from_null(0.05, X, 0.1)\n",
321 |     "print(p_val)\n"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {
328 |     "scrolled": true
329 |    },
330 |    "outputs": [],
331 |    "source": [
332 |     "plt.imshow(U, interpolation='none')\n",
333 |     "plt.colorbar()"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "# U-statistic matrix from the new implementation\n",
343 |     "H = kstein_result['H']\n",
344 |     "plt.imshow(H, interpolation='none')\n",
345 |     "plt.colorbar()"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "plt.imshow(U-H, interpolation='none')\n",
355 |     "plt.colorbar()"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "### "
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": [
371 |     "x = np.random.randint(1, 5, 5)\n",
372 |     "y = np.random.randint(1, 3, 3)"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "x"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "y"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "metadata": {},
397 |    "outputs": [],
398 |    "source": [
399 |     "x[:, np.newaxis] - y[np.newaxis, :]"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": []
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": []
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {},
420 |    "outputs": [],
421 |    "source": []
422 |   }
423 |  ],
424 |  "metadata": {
425 |   "kernelspec": {
426 |    "display_name": "Python 3",
427 |    "language": "python",
428 |    "name": "python3"
429 |   },
430 |   "language_info": {
431 |    "codemirror_mode": {
432 |     "name": "ipython",
433 |     "version": 3
434 |    },
435 |    "file_extension": ".py",
436 |    "mimetype": "text/x-python",
437 |    "name": "python",
438 |    "nbconvert_exporter": "python",
439 |    "pygments_lexer": "ipython3",
440 |    "version": "3.6.4"
441 |   }
442 |  },
443 |  "nbformat": 4,
444 |  "nbformat_minor": 1
445 | }
446 | 


--------------------------------------------------------------------------------
/ipynb/gof_linear_kernel_stein.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "A notebook to test and demonstrate `LinearKernelSteinTest`."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%load_ext autoreload\n",
 17 |     "%autoreload 2\n",
 18 |     "%matplotlib inline\n",
 19 |     "#%config InlineBackend.figure_format = 'svg'\n",
 20 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 21 |     "\n",
 22 |     "import kgof\n",
 23 |     "import kgof.data as data\n",
 24 |     "import kgof.density as density\n",
 25 |     "import kgof.goftest as gof\n",
 26 |     "import kgof.kernel as ker\n",
 27 |     "import kgof.util as util\n",
 28 |     "import matplotlib\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import numpy as np\n",
 31 |     "import scipy.stats as stats"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# font options\n",
 43 |     "font = {\n",
 44 |     "    #'family' : 'normal',\n",
 45 |     "    #'weight' : 'bold',\n",
 46 |     "    'size'   : 18\n",
 47 |     "}\n",
 48 |     "\n",
 49 |     "plt.rc('font', **font)\n",
 50 |     "plt.rc('lines', linewidth=2)\n",
 51 |     "matplotlib.rcParams['pdf.fonttype'] = 42\n",
 52 |     "matplotlib.rcParams['ps.fonttype'] = 42"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "## Problem: p = Isotropic normal distribution"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "collapsed": true
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# true p\n",
 71 |     "seed = 20\n",
 72 |     "d = 1\n",
 73 |     "mean = np.zeros(d)\n",
 74 |     "variance = 1\n",
 75 |     "isonorm = density.IsotropicNormal(mean, variance)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# sample\n",
 87 |     "n = 1000\n",
 88 |     "\n",
 89 |     "# only one dimension of the mean is shifted\n",
 90 |     "#draw_mean = mean + np.hstack((1, np.zeros(d-1)))\n",
 91 |     "draw_mean = mean + 0\n",
 92 |     "draw_variance = variance + 0\n",
 93 |     "X = util.randn(n, d, seed=seed+2)*np.sqrt(draw_variance) + draw_mean\n",
 94 |     "dat = data.Data(X)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# Test\n",
104 |     "alpha = 0.01\n",
105 |     "\n",
106 |     "sig2 = util.meddistance(X, subsample=1000)**2\n",
107 |     "k = ker.KGauss(sig2)\n",
108 |     "#k = ker.KGauss(1)\n",
109 |     "\n",
110 |     "lin_kstein = gof.LinearKernelSteinTest(isonorm, k, alpha=alpha, seed=seed+1)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "lin_kstein_result = lin_kstein.perform_test(dat)\n",
120 |     "lin_kstein_result\n",
121 |     "#kstein.compute_stat(dat)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "collapsed": true
129 |    },
130 |    "outputs": [],
131 |    "source": []
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {
137 |     "collapsed": true
138 |    },
139 |    "outputs": [],
140 |    "source": []
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.6.3"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 1
164 | }
165 | 


--------------------------------------------------------------------------------
/ipynb/gof_me_test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "A notebook to test and demonstrate the `METest` of Jitkrittum et al., 2016 (NIPS 2016) used as a goodness-of-fit test"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%load_ext autoreload\n",
 17 |     "%autoreload 2\n",
 18 |     "%matplotlib inline\n",
 19 |     "#%config InlineBackend.figure_format = 'svg'\n",
 20 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 21 |     "\n",
 22 |     "import freqopttest.tst as tst\n",
 23 |     "import kgof\n",
 24 |     "import kgof.data as data\n",
 25 |     "import kgof.density as density\n",
 26 |     "import kgof.goftest as gof\n",
 27 |     "import kgof.intertst as tgof\n",
 28 |     "import kgof.kernel as ker\n",
 29 |     "import kgof.util as util\n",
 30 |     "import matplotlib\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "import numpy as np\n",
 33 |     "import scipy.stats as stats"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# font options\n",
 45 |     "font = {\n",
 46 |     "    #'family' : 'normal',\n",
 47 |     "    #'weight' : 'bold',\n",
 48 |     "    'size'   : 18\n",
 49 |     "}\n",
 50 |     "\n",
 51 |     "plt.rc('font', **font)\n",
 52 |     "plt.rc('lines', linewidth=2)\n",
 53 |     "matplotlib.rcParams['pdf.fonttype'] = 42\n",
 54 |     "matplotlib.rcParams['ps.fonttype'] = 42"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Test with random test locations"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# true p\n",
 73 |     "seed = 20\n",
 74 |     "d = 1\n",
 75 |     "# sample\n",
 76 |     "n = 800\n",
 77 |     "alpha = 0.05\n",
 78 |     "# number of test locations to use\n",
 79 |     "J = 2\n",
 80 |     "\n",
 81 |     "mean = np.zeros(d)\n",
 82 |     "variance = 1"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "p = density.IsotropicNormal(mean, variance)\n",
 94 |     "q_mean = mean.copy()\n",
 95 |     "q_variance = variance\n",
 96 |     "# q_mean[0] = 1\n",
 97 |     "\n",
 98 |     "# ds = data.DSIsotropicNormal(q_mean, q_variance)\n",
 99 |     "q_means = np.array([ [0], [0]])\n",
100 |     "q_variances = np.array([0.01, 1])\n",
101 |     "ds = data.DSIsoGaussianMixture(q_means, q_variances, pmix=[0.2, 0.8])\n",
102 |     "# ds = data.DSIsoGaussianMixture(p_means, p_variances)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "collapsed": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "dat = ds.sample(n, seed=seed+2)\n",
114 |     "tr, te = dat.split_tr_te(tr_proportion=0.5, seed=2)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "# Test\n",
126 |     "Xtr = tr.data()\n",
127 |     "sig2 = util.meddistance(Xtr, subsample=1000)**2\n",
128 |     "\n",
129 |     "# random test locations\n",
130 |     "V0 = util.fit_gaussian_draw(Xtr, J, seed=seed+1)\n",
131 |     "me_rand = tgof.GaussMETest(p, sig2, V0, alpha=alpha, seed=seed)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "me_rand_result = me_rand.perform_test(te)\n",
141 |     "me_rand_result\n",
142 |     "#kstein.compute_stat(dat)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {
148 |     "collapsed": true
149 |    },
150 |    "source": [
151 |     "## Test with optimized test locations"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": true
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "op = {'n_test_locs': J, 'seed': seed+5, 'max_iter': 200, \n",
163 |     "     'batch_proportion': 1.0, 'locs_step_size': 1.0, \n",
164 |     "      'gwidth_step_size': 0.1, 'tol_fun': 1e-4}\n",
165 |     "# optimize on the training set\n",
166 |     "me_opt = tgof.GaussMETestOpt(p, n_locs=J, tr_proportion=0.5, alpha=alpha, seed=seed+1)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Give the ME test the full data. Internally the data are divided into tr and te.\n",
176 |     "me_opt_result = me_opt.perform_test(dat, op)\n",
177 |     "me_opt_result"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": []
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {
193 |     "collapsed": true
194 |    },
195 |    "outputs": [],
196 |    "source": []
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": []
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Python 3",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.6.3"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 1
238 | }
239 | 


--------------------------------------------------------------------------------
/ipynb/gof_mmd_test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "A notebook to test and demonstrate the `MMD test` of Gretton et al., 2012 used as a goodness-of-fit test. Require the ability to sample from the density `p`."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "%load_ext autoreload\n",
 17 |     "%autoreload 2\n",
 18 |     "%matplotlib inline\n",
 19 |     "#%config InlineBackend.figure_format = 'svg'\n",
 20 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 21 |     "\n",
 22 |     "import freqopttest.tst as tst\n",
 23 |     "import kgof\n",
 24 |     "import kgof.data as data\n",
 25 |     "import kgof.density as density\n",
 26 |     "import kgof.goftest as gof\n",
 27 |     "import kgof.mmd as mgof\n",
 28 |     "import kgof.kernel as ker\n",
 29 |     "import kgof.util as util\n",
 30 |     "import matplotlib\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "import numpy as np\n",
 33 |     "import scipy.stats as stats"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# font options\n",
 45 |     "font = {\n",
 46 |     "    #'family' : 'normal',\n",
 47 |     "    #'weight' : 'bold',\n",
 48 |     "    'size'   : 16\n",
 49 |     "}\n",
 50 |     "\n",
 51 |     "plt.rc('font', **font)\n",
 52 |     "plt.rc('lines', linewidth=2)\n",
 53 |     "matplotlib.rcParams['pdf.fonttype'] = 42\n",
 54 |     "matplotlib.rcParams['ps.fonttype'] = 42"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## MMD test (as a goodness-of-fit test)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# true p\n",
 73 |     "seed = 20\n",
 74 |     "d = 2\n",
 75 |     "# sample\n",
 76 |     "n = 400\n",
 77 |     "alpha = 0.05\n",
 78 |     "\n",
 79 |     "mean = np.zeros(d)\n",
 80 |     "variance = 1"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "p = density.IsotropicNormal(mean, variance)\n",
 92 |     "q_mean = mean.copy()\n",
 93 |     "q_variance = variance\n",
 94 |     "# q_mean[0] = 1\n",
 95 |     "\n",
 96 |     "ds = data.DSIsotropicNormal(q_mean+1, q_variance)\n",
 97 |     "# q_means = np.array([ [0], [0]])\n",
 98 |     "# q_variances = np.array([0.01, 1])\n",
 99 |     "# ds = data.DSIsoGaussianMixture(q_means, q_variances, pmix=[0.2, 0.8])"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {
106 |     "collapsed": true
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "# Test\n",
111 |     "dat = ds.sample(n, seed=seed+2)\n",
112 |     "X = dat.data()\n",
113 |     "# Use median heuristic to determine the Gaussian kernel width\n",
114 |     "sig2 = util.meddistance(X, subsample=1000)**2\n",
115 |     "k = ker.KGauss(sig2)\n"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {
122 |     "scrolled": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "mmd_test = mgof.QuadMMDGof(p, k, n_permute=300, alpha=alpha, seed=seed)\n",
127 |     "mmd_result = mmd_test.perform_test(dat)\n",
128 |     "mmd_result"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "scrolled": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "print('Reject H0?: {0}'.format(mmd_result['h0_rejected']))\n",
140 |     "\n",
141 |     "sim_stats = mmd_result['list_permuted_mmd2']\n",
142 |     "stat = mmd_result['test_stat']\n",
143 |     "unif_weights = np.ones_like(sim_stats)/float(len(sim_stats))\n",
144 |     "plt.hist(sim_stats, label='Simulated', weights=unif_weights)\n",
145 |     "plt.plot([stat, stat], [0, 0], 'r*', markersize=30, label='Stat')\n",
146 |     "plt.legend(loc='best')\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "source": [
155 |     "## MMD test with parameter search"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "def gbrbm_perturb(std_perturb_B, dx=50, dh=10):\n",
167 |     "    \"\"\"\n",
168 |     "    Get a Gaussian-Bernoulli RBM problem where the first entry of the B matrix\n",
169 |     "    (the matrix linking the latent and the observation) is perturbed.\n",
170 |     "\n",
171 |     "    - var_perturb_B: Gaussian noise variance for perturbing B.\n",
172 |     "    - dx: observed dimension\n",
173 |     "    - dh: latent dimension\n",
174 |     "\n",
175 |     "    Return p (density), data source\n",
176 |     "    \"\"\"\n",
177 |     "    with util.NumpySeedContext(seed=10):\n",
178 |     "        B = np.random.randint(0, 2, (dx, dh))*2 - 1.0\n",
179 |     "        b = np.random.randn(dx)\n",
180 |     "        c = np.random.randn(dh)\n",
181 |     "        p = density.GaussBernRBM(B, b, c)\n",
182 |     "\n",
183 |     "        B_perturb = B.copy()\n",
184 |     "        if std_perturb_B > 1e-7:\n",
185 |     "            B_perturb[0, 0] = B_perturb[0, 0] + \\\n",
186 |     "                np.random.randn(1)*std_perturb_B\n",
187 |     "        ds = data.DSGaussBernRBM(B_perturb, b, c, burnin=2000)\n",
188 |     "\n",
189 |     "    return p, ds\n",
190 |     "\n",
191 |     "def gbrbm_perturb_all(std_perturb_B, dx=50, dh=10):\n",
192 |     "    \"\"\"\n",
193 |     "    Get a Gaussian-Bernoulli RBM problem where all entries of B\n",
194 |     "    (the matrix linking the latent and the observation) are perturbed.\n",
195 |     "\n",
196 |     "    - var_perturb_B: Gaussian noise variance for perturbing B.\n",
197 |     "    - dx: observed dimension\n",
198 |     "    - dh: latent dimension\n",
199 |     "\n",
200 |     "    Return p (density), data source\n",
201 |     "    \"\"\"\n",
202 |     "    with util.NumpySeedContext(seed=11):\n",
203 |     "        B = np.random.randint(0, 2, (dx, dh))*2 - 1.0\n",
204 |     "        b = np.random.randn(dx)\n",
205 |     "        c = np.random.randn(dh)\n",
206 |     "        p = density.GaussBernRBM(B, b, c)\n",
207 |     "\n",
208 |     "        if std_perturb_B > 1e-7:\n",
209 |     "            B_perturb = B + np.random.randn(dx, dh)*std_perturb_B\n",
210 |     "        ds = data.DSGaussBernRBM(B_perturb, b, c, burnin=2000)\n",
211 |     "\n",
212 |     "    return p, ds\n"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "n = 1000\n",
224 |     "d = 50\n",
225 |     "seed = 991\n",
226 |     "# p, qds = gbrbm_perturb_all(0.06, dx=d, dh=10)\n",
227 |     "p, qds = gbrbm_perturb(np.sqrt(0.1), dx=d, dh=10)\n",
228 |     "qdat = qds.sample(n, seed=seed+3)\n",
229 |     "Y = qdat.data()\n",
230 |     "\n",
231 |     "pds = p.get_datasource()\n",
232 |     "datX = pds.sample(n, seed=seed+1)\n",
233 |     "X = datX.data()\n",
234 |     "XY = np.vstack((X, Y))"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "np.var(X, 0)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "np.var(Y, 0)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "# Get the median heuristic for each dimension\n",
262 |     "med_factors = 2.0**np.linspace(-5, 5, 30)\n",
263 |     "meds = np.zeros(d)\n",
264 |     "for i in range(d):\n",
265 |     "    medi = util.meddistance(XY[:, [i]], subsample=1000)\n",
266 |     "    meds[i] = medi\n",
267 |     "\n",
268 |     "candidate_kernels = []\n",
269 |     "for i in range(len(med_factors)):\n",
270 |     "    ki = ker.KDiagGauss( (meds**2)*med_factors[i] )\n",
271 |     "    candidate_kernels.append(ki)\n",
272 |     "    \n",
273 |     "# k = ker.KDiagGauss(2*meds**2)\n",
274 |     "\n",
275 |     "# Construct a list of kernels to try based on multiples of the median\n",
276 |     "# heuristic\n",
277 |     "\n",
278 |     "# med = util.meddistance(XY, subsample=1000)\n",
279 |     "# candidate_kernels = [ker.KGauss(f*med**2) for f in med_factors]\n",
280 |     "# k = ker.KGauss((2.0**-1)*med**2)\n",
281 |     "# candidate_kernels = [k]"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {
288 |     "scrolled": true
289 |    },
290 |    "outputs": [],
291 |    "source": [
292 |     "mmd_opt = mgof.QuadMMDGofOpt(p, n_permute=300, alpha=alpha, seed=seed+3)\n",
293 |     "mmd_result = mmd_opt.perform_test(qdat,\n",
294 |     "        candidate_kernels=candidate_kernels,\n",
295 |     "        tr_proportion=0.2, reg=1e-3)\n",
296 |     "mmd_result"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "------------"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {
310 |     "collapsed": true,
311 |     "scrolled": false
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "Kxy = k.eval(X, Y)\n",
316 |     "Kxx = k.eval(X, X)\n",
317 |     "Kyy = k.eval(Y, Y)\n",
318 |     "plt.figure(figsize=(8, 8))\n",
319 |     "plt.imshow(Kxy)\n",
320 |     "plt.title('Kxy')\n",
321 |     "plt.colorbar()"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {
328 |     "collapsed": true,
329 |     "scrolled": false
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "plt.hist(Kxy.ravel(), bins=50)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "plt.figure(figsize=(8, 8))\n",
345 |     "plt.imshow(Kxx)\n",
346 |     "plt.title('Kxx')\n",
347 |     "plt.colorbar()"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {
354 |     "collapsed": true
355 |    },
356 |    "outputs": [],
357 |    "source": [
358 |     "plt.figure(figsize=(8, 8))\n",
359 |     "plt.imshow(Kyy)\n",
360 |     "plt.title('Kyy')\n",
361 |     "plt.colorbar()"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "collapsed": true
369 |    },
370 |    "outputs": [],
371 |    "source": [
372 |     "mmd = np.mean(Kxx+Kyy-2*Kxy)\n",
373 |     "mmd"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": []
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {
389 |     "collapsed": true
390 |    },
391 |    "outputs": [],
392 |    "source": []
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {
398 |     "collapsed": true
399 |    },
400 |    "outputs": [],
401 |    "source": []
402 |   }
403 |  ],
404 |  "metadata": {
405 |   "kernelspec": {
406 |    "display_name": "Python 3",
407 |    "language": "python",
408 |    "name": "python3"
409 |   },
410 |   "language_info": {
411 |    "codemirror_mode": {
412 |     "name": "ipython",
413 |     "version": 3
414 |    },
415 |    "file_extension": ".py",
416 |    "mimetype": "text/x-python",
417 |    "name": "python",
418 |    "nbconvert_exporter": "python",
419 |    "pygments_lexer": "ipython3",
420 |    "version": "3.6.3"
421 |   }
422 |  },
423 |  "nbformat": 4,
424 |  "nbformat_minor": 1
425 | }
426 | 


--------------------------------------------------------------------------------
/ipynb/preliminary.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "The first notebook to test the idea. "
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "%load_ext autoreload\n",
 19 |     "%autoreload 2\n",
 20 |     "%matplotlib inline\n",
 21 |     "#%config InlineBackend.figure_format = 'svg'\n",
 22 |     "#%config InlineBackend.figure_format = 'pdf'\n",
 23 |     "\n",
 24 |     "import kgof\n",
 25 |     "import kgof.data as data\n",
 26 |     "import kgof.density as density\n",
 27 |     "import kgof.goftest as gof\n",
 28 |     "import kgof.kernel as kernel\n",
 29 |     "import kgof.util as util\n",
 30 |     "import matplotlib\n",
 31 |     "import matplotlib.pyplot as plt\n",
 32 |     "import autograd.numpy as np\n",
 33 |     "import scipy.stats as stats"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# font options\n",
 45 |     "font = {\n",
 46 |     "    #'family' : 'normal',\n",
 47 |     "    #'weight' : 'bold',\n",
 48 |     "    'size'   : 18\n",
 49 |     "}\n",
 50 |     "\n",
 51 |     "plt.rc('font', **font)\n",
 52 |     "plt.rc('lines', linewidth=2)\n",
 53 |     "matplotlib.rcParams['pdf.fonttype'] = 42\n",
 54 |     "matplotlib.rcParams['ps.fonttype'] = 42"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "source": [
 63 |     "## Isotropic normal distribution"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {
 70 |     "collapsed": true
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# true p\n",
 75 |     "seed = 22\n",
 76 |     "d = 40\n",
 77 |     "mean = np.zeros(d)\n",
 78 |     "variance = 1"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# sample\n",
 90 |     "n = 700\n",
 91 |     "\n",
 92 |     "# only one dimension of the mean is shifted\n",
 93 |     "#draw_mean = mean + np.hstack((1, np.zeros(d-1)))\n",
 94 |     "\n",
 95 |     "p = density.IsotropicNormal(mean, variance)\n",
 96 |     "qvariance = 2.5\n",
 97 |     "ds = data.DSIsotropicNormal(mean+0, qvariance)\n",
 98 |     "\n",
 99 |     "# # Gaussian mixture\n",
100 |     "# p_means = np.array([ [0], [3.0]])\n",
101 |     "# p_variances = np.array([1, 0.01])\n",
102 |     "# # p = density.IsoGaussianMixture(p_means, p_variances)\n",
103 |     "# p = density.IsotropicNormal(np.zeros(1), 1)\n",
104 |     "\n",
105 |     "# q_means = np.array([ [0], [0]])\n",
106 |     "# q_variances = np.array([0.01, 1])\n",
107 |     "# ds = data.DSIsoGaussianMixture(q_means, q_variances, pmix=[0.2, 0.8])\n",
108 |     "# # ds = data.DSIsoGaussianMixture(p_means, p_variances)\n",
109 |     "dat = ds.sample(n, seed=seed+1)\n",
110 |     "\n",
111 |     "X = dat.data()\n",
112 |     "tr, te = dat.split_tr_te(tr_proportion=0.2, seed=seed+1)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "# Plot the density and generated data\n",
124 |     "if p.dim()==1:\n",
125 |     "#     dat2 = ds.sample(2000, seed=seed+2)\n",
126 |     "#     X2 = dat2.X\n",
127 |     "    sd = np.std(X)\n",
128 |     "    dom = np.linspace(np.min(X)-sd, np.max(X)+sd, 500)\n",
129 |     "    unden = np.exp(p.log_normalized_den(dom[:, np.newaxis]))\n",
130 |     "    plt.figure(figsize=(10, 5))\n",
131 |     "    plt.hist(X, bins=40, normed=True, label='Data', color='r')\n",
132 |     "    plt.plot(dom, unden, 'b-', label='p')\n",
133 |     "    plt.legend(loc='best')"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "# Test\n",
143 |     "J = 5\n",
144 |     "alpha = 0.01\n",
145 |     "\n",
146 |     "X = dat.X\n",
147 |     "gwidth0 = util.meddistance(X, subsample=1000)**2\n",
148 |     "# random test locations\n",
149 |     "V0 = util.fit_gaussian_draw(X, J, seed=seed+1)\n",
150 |     "# V0[0, 0] = 3\n",
151 |     "# print V0\n",
152 |     "print('Gaussian width^2: {0}'.format(gwidth0))"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "scrolled": true
160 |    },
161 |    "outputs": [],
162 |    "source": [
163 |     "k0 = kernel.KGauss(gwidth0)\n",
164 |     "null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)\n",
165 |     "# null_sim = gof.FSSDH0SimCovDraw(n_simulate=2000, seed=10)\n",
166 |     "\n",
167 |     "fssd = gof.FSSD(p, k0, V0, null_sim=null_sim, alpha=alpha)\n",
168 |     "fssd.perform_test(te)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {
175 |     "scrolled": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "fssd.get_H1_mean_variance(te)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "----------------"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Optimized Gaussian FSSD"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "scrolled": false
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "opts = {\n",
205 |     "    'reg': 1e-3,\n",
206 |     "    'max_iter': 30, \n",
207 |     "    'tol_fun':1e-9, \n",
208 |     "#     'disp':True\n",
209 |     "}\n",
210 |     "\n",
211 |     "V_opt, gw_opt, opt_result = gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)\n",
212 |     "del(opt_result['jac'])\n",
213 |     "del(opt_result['x'])\n",
214 |     "opt_result"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {
221 |     "scrolled": true
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "gw_opt"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {
232 |     "scrolled": true
233 |    },
234 |    "outputs": [],
235 |    "source": [
236 |     "# construct a test\n",
237 |     "k_opt = kernel.KGauss(gw_opt)\n",
238 |     "null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)\n",
239 |     "# null_sim = gof.FSSDH0SimCovDraw(n_simulate=2000, seed=10)\n",
240 |     "fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)\n",
241 |     "fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)\n",
242 |     "fssd_opt_result"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "scrolled": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "# get the mean and variance under H1 of the test statistic\n",
254 |     "fssd_opt.get_H1_mean_variance(te)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "sim_stats = fssd_opt_result['sim_stats']\n",
264 |     "plt.hist(sim_stats, bins=20, normed=True);\n",
265 |     "plt.stem([fssd_opt_result['test_stat']], [0.03], 'r-o', label='Stat')\n",
266 |     "plt.legend()"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {
281 |     "collapsed": true
282 |    },
283 |    "source": [
284 |     "## Analyze Gaussian-Bernoulli RBM"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": null,
290 |    "metadata": {
291 |     "collapsed": true
292 |    },
293 |    "outputs": [],
294 |    "source": [
295 |     "def gbrbm_perturb(var_perturb_B, dx=50, dh=10):\n",
296 |     "    \"\"\"\n",
297 |     "    Get a Gaussian-Bernoulli RBM problem where the first entry of the B matrix\n",
298 |     "    (the matrix linking the latent and the observation) is perturbed.\n",
299 |     "\n",
300 |     "    - var_perturb_B: Gaussian noise variance for perturbing B.\n",
301 |     "    - dx: observed dimension\n",
302 |     "    - dh: latent dimension\n",
303 |     "\n",
304 |     "    Return p (density), data source\n",
305 |     "    \"\"\"\n",
306 |     "    with util.NumpySeedContext(seed=10):\n",
307 |     "        B = np.random.randint(0, 2, (dx, dh))*2 - 1.0\n",
308 |     "        b = np.random.randn(dx)\n",
309 |     "        c = np.random.randn(dh)\n",
310 |     "        p = density.GaussBernRBM(B, b, c)\n",
311 |     "\n",
312 |     "        B_perturb = np.copy(B)\n",
313 |     "        B_perturb[0, 0] = B_perturb[0, 0] + \\\n",
314 |     "            np.random.randn(1)*np.sqrt(var_perturb_B)\n",
315 |     "        ds = data.DSGaussBernRBM(B_perturb, b, c, burnin=50)\n",
316 |     "\n",
317 |     "    return p, ds\n"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {
324 |     "collapsed": true
325 |    },
326 |    "outputs": [],
327 |    "source": [
328 |     "p, ds_per =  gbrbm_perturb(1e-1, dx=2, dh=8)\n",
329 |     "ds = p.get_datasource()\n",
330 |     "dat = ds.sample(n=200, seed=5)\n",
331 |     "dat_per = ds_per.sample(n=200, seed=4)\n"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "X = dat.data()\n",
341 |     "X_per = dat_per.data()\n",
342 |     "plt.plot(X[:, 0], X[:, 1], 'bx')\n",
343 |     "plt.plot(X_per[:, 0], X_per[:, 1], 'rx')"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "metadata": {
349 |     "collapsed": true
350 |    },
351 |    "source": [
352 |     "## Visually compare IMQ and Gaussian kernels"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "b = -0.5\n",
362 |     "k_imq = kernel.KIMQ(b=b, c=1)\n",
363 |     "k_g = kernel.KGauss(sigma2=1.0)\n",
364 |     "\n",
365 |     "dom = np.linspace(-8, 8, 100)[:, np.newaxis]\n",
366 |     "v = 0\n",
367 |     "plt.plot(dom, k_imq.eval(dom, np.array([[v]])), 'b-', label='IMQ kernel')\n",
368 |     "plt.plot(dom, k_g.eval(dom, np.array([[v]])), 'r-', label='Gaussian kernel')\n",
369 |     "\n",
370 |     "plt.legend()"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "collapsed": true
378 |    },
379 |    "outputs": [],
380 |    "source": []
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {
386 |     "collapsed": true
387 |    },
388 |    "outputs": [],
389 |    "source": []
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": null,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": []
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {
404 |     "collapsed": true
405 |    },
406 |    "outputs": [],
407 |    "source": []
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {
413 |     "collapsed": true
414 |    },
415 |    "outputs": [],
416 |    "source": []
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {
422 |     "collapsed": true
423 |    },
424 |    "outputs": [],
425 |    "source": []
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {
431 |     "collapsed": true
432 |    },
433 |    "outputs": [],
434 |    "source": []
435 |   }
436 |  ],
437 |  "metadata": {
438 |   "kernelspec": {
439 |    "display_name": "Python 3",
440 |    "language": "python",
441 |    "name": "python3"
442 |   },
443 |   "language_info": {
444 |    "codemirror_mode": {
445 |     "name": "ipython",
446 |     "version": 3
447 |    },
448 |    "file_extension": ".py",
449 |    "mimetype": "text/x-python",
450 |    "name": "python",
451 |    "nbconvert_exporter": "python",
452 |    "pygments_lexer": "ipython3",
453 |    "version": "3.6.3"
454 |   }
455 |  },
456 |  "nbformat": 4,
457 |  "nbformat_minor": 1
458 | }
459 | 


--------------------------------------------------------------------------------
/kgof/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wittawatj/kernel-gof/039a95ed9d8062e283da6bd051b7161a190b4876/kgof/__init__.py


--------------------------------------------------------------------------------
/kgof/config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """
 3 | This file defines global configuration of the project.
 4 | Casual usage of the package should not need to change this. 
 5 | """
 6 | 
 7 | import kgof.glo as glo
 8 | import os
 9 | 
10 | expr_configs = {
11 |     # Full path to the directory to store temporary files when running
12 |     # experiments.
13 |     'scratch_path': '/nfs/data3/wittawat/tmp/',
14 | 
15 |     # Full path to the directory to store experimental results.
16 |     'expr_results_path': '/nfs/data3/wittawat/kgof/results/',
17 | 
18 |     # Full path to the data directory
19 |     'data_path': os.path.join(os.path.dirname(glo.get_root()), 'data')
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/kgof/density.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module containing implementations of some unnormalized probability density 
  3 | functions.
  4 | """
  5 | from __future__ import division
  6 | 
  7 | from builtins import range
  8 | from past.utils import old_div
  9 | from builtins import object
 10 | from future.utils import with_metaclass
 11 | __author__ = 'wittawat'
 12 | 
 13 | from abc import ABCMeta, abstractmethod
 14 | import autograd
 15 | import autograd.numpy as np
 16 | import kgof.data as data
 17 | import scipy.stats as stats
 18 | #import warnings
 19 | import logging
 20 | 
 21 | def warn_bounded_domain(self):
 22 |     logging.warning('{} has a bounded domain. This may have an unintended effect to the test result of FSSD.'.format(self.__class__) )
 23 | 
 24 | def from_log_den(d, f):
 25 |     """
 26 |     Construct an UnnormalizedDensity from the function f, implementing the log 
 27 |     of an unnormalized density.
 28 | 
 29 |     f: X -> den where X: n x d and den is a numpy array of length n.
 30 |     """
 31 |     return UDFromCallable(d, flog_den=f)
 32 | 
 33 | def from_grad_log(d, g):
 34 |     """
 35 |     Construct an UnnormalizedDensity from the function g, implementing the
 36 |     gradient of the log of an unnormalized density.
 37 | 
 38 |     g: X -> grad where X: n x d and grad is n x d (2D numpy array)
 39 |     """
 40 |     return UDFromCallable(d, fgrad_log=g)
 41 | 
 42 | 
 43 | class UnnormalizedDensity(with_metaclass(ABCMeta, object)):
 44 |     """
 45 |     An abstract class of an unnormalized probability density function.  This is
 46 |     intended to be used to represent a model of the data for goodness-of-fit
 47 |     testing.
 48 |     """
 49 | 
 50 |     @abstractmethod
 51 |     def log_den(self, X):
 52 |         """
 53 |         Evaluate this log of the unnormalized density on the n points in X.
 54 | 
 55 |         X: n x d numpy array
 56 | 
 57 |         Return a one-dimensional numpy array of length n.
 58 |         """
 59 |         raise NotImplementedError()
 60 | 
 61 |     def log_normalized_den(self, X):
 62 |         """
 63 |         Evaluate the exact normalized log density. The difference to log_den()
 64 |         is that this method adds the normalizer. This method is not
 65 |         compulsory. Subclasses do not need to override.
 66 |         """
 67 |         raise NotImplementedError()
 68 | 
 69 |     def get_datasource(self):
 70 |         """
 71 |         Return a DataSource that allows sampling from this density.
 72 |         May return None if no DataSource is implemented.
 73 |         Implementation of this method is not enforced in the subclasses.
 74 |         """
 75 |         return None
 76 | 
 77 |     def grad_log(self, X):
 78 |         """
 79 |         Evaluate the gradients (with respect to the input) of the log density at
 80 |         each of the n points in X. This is the score function. Given an
 81 |         implementation of log_den(), this method will automatically work.
 82 |         Subclasses may override this if a more efficient implementation is
 83 |         available.
 84 | 
 85 |         X: n x d numpy array.
 86 | 
 87 |         Return an n x d numpy array of gradients.
 88 |         """
 89 |         g = autograd.elementwise_grad(self.log_den)
 90 |         G = g(X)
 91 |         return G
 92 | 
 93 |     @abstractmethod
 94 |     def dim(self):
 95 |         """
 96 |         Return the dimension of the input.
 97 |         """
 98 |         raise NotImplementedError()
 99 | 
100 | # end UnnormalizedDensity
101 | 
102 | class UDFromCallable(UnnormalizedDensity):
103 |     """
104 |     UnnormalizedDensity constructed from the specified implementations of 
105 |     log_den() and grad_log() as callable objects.
106 |     """
107 |     def __init__(self, d, flog_den=None, fgrad_log=None):
108 |         """
109 |         Only one of log_den and grad_log are required.
110 |         If log_den is specified, the gradient is automatically computed with
111 |         autograd.
112 | 
113 |         d: the dimension of the domain of the density
114 |         log_den: a callable object (function) implementing the log of an unnormalized density. See UnnormalizedDensity.log_den.
115 |         grad_log: a callable object (function) implementing the gradient of the log of an unnormalized density.
116 |         """
117 |         if flog_den is None and fgrad_log is None:
118 |             raise ValueError('At least one of {log_den, grad_log} must be specified.')
119 |         self.d = d
120 |         self.flog_den = flog_den
121 |         self.fgrad_log = fgrad_log
122 | 
123 |     def log_den(self, X):
124 |         flog_den = self.flog_den
125 |         if flog_den is None:
126 |             raise ValueError('log_den callable object is None.')
127 |         return flog_den(X)
128 | 
129 |     def grad_log(self, X):
130 |         fgrad_log = self.fgrad_log
131 |         if fgrad_log is None:
132 |             # autograd
133 |             g = autograd.elementwise_grad(self.flog_den)
134 |             G = g(X)
135 |         else:
136 |             G = fgrad_log(X)
137 |         return G
138 | 
139 |     def dim(self):
140 |         return self.d
141 | 
142 | # end UDFromCallable
143 | 
144 | 
145 | class IsotropicNormal(UnnormalizedDensity):
146 |     """
147 |     Unnormalized density of an isotropic multivariate normal distribution.
148 |     """
149 |     def __init__(self, mean, variance):
150 |         """
151 |         mean: a numpy array of length d for the mean 
152 |         variance: a positive floating-point number for the variance.
153 |         """
154 |         self.mean = mean 
155 |         self.variance = variance
156 | 
157 |     def log_den(self, X):
158 |         mean = self.mean 
159 |         variance = self.variance
160 |         unden = old_div(-np.sum((X-mean)**2, 1),(2.0*variance))
161 |         return unden
162 | 
163 |     def log_normalized_den(self, X):
164 |         d = self.dim()
165 |         return stats.multivariate_normal.logpdf(X, mean=self.mean, cov=self.variance*np.eye(d))
166 | 
167 |     def get_datasource(self):
168 |         return data.DSIsotropicNormal(self.mean, self.variance)
169 | 
170 |     def dim(self):
171 |         return len(self.mean)
172 | 
173 | 
174 | 
175 | class Normal(UnnormalizedDensity):
176 |     """
177 |     A multivariate normal distribution.
178 |     """
179 |     def __init__(self, mean, cov):
180 |         """
181 |         mean: a numpy array of length d.
182 |         cov: d x d numpy array for the covariance.
183 |         """
184 |         self.mean = mean 
185 |         self.cov = cov
186 |         assert mean.shape[0] == cov.shape[0]
187 |         assert cov.shape[0] == cov.shape[1]
188 |         E, V = np.linalg.eigh(cov)
189 |         if np.any(np.abs(E) <= 1e-7):
190 |             raise ValueError('covariance matrix is not full rank.')
191 |         # The precision matrix
192 |         self.prec = np.dot(np.dot(V, np.diag(old_div(1.0,E))), V.T)
193 |         #print self.prec
194 | 
195 |     def log_den(self, X):
196 |         mean = self.mean 
197 |         X0 = X - mean
198 |         X0prec = np.dot(X0, self.prec)
199 |         unden = old_div(-np.sum(X0prec*X0, 1),2.0)
200 |         return unden
201 | 
202 |     def get_datasource(self):
203 |         return data.DSNormal(self.mean, self.cov)
204 | 
205 |     def dim(self):
206 |         return len(self.mean)
207 | 
208 | # end Normal
209 | 
210 | class IsoGaussianMixture(UnnormalizedDensity):
211 |     """
212 |     UnnormalizedDensity of a Gaussian mixture in R^d where each component 
213 |     is an isotropic multivariate normal distribution.
214 | 
215 |     Let k be the number of mixture components.
216 |     """
217 |     def __init__(self, means, variances, pmix=None):
218 |         """
219 |         means: a k x d 2d array specifying the means.
220 |         variances: a one-dimensional length-k array of variances
221 |         pmix: a one-dimensional length-k array of mixture weights. Sum to one.
222 |         """
223 |         k, d = means.shape
224 |         if k != len(variances):
225 |             raise ValueError('Number of components in means and variances do not match.')
226 | 
227 |         if pmix is None:
228 |             pmix = old_div(np.ones(k),float(k))
229 | 
230 |         if np.abs(np.sum(pmix) - 1) > 1e-8:
231 |             raise ValueError('Mixture weights do not sum to 1.')
232 | 
233 |         self.pmix = pmix
234 |         self.means = means
235 |         self.variances = variances
236 | 
237 |     def log_den(self, X):
238 |         return self.log_normalized_den(X)
239 | 
240 |     def log_normalized_den(self, X):
241 |         pmix = self.pmix
242 |         means = self.means
243 |         variances = self.variances
244 |         k, d = self.means.shape
245 |         n = X.shape[0]
246 |         den = np.zeros(n, dtype=float)
247 |         for i in range(k):
248 |             norm_den_i = IsoGaussianMixture.normal_density(means[i],
249 |                     variances[i], X)
250 |             den = den + norm_den_i*pmix[i]
251 |         return np.log(den)
252 | 
253 |  
254 |     #def grad_log(self, X):
255 |     #    """
256 |     #    Return an n x d numpy array of gradients.
257 |     #    """
258 |     #    pmix = self.pmix
259 |     #    means = self.means
260 |     #    variances = self.variances
261 |     #    k, d = self.means.shape
262 |     #    # exact density. length-n array
263 |     #    den = np.exp(self.log_den(X))
264 |     #    for i in range(k):
265 |     #        norm_den_i = IsoGaussianMixture.normal_density(means[i],
266 |     #                variances[i], X)
267 | 
268 | 
269 |     @staticmethod
270 |     def normal_density(mean, variance, X):
271 |         """
272 |         Exact density (not log density) of an isotropic Gaussian.
273 |         mean: length-d array
274 |         variance: scalar variances
275 |         X: n x d 2d-array
276 |         """
277 |         Z = np.sqrt(2.0*np.pi*variance)
278 |         unden = np.exp(old_div(-np.sum((X-mean)**2.0, 1),(2.0*variance)) )
279 |         den = old_div(unden,Z)
280 |         assert len(den) == X.shape[0]
281 |         return den
282 | 
283 |     def get_datasource(self):
284 |         return data.DSIsoGaussianMixture(self.means, self.variances, self.pmix)
285 | 
286 |     def dim(self):
287 |         k, d = self.means.shape
288 |         return d
289 | 
290 | # end class IsoGaussianMixture
291 | 
292 | class GaussianMixture(UnnormalizedDensity):
293 |     """
294 |     UnnormalizedDensity of a Gaussian mixture in R^d where each component 
295 |     can be arbitrary. This is the most general form of a Gaussian mixture.
296 | 
297 |     Let k be the number of mixture components.
298 |     """
299 |     def __init__(self, means, variances, pmix=None):
300 |         """
301 |         means: a k x d 2d array specifying the means.
302 |         variances: a k x d x d numpy array containing a stack of k covariance
303 |             matrices, one for each mixture component.
304 |         pmix: a one-dimensional length-k array of mixture weights. Sum to one.
305 |         """
306 |         k, d = means.shape
307 |         if k != variances.shape[0]:
308 |             raise ValueError('Number of components in means and variances do not match.')
309 | 
310 |         if pmix is None:
311 |             pmix = old_div(np.ones(k),float(k))
312 | 
313 |         if np.abs(np.sum(pmix) - 1) > 1e-8:
314 |             raise ValueError('Mixture weights do not sum to 1.')
315 | 
316 |         self.pmix = pmix
317 |         self.means = means
318 |         self.variances = variances
319 | 
320 |     def log_den(self, X):
321 |         return self.log_normalized_den(X)
322 | 
323 |     def log_normalized_den(self, X):
324 |         pmix = self.pmix
325 |         means = self.means
326 |         variances = self.variances
327 |         k, d = self.means.shape
328 |         n = X.shape[0]
329 | 
330 |         den = np.zeros(n, dtype=float)
331 |         for i in range(k):
332 |             norm_den_i = GaussianMixture.multivariate_normal_density(means[i],
333 |                     variances[i], X)
334 |             den = den + norm_den_i*pmix[i]
335 |         return np.log(den)
336 | 
337 |     @staticmethod
338 |     def multivariate_normal_density(mean, cov, X):
339 |         """
340 |         Exact density (not log density) of a multivariate Gaussian.
341 |         mean: length-d array
342 |         cov: a dxd covariance matrix
343 |         X: n x d 2d-array
344 |         """
345 |         
346 |         evals, evecs = np.linalg.eigh(cov)
347 |         cov_half_inv = evecs.dot(np.diag(evals**(-0.5))).dot(evecs.T)
348 |     #     print(evals)
349 |         half_evals = np.dot(X-mean, cov_half_inv)
350 |         full_evals = np.sum(half_evals**2, 1)
351 |         unden = np.exp(-0.5*full_evals)
352 |         
353 |         Z = np.sqrt(np.linalg.det(2.0*np.pi*cov))
354 |         den = unden/Z
355 |         assert len(den) == X.shape[0]
356 |         return den
357 | 
358 |     def get_datasource(self):
359 |         return data.DSGaussianMixture(self.means, self.variances, self.pmix)
360 | 
361 |     def dim(self):
362 |         k, d = self.means.shape
363 |         return d
364 | 
365 | # end GaussianMixture
366 | 
367 | class GaussBernRBM(UnnormalizedDensity):
368 |     """
369 |     Gaussian-Bernoulli Restricted Boltzmann Machine.
370 |     The joint density takes the form
371 |         p(x, h) = Z^{-1} exp(0.5*x^T B h + b^T x + c^T h - 0.5||x||^2)
372 |     where h is a vector of {-1, 1}.
373 |     """
374 |     def __init__(self, B, b, c):
375 |         """
376 |         B: a dx x dh matrix 
377 |         b: a numpy array of length dx
378 |         c: a numpy array of length dh
379 |         """
380 |         dh = len(c)
381 |         dx = len(b)
382 |         assert B.shape[0] == dx
383 |         assert B.shape[1] == dh
384 |         assert dx > 0
385 |         assert dh > 0
386 |         self.B = B
387 |         self.b = b
388 |         self.c = c
389 | 
390 |     def log_den(self, X):
391 |         B = self.B
392 |         b = self.b
393 |         c = self.c
394 | 
395 |         XBC = 0.5*np.dot(X, B) + c
396 |         unden = np.dot(X, b) - 0.5*np.sum(X**2, 1) + np.sum(np.log(np.exp(XBC)
397 |             + np.exp(-XBC)), 1)
398 |         assert len(unden) == X.shape[0]
399 |         return unden
400 | 
401 |     def grad_log(self, X):
402 |     #    """
403 |     #    Evaluate the gradients (with respect to the input) of the log density at
404 |     #    each of the n points in X. This is the score function.
405 | 
406 |     #    X: n x d numpy array.
407 |         """
408 |         Evaluate the gradients (with respect to the input) of the log density at
409 |         each of the n points in X. This is the score function.
410 | 
411 |         X: n x d numpy array.
412 | 
413 |         Return an n x d numpy array of gradients.
414 |         """
415 |         XB = np.dot(X, self.B)
416 |         Y = 0.5*XB + self.c
417 |         E2y = np.exp(2*Y)
418 |         # n x dh
419 |         Phi = old_div((E2y-1.0),(E2y+1))
420 |         # n x dx
421 |         T = np.dot(Phi, 0.5*self.B.T)
422 |         S = self.b - X + T
423 |         return S
424 | 
425 |     def get_datasource(self, burnin=2000):
426 |         return data.DSGaussBernRBM(self.B, self.b, self.c, burnin=burnin)
427 | 
428 |     def dim(self):
429 |         return len(self.b)
430 | 
431 | # end GaussBernRBM
432 | 
433 | class ISIPoissonLinear(UnnormalizedDensity):
434 |     """
435 |     Unnormalized density of inter-arrival times from nonhomogeneous poisson process with linear intensity function.
436 |     lambda = 1 + bt
437 |     """
438 |     def __init__(self, b):
439 |         """
440 |         b: slope of the linear function 
441 |         """
442 |         warn_bounded_domain(self)
443 |         self.b = b 
444 |     
445 |     def log_den(self, X):
446 |         b = self.b
447 |         unden = -np.sum(0.5*b*X**2+X-np.log(1.0+b*X), 1)
448 |         return unden
449 | 
450 |     def dim(self):
451 |         return 1
452 | 
453 | # end ISIPoissonLinear
454 | 
455 | class ISIPoissonSine(UnnormalizedDensity):
456 |     """
457 |     Unnormalized density of inter-arrival times from nonhomogeneous poisson process with sine intensity function.
458 |     lambda = b*(1+sin(w*X))
459 |     """
460 |     def __init__(self, w=10.0,b=1.0):
461 |         """
462 |         w: the frequency of sine function
463 |         b: amplitude of intensity function
464 |         """
465 |         warn_bounded_domain(self)
466 |         self.b = b
467 |         self.w = w
468 |     
469 |     def log_den(self, X):
470 |         b = self.b
471 |         w = self.w
472 |         unden = np.sum(b*(-X + old_div((np.cos(w*X)-1),w)) + np.log(b*(1+np.sin(w*X))),1)
473 |         return unden
474 | 
475 |     def dim(self):
476 |         return 1
477 | 
478 | # end ISIPoissonSine
479 | 
480 | class Gamma(UnnormalizedDensity):
481 |     """
482 |     A gamma distribution.
483 |     """
484 |     def __init__(self, alpha, beta = 1.0):
485 |         """
486 |         alpha: shape of parameter
487 |         beta: scale
488 |         """
489 |         warn_bounded_domain(self)
490 |         self.alpha = alpha 
491 |         self.beta = beta
492 |         
493 |     def log_den(self, X):
494 |         alpha = self.alpha
495 |         beta = self.beta
496 |         #unden = np.sum(stats.gamma.logpdf(X, alpha, scale = beta), 1)
497 |         unden = np.sum(-beta*X + (alpha-1)*np.log(X), 1)
498 |         return unden
499 | 
500 |     def get_datasource(self):
501 |         return data.DSNormal(self.mean, self.cov)
502 | 
503 | 
504 |     def dim(self):
505 |         return 1
506 | 
507 | 
508 | class LogGamma(UnnormalizedDensity):
509 |     """
510 |     A gamma distribution with transformed domain.
511 |     t = exp(x),  t \in R+  x \in R
512 |     """
513 |     def __init__(self, alpha, beta = 1.0):
514 |         """
515 |         alpha: shape of parameter
516 |         beta: scale
517 |         """
518 |         self.alpha = alpha
519 |         self.beta = beta
520 |         
521 |     def log_den(self, X):
522 |         alpha = self.alpha
523 |         beta = self.beta
524 |         #unden = np.sum(stats.gamma.logpdf(X, alpha, scale = beta), 1)
525 |         unden = np.sum(-beta*np.exp(X) + (alpha-1)*X + X , 1)
526 |         return unden
527 | 
528 |     def get_datasource(self):
529 |         return data.DSNormal(self.mean, self.cov)
530 | 
531 |     def dim(self):
532 |         return 1
533 | # end LogGamma
534 | 
535 | 
536 | 
537 | class ISILogPoissonLinear(UnnormalizedDensity):
538 |     """
539 |     Unnormalized density of inter-arrival times from nonhomogeneous poisson process with linear intensity function.
540 |     lambda = 1 + bt
541 |     """
542 |     def __init__(self, b):
543 |         """
544 |         b: slope of the linear function 
545 |         """
546 |         warn_bounded_domain(self)
547 |         self.b = b 
548 |     
549 |     def log_den(self, X):
550 |         b = self.b
551 |         unden = -np.sum(0.5*b*np.exp(X)**2 + np.exp(X) - np.log(1.0+b*np.exp(X))-X, 1)
552 |         return unden
553 | 
554 |     def dim(self):
555 |         return 1
556 | 
557 | # end ISIPoissonLinear
558 | 
559 | class ISIPoisson2D(UnnormalizedDensity):
560 |     """
561 |     Unnormalized density of nonhomogeneous spatial poisson process
562 |     """
563 |     def __init__(self):
564 |         """
565 |         lambda_(X,Y) = X^2 + Y^2
566 |         """
567 |         warn_bounded_domain(self)
568 | 
569 |     def quadratic_intensity(self,X,Y):
570 |         int_intensity = -(X**2+Y**2)*X*Y + 3*np.log(X**2+Y**2)
571 |         return int_intensity
572 | 
573 |     def log_den(self, X):
574 |         unden = self.quadratic_intensity(X[:,0],X[:,1])
575 |         return unden
576 | 
577 |     def dim(self):
578 |         return 1
579 | 
580 | # end class ISIPoisson2D
581 | 
582 | 
583 | class ISISigmoidPoisson2D(UnnormalizedDensity):
584 |     """
585 |     Unnormalized density of nonhomogeneous spatial poisson process with sigmoid transformation
586 |     """
587 |     def __init__(self, intensity = 'quadratic', w = 1.0, a=1.0):
588 |         """
589 |         lambda_(X,Y) = a* X^2 + Y^2
590 |         X = 1/(1+exp(s))
591 |         Y = 1/(1+exp(t))
592 |         X, Y \in [0,1], s,t \in R
593 |         """
594 |         warn_bounded_domain(self)
595 |         self.a = a
596 |         self.w = w
597 |         if intensity == 'quadratic':
598 |             self.intensity = self.quadratic_intensity
599 |         elif intensity == 'sine':
600 |             self.intensity = self.sine_intensity
601 |         else:
602 |             raise ValueError('Not intensity function found')
603 | 
604 |     def sigmoid(self,x):
605 |         sig = old_div(1,(1+np.exp(x)))
606 |         return sig
607 | 
608 |     def quadratic_intensity(self,s,t):
609 |         X = self.sigmoid(s)
610 |         Y = self.sigmoid(t)
611 |         int_intensity = -(self.a*X**2+Y**2)*X*Y + 3*(np.log(self.a*X**2+Y**2)+np.log((X*(X-1)*Y*(Y-1))))
612 |         return int_intensity
613 | 
614 |     def log_den(self, S):
615 |         unden = self.quadratic_intensity(S[:,0],S[:,1])
616 |         return unden
617 | 
618 |     def dim(self):
619 |         return 1
620 | 
621 | # end class ISISigmoidPoisson2D
622 | 
623 | 
624 | class Poisson2D(UnnormalizedDensity):
625 |     """
626 |     Unnormalized density of nonhomogeneous spatial poisson process
627 |     """
628 |     def __init__(self, w=1.0):
629 |         """
630 |         lambda_(X,Y) = sin(w*pi*X)+sin(w*pi*Y)
631 |         """
632 |         self.w = w
633 | 
634 |     def lamb_sin(self, X):
635 |         return np.prod(np.sin(self.w*np.pi*X),1)
636 | 
637 |     def log_den(self, X):
638 |         unden = np.log(self.gmm_den(X))
639 |         return unden
640 | 
641 |     def dim(self):
642 |         return 1
643 |     
644 | class Resample(UnnormalizedDensity):
645 |     """
646 |     Unnormalized Density of real dataset with estimated intensity function
647 |     fit takes the function to evaluate the density of resampled data
648 |     """
649 |     def __init__(self, fit):
650 |         self.fit = fit
651 | 
652 |     def log_den(self, X):
653 |         unden = np.log(self.fit(X))
654 |         return unden
655 | 
656 |     def dim(self):
657 |         return 1
658 | 
659 | # end class SigmoidPoisson2D
660 | 
661 | class GaussCosFreqs(UnnormalizedDensity):
662 |     """
663 |     p(x) \propto exp(-||x||^2/2sigma^2)*(1+ prod_{i=1}^d cos(w_i*x_i))
664 | 
665 |     where w1,..wd are frequencies of each dimension.
666 |     sigma^2 is the overall variance.
667 |     """
668 | 
669 |     def __init__(self, sigma2, freqs):
670 |         """
671 |         sigma2: overall scale of the distribution. A positive scalar.
672 |         freqs: a 1-d array of length d for the frequencies.
673 |         """
674 |         self.sigma2 = sigma2
675 |         if sigma2 <= 0 :
676 |             raise ValueError('sigma2 must be > 0')
677 |         self.freqs = freqs
678 | 
679 |     def log_den(self, X):
680 |         sigma2 = self.sigma2
681 |         freqs = self.freqs
682 |         log_unden = old_div(-np.sum(X**2, 1),(2.0*sigma2)) + 1+np.prod(np.cos(X*freqs), 1)
683 |         return log_unden
684 | 
685 |     def dim(self):
686 |         return len(self.freqs)
687 | 
688 |     def get_datasource(self):
689 |         return data.DSGaussCosFreqs(self.sigma2, self.freqs)
690 | 
691 | 


--------------------------------------------------------------------------------
/kgof/ex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wittawatj/kernel-gof/039a95ed9d8062e283da6bd051b7161a190b4876/kgof/ex/__init__.py


--------------------------------------------------------------------------------
/kgof/ex/ex3_vary_nlocs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simulation to examine the P(reject) as the number of test locations
  3 | increases.  
  4 | """
  5 | __author__ = 'wittawat'
  6 | 
  7 | import kgof
  8 | import kgof.data as data
  9 | import kgof.glo as glo
 10 | import kgof.density as density
 11 | import kgof.goftest as gof
 12 | import kgof.util as util 
 13 | import kgof.kernel as kernel 
 14 | 
 15 | # need independent_jobs package 
 16 | # https://github.com/karlnapf/independent-jobs
 17 | # The independent_jobs and kgof have to be in the global search path (.bashrc)
 18 | import independent_jobs as inj
 19 | from independent_jobs.jobs.IndependentJob import IndependentJob
 20 | from independent_jobs.results.SingleResult import SingleResult
 21 | from independent_jobs.aggregators.SingleResultAggregator import SingleResultAggregator
 22 | from independent_jobs.engines.BatchClusterParameters import BatchClusterParameters
 23 | from independent_jobs.engines.SerialComputationEngine import SerialComputationEngine
 24 | from independent_jobs.engines.SlurmComputationEngine import SlurmComputationEngine
 25 | from independent_jobs.tools.Log import logger
 26 | import logging
 27 | import math
 28 | #import numpy as np
 29 | import autograd.numpy as np
 30 | import os
 31 | import sys 
 32 | import time
 33 | 
 34 | """
 35 | All the job functions return a dictionary with the following keys:
 36 |     - goftest: test object. (may or may not return)
 37 |     - test_result: the result from calling perform_test(te).
 38 |     - time_secs: run time in seconds 
 39 | """
 40 | 
 41 | def job_fssdq_med(p, data_source, tr, te, r, J, null_sim=None):
 42 |     """
 43 |     FSSD test with a Gaussian kernel, where the test locations are randomized,
 44 |     and the Gaussian width is set with the median heuristic. Use full sample.
 45 |     No training/testing splits.
 46 | 
 47 |     p: an UnnormalizedDensity
 48 |     data_source: a DataSource
 49 |     tr, te: Data
 50 |     r: trial number (positive integer)
 51 |     """
 52 |     if null_sim is None:
 53 |         null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)
 54 | 
 55 |     # full data
 56 |     data = tr + te
 57 |     X = data.data()
 58 |     with util.ContextTimer() as t:
 59 |         # median heuristic 
 60 |         med = util.meddistance(X, subsample=1000)
 61 |         k = kernel.KGauss(med**2)
 62 |         V = util.fit_gaussian_draw(X, J, seed=r+1)
 63 | 
 64 |         fssd_med = gof.FSSD(p, k, V, null_sim=null_sim, alpha=alpha)
 65 |         fssd_med_result = fssd_med.perform_test(data)
 66 |     return { 'test_result': fssd_med_result, 'time_secs': t.secs}
 67 | 
 68 | 
 69 | 
 70 | def job_fssdq_opt(p, data_source, tr, te, r, J, null_sim=None):
 71 |     """
 72 |     FSSD with optimization on tr. Test on te. Use a Gaussian kernel.
 73 |     """
 74 |     if null_sim is None:
 75 |         null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=r)
 76 | 
 77 |     Xtr = tr.data()
 78 |     with util.ContextTimer() as t:
 79 |         # Use grid search to initialize the gwidth
 80 |         n_gwidth_cand = 5
 81 |         gwidth_factors = 2.0**np.linspace(-3, 3, n_gwidth_cand) 
 82 |         med2 = util.meddistance(Xtr, 1000)**2
 83 | 
 84 |         k = kernel.KGauss(med2*2)
 85 |         # fit a Gaussian to the data and draw to initialize V0
 86 |         V0 = util.fit_gaussian_draw(Xtr, J, seed=r+1, reg=1e-6)
 87 |         list_gwidth = np.hstack( ( (med2)*gwidth_factors ) )
 88 |         besti, objs = gof.GaussFSSD.grid_search_gwidth(p, tr, V0, list_gwidth)
 89 |         gwidth = list_gwidth[besti]
 90 |         assert util.is_real_num(gwidth), 'gwidth not real. Was %s'%str(gwidth)
 91 |         assert gwidth > 0, 'gwidth not positive. Was %.3g'%gwidth
 92 |         logging.info('After grid search, gwidth=%.3g'%gwidth)
 93 |         
 94 |         ops = {
 95 |             'reg': 1e-2,
 96 |             'max_iter': 50,
 97 |             'tol_fun': 1e-4,
 98 |             'disp': True,
 99 |             'locs_bounds_frac': 10.0,
100 |             'gwidth_lb': 1e-1,
101 |             'gwidth_ub': 1e3,
102 |             }
103 | 
104 |         V_opt, gwidth_opt, info = gof.GaussFSSD.optimize_locs_widths(p, tr,
105 |                 gwidth, V0, **ops) 
106 |         # Use the optimized parameters to construct a test
107 |         k_opt = kernel.KGauss(gwidth_opt)
108 |         fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
109 |         fssd_opt_result = fssd_opt.perform_test(te)
110 |     return {'test_result': fssd_opt_result, 'time_secs': t.secs, 
111 |             'goftest': fssd_opt, 'opt_info': info,
112 |             }
113 | 
114 | def job_fssdp_opt(p, data_source, tr, te, r, J):
115 |     """
116 |     The suffix p means that p is sampled to get a sample for computing the
117 |     covariance matrix under H0.
118 |     """
119 |     null_sim = gof.FSSDH0SimCovDraw(n_draw=2000, n_simulate=2000, seed=r)
120 |     return job_fssdq_opt(p, data_source, tr, te, r, J, null_sim=null_sim)
121 | 
122 | 
123 | # Define our custom Job, which inherits from base class IndependentJob
124 | class Ex3Job(IndependentJob):
125 |    
126 |     def __init__(self, aggregator, p, data_source,
127 |             prob_label, rep, job_func, n_locs):
128 |         #walltime = 60*59*24 
129 |         walltime = 60*59
130 |         memory = int(tr_proportion*sample_size*1e-2) + 50
131 | 
132 |         IndependentJob.__init__(self, aggregator, walltime=walltime,
133 |                                memory=memory)
134 |         # p: an UnnormalizedDensity
135 |         self.p = p
136 |         self.data_source = data_source
137 |         self.prob_label = prob_label
138 |         self.rep = rep
139 |         self.job_func = job_func
140 |         self.n_locs = n_locs
141 | 
142 |     # we need to define the abstract compute method. It has to return an instance
143 |     # of JobResult base class
144 |     def compute(self):
145 | 
146 |         p = self.p
147 |         data_source = self.data_source 
148 |         r = self.rep
149 |         n_locs = self.n_locs
150 |         job_func = self.job_func
151 |         # sample_size is a global variable
152 |         data = data_source.sample(sample_size, seed=r)
153 |         with util.ContextTimer() as t:
154 |             tr, te = data.split_tr_te(tr_proportion=tr_proportion, seed=r+21 )
155 |             prob_label = self.prob_label
156 |             logger.info("computing. %s. prob=%s, r=%d,\
157 |                     J=%d"%(job_func.__name__, prob_label, r, n_locs))
158 | 
159 |             job_result = job_func(p, data_source, tr, te, r, n_locs)
160 | 
161 |             # create ScalarResult instance
162 |             result = SingleResult(job_result)
163 |             # submit the result to my own aggregator
164 |             self.aggregator.submit_result(result)
165 |             func_name = job_func.__name__
166 |         logger.info("done. ex2: %s, prob=%s, r=%d, J=%d. Took: %.3g s "%(func_name,
167 |             prob_label, r, n_locs, t.secs))
168 | 
169 |         # save result
170 |         fname = '%s-%s-n%d_r%d_J%d_a%.3f_trp%.2f.p' \
171 |                 %(prob_label, func_name, sample_size, r, n_locs, alpha,
172 |                         tr_proportion)
173 |         glo.ex_save_result(ex, job_result, prob_label, fname)
174 | 
175 | 
176 | # This import is needed so that pickle knows about the class Ex3Job.
177 | # pickle is used when collecting the results from the submitted jobs.
178 | from kgof.ex.ex3_vary_nlocs import Ex3Job
179 | from kgof.ex.ex3_vary_nlocs import job_fssdq_med
180 | from kgof.ex.ex3_vary_nlocs import job_fssdq_opt
181 | from kgof.ex.ex3_vary_nlocs import job_fssdp_opt
182 | 
183 | #--- experimental setting -----
184 | ex = 3
185 | 
186 | # sample size = n (the training and test sizes are n/2)
187 | sample_size = 500
188 | 
189 | # number of test locations / test frequencies J
190 | alpha = 0.05
191 | tr_proportion = 0.5
192 | # repetitions for each parameter setting
193 | reps = 300
194 | 
195 | # list of number of test locations/frequencies
196 | #Js = [5, 10, 15, 20, 25]
197 | #Js = range(2, 6+1)
198 | #Js = [2**x for x in range(5)]
199 | Js = [2, 8, 32, 96, 384 ]
200 | #Js = [2, 8, 32]
201 | 
202 | method_job_funcs = [ job_fssdq_med, job_fssdq_opt, 
203 |         #job_fssdp_opt, 
204 |         ]
205 | 
206 | # If is_rerun==False, do not rerun the experiment if a result file for the current
207 | # setting already exists.
208 | is_rerun = False
209 | #---------------------------
210 | 
211 | def gaussbern_rbm_tuple(var, dx=50, dh=10, n=sample_size):
212 |     """
213 |     Get a tuple of Gaussian-Bernoulli RBM problems.
214 |     We follow the parameter settings as described in section 6 of Liu et al.,
215 |     2016.
216 | 
217 |     - var: Gaussian noise variance for perturbing B.
218 |     - dx: observed dimension
219 |     - dh: latent dimension
220 | 
221 |     Return p, a DataSource
222 |     """
223 |     with util.NumpySeedContext(seed=1000):
224 |         B = np.random.randint(0, 2, (dx, dh))*2 - 1.0
225 |         b = np.random.randn(dx)
226 |         c = np.random.randn(dh)
227 |         p = density.GaussBernRBM(B, b, c)
228 | 
229 |         B_perturb = B + np.random.randn(dx, dh)*np.sqrt(var)
230 |         gb_rbm = data.DSGaussBernRBM(B_perturb, b, c, burnin=50)
231 | 
232 |     return p, gb_rbm
233 | 
234 | def get_pqsource(prob_label):
235 |     """
236 |     Return (p, ds), a tuple of
237 |     - p: a Density representing the distribution p
238 |     - ds: a DataSource, each corresponding to one parameter setting.
239 |         The DataSource generates sample from q.
240 |     """
241 |     prob2tuples = { 
242 |             # H0 is true. vary d. P = Q = N(0, I)
243 |             'sg5': (density.IsotropicNormal(np.zeros(5), 1),
244 |                 data.DSIsotropicNormal(np.zeros(5), 1) ),
245 | 
246 |             # P = N(0, I), Q = N( (0.2,..0), I)
247 |             'gmd5': (density.IsotropicNormal(np.zeros(5), 1),
248 |                 data.DSIsotropicNormal(np.hstack((0.2, np.zeros(4))), 1) ),
249 | 
250 |             'gmd1': (density.IsotropicNormal(np.zeros(1), 1),
251 |                 data.DSIsotropicNormal(np.ones(1)*0.2, 1) ),
252 | 
253 |             # P = N(0, I), Q = N( (1,..0), I)
254 |             'gmd100': (density.IsotropicNormal(np.zeros(100), 1),
255 |                 data.DSIsotropicNormal(np.hstack((1, np.zeros(99))), 1) ),
256 | 
257 |             # Gaussian variance difference problem. Only the variance 
258 |             # of the first dimenion differs. d varies.
259 |             'gvd5': (density.Normal(np.zeros(5), np.eye(5) ), 
260 |                 data.DSNormal(np.zeros(5), np.diag(np.hstack((2, np.ones(4)))) )),
261 | 
262 |             'gvd10': (density.Normal(np.zeros(10), np.eye(10) ), 
263 |                 data.DSNormal(np.zeros(10), np.diag(np.hstack((2, np.ones(9)))) )),
264 | 
265 |             # Gaussian Bernoulli RBM. dx=50, dh=10. H0 is true
266 |             'gbrbm_dx50_dh10_v0': gaussbern_rbm_tuple(0,
267 |                 dx=50, dh=10, n=sample_size),
268 | 
269 |             # Gaussian Bernoulli RBM. dx=5, dh=3. H0 is true
270 |             'gbrbm_dx5_dh3_v0': gaussbern_rbm_tuple(0,
271 |                 dx=5, dh=3, n=sample_size),
272 | 
273 |             # Gaussian Bernoulli RBM. dx=50, dh=10. 
274 |             'gbrbm_dx50_dh10_v1em3': gaussbern_rbm_tuple(1e-3,
275 |                 dx=50, dh=10, n=sample_size),
276 | 
277 |             # Gaussian Bernoulli RBM. dx=5, dh=3. Perturb with noise = 1e-2.
278 |             'gbrbm_dx5_dh3_v5em3': gaussbern_rbm_tuple(5e-3,
279 |                 dx=5, dh=3, n=sample_size),
280 | 
281 |             # Gaussian mixture of two components. Uniform mixture weights.
282 |             # p = 0.5*N(0, 1) + 0.5*N(3, 0.01)
283 |             # q = 0.5*N(-3, 0.01) + 0.5*N(0, 1)
284 |             'gmm_d1': (
285 |                 density.IsoGaussianMixture(np.array([[0], [3.0]]), np.array([1, 0.01]) ),
286 |                 data.DSIsoGaussianMixture(np.array([[-3.0], [0]]), np.array([0.01, 1]) )
287 |                 ),
288 | 
289 |             # p = N(0, 1) 
290 |             # q = 0.1*N([-10, 0,..0], 0.001) + 0.9*N([0,0,..0], 1)
291 |             'g_vs_gmm_d5': (
292 |                     density.IsotropicNormal(np.zeros(5), 1), 
293 |                     data.DSIsoGaussianMixture( 
294 |                         np.vstack(( np.hstack((0.0, np.zeros(4))), np.zeros(5) )),
295 |                         np.array([0.0001, 1]), pmix=[0.1, 0.9] )
296 |                     ),
297 | 
298 |             'g_vs_gmm_d2': (
299 |                     density.IsotropicNormal(np.zeros(2), 1), 
300 |                     data.DSIsoGaussianMixture( 
301 |                         np.vstack(( np.hstack((0.0, np.zeros(1))), np.zeros(2) )),
302 |                         np.array([0.01, 1]), pmix=[0.1, 0.9] )
303 |                     ),
304 |             'g_vs_gmm_d1': (
305 |                     density.IsotropicNormal(np.zeros(1), 1), 
306 |                     data.DSIsoGaussianMixture(np.array([[0.0], [0]]),
307 |                         np.array([0.01, 1]), pmix=[0.1, 0.9] )
308 |                     ),
309 |             }
310 |     if prob_label not in prob2tuples:
311 |         raise ValueError('Unknown problem label. Need to be one of %s'%str(prob2tuples.keys()) )
312 |     return prob2tuples[prob_label]
313 | 
314 | 
315 | def run_problem(prob_label):
316 |     """Run the experiment"""
317 |     p, ds = get_pqsource(prob_label)
318 | 
319 |     # ///////  submit jobs //////////
320 |     # create folder name string
321 |     #result_folder = glo.result_folder()
322 |     from kgof.config import expr_configs
323 |     tmp_dir = expr_configs['scratch_path']
324 |     foldername = os.path.join(tmp_dir, 'kgof_slurm', 'e%d'%ex)
325 |     logger.info("Setting engine folder to %s" % foldername)
326 | 
327 |     # create parameter instance that is needed for any batch computation engine
328 |     logger.info("Creating batch parameter instance")
329 |     batch_parameters = BatchClusterParameters(
330 |         foldername=foldername, job_name_base="e%d_"%ex, parameter_prefix="")
331 | 
332 |     # Use the following line if Slurm queue is not used.
333 |     #engine = SerialComputationEngine()
334 |     engine = SlurmComputationEngine(batch_parameters)
335 |     n_methods = len(method_job_funcs)
336 |     # repetitions x len(Js) x #methods
337 |     aggregators = np.empty((reps, len(Js), n_methods ), dtype=object)
338 |     for r in range(reps):
339 |         for ji, J in enumerate(Js):
340 |             for mi, f in enumerate(method_job_funcs):
341 |                 # name used to save the result
342 |                 func_name = f.__name__
343 |                 fname = '%s-%s-n%d_r%d_J%d_a%.3f_trp%.2f.p' \
344 |                         %(prob_label, func_name, sample_size, r, J, alpha,
345 |                                 tr_proportion)
346 |                 if not is_rerun and glo.ex_file_exists(ex, prob_label, fname):
347 |                     logger.info('%s exists. Load and return.'%fname)
348 |                     job_result = glo.ex_load_result(ex, prob_label, fname)
349 | 
350 |                     sra = SingleResultAggregator()
351 |                     sra.submit_result(SingleResult(job_result))
352 |                     aggregators[r, ji, mi] = sra
353 |                 else:
354 |                     # result not exists or rerun
355 | 
356 |                     # p: an UnnormalizedDensity object
357 |                     job = Ex3Job(SingleResultAggregator(), p, ds, prob_label,
358 |                             r, f, J)
359 |                     agg = engine.submit_job(job)
360 |                     aggregators[r, ji, mi] = agg
361 | 
362 |     # let the engine finish its business
363 |     logger.info("Wait for all call in engine")
364 |     engine.wait_for_all()
365 | 
366 |     # ////// collect the results ///////////
367 |     logger.info("Collecting results")
368 |     job_results = np.empty((reps, len(Js), n_methods), dtype=object)
369 |     for r in range(reps):
370 |         for ji, J in enumerate(Js):
371 |             for mi, f in enumerate(method_job_funcs):
372 |                 logger.info("Collecting result (%s, r=%d, J=%rd)" %
373 |                         (f.__name__, r, J))
374 |                 # let the aggregator finalize things
375 |                 aggregators[r, ji, mi].finalize()
376 | 
377 |                 # aggregators[i].get_final_result() returns a SingleResult instance,
378 |                 # which we need to extract the actual result
379 |                 job_result = aggregators[r, ji, mi].get_final_result().result
380 |                 job_results[r, ji, mi] = job_result
381 | 
382 |     #func_names = [f.__name__ for f in method_job_funcs]
383 |     #func2labels = exglobal.get_func2label_map()
384 |     #method_labels = [func2labels[f] for f in func_names if f in func2labels]
385 | 
386 |     # save results 
387 |     results = {'job_results': job_results, 'data_source': ds, 
388 |             'alpha': alpha, 'repeats': reps, 'Js': Js,
389 |             'p': p,
390 |             'tr_proportion': tr_proportion,
391 |             'method_job_funcs': method_job_funcs, 'prob_label': prob_label,
392 |             'sample_size': sample_size, 
393 |             }
394 |     
395 |     # class name 
396 |     fname = 'ex%d-%s-me%d_n%d_rs%d_Jmi%d_Jma%d_a%.3f_trp%.2f.p' \
397 |         %(ex, prob_label, n_methods, sample_size, reps, min(Js),
398 |                 max(Js), alpha, tr_proportion)
399 | 
400 |     glo.ex_save_result(ex, results, fname)
401 |     logger.info('Saved aggregated results to %s'%fname)
402 | 
403 | 
404 | def main():
405 |     if len(sys.argv) != 2:
406 |         print('Usage: %s problem_label'%sys.argv[0])
407 |         sys.exit(1)
408 |     prob_label = sys.argv[1]
409 | 
410 |     run_problem(prob_label)
411 | 
412 | if __name__ == '__main__':
413 |     main()
414 | 
415 | 


--------------------------------------------------------------------------------
/kgof/ex/run_ex1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | screen -AdmS ex1_kgof -t tab0 bash 
 4 | # launch each problem in parallell, each in its own screen tab
 5 | # See http://unix.stackexchange.com/questions/74785/how-to-open-tabs-windows-in-gnu-screen-execute-commands-within-each-one
 6 | # http://stackoverflow.com/questions/7120426/invoke-bash-run-commands-inside-new-shell-then-give-control-back-to-user
 7 | 
 8 | #screen -S ex1_kgof -X screen -t tab6 bash -lic "python ex1_vary_n.py gmd_p03_d10_ns"
 9 | 
10 | 
11 | screen -S ex1_kgof -X screen -t tab7 bash -lic "python ex1_vary_n.py gbrbm_dx50_dh10_h0"
12 | screen -S ex1_kgof -X screen -t tab7 bash -lic "python ex1_vary_n.py gbrbm_dx50_dh40_h0"
13 | screen -S ex1_kgof -X screen -t tab7 bash -lic "python ex1_vary_n.py gbrbm_dx50_dh10_vp1"
14 | screen -S ex1_kgof -X screen -t tab7 bash -lic "python ex1_vary_n.py gbrbm_dx50_dh40_vp1"
15 | 
16 | 


--------------------------------------------------------------------------------
/kgof/ex/run_ex2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | screen -AdmS ex2_kgof -t tab0 bash 
 4 | # launch each problem in parallell, each in its own screen tab
 5 | # See http://unix.stackexchange.com/questions/74785/how-to-open-tabs-windows-in-gnu-screen-execute-commands-within-each-one
 6 | # http://stackoverflow.com/questions/7120426/invoke-bash-run-commands-inside-new-shell-then-give-control-back-to-user
 7 | 
 8 | #screen -S ex2_kgof -X screen -t tab1 bash -lic "python ex2_prob_params.py gmd"
 9 | #screen -S ex2_kgof -X screen -t tab3 bash -lic "python ex2_prob_params.py gvinc_d5"
10 | #screen -S ex2_kgof -X screen -t tab3 bash -lic "python ex2_prob_params.py gvsub1_d1"
11 | #screen -S ex2_kgof -X screen -t tab4 bash -lic "python ex2_prob_params.py gmd_d10_ms"
12 | #screen -S ex2_kgof -X screen -t tab5 bash -lic "python ex2_prob_params.py gvd"
13 | 
14 | 
15 | #screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh10"
16 | screen -S ex2_kgof -X screen -t tab6 bash -lic "python ex2_prob_params.py gbrbm_dx50_dh40"
17 | #screen -S ex2_kgof -X screen -t tab7 bash -lic "python ex2_prob_params.py glaplace"
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/kgof/ex/run_ex3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | screen -AdmS ex3_kgof -t tab0 bash 
 4 | # launch each problem in parallell, each in its own screen tab
 5 | # See http://unix.stackexchange.com/questions/74785/how-to-open-tabs-windows-in-gnu-screen-execute-commands-within-each-one
 6 | # http://stackoverflow.com/questions/7120426/invoke-bash-run-commands-inside-new-shell-then-give-control-back-to-user
 7 | 
 8 | #screen -S ex2_kgof -X screen -t tab1 bash -lic "python ex2_prob_params.py gmd"
 9 | #screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py gmd1"
10 | #screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py g_vs_gmm_d5"
11 | #screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py g_vs_gmm_d2"
12 | #screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py gvd10"
13 | #screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py gbrbm_dx5_dh3_v0"
14 | #screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py gbrbm_dx5_dh3_v5em3"
15 | 
16 | 
17 | screen -S ex3_kgof -X screen -t tab2 bash -lic "python ex3_vary_nlocs.py sg5"
18 | screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py gvd5"
19 | screen -S ex3_kgof -X screen -t tab3 bash -lic "python ex3_vary_nlocs.py g_vs_gmm_d1"
20 | 


--------------------------------------------------------------------------------
/kgof/glo.py:
--------------------------------------------------------------------------------
  1 | """A global module containing functions for managing the project."""
  2 | 
  3 | from future import standard_library
  4 | standard_library.install_aliases()
  5 | __author__ = 'wittawat'
  6 | 
  7 | import kgof
  8 | import os
  9 | import pickle
 10 | 
 11 | 
 12 | def get_root():
 13 |     """Return the full path to the root of the package"""
 14 |     return os.path.abspath(os.path.dirname(kgof.__file__))
 15 | 
 16 | def result_folder():
 17 |     """Return the full path to the result/ folder containing experimental result 
 18 |     files"""
 19 |     import kgof.config as config
 20 |     results_path = config.expr_configs['expr_results_path']
 21 |     return results_path
 22 |     #return os.path.join(get_root(), 'result')
 23 | 
 24 | def data_folder():
 25 |     """
 26 |     Return the full path to the data folder 
 27 |     """
 28 |     import kgof.config as config
 29 |     data_path = config.expr_configs['data_path']
 30 |     return data_path
 31 |     #return os.path.join(get_root(), 'data')
 32 | 
 33 | def data_file(*relative_path):
 34 |     """
 35 |     Access the file under the data folder. The path is relative to the 
 36 |     data folder
 37 |     """
 38 |     dfolder = data_folder()
 39 |     return os.path.join(dfolder, *relative_path)
 40 | 
 41 | def load_data_file(*relative_path):
 42 |     fpath = data_file(*relative_path)
 43 |     return pickle_load(fpath)
 44 | 
 45 | def ex_result_folder(ex):
 46 |     """Return the full path to the folder containing result files of the specified 
 47 |     experiment. 
 48 |     ex: a positive integer. """
 49 |     rp = result_folder()
 50 |     fpath = os.path.join(rp, 'ex%d'%ex )
 51 |     if not os.path.exists(fpath):
 52 |         create_dirs(fpath)
 53 |     return fpath
 54 | 
 55 | def create_dirs(full_path):
 56 |     """Recursively create the directories along the specified path. 
 57 |     Assume that the path refers to a folder. """
 58 |     if not os.path.exists(full_path):
 59 |         os.makedirs(full_path)
 60 | 
 61 | def ex_result_file(ex, *relative_path ):
 62 |     """Return the full path to the file identified by the relative path as a list 
 63 |     of folders/files under the result folder of the experiment ex. """
 64 |     rf = ex_result_folder(ex)
 65 |     return os.path.join(rf, *relative_path)
 66 | 
 67 | def ex_save_result(ex, result, *relative_path):
 68 |     """Save a dictionary object result for the experiment ex. Serialization is 
 69 |     done with pickle. 
 70 |     EX: ex_save_result(1, result, 'data', 'result.p'). Save under result/ex1/data/result.p 
 71 |     EX: ex_save_result(1, result, 'result.p'). Save under result/ex1/result.p 
 72 |     """
 73 |     fpath = ex_result_file(ex, *relative_path)
 74 |     dir_path = os.path.dirname(fpath)
 75 |     create_dirs(dir_path)
 76 |     # 
 77 |     with open(fpath, 'wb') as f:
 78 |         # expect result to be a dictionary
 79 |         pickle.dump(result, f)
 80 | 
 81 | def ex_load_result(ex, *relative_path):
 82 |     """Load a result identified by the  path from the experiment ex"""
 83 |     fpath = ex_result_file(ex, *relative_path)
 84 |     return pickle_load(fpath)
 85 | 
 86 | def ex_file_exists(ex, *relative_path):
 87 |     """Return true if the result file in under the specified experiment folder
 88 |     exists"""
 89 |     fpath = ex_result_file(ex, *relative_path)
 90 |     return os.path.isfile(fpath)
 91 | 
 92 | def pickle_load(fpath):
 93 |     if not os.path.isfile(fpath):
 94 |         raise ValueError('%s does not exist' % fpath)
 95 | 
 96 |     with open(fpath, 'rb') as f:
 97 |         # expect a dictionary
 98 |         result = pickle.load(f)
 99 |     return result
100 | 
101 | 


--------------------------------------------------------------------------------
/kgof/intertst.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module containing the two-sample tests of Jitkrittum et al., 2016 (NIPS 2016)
  3 | disguised as goodness-of-fit tests. Require the ability to
  4 | sample from the specified density. This module depends on external packages.
  5 | 
  6 | freqopttest https://github.com/wittawatj/interpretable-test
  7 | 
  8 | """
  9 | 
 10 | from builtins import str
 11 | __author__ = 'wittawat'
 12 | 
 13 | from abc import ABCMeta, abstractmethod
 14 | import autograd
 15 | import autograd.numpy as np
 16 | # Require freqopttest https://github.com/wittawatj/interpretable-test
 17 | import freqopttest.tst as tst
 18 | import freqopttest.data as fdata
 19 | import kgof.data as data
 20 | import kgof.goftest as gof
 21 | import kgof.util as util
 22 | import kgof.kernel as kernel
 23 | import logging
 24 | import matplotlib.pyplot as plt
 25 | 
 26 | import scipy
 27 | import scipy.stats as stats
 28 | 
 29 | 
 30 | class GaussMETest(gof.GofTest):
 31 |     """
 32 |     Goodness-of-fit test by drawing sample from the density p and test with
 33 |     the mean embeddings test of Jitkrittum et al., 2016 (NIPS 2016). Use a
 34 |     Gaussian kernel. Test locations are specified, not optimized. 
 35 | 
 36 |     H0: the sample follows p
 37 |     H1: the sample does not follow p
 38 | 
 39 |     p is specified to the constructor in the form of an UnnormalizedDensity.
 40 |     """
 41 | 
 42 |     def __init__(self, p, gwidth2, test_locs, alpha=0.01, seed=28):
 43 |         """
 44 |         p: an instance of UnnormalizedDensity
 45 |         gwidth2: Gaussian width squared for the Gaussian kernel
 46 |         test_locs: J x d numpy array of J locations to test the difference
 47 |         alpha: significance level 
 48 |         """
 49 |         super(GaussMETest, self).__init__(p, alpha)
 50 |         self.gwidth2 = gwidth2
 51 |         self.test_locs = test_locs
 52 |         self.seed = seed
 53 |         ds = p.get_datasource()
 54 |         if ds is None:
 55 |             raise ValueError('%s test requires a density p which implements get_datasource(', str(GaussMETest))
 56 | 
 57 |         # Construct the ME test
 58 |         metest = tst.MeanEmbeddingTest(test_locs, gwidth2, alpha=alpha)
 59 |         self.metest = metest
 60 | 
 61 |     def perform_test(self, dat):
 62 |         """
 63 |         dat: an instance of Data
 64 |         """
 65 |         with util.ContextTimer() as t:
 66 |             seed = self.seed
 67 |             metest = self.metest
 68 |             p = self.p
 69 | 
 70 |             # Draw sample from p. #sample to draw is the same as that of dat
 71 |             ds = p.get_datasource()
 72 |             p_sample = ds.sample(dat.sample_size(), seed=seed)
 73 | 
 74 |             # Run the two-sample test on p_sample and dat
 75 |             # Make a two-sample test data
 76 |             tst_data = fdata.TSTData(p_sample.data(), dat.data())
 77 |             # Test 
 78 |             results = metest.perform_test(tst_data)
 79 | 
 80 |         results['time_secs'] = t.secs
 81 |         return results
 82 | 
 83 |     def compute_stat(self, dat):
 84 |         metest = self.metest
 85 |         p = self.p
 86 |         # Draw sample from p. #sample to draw is the same as that of dat
 87 |         ds = p.get_datasource()
 88 |         p_sample = ds.sample(dat.sample_size(), seed=self.seed)
 89 | 
 90 |         # Make a two-sample test data
 91 |         tst_data = fdata.TSTData(p_sample.data(), dat.data())
 92 |         s = metest.compute_stat(tst_data)
 93 |         return s
 94 | 
 95 |         
 96 | # end GaussMETest
 97 | 
 98 | class GaussMETestOpt(gof.GofTest):
 99 |     """
100 |     Goodness-of-fit test by drawing sample from the density p and test with
101 |     the mean embeddings test of Jitkrittum et al., 2016 (NIPS 2016). Use a
102 |     Gaussian kernel. 
103 |     
104 |     For each given dataset dat, automatically optimize the test locations and
105 |     the Gaussian width by dividing the dat into two disjoint halves: tr
106 |     (training) and te (test set). The size of tr is specified by tr_proportion.
107 | 
108 |     H0: the sample follows p
109 |     H1: the sample does not follow p
110 | 
111 |     p is specified to the constructor in the form of an UnnormalizedDensity.
112 |     """
113 | 
114 |     def __init__(self, p, n_locs, tr_proportion=0.5, alpha=0.01, seed=29):
115 |         """
116 |         p: an instance of UnnormalizedDensity
117 |         n_locs: number of test locations to use
118 |         tr_proportion: proportion of the training set. A number in (0, 1).
119 |         alpha: significance level 
120 |         """
121 |         super(GaussMETestOpt, self).__init__(p, alpha)
122 |         if tr_proportion <= 0 or tr_proportion >= 1:
123 |             raise ValueError('tr_proportion must be between 0 and 1 (exclusive)')
124 |         self.n_locs = n_locs
125 |         self.tr_proportion = tr_proportion
126 |         self.seed = seed
127 |         ds = p.get_datasource()
128 |         if ds is None:
129 |             raise ValueError('%s test requires a density p which implements get_datasource(', str(GaussMETest))
130 | 
131 |     def perform_test(self, dat, op=None, return_metest=False):
132 |         """
133 |         dat: an instance of Data
134 |         op: a dictionary specifying options for the optimization of the ME test.
135 |             Can be None (use default).
136 |         """
137 | 
138 |         with util.ContextTimer() as t:
139 |             metest, tr_tst_data, te_tst_data = self._get_metest_opt(dat, op)
140 | 
141 |             # Run the two-sample test 
142 |             results = metest.perform_test(te_tst_data)
143 | 
144 |         results['time_secs'] = t.secs
145 |         if return_metest:
146 |             results['metest'] = metest
147 |         return results
148 | 
149 |     def _get_metest_opt(self, dat, op=None):
150 |         seed = self.seed
151 |         if op is None:
152 |             op = {'n_test_locs': self.n_locs, 'seed': seed+5, 'max_iter': 100, 
153 |                  'batch_proportion': 1.0, 'locs_step_size': 1.0, 
154 |                   'gwidth_step_size': 0.1, 'tol_fun': 1e-4, 'reg':1e-6}
155 |         seed = self.seed
156 |         alpha = self.alpha
157 |         p = self.p
158 |         # Draw sample from p. #sample to draw is the same as that of dat
159 |         ds = p.get_datasource()
160 |         p_sample = ds.sample(dat.sample_size(), seed=seed)
161 |         xtr, xte = p_sample.split_tr_te(tr_proportion=self.tr_proportion, seed=seed+18)
162 |         # ytr, yte are of type data.Data
163 |         ytr, yte = dat.split_tr_te(tr_proportion=self.tr_proportion, seed=seed+12)
164 | 
165 |         # training and test data
166 |         tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
167 |         te_tst_data = fdata.TSTData(xte.data(), yte.data())
168 | 
169 |         # Train the ME test
170 |         V_opt, gw2_opt, _ = tst.MeanEmbeddingTest.optimize_locs_width(tr_tst_data, alpha, **op)
171 |         metest = tst.MeanEmbeddingTest(V_opt, gw2_opt, alpha)
172 |         return metest, tr_tst_data, te_tst_data
173 | 
174 |     def compute_stat(self, dat, op=None):
175 |         metest, tr_tst_data, te_tst_data = self._get_metest_opt(dat, op)
176 | 
177 |         # Make a two-sample test data
178 |         s = metest.compute_stat(te_tst_data)
179 |         return s
180 | 
181 | 


--------------------------------------------------------------------------------
/kgof/mmd.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module containing the MMD two-sample test of Gretton et al., 2012 
  3 | "A Kernel Two-Sample Test" disguised as goodness-of-fit tests. Require the
  4 | ability to sample from the specified density. This module depends on an external
  5 | package
  6 | 
  7 | freqopttest https://github.com/wittawatj/interpretable-test
  8 | 
  9 | providing an implementation to the MMD test.
 10 | 
 11 | """
 12 | 
 13 | from builtins import str
 14 | __author__ = 'wittawat'
 15 | 
 16 | from abc import ABCMeta, abstractmethod
 17 | import autograd
 18 | import autograd.numpy as np
 19 | # Require freqopttest https://github.com/wittawatj/interpretable-test
 20 | import freqopttest.tst as tst
 21 | import freqopttest.data as fdata
 22 | import kgof.data as data
 23 | import kgof.goftest as gof
 24 | import kgof.util as util
 25 | import kgof.kernel as kernel
 26 | import logging
 27 | import matplotlib.pyplot as plt
 28 | 
 29 | import scipy
 30 | import scipy.stats as stats
 31 | 
 32 | class QuadMMDGof(gof.GofTest):
 33 |     """
 34 |     Goodness-of-fit test by drawing sample from the density p and test with
 35 |     the MMD test of Gretton et al., 2012. 
 36 | 
 37 |     H0: the sample follows p
 38 |     H1: the sample does not follow p
 39 | 
 40 |     p is specified to the constructor in the form of an UnnormalizedDensity.
 41 |     """
 42 | 
 43 |     def __init__(self, p, k, n_permute=400, alpha=0.01, seed=28):
 44 |         """
 45 |         p: an instance of UnnormalizedDensity
 46 |         k: an instance of Kernel
 47 |         n_permute: number of times to permute the samples to simulate from the 
 48 |             null distribution (permutation test)
 49 |         alpha: significance level 
 50 |         seed: random seed
 51 |         """
 52 |         super(QuadMMDGof, self).__init__(p, alpha)
 53 |         # Construct the MMD test
 54 |         self.mmdtest = tst.QuadMMDTest(k, n_permute=n_permute, alpha=alpha)
 55 |         self.k = k
 56 |         self.seed = seed
 57 |         ds = p.get_datasource()
 58 |         if ds is None:
 59 |             raise ValueError('%s test requires a density p which implements get_datasource(', str(QuadMMDGof))
 60 | 
 61 | 
 62 |     def perform_test(self, dat):
 63 |         """
 64 |         dat: an instance of Data
 65 |         """
 66 |         with util.ContextTimer() as t:
 67 |             seed = self.seed
 68 |             mmdtest = self.mmdtest
 69 |             p = self.p
 70 | 
 71 |             # Draw sample from p. #sample to draw is the same as that of dat
 72 |             ds = p.get_datasource()
 73 |             p_sample = ds.sample(dat.sample_size(), seed=seed+12)
 74 | 
 75 |             # Run the two-sample test on p_sample and dat
 76 |             # Make a two-sample test data
 77 |             tst_data = fdata.TSTData(p_sample.data(), dat.data())
 78 |             # Test 
 79 |             results = mmdtest.perform_test(tst_data)
 80 | 
 81 |         results['time_secs'] = t.secs
 82 |         return results
 83 | 
 84 |     def compute_stat(self, dat):
 85 |         mmdtest = self.mmdtest
 86 |         p = self.p
 87 |         # Draw sample from p. #sample to draw is the same as that of dat
 88 |         ds = p.get_datasource()
 89 |         p_sample = ds.sample(dat.sample_size(), seed=self.seed)
 90 | 
 91 |         # Make a two-sample test data
 92 |         tst_data = fdata.TSTData(p_sample.data(), dat.data())
 93 |         s = mmdtest.compute_stat(tst_data)
 94 |         return s
 95 | 
 96 |         
 97 | # end QuadMMDGof
 98 | 
 99 | class QuadMMDGofOpt(gof.GofTest):
100 |     """
101 |     Goodness-of-fit test by drawing sample from the density p and test with the
102 |     MMD test of Gretton et al., 2012. Optimize the kernel by the power
103 |     criterion as in Sutherland et al., 2016. Need to split the data into
104 |     training and test sets.
105 | 
106 |     H0: the sample follows p
107 |     H1: the sample does not follow p
108 | 
109 |     p is specified to the constructor in the form of an UnnormalizedDensity.
110 |     """
111 | 
112 |     def __init__(self, p, n_permute=400, alpha=0.01, seed=28):
113 |         """
114 |         p: an instance of UnnormalizedDensity
115 |         k: an instance of Kernel
116 |         n_permute: number of times to permute the samples to simulate from the 
117 |             null distribution (permutation test)
118 |         alpha: significance level 
119 |         seed: random seed
120 |         """
121 |         super(QuadMMDGofOpt, self).__init__(p, alpha)
122 |         self.n_permute = n_permute
123 |         self.seed = seed
124 |         ds = p.get_datasource()
125 |         if ds is None:
126 |             raise ValueError('%s test requires a density p which implements get_datasource(', str(QuadMMDGof))
127 | 
128 | 
129 |     def perform_test(self, dat, candidate_kernels=None, return_mmdtest=False,
130 |             tr_proportion=0.2, reg=1e-3):
131 |         """
132 |         dat: an instance of Data
133 |         candidate_kernels: a list of Kernel's to choose from
134 |         tr_proportion: proportion of sample to be used to choosing the best
135 |             kernel
136 |         reg: regularization parameter for the test power criterion 
137 |         """
138 |         with util.ContextTimer() as t:
139 |             seed = self.seed
140 |             p = self.p
141 |             ds = p.get_datasource()
142 |             p_sample = ds.sample(dat.sample_size(), seed=seed+77)
143 |             xtr, xte = p_sample.split_tr_te(tr_proportion=tr_proportion, seed=seed+18)
144 |             # ytr, yte are of type data.Data
145 |             ytr, yte = dat.split_tr_te(tr_proportion=tr_proportion, seed=seed+12)
146 | 
147 |             # training and test data
148 |             tr_tst_data = fdata.TSTData(xtr.data(), ytr.data())
149 |             te_tst_data = fdata.TSTData(xte.data(), yte.data())
150 | 
151 |             if candidate_kernels is None:
152 |                 # Assume a Gaussian kernel. Construct a list of 
153 |                 # kernels to try based on multiples of the median heuristic
154 |                 med = util.meddistance(tr_tst_data.stack_xy(), 1000)
155 |                 list_gwidth = np.hstack( ( (med**2) *(2.0**np.linspace(-4, 4, 10) ) ) )
156 |                 list_gwidth.sort()
157 |                 candidate_kernels = [kernel.KGauss(gw2) for gw2 in list_gwidth]
158 | 
159 |             alpha = self.alpha
160 | 
161 |             # grid search to choose the best Gaussian width
162 |             besti, powers = tst.QuadMMDTest.grid_search_kernel(tr_tst_data,
163 |                     candidate_kernels, alpha, reg=reg)
164 |             # perform test 
165 |             best_ker = candidate_kernels[besti]
166 |             mmdtest = tst.QuadMMDTest(best_ker, self.n_permute, alpha=alpha)
167 |             results = mmdtest.perform_test(te_tst_data)
168 |             if return_mmdtest:
169 |                 results['mmdtest'] = mmdtest
170 | 
171 |         results['time_secs'] = t.secs
172 |         return results
173 | 
174 |     def compute_stat(self, dat):
175 |         raise NotImplementedError('Not implemented yet.')
176 | 
177 |         
178 | # end QuadMMDGofOpt
179 | 


--------------------------------------------------------------------------------
/kgof/plot.py:
--------------------------------------------------------------------------------
  1 | """Module containing convenient functions for plotting"""
  2 | 
  3 | from builtins import range
  4 | from builtins import object
  5 | __author__ = 'wittawat'
  6 | 
  7 | import kgof.glo as glo
  8 | import matplotlib
  9 | import matplotlib.pyplot as plt
 10 | import autograd.numpy as np
 11 | 
 12 | 
 13 | def get_func_tuples():
 14 |     """
 15 |     Return a list of tuples where each tuple is of the form
 16 |         (func_name used in the experiments, label name, plot line style)
 17 |     """
 18 |     func_tuples = [
 19 |             ('job_fssdJ1q_med', 'FSSD-rand J1', 'r--^'),
 20 |             ('job_fssdJ5q_med', 'FSSD-rand', 'r--^'),
 21 |             ('job_fssdq_med', 'FSSD-rand', 'r--^'),
 22 | 
 23 |             ('job_fssdJ1q_opt', 'FSSD-opt J1', 'r-s'),
 24 |             ('job_fssdq_opt', 'FSSD-opt', 'r-s'),
 25 |             ('job_fssdJ5q_opt', 'FSSD-opt', 'r-s'),
 26 |             ('job_fssdJ5q_imq_optv', 'FSSD-IMQv', 'k-h'),
 27 |             ('job_fssdJ5q_imqb1_optv', 'FSSD-IMQ-1', 'k--s'),
 28 |             ('job_fssdJ5q_imqb2_optv', 'FSSD-IMQ-2', 'k-->'),
 29 |             ('job_fssdJ5q_imqb3_optv', 'FSSD-IMQ-3', 'k-.*'),
 30 |             ('job_fssdJ5q_imq_opt', 'FSSD-IMQ', 'y-x'),
 31 |             ('job_fssdJ5q_imq_optbv', 'FSSD-IMQ-bv', 'y--d'),
 32 |             ('job_fssdJ10q_opt', 'FSSD-opt', 'k-s'),
 33 | 
 34 |             ('job_fssdJ5p_opt', 'FSSD-opt J5', 'm-s'),
 35 |             ('job_fssdp_opt', 'FSSDp-opt', 'm-s'),
 36 |             ('job_fssdJ10p_opt', 'FSSDp-opt J10', 'k-s'),
 37 | 
 38 |             ('job_fssdJ1q_opt2', 'FSSD-opt2 J1', 'b-^'),
 39 |             ('job_fssdJ5q_opt2', 'FSSD-opt2 J5', 'r-^'),
 40 |             ('job_me_opt', 'ME-opt', 'b-d'),
 41 | 
 42 |             ('job_kstein_med', 'KSD', 'g-o'),
 43 |             ('job_kstein_imq', 'KSD-IMQ', 'c-*'),
 44 |             ('job_lin_kstein_med', 'LKS', 'g-.h'),
 45 |             ('job_mmd_med', 'MMD', 'm--^'),
 46 |             ('job_mmd_opt', 'MMD-opt', 'm-<'),
 47 |             ('job_mmd_dgauss_opt', 'MMD-dopt', 'y-<'),
 48 |             ]
 49 |     return func_tuples
 50 | 
 51 | def set_default_matplotlib_options():
 52 |     # font options
 53 |     font = {
 54 |     #     'family' : 'normal',
 55 |         #'weight' : 'bold',
 56 |         'size'   : 30
 57 |     }
 58 |     matplotlib.rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
 59 | 
 60 | 
 61 |     # matplotlib.use('cairo')
 62 |     matplotlib.rc('text', usetex=True)
 63 |     matplotlib.rcParams['text.usetex'] = True
 64 |     plt.rc('font', **font)
 65 |     plt.rc('lines', linewidth=3, markersize=10)
 66 |     # matplotlib.rcParams['ps.useafm'] = True
 67 |     # matplotlib.rcParams['pdf.use14corefonts'] = True
 68 | 
 69 |     matplotlib.rcParams['pdf.fonttype'] = 42
 70 |     matplotlib.rcParams['ps.fonttype'] = 42
 71 | 
 72 | def get_func2label_map():
 73 |     # map: job_func_name |-> plot label
 74 |     func_tuples = get_func_tuples()
 75 |     #M = {k:v for (k,v) in zip(func_names, labels)}
 76 |     M = {k:v for (k,v,_) in func_tuples}
 77 |     return M
 78 | 
 79 | 
 80 | def func_plot_fmt_map():
 81 |     """
 82 |     Return a map from job function names to matplotlib plot styles 
 83 |     """
 84 |     # line_styles = ['o-', 'x-',  '*-', '-_', 'D-', 'h-', '+-', 's-', 'v-', 
 85 |     #               ',-', '1-']
 86 |     func_tuples = get_func_tuples()
 87 |     M = {k:v for (k, _, v) in func_tuples}
 88 |     return M
 89 | 
 90 | 
 91 | class PlotValues(object):
 92 |     """
 93 |     An object encapsulating values of a plot where there are many curves, 
 94 |     each corresponding to one method, with common x-axis values.
 95 |     """
 96 |     def __init__(self, xvalues, methods, plot_matrix):
 97 |         """
 98 |         xvalues: 1d numpy array of x-axis values
 99 |         methods: a list of method names
100 |         plot_matrix: len(methods) x len(xvalues) 2d numpy array containing 
101 |             values that can be used to plot
102 |         """
103 |         self.xvalues = xvalues
104 |         self.methods = methods
105 |         self.plot_matrix = plot_matrix
106 | 
107 |     def ascii_table(self, tablefmt="pipe"):
108 |         """
109 |         Return an ASCII string representation of the table.
110 | 
111 |         tablefmt: "plain", "fancy_grid", "grid", "simple" might be useful.
112 |         """
113 |         methods = self.methods
114 |         xvalues = self.xvalues
115 |         plot_matrix = self.plot_matrix
116 | 
117 |         import tabulate
118 |         # https://pypi.python.org/pypi/tabulate
119 |         aug_table = np.hstack((np.array(methods)[:, np.newaxis], plot_matrix))
120 |         return tabulate.tabulate(aug_table, xvalues, tablefmt=tablefmt)
121 | 
122 | # end of class PlotValues
123 | 
124 | def plot_prob_reject(ex, fname, func_xvalues, xlabel, func_title=None, 
125 |         return_plot_values=False):
126 |     """
127 |     plot the empirical probability that the statistic is above the threshold.
128 |     This can be interpreted as type-1 error (when H0 is true) or test power 
129 |     (when H1 is true). The plot is against the specified x-axis.
130 | 
131 |     - ex: experiment number 
132 |     - fname: file name of the aggregated result
133 |     - func_xvalues: function taking aggregated results dictionary and return the values 
134 |         to be used for the x-axis values.            
135 |     - xlabel: label of the x-axis. 
136 |     - func_title: a function: results dictionary -> title of the plot
137 |     - return_plot_values: if true, also return a PlotValues as the second
138 |       output value.
139 | 
140 |     Return loaded results
141 |     """
142 |     #from IPython.core.debugger import Tracer 
143 |     #Tracer()()
144 | 
145 |     results = glo.ex_load_result(ex, fname)
146 | 
147 |     def rej_accessor(jr):
148 |         rej = jr['test_result']['h0_rejected']
149 |         # When used with vectorize(), making the value float will make the resulting 
150 |         # numpy array to be of float. nan values can be stored.
151 |         return float(rej)
152 | 
153 |     #value_accessor = lambda job_results: job_results['test_result']['h0_rejected']
154 |     vf_pval = np.vectorize(rej_accessor)
155 |     # results['job_results'] is a dictionary: 
156 |     # {'test_result': (dict from running perform_test(te) '...':..., }
157 |     rejs = vf_pval(results['job_results'])
158 |     repeats, _, n_methods = results['job_results'].shape
159 | 
160 |     # yvalues (corresponding to xvalues) x #methods
161 |     mean_rejs = np.mean(rejs, axis=0)
162 |     #print mean_rejs
163 |     #std_pvals = np.std(rejs, axis=0)
164 |     #std_pvals = np.sqrt(mean_rejs*(1.0-mean_rejs))
165 | 
166 |     xvalues = func_xvalues(results)
167 | 
168 |     #ns = np.array(results[xkey])
169 |     #te_proportion = 1.0 - results['tr_proportion']
170 |     #test_sizes = ns*te_proportion
171 |     line_styles = func_plot_fmt_map()
172 |     method_labels = get_func2label_map()
173 |     
174 |     func_names = [f.__name__ for f in results['method_job_funcs'] ]
175 |     plotted_methods = []
176 |     for i in range(n_methods):    
177 |         te_proportion = 1.0 - results['tr_proportion']
178 |         fmt = line_styles[func_names[i]]
179 |         #plt.errorbar(ns*te_proportion, mean_rejs[:, i], std_pvals[:, i])
180 |         method_label = method_labels[func_names[i]]
181 |         plotted_methods.append(method_label)
182 |         plt.plot(xvalues, mean_rejs[:, i], fmt, label=method_label)
183 |     '''
184 |     else:
185 |         # h0 is true 
186 |         z = stats.norm.isf( (1-confidence)/2.0)
187 |         for i in range(n_methods):
188 |             phat = mean_rejs[:, i]
189 |             conf_iv = z*(phat*(1-phat)/repeats)**0.5
190 |             #plt.errorbar(test_sizes, phat, conf_iv, fmt=line_styles[i], label=method_labels[i])
191 |             plt.plot(test_sizes, mean_rejs[:, i], line_styles[i], label=method_labels[i])
192 |     '''
193 |             
194 |     ylabel = 'Rejection rate'
195 |     plt.ylabel(ylabel)
196 |     plt.xlabel(xlabel)
197 |     plt.xticks(np.hstack((xvalues) ))
198 |     
199 |     alpha = results['alpha']
200 |     plt.legend(loc='best')
201 |     title = '%s. %d trials. $\\alpha$ = %.2g.'%( results['prob_label'],
202 |             repeats, alpha) if func_title is None else func_title(results)
203 |     plt.title(title)
204 |     plt.grid()
205 |     if return_plot_values:
206 |         return results, PlotValues(xvalues=xvalues, methods=plotted_methods,
207 |                 plot_matrix=mean_rejs.T)
208 |     else:
209 |         return results
210 |         
211 | 
212 | def plot_runtime(ex, fname, func_xvalues, xlabel, func_title=None):
213 |     results = glo.ex_load_result(ex, fname)
214 |     value_accessor = lambda job_results: job_results['time_secs']
215 |     vf_pval = np.vectorize(value_accessor)
216 |     # results['job_results'] is a dictionary: 
217 |     # {'test_result': (dict from running perform_test(te) '...':..., }
218 |     times = vf_pval(results['job_results'])
219 |     repeats, _, n_methods = results['job_results'].shape
220 |     time_avg = np.mean(times, axis=0)
221 |     time_std = np.std(times, axis=0)
222 | 
223 |     xvalues = func_xvalues(results)
224 | 
225 |     #ns = np.array(results[xkey])
226 |     #te_proportion = 1.0 - results['tr_proportion']
227 |     #test_sizes = ns*te_proportion
228 |     line_styles = func_plot_fmt_map()
229 |     method_labels = get_func2label_map()
230 |     
231 |     func_names = [f.__name__ for f in results['method_job_funcs'] ]
232 |     for i in range(n_methods):    
233 |         te_proportion = 1.0 - results['tr_proportion']
234 |         fmt = line_styles[func_names[i]]
235 |         #plt.errorbar(ns*te_proportion, mean_rejs[:, i], std_pvals[:, i])
236 |         method_label = method_labels[func_names[i]]
237 |         plt.errorbar(xvalues, time_avg[:, i], yerr=time_std[:,i], fmt=fmt,
238 |                 label=method_label)
239 |             
240 |     ylabel = 'Time (s)'
241 |     plt.ylabel(ylabel)
242 |     plt.xlabel(xlabel)
243 |     plt.xlim([np.min(xvalues), np.max(xvalues)])
244 |     plt.xticks( xvalues, xvalues )
245 |     plt.legend(loc='best')
246 |     plt.gca().set_yscale('log')
247 |     title = '%s. %d trials. '%( results['prob_label'],
248 |             repeats ) if func_title is None else func_title(results)
249 |     plt.title(title)
250 |     #plt.grid()
251 |     return results
252 | 
253 | 
254 | def box_meshgrid(func, xbound, ybound, nx=50, ny=50):
255 |     """
256 |     Form a meshed grid (to be used with a contour plot) on a box
257 |     specified by xbound, ybound. Evaluate the grid with [func]: (n x 2) -> n.
258 |     
259 |     - xbound: a tuple (xmin, xmax)
260 |     - ybound: a tuple (ymin, ymax)
261 |     - nx: number of points to evluate in the x direction
262 |     
263 |     return XX, YY, ZZ where XX is a 2D nd-array of size nx x ny
264 |     """
265 |     
266 |     # form a test location grid to try 
267 |     minx, maxx = xbound
268 |     miny, maxy = ybound
269 |     loc0_cands = np.linspace(minx, maxx, nx)
270 |     loc1_cands = np.linspace(miny, maxy, ny)
271 |     lloc0, lloc1 = np.meshgrid(loc0_cands, loc1_cands)
272 |     # nd1 x nd0 x 2
273 |     loc3d = np.dstack((lloc0, lloc1))
274 |     # #candidates x 2
275 |     all_loc2s = np.reshape(loc3d, (-1, 2) )
276 |     # evaluate the function
277 |     func_grid = func(all_loc2s)
278 |     func_grid = np.reshape(func_grid, (ny, nx))
279 |     
280 |     assert lloc0.shape[0] == ny
281 |     assert lloc0.shape[1] == nx
282 |     assert np.all(lloc0.shape == lloc1.shape)
283 |     
284 |     return lloc0, lloc1, func_grid
285 | 
286 | def get_density_cmap():
287 |     """
288 |     Return a colormap for plotting the model density p.
289 |     Red = high density 
290 |     white = very low density.
291 |     Varying from white (low) to red (high).
292 |     """
293 |     # Add completely white color to Reds colormap in Matplotlib
294 |     list_colors = plt.cm.datad['Reds']
295 |     list_colors = list(list_colors)
296 |     list_colors.insert(0, (1, 1, 1))
297 |     list_colors.insert(0, (1, 1, 1))
298 |     lscm = matplotlib.colors.LinearSegmentedColormap.from_list("my_Reds", list_colors)
299 |     return lscm
300 | 


--------------------------------------------------------------------------------
/kgof/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wittawatj/kernel-gof/039a95ed9d8062e283da6bd051b7161a190b4876/kgof/test/__init__.py


--------------------------------------------------------------------------------
/kgof/test/test_density.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for testing density module.
 3 | """
 4 | 
 5 | __author__ = 'wittawat'
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | import kgof.data as data
10 | import kgof.density as density
11 | import kgof.util as util
12 | import kgof.kernel as kernel
13 | import kgof.goftest as gof
14 | import kgof.glo as glo
15 | import scipy.stats as stats
16 | 
17 | import unittest
18 | 
19 | 
20 | class TestIsotropicNormal(unittest.TestCase):
21 |     def setUp(self):
22 |         pass
23 | 
24 | 
25 |     def test_log_den(self):
26 |         n = 7
27 |         with util.NumpySeedContext(seed=16):
28 |             for d in [3, 1]:
29 |                 variance = 1.1
30 |                 mean = np.random.randn(d)
31 |                 X = np.random.rand(n, d) + 1
32 | 
33 |                 isonorm = density.IsotropicNormal(mean, variance)
34 |                 log_dens = isonorm.log_den(X)
35 |                 my_log_dens = -np.sum((X-mean)**2, 1)/(2.0*variance)
36 | 
37 |                 # check correctness 
38 |                 np.testing.assert_almost_equal(log_dens, my_log_dens)
39 | 
40 |     def test_grad_log(self):
41 |         n = 8
42 |         with util.NumpySeedContext(seed=17):
43 |             for d in [4, 1]:
44 |                 variance = 1.2
45 |                 mean = np.random.randn(d) + 1
46 |                 X = np.random.rand(n, d) - 2 
47 | 
48 |                 isonorm = density.IsotropicNormal(mean, variance)
49 |                 grad_log = isonorm.grad_log(X)
50 |                 my_grad_log = -(X-mean)/variance
51 | 
52 |                 # check correctness 
53 |                 np.testing.assert_almost_equal(grad_log, my_grad_log)
54 | 
55 |     def tearDown(self):
56 |         pass
57 | 
58 | 
59 | class TestGaussianMixture(unittest.TestCase):
60 | 
61 |     def test_multivariate_normal_density(self):
62 |         for i in range(4):
63 |             with util.NumpySeedContext(seed=i+8):
64 |                 d = i + 2
65 |                 cov = stats.wishart(df=10+d, scale=np.eye(d)).rvs(size=1)
66 |                 mean = np.random.randn(d)
67 |                 X = np.random.randn(11, d)
68 |                 den_estimate = density.GaussianMixture.multivariate_normal_density(mean, cov, X)
69 | 
70 |                 mnorm = stats.multivariate_normal(mean=mean, cov=cov)
71 |                 den_truth = mnorm.pdf(X)
72 | 
73 |                 np.testing.assert_almost_equal(den_estimate, den_truth)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |    unittest.main()
78 | 
79 | 


--------------------------------------------------------------------------------
/kgof/test/test_goftest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for testing goftest module.
  3 | """
  4 | 
  5 | __author__ = 'wittawat'
  6 | 
  7 | import numpy as np
  8 | import numpy.testing as testing
  9 | import matplotlib.pyplot as plt
 10 | import kgof.data as data
 11 | import kgof.density as density
 12 | import kgof.util as util
 13 | import kgof.kernel as kernel
 14 | import kgof.goftest as gof
 15 | import kgof.glo as glo
 16 | import scipy.stats as stats
 17 | 
 18 | import unittest
 19 | 
 20 | 
 21 | class TestFSSD(unittest.TestCase):
 22 |     def setUp(self):
 23 |         pass
 24 | 
 25 |     def test_basic(self):
 26 |         """
 27 |         Nothing special. Just test basic things.
 28 |         """
 29 |         seed = 12
 30 |         # sample
 31 |         n = 100
 32 |         alpha = 0.01
 33 |         for d in [1, 4]:
 34 |             mean = np.zeros(d)
 35 |             variance = 1
 36 |             isonorm = density.IsotropicNormal(mean, variance)
 37 | 
 38 |             # only one dimension of the mean is shifted
 39 |             #draw_mean = mean + np.hstack((1, np.zeros(d-1)))
 40 |             draw_mean = mean +0
 41 |             draw_variance = variance + 1
 42 |             X = util.randn(n, d, seed=seed)*np.sqrt(draw_variance) + draw_mean
 43 |             dat = data.Data(X)
 44 | 
 45 |             # Test
 46 |             for J in [1, 3]:
 47 |                 sig2 = util.meddistance(X, subsample=1000)**2
 48 |                 k = kernel.KGauss(sig2)
 49 | 
 50 |                 # random test locations
 51 |                 V = util.fit_gaussian_draw(X, J, seed=seed+1)
 52 |                 null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
 53 |                 fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)
 54 | 
 55 |                 tresult = fssd.perform_test(dat, return_simulated_stats=True)
 56 | 
 57 |                 # assertions
 58 |                 self.assertGreaterEqual(tresult['pvalue'], 0)
 59 |                 self.assertLessEqual(tresult['pvalue'], 1)
 60 | 
 61 |     def test_optimized_fssd(self):
 62 |         """
 63 |         Test FSSD test with parameter optimization.
 64 |         """
 65 |         seed = 4
 66 |         # sample size
 67 |         n = 179 
 68 |         alpha = 0.01
 69 |         for d in [1, 3]:
 70 |             mean = np.zeros(d)
 71 |             variance = 1.0
 72 |             p = density.IsotropicNormal(mean, variance)
 73 |             # Mean difference. obvious reject
 74 |             ds = data.DSIsotropicNormal(mean+4, variance+0)
 75 |             dat = ds.sample(n, seed=seed)
 76 |             # test 
 77 |             for J in [1, 4]:
 78 |                 opts = {
 79 |                     'reg': 1e-2,
 80 |                     'max_iter': 10, 
 81 |                     'tol_fun':1e-3, 
 82 |                     'disp':False
 83 |                 }
 84 |                 tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed+1)
 85 | 
 86 |                 Xtr = tr.X
 87 |                 gwidth0 = util.meddistance(Xtr, subsample=1000)**2
 88 |                 # random test locations
 89 |                 V0 = util.fit_gaussian_draw(Xtr, J, seed=seed+1)
 90 |                 V_opt, gw_opt, opt_result = \
 91 |                 gof.GaussFSSD.optimize_locs_widths(p, tr, gwidth0, V0, **opts)
 92 | 
 93 |                 # construct a test
 94 |                 k_opt = kernel.KGauss(gw_opt)
 95 |                 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
 96 |                 fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
 97 |                 fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)
 98 |                 assert fssd_opt_result['h0_rejected']
 99 | 
100 |     def test_auto_init_opt_fssd(self):
101 |         """
102 |         Test FSSD-opt test with automatic parameter initialization.
103 |         """
104 |         seed = 5
105 |         # sample size
106 |         n = 191 
107 |         alpha = 0.01
108 |         for d in [1, 4]:
109 |             mean = np.zeros(d)
110 |             variance = 1.0
111 |             p = density.IsotropicNormal(mean, variance)
112 |             # Mean difference. obvious reject
113 |             ds = data.DSIsotropicNormal(mean+4, variance+0)
114 |             dat = ds.sample(n, seed=seed)
115 |             # test 
116 |             for J in [1, 3]:
117 |                 opts = {
118 |                     'reg': 1e-2,
119 |                     'max_iter': 10, 
120 |                     'tol_fun': 1e-3, 
121 |                     'disp':False
122 |                 }
123 |                 tr, te = dat.split_tr_te(tr_proportion=0.3, seed=seed+1)
124 | 
125 |                 V_opt, gw_opt, opt_result = \
126 |                 gof.GaussFSSD.optimize_auto_init(p, tr, J, **opts)
127 | 
128 |                 # construct a test
129 |                 k_opt = kernel.KGauss(gw_opt)
130 |                 null_sim = gof.FSSDH0SimCovObs(n_simulate=2000, seed=10)
131 |                 fssd_opt = gof.FSSD(p, k_opt, V_opt, null_sim=null_sim, alpha=alpha)
132 |                 fssd_opt_result = fssd_opt.perform_test(te, return_simulated_stats=True)
133 |                 assert fssd_opt_result['h0_rejected']
134 | 
135 |     def test_ustat_h1_mean_variance(self):
136 |         seed = 20
137 |         # sample
138 |         n = 200
139 |         alpha = 0.01
140 |         for d in [1, 4]:
141 |             mean = np.zeros(d)
142 |             variance = 1
143 |             isonorm = density.IsotropicNormal(mean, variance)
144 | 
145 |             draw_mean = mean + 2
146 |             draw_variance = variance + 1
147 |             X = util.randn(n, d, seed=seed)*np.sqrt(draw_variance) + draw_mean
148 |             dat = data.Data(X)
149 | 
150 |             # Test
151 |             for J in [1, 3]:
152 |                 sig2 = util.meddistance(X, subsample=1000)**2
153 |                 k = kernel.KGauss(sig2)
154 | 
155 |                 # random test locations
156 |                 V = util.fit_gaussian_draw(X, J, seed=seed+1)
157 | 
158 |                 null_sim = gof.FSSDH0SimCovObs(n_simulate=200, seed=3)
159 |                 fssd = gof.FSSD(isonorm, k, V, null_sim=null_sim, alpha=alpha)
160 |                 fea_tensor = fssd.feature_tensor(X)
161 | 
162 |                 u_mean, u_variance = gof.FSSD.ustat_h1_mean_variance(fea_tensor)
163 | 
164 |                 # assertions
165 |                 self.assertGreaterEqual(u_variance, 0)
166 |                 # should reject H0
167 |                 self.assertGreaterEqual(u_mean, 0)
168 | 
169 |     def tearDown(self):
170 |         pass
171 | 
172 | # end class TestFSSD
173 | 
174 | class TestSteinWitness(unittest.TestCase):
175 |     def test_basic(self):
176 |         d = 3
177 |         p = density.IsotropicNormal(mean=np.zeros(d), variance=3.0)
178 |         q = density.IsotropicNormal(mean=np.zeros(d)+2, variance=3.0)
179 |         k = kernel.KGauss(2.0)
180 | 
181 |         ds = q.get_datasource()
182 |         n = 97
183 |         dat = ds.sample(n, seed=3)
184 | 
185 |         witness = gof.SteinWitness(p, k, dat)
186 |         # points to evaluate the witness
187 |         J = 4
188 |         V = np.random.randn(J, d)*2
189 |         evals = witness(V)
190 | 
191 |         testing.assert_equal(evals.shape, (J, d))
192 | 
193 | # end class TestSteinWitness
194 | 
195 | 
196 | if __name__ == '__main__':
197 |    unittest.main()
198 | 
199 | 


--------------------------------------------------------------------------------
/kgof/test/test_kernel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for testing kernel module.
  3 | """
  4 | 
  5 | __author__ = 'wittawat'
  6 | 
  7 | import autograd
  8 | import autograd.numpy as np
  9 | import matplotlib.pyplot as plt
 10 | import kgof.data as data
 11 | import kgof.density as density
 12 | import kgof.util as util
 13 | import kgof.kernel as kernel
 14 | import kgof.goftest as gof
 15 | import kgof.glo as glo
 16 | import scipy.stats as stats
 17 | import numpy.testing as testing
 18 | 
 19 | import unittest
 20 | 
 21 | 
 22 | class TestKGauss(unittest.TestCase):
 23 |     def setUp(self):
 24 |         pass
 25 | 
 26 |     def test_basic(self):
 27 |         """
 28 |         Nothing special. Just test basic things.
 29 |         """
 30 |         # sample
 31 |         n = 10
 32 |         d = 3
 33 |         with util.NumpySeedContext(seed=29):
 34 |             X = np.random.randn(n, d)*3
 35 |             k = kernel.KGauss(sigma2=1)
 36 |             K = k.eval(X, X)
 37 | 
 38 |             self.assertEqual(K.shape, (n, n))
 39 |             self.assertTrue(np.all(K >= 0-1e-6))
 40 |             self.assertTrue(np.all(K <= 1+1e-6), 'K not bounded by 1')
 41 | 
 42 |     def test_pair_gradX_Y(self):
 43 |         # sample
 44 |         n = 11
 45 |         d = 3
 46 |         with util.NumpySeedContext(seed=20):
 47 |             X = np.random.randn(n, d)*4
 48 |             Y = np.random.randn(n, d)*2
 49 |             k = kernel.KGauss(sigma2=2.1)
 50 |             # n x d
 51 |             pair_grad = k.pair_gradX_Y(X, Y)
 52 |             loop_grad = np.zeros((n, d))
 53 |             for i in range(n):
 54 |                 for j in range(d):
 55 |                     loop_grad[i, j] = k.gradX_Y(X[[i], :], Y[[i], :], j)
 56 | 
 57 |             testing.assert_almost_equal(pair_grad, loop_grad)
 58 | 
 59 | 
 60 |     def test_gradX_y(self):
 61 |         n = 10
 62 |         with util.NumpySeedContext(seed=10):
 63 |             for d in [1, 3]:
 64 |                 y = np.random.randn(d)*2
 65 |                 X = np.random.rand(n, d)*3
 66 | 
 67 |                 sigma2 = 1.3
 68 |                 k = kernel.KGauss(sigma2=sigma2)
 69 |                 # n x d
 70 |                 G = k.gradX_y(X, y)
 71 |                 # check correctness 
 72 |                 K = k.eval(X, y[np.newaxis, :])
 73 |                 myG = -K/sigma2*(X-y)
 74 | 
 75 |                 self.assertEqual(G.shape, myG.shape)
 76 |                 testing.assert_almost_equal(G, myG)
 77 | 
 78 | 
 79 |     def test_gradXY_sum(self):
 80 |         n = 11
 81 |         with util.NumpySeedContext(seed=12):
 82 |             for d in [3, 1]:
 83 |                 X = np.random.randn(n, d)
 84 |                 sigma2 = 1.4
 85 |                 k = kernel.KGauss(sigma2=sigma2)
 86 | 
 87 |                 # n x n
 88 |                 myG = np.zeros((n, n))
 89 |                 K = k.eval(X, X)
 90 |                 for i in range(n):
 91 |                     for j in range(n):
 92 |                         diffi2 = np.sum( (X[i, :] - X[j, :])**2 )
 93 |                         #myG[i, j] = -diffi2*K[i, j]/(sigma2**2)+ d*K[i, j]/sigma2
 94 |                         myG[i, j] = K[i, j]/sigma2*(d - diffi2/sigma2)
 95 | 
 96 |                 # check correctness 
 97 |                 G = k.gradXY_sum(X, X)
 98 | 
 99 |                 self.assertEqual(G.shape, myG.shape)
100 |                 testing.assert_almost_equal(G, myG)
101 | 
102 | 
103 |     def tearDown(self):
104 |         pass
105 | 
106 | 
107 | if __name__ == '__main__':
108 |    unittest.main()
109 | 
110 | 


--------------------------------------------------------------------------------
/kgof/util.py:
--------------------------------------------------------------------------------
  1 | """A module containing convenient methods for general machine learning"""
  2 | from __future__ import print_function
  3 | from __future__ import division
  4 | from __future__ import unicode_literals
  5 | from __future__ import absolute_import
  6 | 
  7 | from builtins import zip
  8 | from builtins import int
  9 | from builtins import range
 10 | from future import standard_library
 11 | standard_library.install_aliases()
 12 | from past.utils import old_div
 13 | from builtins import object
 14 | __author__ = 'wittawat'
 15 | 
 16 | import autograd.numpy as np
 17 | import time 
 18 | 
 19 | class ContextTimer(object):
 20 |     """
 21 |     A class used to time an execution of a code snippet. 
 22 |     Use it with with .... as ...
 23 |     For example, 
 24 | 
 25 |         with ContextTimer() as t:
 26 |             # do something 
 27 |         time_spent = t.secs
 28 | 
 29 |     From https://www.huyng.com/posts/python-performance-analysis
 30 |     """
 31 | 
 32 |     def __init__(self, verbose=False):
 33 |         self.verbose = verbose
 34 | 
 35 |     def __enter__(self):
 36 |         self.start = time.time()
 37 |         return self
 38 | 
 39 |     def __exit__(self, *args):
 40 |         self.end = time.time()
 41 |         self.secs = self.end - self.start 
 42 |         if self.verbose:
 43 |             print('elapsed time: %f ms' % (self.secs*1000))
 44 | 
 45 | # end class ContextTimer
 46 | 
 47 | class NumpySeedContext(object):
 48 |     """
 49 |     A context manager to reset the random seed by numpy.random.seed(..).
 50 |     Set the seed back at the end of the block. 
 51 |     """
 52 |     def __init__(self, seed):
 53 |         self.seed = seed 
 54 | 
 55 |     def __enter__(self):
 56 |         rstate = np.random.get_state()
 57 |         self.cur_state = rstate
 58 |         np.random.seed(self.seed)
 59 |         return self
 60 | 
 61 |     def __exit__(self, *args):
 62 |         np.random.set_state(self.cur_state)
 63 | 
 64 | # end NumpySeedContext
 65 | 
 66 | class ChunkIterable(object):
 67 |     """
 68 |     Construct an Iterable such that each call to its iterator returns a tuple
 69 |     of two indices (f, t) where f is the starting index, and t is the ending
 70 |     index of a chunk. f and t are (chunk_size) apart except for the last tuple
 71 |     which will always cover the rest.
 72 |     """
 73 |     def __init__(self, start, end, chunk_size):
 74 |         self.start = start
 75 |         self.end = end
 76 |         self.chunk_size = chunk_size
 77 |     
 78 |     def __iter__(self):
 79 |         s = self.start
 80 |         e = self.end
 81 |         c = self.chunk_size
 82 |         # Probably not a good idea to use list. Waste memory.
 83 |         L = list(range(s, e, c))
 84 |         L.append(e)
 85 |         return zip(L, L[1:])
 86 | 
 87 | # end ChunkIterable
 88 | 
 89 | def constrain(val, min_val, max_val):
 90 |     return min(max_val, max(min_val, val))
 91 | 
 92 | def dist_matrix(X, Y):
 93 |     """
 94 |     Construct a pairwise Euclidean distance matrix of size X.shape[0] x Y.shape[0]
 95 |     """
 96 |     sx = np.sum(X**2, 1)
 97 |     sy = np.sum(Y**2, 1)
 98 |     D2 =  sx[:, np.newaxis] - 2.0*X.dot(Y.T) + sy[np.newaxis, :] 
 99 |     # to prevent numerical errors from taking sqrt of negative numbers
100 |     D2[D2 < 0] = 0
101 |     D = np.sqrt(D2)
102 |     return D
103 | 
104 | def dist2_matrix(X, Y):
105 |     """
106 |     Construct a pairwise Euclidean distance **squared** matrix of size
107 |     X.shape[0] x Y.shape[0]
108 |     """
109 |     sx = np.sum(X**2, 1)
110 |     sy = np.sum(Y**2, 1)
111 |     D2 =  sx[:, np.newaxis] - 2.0*np.dot(X, Y.T) + sy[np.newaxis, :] 
112 |     return D2
113 | 
114 | def meddistance(X, subsample=None, mean_on_fail=True):
115 |     """
116 |     Compute the median of pairwise distances (not distance squared) of points
117 |     in the matrix.  Useful as a heuristic for setting Gaussian kernel's width.
118 | 
119 |     Parameters
120 |     ----------
121 |     X : n x d numpy array
122 |     mean_on_fail: True/False. If True, use the mean when the median distance is 0.
123 |         This can happen especially, when the data are discrete e.g., 0/1, and 
124 |         there are more slightly more 0 than 1. In this case, the m
125 | 
126 |     Return
127 |     ------
128 |     median distance
129 |     """
130 |     if subsample is None:
131 |         D = dist_matrix(X, X)
132 |         Itri = np.tril_indices(D.shape[0], -1)
133 |         Tri = D[Itri]
134 |         med = np.median(Tri)
135 |         if med <= 0:
136 |             # use the mean
137 |             return np.mean(Tri)
138 |         return med
139 | 
140 |     else:
141 |         assert subsample > 0
142 |         rand_state = np.random.get_state()
143 |         np.random.seed(9827)
144 |         n = X.shape[0]
145 |         ind = np.random.choice(n, min(subsample, n), replace=False)
146 |         np.random.set_state(rand_state)
147 |         # recursion just one
148 |         return meddistance(X[ind, :], None, mean_on_fail)
149 | 
150 | 
151 | def is_real_num(X):
152 |     """return true if x is a real number. 
153 |     Work for a numpy array as well. Return an array of the same dimension."""
154 |     def each_elem_true(x):
155 |         try:
156 |             float(x)
157 |             return not (np.isnan(x) or np.isinf(x))
158 |         except:
159 |             return False
160 |     f = np.vectorize(each_elem_true)
161 |     return f(X)
162 |     
163 | 
164 | def tr_te_indices(n, tr_proportion, seed=9282 ):
165 |     """Get two logical vectors for indexing train/test points.
166 | 
167 |     Return (tr_ind, te_ind)
168 |     """
169 |     rand_state = np.random.get_state()
170 |     np.random.seed(seed)
171 | 
172 |     Itr = np.zeros(n, dtype=bool)
173 |     tr_ind = np.random.choice(n, int(tr_proportion*n), replace=False)
174 |     Itr[tr_ind] = True
175 |     Ite = np.logical_not(Itr)
176 | 
177 |     np.random.set_state(rand_state)
178 |     return (Itr, Ite)
179 | 
180 | def subsample_ind(n, k, seed=32):
181 |     """
182 |     Return a list of indices to choose k out of n without replacement
183 |     """
184 |     with NumpySeedContext(seed=seed):
185 |         ind = np.random.choice(n, k, replace=False)
186 |     return ind
187 | 
188 | def subsample_rows(X, k, seed=29):
189 |     """
190 |     Subsample k rows from the matrix X.
191 |     """
192 |     n = X.shape[0]
193 |     if k > n:
194 |         raise ValueError('k exceeds the number of rows.')
195 |     ind = subsample_ind(n, k, seed=seed)
196 |     return X[ind, :]
197 |     
198 | 
199 | def fit_gaussian_draw(X, J, seed=28, reg=1e-7, eig_pow=1.0):
200 |     """
201 |     Fit a multivariate normal to the data X (n x d) and draw J points 
202 |     from the fit. 
203 |     - reg: regularizer to use with the covariance matrix
204 |     - eig_pow: raise eigenvalues of the covariance matrix to this power to construct 
205 |         a new covariance matrix before drawing samples. Useful to shrink the spread 
206 |         of the variance.
207 |     """
208 |     with NumpySeedContext(seed=seed):
209 |         d = X.shape[1]
210 |         mean_x = np.mean(X, 0)
211 |         cov_x = np.cov(X.T)
212 |         if d==1:
213 |             cov_x = np.array([[cov_x]])
214 |         [evals, evecs] = np.linalg.eig(cov_x)
215 |         evals = np.maximum(0, np.real(evals))
216 |         assert np.all(np.isfinite(evals))
217 |         evecs = np.real(evecs)
218 |         shrunk_cov = evecs.dot(np.diag(evals**eig_pow)).dot(evecs.T) + reg*np.eye(d)
219 |         V = np.random.multivariate_normal(mean_x, shrunk_cov, J)
220 |     return V
221 | 
222 | def bound_by_data(Z, Data):
223 |     """
224 |     Determine lower and upper bound for each dimension from the Data, and project 
225 |     Z so that all points in Z live in the bounds.
226 | 
227 |     Z: m x d 
228 |     Data: n x d
229 | 
230 |     Return a projected Z of size m x d.
231 |     """
232 |     n, d = Z.shape
233 |     Low = np.min(Data, 0)
234 |     Up = np.max(Data, 0)
235 |     LowMat = np.repeat(Low[np.newaxis, :], n, axis=0)
236 |     UpMat = np.repeat(Up[np.newaxis, :], n, axis=0)
237 | 
238 |     Z = np.maximum(LowMat, Z)
239 |     Z = np.minimum(UpMat, Z)
240 |     return Z
241 | 
242 | 
243 | def one_of_K_code(arr):
244 |     """
245 |     Make a one-of-K coding out of the numpy array.
246 |     For example, if arr = ([0, 1, 0, 2]), then return a 2d array of the form 
247 |      [[1, 0, 0], 
248 |       [0, 1, 0],
249 |       [1, 0, 0],
250 |       [0, 0, 1]]
251 |     """
252 |     U = np.unique(arr)
253 |     n = len(arr)
254 |     nu = len(U)
255 |     X = np.zeros((n, nu))
256 |     for i, u in enumerate(U):
257 |         Ii = np.where( np.abs(arr - u) < 1e-8 )
258 |         #ni = len(Ii)
259 |         X[Ii[0], i] = 1
260 |     return X
261 | 
262 | def fullprint(*args, **kwargs):
263 |     "https://gist.github.com/ZGainsforth/3a306084013633c52881"
264 |     from pprint import pprint
265 |     import numpy
266 |     opt = numpy.get_printoptions()
267 |     numpy.set_printoptions(threshold='nan')
268 |     pprint(*args, **kwargs)
269 |     numpy.set_printoptions(**opt)
270 | 
271 | 
272 | def standardize(X):
273 |     mx = np.mean(X, 0)
274 |     stdx = np.std(X, axis=0)
275 |     # Assume standard deviations are not 0
276 |     Zx = old_div((X-mx),stdx)
277 |     assert np.all(np.isfinite(Zx))
278 |     return Zx
279 | 
280 | def outer_rows(X, Y):
281 |     """
282 |     Compute the outer product of each row in X, and Y.
283 | 
284 |     X: n x dx numpy array
285 |     Y: n x dy numpy array
286 | 
287 |     Return an n x dx x dy numpy array.
288 |     """
289 | 
290 |     # Matlab way to do this. According to Jonathan Huggins, this is not
291 |     # efficient. Use einsum instead. See below.
292 |     #n, dx = X.shape
293 |     #dy = Y.shape[1]
294 |     #X_col_rep = X[:, np.tile(range(dx), (dy, 1)).T.reshape(-1) ]
295 |     #Y_tile = np.tile(Y, (1, dx))
296 |     #Z = X_col_rep*Y_tile
297 |     #return np.reshape(Z, (n, dx, dy))
298 |     return np.einsum('ij,ik->ijk', X, Y)
299 | 
300 | def randn(m, n, seed=3):
301 |     with NumpySeedContext(seed=seed):
302 |         return np.random.randn(m, n)
303 | 
304 | def matrix_inner_prod(A, B):
305 |     """
306 |     Compute the matrix inner product <A, B> = trace(A^T * B).
307 |     """
308 |     assert A.shape[0] == B.shape[0]
309 |     assert A.shape[1] == B.shape[1]
310 |     return A.reshape(-1).dot(B.reshape(-1))
311 | 
312 | def get_classpath(obj):
313 |     """
314 |     Return the full module and class path of the obj. For instance, 
315 |     kgof.density.IsotropicNormal
316 | 
317 |     Return a string.
318 |     """
319 |     return obj.__class__.__module__ + '.' + obj.__class__.__name__
320 | 
321 | def merge_dicts(*dict_args):
322 |     """
323 |     Given any number of dicts, shallow copy and merge into a new dict,
324 |     precedence goes to key value pairs in latter dicts.
325 | 
326 |     http://stackoverflow.com/questions/38987/how-to-merge-two-python-dictionaries-in-a-single-expression
327 |     """
328 |     result = {}
329 |     for dictionary in dict_args:
330 |         result.update(dictionary)
331 |     return result
332 | 
333 | 


--------------------------------------------------------------------------------
/run_unittest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash 
2 | python -m unittest discover -s kgof/test 
3 | 
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """A setuptools based setup module.
 2 | 
 3 | See:
 4 | https://packaging.python.org/en/latest/distributing.html
 5 | https://github.com/pypa/sampleproject
 6 | """
 7 | 
 8 | # Always prefer setuptools over distutils
 9 | from setuptools import setup, find_packages
10 | # To use a consistent encoding
11 | from codecs import open
12 | from os import path
13 | 
14 | here = path.abspath(path.dirname(__file__))
15 | 
16 | # Get the long description from the README file
17 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
18 |     long_description = f.read()
19 | 
20 | setup(
21 |     name='kgof',
22 | 
23 |     # Versions should comply with PEP440.  For a discussion on single-sourcing
24 |     # the version across setup.py and the project code, see
25 |     # https://packaging.python.org/en/latest/single_source_version.html
26 |     version='0.1.0',
27 | 
28 |     description='Fast kernel-based goodness-of-fit tests',
29 |     long_description=long_description,
30 | 
31 |     # The project's main homepage.
32 |     url='https://github.com/wittawatj/kernel-gof',
33 | 
34 |     # Author details
35 |     author='Wittawat Jitkrittum',
36 |     author_email='wittawatj@gmail.com',
37 | 
38 |     # Choose your license
39 |     license='MIT',
40 | 
41 |     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
42 |     classifiers=[
43 |         # How mature is this project? Common values are
44 |         #   3 - Alpha
45 |         #   4 - Beta
46 |         #   5 - Production/Stable
47 |         'Development Status :: 3 - Alpha',
48 | 
49 |         # Indicate who your project is intended for
50 |         'Intended Audience :: Developers',
51 | 
52 |         # Pick your license as you wish (should match "license" above)
53 |         'License :: OSI Approved :: MIT License',
54 |         'Operating System :: OS Independent',
55 | 
56 |         # Specify the Python versions you support here. In particular, ensure
57 |         # that you indicate whether you support Python 2, Python 3 or both.
58 |         'Programming Language :: Python :: 2',
59 |         'Programming Language :: Python :: 2.7',
60 | 
61 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
62 |         'Topic :: Scientific/Engineering :: Mathematics',
63 |     ],
64 | 
65 |     # What does your project relate to?
66 |     keywords='hypothesis-test kernel-methods machine-learning AI goodness-of-fit',
67 | 
68 |     # You can just specify the packages manually here if your project is
69 |     # simple. Or you can use find_packages().
70 |     packages=find_packages(exclude=['data', '*.ex']),
71 |     
72 |     # See https://www.python.org/dev/peps/pep-0440/#version-specifiers
73 |     python_requires='>= 2.7',
74 | 
75 |     # Alternatively, if you want to distribute just a my_module.py, uncomment
76 |     # this:
77 |     #py_modules=["gofte"],
78 | 
79 |     # List run-time dependencies here.  These will be installed by pip when
80 |     # your project is installed. For an analysis of "install_requires" vs pip's
81 |     # requirements files see:
82 |     # https://packaging.python.org/en/latest/requirements.html
83 |     install_requires=['numpy', 'autograd', 'scipy', 'matplotlib'],
84 | )
85 | 


--------------------------------------------------------------------------------