├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── __init__.py
    ├── basic.py
    ├── distributions.py
    ├── erdos_renyi.py
    └── generator.py
├── main.py
├── models
    ├── autoreg_base.py
    ├── bge_model.py
    ├── factorised_base.py
    └── vcn.py
├── requirements.txt
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .ipynb_checkpoints/
  2 | .vscode/
  3 | __pycache__/
  4 | .DS_Store
  5 | *.ipynb
  6 | =======
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | #Apple
137 | .DS_Store
138 | 
139 | out/
140 | results/
141 | create_job.py
142 | run_eval_all.py
143 | jobs/
144 | run_eval_all_.py
145 | del.py
146 | results_new/
147 | results_tp/
148 | results_iclr/
149 | weights_tp/
150 | plots/
151 | data/MNIST/
152 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Variational Causal Networks
 2 |  Pytorch implementation of [Variational Causal Networks: Approximate Bayesian Inference over Causal Structures](https://arxiv.org/abs/2106.07635) (Annadani et al. 2021).
 3 |  
 4 | [Yashas Annadani](https://yashasannadani.com), [Jonas Rothfuss](https://las.inf.ethz.ch/people/jonas-rothfuss), [Alexandre Lacoste](https://ca.linkedin.com/in/alexandre-lacoste-4032465), [Nino Scherrer](https://ch.linkedin.com/in/ninoscherrer), [Anirudh Goyal](https://anirudh9119.github.io/), [Yoshua Bengio](https://mila.quebec/en/yoshua-bengio/), [Stefan Bauer](https://www.is.mpg.de/~sbauer)
 5 |  
 6 | 
 7 | ## Installation
 8 | You can install the dependencies using 
 9 | `pip install -r requirements.txt
10 | `
11 | 
12 | Create Directory structure which looks as follows: `[save_path]/er_1/`
13 | 
14 | ## Examples
15 | 
16 | Run
17 | 
18 | `python main.py --num_nodes [num_nodes] --data_seed [data_seed] --anneal --save_path [save_path]`
19 | 
20 | In the paper we run the model on 20 different data seeds to obtain confidence intervals. If you would like to compare with factorised distribution, run:
21 | 
22 | `python main.py --num_nodes [num_nodes] --data_seed [data_seed] --anneal --save_path [save_path] --no_autoreg_base`
23 | 
24 | ## Contact
25 | 
26 | If you have any questions, please address them to: Yashas Annadani `yashas.annadani@gmail.com`
27 | 
28 | 
29 | 
30 | If you use this work, please cite:
31 | 
32 | 	@article{annadani2021variational,
33 | 	title={Variational Causal Networks: Approximate Bayesian Inference over Causal Structures},
34 | 	author={Annadani, Yashas and Rothfuss, Jonas and Lacoste, Alexandre and Scherrer, Nino and Goyal, Anirudh and Bengio, Yoshua and Bauer, Stefan},
35 | 	journal={arXiv preprint arXiv:2106.07635},
36 | 	year={2021}
37 | 	}
38 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/data/basic.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import time
  4 | import numpy as np
  5 | import scipy
  6 | import tqdm
  7 | from scipy.stats import multivariate_normal
  8 | import igraph as ig
  9 | import itertools
 10 | import torch
 11 | from utils import all_combinations, mat_to_graph
 12 | 
 13 | class BasicModel:
 14 |     """
 15 |     Basic observational model
 16 |     Given 
 17 |         p(G)
 18 | 
 19 |     Implements
 20 | 
 21 |         p(theta | G)
 22 |         p(x | theta, G)
 23 |     
 24 |     """
 25 | 
 26 |     def __init__(self, *, g_dist, verbose=False, seed = None):
 27 |         super(BasicModel, self).__init__()
 28 |         
 29 |         self.verbose = verbose
 30 |         self.g_dist = g_dist
 31 |         self.reseed(seed = seed)
 32 |        
 33 |     def reseed(self, seed = None):
 34 |         if seed is None:
 35 |             return
 36 |         np.random.seed(seed)
 37 |         torch.manual_seed(seed)
 38 | 
 39 |     def sample_parameters(self, g):
 40 |         """Samples parameters given igraph.Graph g
 41 |         For each variable i, sample parameters for every possible state of parents
 42 |         Returns:
 43 |             theta 
 44 |         """
 45 |         raise NotImplementedError
 46 | 
 47 |     def sample_obs(self, n_samples, g, theta = None, toporder=None):
 48 |         """Samples `n_samples` observations given index i of graph and theta
 49 |             n_samples : int
 50 |             g : graph
 51 |             theta : [n_edges]
 52 |         Returns:
 53 |             x : [n_samples, n_vars] 
 54 |         """
 55 |         if theta is None:
 56 |             theta = self.sample_parameters(g)
 57 |         if toporder is None:
 58 |             toporder = g.topological_sorting()
 59 | 
 60 |         x = np.zeros((n_samples, len(g.vs)))
 61 |         z = scipy.stats.norm.rvs(loc=0.0, scale=self.sig_obs, size=(n_samples, len(g.vs)))
 62 |         
 63 |         # ancestral sampling
 64 |         for j in toporder:
 65 |             parent_edges = g.incident(j, mode='in')
 66 |             parents = list(g.es[e].source for e in parent_edges)
 67 |             if parents:
 68 |                 mean = x[:, parents] @ theta[parents, j]
 69 |                 x[:, j] = mean + z[:, j]
 70 |             else:
 71 |                 x[:, j] = z[:, j]
 72 |         return x.astype(np.float32)
 73 |         
 74 |         
 75 |     def log_prob_parameters(self, theta, g):
 76 |         """Computes p(theta | G)"""
 77 | 
 78 |         raise NotImplementedError
 79 | 
 80 |     def log_likelihood(self, x, theta, g):
 81 |         """Computes p(x | theta, G)"""
 82 | 
 83 |         raise NotImplementedError
 84 |         
 85 | 
 86 |     def log_marginal_likelihood_given_g(self, g, x):
 87 |         """Computes log p(x | G) 
 88 |             x : [n_samples, n_vars]
 89 |             g: graph
 90 |         """
 91 | 
 92 |         raise NotImplementedError
 93 |        
 94 | 
 95 |     def log_marginal_likelihood(self, x, all_g, z_g=None, numpy = False):
 96 |         """Computes log p(x) in closed form using conjugacy properties of Dirichlet-Categorical
 97 |             x : [n_samples, n_vars]
 98 |             all_g : list of all possible igraph.Graph objects in domain; is exhaustively summed over
 99 |         """
100 | 
101 |         # log p(x, G)
102 |         log_prob_obs_g = np.zeros(len(all_g))
103 | 
104 |         # normalizing constant for log p(G) using exhaustive normalization
105 |         if z_g is None:
106 |             z_g = self.g_dist.log_normalization_constant(all_g=all_g)
107 | 
108 |         # log p(x, G)
109 |         for i, g in enumerate(tqdm.tqdm(all_g, desc='p(X) log_marginal_likelihood', disable=not self.verbose)):
110 |             if numpy:
111 |                 g = mat_to_graph(g)
112 |             # log p(x, G) = log (p(G)/Z) + log p(x | G)
113 |             log_prob_obs_g[i] = self.g_dist.unnormalized_log_prob(g=g) - z_g \
114 |                 + self.log_marginal_likelihood_given_g(g=g, x=x)
115 | 
116 |         # log p(x) = log(sum_G exp(log p(x, G)))
117 |         return scipy.special.logsumexp(log_prob_obs_g)
118 | 
119 |     def log_posterior_graph_given_obs(self, g, x, z_g, _log_marginal_likelihood = None):
120 |         """Computes p(G | D) given the previously computed normalization constant
121 |             x : [..., n_vars]
122 |             i : int (graph)
123 |         """
124 | 
125 |         log_prob_g = self.g_dist.unnormalized_log_prob(g=g) - z_g
126 |         log_marginal_likelihood_given_g = self.log_marginal_likelihood_given_g(
127 |             g=g, x=x)
128 |         if _log_marginal_likelihood is None:
129 |             return log_prob_g + log_marginal_likelihood_given_g
130 |         return log_prob_g + log_marginal_likelihood_given_g - _log_marginal_likelihood
131 | 
132 |     def sample_posterior_weights_given_obs(self, g, x):
133 |         """Computes p(theta | G, D) 
134 |             x : [..., n_vars]
135 |             i : int (graph)
136 |         """
137 | 
138 |         raise NotImplementedError
139 |     ####
140 |     # Monte Carlo Integration to validate (closed-form computation) of marginal likelihood
141 |     ####
142 | 
143 |     def log_prob_parameters_mc(self, theta, n_samples=3e4):
144 |         """Approximates p(theta) using Monte Carlo integration
145 |             theta : parameters
146 |         """
147 | 
148 |         logliks = []
149 |         for tt in range(int(n_samples)):
150 | 
151 |             # sample from p(G)
152 |             g = self.g_dist.sample_G()
153 | 
154 |             # evaluate log prob p(theta | G)
155 |             logliks.append(self.log_prob_parameters(theta=theta, g=g))
156 | 
157 |             # print
158 |             if not tt % int(n_samples / 1000) and tt > 0:
159 |                 curr = scipy.special.logsumexp(
160 |                     np.array(logliks[:tt + 1]) - np.log(tt + 1))
161 |                 print(f'iter = {tt}: log p(theta | G) [MC] = {curr}', end='\r')
162 | 
163 |         log_prob_obs = scipy.special.logsumexp(
164 |             np.array(logliks) - np.log(n_samples))
165 |         return log_prob_obs
166 | 
167 |     def log_marginal_likelihood_given_g_mc(self, x, g, n_samples=3e4):
168 |         """Approximates p(x | G) using Monte Carlo integration
169 |             x : [n_samples, n_vars]
170 |             g : graph
171 |         """
172 | 
173 |         logliks = []
174 |         for tt in range(int(n_samples)):
175 | 
176 |             # sample from p(theta | G)
177 |             theta = self.sample_parameters(g=g)
178 | 
179 |             # evaluate likelihood log p(X | theta, G)
180 |             logliks.append(self.log_likelihood(x=x, theta=theta, g=g))
181 | 
182 |             # print
183 |             if not tt % int(n_samples / 1000) and tt > 0:
184 |                 curr = scipy.special.logsumexp(
185 |                     np.array(logliks[:tt + 1]) - np.log(tt + 1))
186 |                 print(f'iter = {tt}: log p(X | G) [MC] = {curr}', end='\r')
187 | 
188 |         log_prob_obs = scipy.special.logsumexp(
189 |             np.array(logliks) - np.log(n_samples))
190 |         return log_prob_obs
191 | 
192 |     def log_marginal_likelihood_mc(self, x, n_samples=3e4):
193 |         """Approximates normalization constant p(x) using Monte Carlo integration
194 |             x : [n_samples, n_vars]
195 |         """
196 | 
197 |         logliks = []
198 |         for tt in range(int(n_samples)):
199 | 
200 |             # sample from p(G, theta) = p(G) p(theta | G)
201 |             g = self.g_dist.sample_G()
202 |             theta = self.sample_parameters(g=g)
203 | 
204 |             # evaluate likelihood log p(X | theta, G)
205 |             logliks.append(self.log_likelihood(x=x, theta=theta, g=g))
206 | 
207 |             # print
208 |             if not tt % int(n_samples / 1000) and tt > 0:
209 |                 curr = scipy.special.logsumexp(
210 |                     (logliks[:tt + 1] - np.log(tt + 1)))
211 |                 print(f'iter = {tt}: log p(X) [MC] = {curr}', end='\r')
212 |         print()
213 |         log_prob_obs = scipy.special.logsumexp((logliks - np.log(n_samples)))
214 |         return log_prob_obs
215 | 


--------------------------------------------------------------------------------
/data/distributions.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | import tqdm
  4 | import scipy
  5 | from scipy.stats import multivariate_normal
  6 | import itertools
  7 | import networkx as nx
  8 | 
  9 | from utils import expm_np, all_combinations
 10 | import torch
 11 | 
 12 | class GraphDistribution:
 13 |     """
 14 |     Class to represent distributions over graphs.
 15 |     """
 16 | 
 17 |     def __init__(self, n_vars, verbose=False):
 18 |         self.n_vars = n_vars
 19 |         self.verbose = verbose
 20 | 
 21 |     def sample_G(self, return_mat=False):
 22 |         """
 23 |         Samples graph according to distribution
 24 | 
 25 |         n: number of vertices
 26 |         Returns:
 27 |             g: igraph.Graph 
 28 |         """
 29 |         raise NotImplementedError
 30 | 
 31 |     def unnormalized_log_prob(self, g):
 32 |         """
 33 |         g: igraph.Graph object
 34 |         Returns:
 35 |             float   log p(G) + const, i.e. unnormalized
 36 |         """
 37 |         raise NotImplementedError
 38 | 
 39 |     def log_normalization_constant(self, all_g):
 40 |         """
 41 |         Computes normalization constant for log p(G), i.e. `Z = log(sum_G p(g))`
 42 |         all_g: list of igraph.Graph objects
 43 |         Returns:
 44 |             float
 45 |         """
 46 |         log_prob_g_unn = np.zeros(len(all_g))
 47 |         for i, g in enumerate(tqdm.tqdm(all_g, desc='p(G) log_normalization_constant', disable=not self.verbose)):
 48 |             log_prob_g_unn[i] = self.unnormalized_log_prob(g=g)
 49 |         log_prob_sum_g = scipy.special.logsumexp(log_prob_g_unn)
 50 |         return log_prob_sum_g
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | class UniformDAGDistributionRejection(GraphDistribution):
 58 |     """
 59 |     Uniform distribution over DAGs
 60 |     """
 61 | 
 62 |     def __init__(self, n_vars, verbose=False):
 63 |         super(UniformDAGDistributionRejection, self).__init__(n_vars=n_vars, verbose=verbose)
 64 |         self.n_vars = n_vars 
 65 |         self.verbose = verbose
 66 | 
 67 |     def sample_G(self, return_mat=False):
 68 |         """Samples uniformly random DAG"""
 69 |         while True:
 70 |             mat = np.random.choice(2, size=self.n_vars * self.n_vars).reshape(self.n_vars, self.n_vars)
 71 |             if expm_np(mat) == 0:
 72 |                 if return_mat:
 73 |                     return mat
 74 |                 else:
 75 |                     return nx.DiGraph(mat)
 76 | 
 77 |     def unnormalized_log_prob(self, g):
 78 |         """
 79 |         p(G) ~ 1
 80 |         """
 81 | 
 82 |         return 0.0
 83 | 
 84 | class GibbsUniformDAGDistribution(GraphDistribution):
 85 |     """
 86 |     Almost Uniform distribution over DAGs based on the DAG constraint
 87 |     """
 88 | 
 89 |     def __init__(self, n_vars, gibbs_temp=10., sparsity_factor = 0.0, verbose=False):
 90 |         super(GibbsUniformDAGDistribution, self).__init__(n_vars=n_vars, verbose=verbose)
 91 |         self.n_vars = n_vars 
 92 |         self.verbose = verbose
 93 |         self.gibbs_temp = gibbs_temp
 94 |         self.sparsity_factor = sparsity_factor
 95 |         self.z_g = None
 96 | 
 97 |     def sample_G(self, return_mat=False):
 98 |         """Samples almost uniformly random DAG"""
 99 |         raise NotImplementedError
100 | 
101 |     def unnormalized_log_prob(self, g):
102 |         """
103 |         p(G) ~ 1
104 |         """
105 |         mat = g
106 |         dagness = expm_np(mat, self.n_vars)
107 |         return -self.gibbs_temp*dagness - self.sparsity_factor*np.sum(mat)
108 | 
109 | class GibbsDAGDistributionFull(GraphDistribution):
110 |     """
111 |     Almost Uniform distribution over DAGs based on the DAG constraint
112 |     """
113 | 
114 |     def __init__(self, n_vars, gibbs_temp=10., sparsity_factor = 0.0, verbose=False):
115 |         super(GibbsDAGDistributionFull, self).__init__(n_vars=n_vars, verbose=verbose)
116 |         assert n_vars<=4, 'Cannot use this for higher dimensional variables, Try UniformDAGDistributionRejection instead'
117 |         self.n_vars = n_vars 
118 |         self.verbose = verbose
119 |         self.gibbs_temp = gibbs_temp
120 |         self.sparsity_factor = sparsity_factor
121 |         all_g = all_combinations(n_vars, return_adj = True) #Do not store this in interest of memory
122 |         dagness = np.zeros(len(all_g))
123 |         for i, j in enumerate(all_g):
124 |             dagness[i] = expm_np(j, self.n_vars)
125 |         self.logits = -gibbs_temp*dagness - sparsity_factor*np.sum(all_g, axis = (-1, -2))
126 |         self.z_g = scipy.special.logsumexp(self.logits)
127 | 
128 |     def sample_G(self, return_mat=False):
129 |         """Samples almost uniformly random DAG"""
130 |         all_g = all_combinations(self.n_vars, return_adj = True)
131 |         mat_id = torch.distributions.Categorical(logits = torch.tensor(self.logits)).sample()
132 |         mat = all_g[mat_id]
133 |         if return_mat:
134 |             return mat
135 |         else:
136 |             return nx.DiGraph(mat)
137 | 
138 |     def unnormalized_log_prob(self, g):
139 |         """
140 |         p(G) ~ 1
141 |         """
142 |         mat = g
143 |         dagness = expm_np(mat, self.n_vars)
144 |         return -self.gibbs_temp*dagness - self.sparsity_factor*np.sum(mat)


--------------------------------------------------------------------------------
/data/erdos_renyi.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from .generator import Generator
 4 | import networkx as nx 
 5 | import graphical_models
 6 | 
 7 | class ER(Generator):
 8 | 	"""Generate erdos renyi random graphs using networkx's native random graph builder
 9 | 		Args: 
10 | 		num_nodes - Number of Nodes in the graph
11 | 		exp_edges - Expected Number of edges in Erdos Renyi graph
12 | 		noise_type - Type of exogenous variables
13 | 		noise_sigma - Std of the noise type
14 | 		num_sampels - number of observations
15 | 		mu_prior - prior of weights mean(gaussian)
16 | 		sigma_prior - prior of weights sigma (gaussian)
17 | 		seed - random seed for data
18 | 	"""
19 | 	
20 | 	def __init__(self, num_nodes, exp_edges = 1, noise_type='isotropic-gaussian', noise_sigma = 1.0, num_samples=1000, mu_prior = 2.0, sigma_prior = 1.0, seed = 10):
21 | 		self.noise_sigma = noise_sigma
22 | 		p = float(exp_edges)/ (num_nodes-1)
23 | 		acyclic = 0
24 | 		mmec = 0
25 | 		count = 1
26 | 		while not (acyclic and mmec):
27 | 			if exp_edges <= 2:
28 | 				self.graph = nx.generators.random_graphs.fast_gnp_random_graph(num_nodes, p, directed = True, seed = seed*count)
29 | 			else:
30 | 				self.graph = nx.generators.random_graphs.gnp_random_graph(num_nodes, p, directed = True, seed = seed*count)
31 | 			acyclic = expm_np(nx.to_numpy_matrix(self.graph), num_nodes) == 0
32 | 			if acyclic:
33 | 				mmec = num_mec(self.graph) >=2
34 | 			count += 1
35 | 		super().__init__(num_nodes, len(self.graph.edges), noise_type, num_samples, mu_prior = mu_prior , sigma_prior = sigma_prior, seed = seed)
36 | 		self.init_sampler()
37 | 		self.samples = self.sample(self.num_samples)
38 | 
39 | 	def __getitem__(self, index):
40 | 		return self.samples[index]
41 | 
42 | def matrix_poly_np(matrix, d):
43 | 	x = np.eye(d) + matrix/d
44 | 	return np.linalg.matrix_power(x, d)
45 | 
46 | def expm_np(A, m):
47 | 	expm_A = matrix_poly_np(A, m)
48 | 	h_A = np.trace(expm_A) - m
49 | 	return h_A
50 | 
51 | def num_mec(m):
52 | 	a = graphical_models.DAG.from_nx(m)
53 | 	skeleton = a.cpdag() ##Find the skeleton
54 | 	all_dags = skeleton.all_dags() #Find all DAGs in MEC
55 | 	return len(all_dags)
56 | 


--------------------------------------------------------------------------------
/data/generator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import networkx as nx
  4 | PRESETS = ['chain', 'collider','fork', 'random']
  5 | NOISE_TYPES = ['gaussian', 'isotropic-gaussian', 'exponential', 'gumbel']
  6 | VARIABLE_TYPES = ['gaussian', 'non-gaussian', 'categorical']
  7 | 
  8 | class Generator(torch.utils.data.Dataset):
  9 | 
 10 | 	""" Base class for generating different graphs and performing ancestral sampling"""
 11 | 
 12 | 	def __init__(self, num_nodes, num_edges, noise_type, num_samples, mu_prior = None, sigma_prior = None, seed = None):
 13 | 		self.num_nodes = num_nodes
 14 | 		self.num_edges = num_edges
 15 | 		assert noise_type in NOISE_TYPES, 'Noise types must correspond to {} but got {}'.format(NOISE_TYPES, noise_type)
 16 | 		self.noise_type = noise_type
 17 | 		self.num_samples = num_samples
 18 | 		self.mu_prior = mu_prior
 19 | 		self.sigma_prior = sigma_prior
 20 | 		if seed is not None:
 21 | 			self.reseed(seed)
 22 | 		if not "self.weighted_adjacency_matrix" in locals():
 23 | 			self.sample_weights()
 24 | 			self.build_graph()
 25 | 	
 26 | 	def reseed(self, seed = None):
 27 | 		torch.manual_seed(seed)
 28 | 		np.random.seed(seed)
 29 | 
 30 | 	def __getitem__(self, index):
 31 | 		raise NotImplementedError
 32 | 
 33 | 	def build_graph(self):
 34 | 		""" Initilises the adjacency matrix and the weighted adjacency matrix"""
 35 | 
 36 | 		self.adjacency_matrix = nx.to_numpy_matrix(self.graph)
 37 | 		self.weighted_adjacency_matrix = self.adjacency_matrix.copy()
 38 | 		edge_pointer = 0
 39 | 		for i in nx.topological_sort(self.graph):
 40 | 			parents = list(self.graph.predecessors(i))
 41 | 			if len(parents) == 0:
 42 | 				continue
 43 | 			else:
 44 | 				for j in parents:
 45 | 					self.weighted_adjacency_matrix[j, i] = self.weights[edge_pointer]
 46 | 					edge_pointer += 1
 47 | 
 48 | 	def init_sampler(self):
 49 | 		if self.noise_type.endswith('gaussian'):
 50 | 			#Identifiable
 51 | 			if self.noise_type == 'isotropic-gaussian':
 52 | 				noise_std= [self.noise_sigma]*self.num_nodes
 53 | 			elif self.noise_type == 'gaussian':
 54 | 				noise_std = np.linspace(0.1, 3., self.num_nodes)
 55 | 			for i in range(self.num_nodes):
 56 | 				self.graph.nodes[i]['sampler'] = torch.distributions.normal.Normal(0., noise_std[i])
 57 | 
 58 | 		elif self.noise_type == 'exponential':
 59 | 			noise_std= [self.noise_sigma]*self.num_nodes
 60 | 			for i in range(self.num_nodes):
 61 | 				self.graph.nodes[i]['sampler'] = torch.distributions.exponential.Exponential(noise_std[i])
 62 | 
 63 | 	def sample_weights(self):
 64 | 		"""Sample the edge weights"""
 65 | 
 66 | 		if self.mu_prior is not None:
 67 | 		
 68 | 			self.weights = torch.distributions.normal.Normal(self.mu_prior, self.sigma_prior).sample([self.num_edges])
 69 | 		else:
 70 | 			dist = torch.distributions.uniform.Uniform(-5, 5)
 71 | 			self.weights = torch.zeros(self.num_edges)
 72 | 			for k in range(self.num_edges):
 73 | 				sample = 0.
 74 | 				while sample > -0.5 and sample < 0.5:
 75 | 					sample = dist.sample()
 76 | 					self.weights[k] = sample
 77 | 
 78 | 		print(self.weights)
 79 | 
 80 | 	def sample(self, num_samples, graph = None, node = None, value = None):
 81 | 		"""Sample observations given a graph
 82 | 		num_samples: Scalar
 83 | 		graph: networkx DiGraph
 84 | 		node: If intervention is performed, specify which node
 85 | 		value: value set to node after intervention
 86 | 
 87 | 		Outputs: Observations [num_samples x num_nodes]
 88 | 		"""
 89 | 
 90 | 		if graph is None:
 91 | 			graph = self.graph
 92 | 
 93 | 		samples = torch.zeros(num_samples, self.num_nodes)
 94 | 		edge_pointer = 0
 95 | 		for i in nx.topological_sort(graph):
 96 | 			if i == node:
 97 | 				noise = torch.tensor([value]*num_samples)
 98 | 			else:
 99 | 				noise = self.graph.nodes[i]['sampler'].sample([num_samples])
100 | 			parents = list(self.graph.predecessors(i))
101 | 			if len(parents) == 0:
102 | 				samples[:,i] = noise
103 | 			else:
104 | 				curr = 0.
105 | 				for j in parents:
106 | 					curr += self.weighted_adjacency_matrix[j, i]*samples[:,j]
107 | 					edge_pointer += 1
108 | 				curr += noise
109 | 				samples[:, i] = curr
110 | 		return samples
111 | 
112 | 	def intervene(self, num_samples, node = None, value = None):
113 | 		
114 | 		"""Perform intervention to obtain a mutilated graph"""
115 | 
116 | 		if node is None:
117 | 			node = torch.randint(self.num_nodes, (1,))
118 | 		if value is None:
119 | 			#value = torch.distributions.uniform.Uniform(-5,5).sample()
120 | 			value = torch.tensor(2.0)
121 | 
122 | 		mutated_graph = self.adjacency_matrix.copy()
123 | 		mutated_graph[:, node] = 0. #Cut off all the parents
124 | 
125 | 		return self.sample(num_samples, nx.DiGraph(mutated_graph), node.item(), value.item()), node, value
126 | 	
127 | 	def __len__(self):
128 | 		return self.num_samples
129 | 
130 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import os.path as osp
  3 | import numpy as np
  4 | import torch
  5 | import argparse
  6 | from datetime import datetime
  7 | import pickle as pkl
  8 | import shutil
  9 | import networkx as nx
 10 | import time
 11 | 
 12 | import utils
 13 | import matplotlib.pyplot as plt
 14 | from models import vcn, autoreg_base, factorised_base, bge_model
 15 | from data import erdos_renyi, distributions
 16 | import graphical_models
 17 | from sklearn import metrics
 18 | 
 19 | def parse_args():
 20 |     parser = argparse.ArgumentParser(description='Variational Causal Networks')
 21 |     parser.add_argument('--save_path', type=str, default = 'results_anneal/',
 22 |                     help='Path to save result files')
 23 |     parser.add_argument('--no_autoreg_base', action='store_true', default=False,
 24 |                     help='Use factorisable disrtibution')
 25 |     parser.add_argument('--seed', type=int, default=10,
 26 |                     help='random seed (default: 10)')
 27 |     parser.add_argument('--data_seed', type=int, default=20,
 28 |                     help='random seed for generating data(default: 20)')
 29 |     parser.add_argument('--batch_size', type=int, default=1000,
 30 |                     help='Batch Size for training')
 31 |     parser.add_argument('--lr', type=float, default=1e-2,
 32 |                     help='Learning rate')
 33 |     parser.add_argument('--gibbs_temp', type=float, default=1000.0,
 34 |                     help='Temperature for the Graph Gibbs Distribution')
 35 |     parser.add_argument('--sparsity_factor', type=float, default=0.001,
 36 |                     help='Hyperparameter for sparsity regularizer')
 37 |     parser.add_argument('--epochs', type=int, default=30000,
 38 |                     help='Number of iterations to train')
 39 |     parser.add_argument('--num_nodes', type=int, default=2,
 40 |                     help='Number of nodes in the causal model')
 41 |     parser.add_argument('--num_samples', type=int, default=100,
 42 |                     help='Total number of samples in the synthetic data')
 43 |     parser.add_argument('--noise_type', type=str, default='isotropic-gaussian',
 44 |                     help='Type of noise of causal model')
 45 |     parser.add_argument('--noise_sigma', type=float, default=1.0,
 46 |                     help='Std of Noise Variables')
 47 |     parser.add_argument('--theta_mu', type=float, default=2.0,
 48 |                     help='Mean of Parameter Variables')
 49 |     parser.add_argument('--theta_sigma', type=float, default=1.0,
 50 |                     help='Std of Parameter Variables')
 51 |     parser.add_argument('--data_type', type=str, default='er',
 52 |                     help='Type of data')
 53 |     parser.add_argument('--exp_edges', type=float, default=1.0,
 54 |                     help='Expected number of edges in the random graph')
 55 |     parser.add_argument('--eval_only', action='store_true', default=False,
 56 |                     help='Perform Just Evaluation')
 57 |     parser.add_argument('--anneal', action='store_true', default=False,
 58 |                     help='Perform gibbs temp annealing')
 59 | 
 60 |     args = parser.parse_args()
 61 |     args.data_size = args.num_nodes * (args.num_nodes-1)
 62 |     root = args.save_path
 63 |     list_dir = os.listdir(args.save_path)
 64 |     args.save_path = os.path.join(args.save_path, args.data_type + '_' + str(int(args.exp_edges)), str(args.num_nodes) + '_' + str(args.seed) + '_' + str(args.data_seed) + '_' + str(args.num_samples) + '_' + \
 65 |       str(args.sparsity_factor) +'_' + str(args.gibbs_temp) + '_' + str(args.no_autoreg_base)) 
 66 |     if not os.path.exists(args.save_path):
 67 |         os.makedirs(args.save_path)
 68 |     if args.num_nodes == 2:
 69 |         args.exp_edges = 0.8
 70 | 
 71 |     args.gibbs_temp_init = 10.
 72 |     args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 73 |     
 74 |     torch.manual_seed(args.seed)
 75 |     np.random.seed(args.seed)
 76 | 
 77 |     return args
 78 | 
 79 | def auroc(model, ground_truth, num_samples = 1000):
 80 |     """Compute the AUROC of the model as given in 
 81 |     https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0009202"""
 82 | 
 83 |     gt = utils.adj_mat_to_vec(torch.from_numpy(ground_truth).unsqueeze(0), model.num_nodes).numpy().squeeze()
 84 |     num_nodes = model.num_nodes
 85 |     bs = 10000
 86 |     i = 0
 87 |     samples = []
 88 |     with torch.no_grad():
 89 |         while i<num_samples:
 90 |             curr = min(bs, num_samples-i)
 91 |             samples.append(model.graph_dist.sample([curr]).cpu().numpy().squeeze())
 92 |             i+=curr
 93 |     samples = np.concatenate(samples, axis = 0)
 94 |     samples_mean = np.mean(samples, axis = 0)
 95 |     sorted_beliefs_index = np.argsort(samples_mean)[::-1]
 96 |     fpr = np.zeros((samples_mean.shape[-1]))
 97 |     tpr = np.zeros((samples_mean.shape[-1]))
 98 |     tnr = np.zeros((samples_mean.shape[-1]))
 99 |     for i in range(samples_mean.shape[-1]):
100 |         indexes = np.zeros((samples_mean.shape[-1]))
101 |         indexes[sorted_beliefs_index[:i]] = 1
102 |         tp = np.sum(np.logical_and(gt == 1, indexes == 1))
103 |         fn = np.sum(np.logical_and(indexes==0 , gt != indexes))
104 |         tn = np.sum(np.logical_and(gt==0, indexes==0))
105 |         fp = np.sum(np.logical_and(indexes==1, gt!=indexes))
106 |         fpr[i] = float(fp)/(fp+tn)
107 |         tpr[i] = float(tp)/(tp + fn)
108 |         tnr[i] = float(tn)/(tn + fp)
109 |     auroc = metrics.auc(fpr, tpr)
110 |     return auroc
111 | 
112 |         
113 | def exp_shd(model, ground_truth, num_samples = 1000):
114 |     """Compute the Expected Structural Hamming Distance of the model"""
115 |     shd = 0
116 |     prc = 0.
117 |     rec = 0.
118 |     with torch.no_grad():
119 |         samples = model.graph_dist.sample([num_samples])
120 |         G = utils.vec_to_adj_mat(samples, model.num_nodes) 
121 |         for i in range(num_samples):
122 |             metrics = utils.shd(G[i].cpu().numpy(), ground_truth)
123 |             shd += metrics['shd']
124 |             prc += metrics['prc']
125 |             rec += metrics['rec']
126 |     return shd/num_samples, prc/num_samples, rec/num_samples
127 |  
128 | def full_kl_and_hellinger(model, bge_train, g_dist, device):
129 |     """Compute the KL Divergence and Hellinger distance in lower dimensional settings (d<=4)"""
130 | 
131 |     bs = 100000
132 |     all_adj = utils.all_combinations(model.num_nodes, return_adj = True).astype(np.float32)
133 |     all_adj_vec = utils.all_combinations(model.num_nodes, return_adj = False).astype(np.float32)
134 |     log_posterior_graph = torch.zeros(len(all_adj))
135 |     log_prob_g = torch.zeros(len(all_adj))
136 |     log_prob_model = torch.zeros(len(all_adj))
137 |     with torch.no_grad():
138 |         for tt in range(0,len(all_adj),bs):
139 |             log_posterior_graph[tt:tt+bs] = bge_train.log_marginal_likelihood_given_g(w = torch.tensor(all_adj[tt:tt+bs]).to(device)).cpu() #Unnormalized Log Probabilities
140 |             log_prob_model[tt:tt+bs] = model.graph_dist.log_prob(torch.tensor(all_adj_vec[tt:tt+bs]).to(device).unsqueeze(2)).cpu().squeeze()
141 |         for tt in range(len(all_adj)):
142 |             log_prob_g[tt] = g_dist.unnormalized_log_prob(g=all_adj[tt])
143 |     graph_p = torch.distributions.categorical.Categorical(logits = log_posterior_graph + log_prob_g)
144 |     graph_q = torch.distributions.categorical.Categorical(logits = log_prob_model)
145 |     hellinger = (1./np.sqrt(2)) * torch.sqrt((torch.sqrt(graph_p.probs) - torch.sqrt(graph_q.probs)).pow(2).sum()) 
146 |     return torch.distributions.kl.kl_divergence(graph_q, graph_p).item(), hellinger.item()
147 | 
148 | 
149 | def train(model, bge_train, optimizer, baseline, batch_size, e, device):
150 |     kl_graphs = 0.
151 |     losses = 0.
152 |     likelihoods = 0.
153 | 
154 |     model.train()
155 |    
156 |     optimizer.zero_grad()
157 |     likelihood, kl_graph, log_probs = model(batch_size, bge_train, e)  #TODO: Check if additional entropy regularization is required
158 |     score_val = ( - likelihood + kl_graph).detach()
159 |     per_sample_elbo = log_probs*(score_val-baseline)
160 |     baseline = 0.95 * baseline + 0.05 * score_val.mean() 
161 |     loss = (per_sample_elbo).mean()
162 |     loss.backward()
163 |     optimizer.step()
164 |     
165 |     likelihoods = -likelihood.mean().item()
166 |     kl_graphs = kl_graph.mean().item() 
167 |     losses = ( -likelihood  + kl_graph).mean().item()
168 |         
169 |     return  losses, likelihoods,  kl_graphs, baseline
170 |     
171 | 
172 | 
173 | def evaluate(model, bge_test, batch_size, e, device):
174 |     model.eval()
175 | 
176 |     with torch.no_grad():
177 |         
178 |         likelihood, kl_graph, _ = model(batch_size, bge_test, e) 
179 |         elbo = (-likelihood + kl_graph).mean().item()
180 |         likelihoods = -likelihood.mean().item()
181 |         
182 |     return elbo, likelihoods
183 |         
184 | def load_model(args):
185 |     if not args.no_autoreg_base:
186 |         graph_dist = autoreg_base.AutoregressiveBase(args.num_nodes, device = args.device, temp_rsample = 0.1).to(args.device)
187 |     else:
188 |         graph_dist = factorised_base.FactorisedBase(args.num_nodes, device = args.device, temp_rsample = 0.1).to(args.device)
189 |     
190 |     def _gibbs_update(curr, epoch):
191 |         if epoch < args.epochs*0.05:
192 |             return curr
193 |         else:
194 |             return args.gibbs_temp_init+ (args.gibbs_temp - args.gibbs_temp_init)*(10**(-2 * max(0, (args.epochs - 1.1*epoch)/args.epochs)))
195 |     
196 |     if args.anneal:
197 |         gibbs_update = _gibbs_update
198 |     else:
199 |         gibbs_update = None
200 | 
201 |     model = vcn.VCN(num_nodes = args.num_nodes, graph_dist = graph_dist, sparsity_factor = args.sparsity_factor, gibbs_temp_init = args.gibbs_temp_init, gibbs_update = gibbs_update).to(args.device)
202 |     print(model, flush = True)
203 |     return model
204 | 
205 | def load_data(args):
206 |     if args.data_type == 'd4':
207 |         train_data = data_map[args.data_type](file_name = int(args.exp_edges))
208 |     else:
209 |         train_data = data_map[args.data_type](num_nodes = args.num_nodes, exp_edges = args.exp_edges, noise_type = args.noise_type, noise_sigma = args.noise_sigma, \
210 |             num_samples = args.num_samples, mu_prior = args.theta_mu, sigma_prior = args.theta_sigma, seed = args.data_seed)
211 | 
212 |     bge_train = bge_model.BGe(mean_obs = [args.theta_mu]*args.num_nodes, alpha_mu = 1.0, alpha_lambd=args.alpha_lambd, data = train_data.samples, device = args.device)
213 |     return bge_train, train_data
214 | 
215 | def main(args):
216 |     model = load_model(args)
217 | 
218 |     optimizer = torch.optim.Adam(model.parameters() , args.lr)
219 |     
220 |     bge_train, train_data = load_data(args)
221 |     if args.num_nodes <=4:
222 |         g_dist = distributions.GibbsDAGDistributionFull(args.num_nodes, args.gibbs_temp, args.sparsity_factor)
223 |     else:
224 |         g_dist = distributions.GibbsUniformDAGDistribution(args.num_nodes, args.gibbs_temp, args.sparsity_factor)
225 |     
226 |     best_elbo = 1e20
227 |     likelihood = []
228 |     kl_graph = []
229 |     elbo_train = []
230 |     val_elbo = []
231 |     baseline = 0.
232 |     best_likelihood = 1e20
233 |     best_kl = 1e20
234 |     
235 |     time_epoch = []
236 |     if not args.eval_only:    
237 |         for e in range(1, args.epochs + 1):
238 |             temp_time = time.time()
239 |             el, li, kl_g, baseline = train(model, bge_train, optimizer, baseline, args.batch_size, e, args.device)
240 |             time_epoch.append(time.time()- temp_time)
241 |             likelihood.append(li), kl_graph.append(kl_g), elbo_train.append(el)
242 |             elbo_epoch, likelihood_epoch = evaluate(model, bge_train, args.batch_size, e, args.device)
243 |             val_elbo.append(elbo_epoch)
244 | 
245 |             if e % 100 == 0:
246 |                 kl_full, hellinger_full = 0., 0.
247 |                 if args.num_nodes<=4:
248 |                     kl_full, hellinger_full = full_kl_and_hellinger(model, bge_train, g_dist, args.device)
249 | 
250 |                 print('Epoch {}:  TRAIN - ELBO: {:.5f} likelihood: {:.5f} kl graph: {:.5f} VAL-ELBO: {:.5f} Temp Target {:.4f} Time {:.2f}'.\
251 |                     format(e, el, li,kl_g, elbo_epoch, model.gibbs_temp, np.sum(time_epoch[e-100:e]), flush = True))
252 | 
253 |                 torch.save({'model':model.state_dict(), 'best_elbo':best_elbo, 'saved_epoch': e, 'time': time_epoch,\
254 |                       'likelihood': likelihood, 'kl_graph': kl_graph, 'elbo_train': elbo_train, 'val_elbo': val_elbo, 'baseline': baseline}, osp.join(args.save_path, 'last_saved_model.pth'))
255 |                 
256 |         torch.save({'model':model.state_dict(), 'best_elbo':best_elbo, 'saved_epoch': args.epochs, 'time': time_epoch,\
257 |                     'likelihood': likelihood, 'kl_graph': kl_graph, 'elbo_train': elbo_train, 'val_elbo': val_elbo, 'baseline': baseline}, osp.join(args.save_path, 'best_model.pth'))
258 | 
259 |     model.load_state_dict(torch.load(osp.join(args.save_path,'best_model.pth'))['model'])
260 |     shd, prc, rec = exp_shd(model, train_data.adjacency_matrix)
261 |     kl_full = 0.
262 |     hellinger_full = 0.
263 |     auroc_score = 0.
264 |     if args.num_nodes<=4:
265 |         kl_full, hellinger_full = full_kl_and_hellinger(model, bge_train, g_dist, args.device)
266 |     else:
267 |         auroc_score = auroc(model, train_data.adjacency_matrix)
268 | 
269 |     print('Exp SHD:', shd,  'Exp Precision:', prc, 'Exp Recall:', rec, 'Kl_full:', kl_full, 'hellinger_full:', hellinger_full,\
270 |     'auroc:', auroc_score)
271 | 
272 |     with open(osp.join(args.save_path, 'results.pkl'), 'wb') as bb:
273 |             pkl.dump({'likelihood':likelihood, 'kl_graph':kl_graph, 'elbo_train': elbo_train,\
274 |             'elbo_val':val_elbo, 'kl_best_full': kl_full, 'hellinger_best_full': hellinger_full, 'time': time_epoch\
275 |             , 'baseline': baseline, 'exp_shd': shd, 'exp_prc': prc, 'exp_rec': rec, 'auroc': auroc_score}, bb)
276 | 
277 | if __name__ == '__main__':
278 |     args = parse_args()
279 |     if args.num_nodes <=4:
280 |         args.alpha_lambd = 10.
281 |     else:
282 |         args.alpha_lambd = 1000.
283 |     data_map = {'er': erdos_renyi.ER}
284 |     main(args)
285 | 


--------------------------------------------------------------------------------
/models/autoreg_base.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import utils
  3 | 
  4 | class AutoregressiveBase(torch.nn.Module):
  5 |     """Autoregressive bernoulli sampler based on LSTM"""
  6 |     
  7 |     def __init__(self, n_nodes, hidden_dim=48, n_layers=3, temp_rsample=0.1, device = 'cpu'):
  8 |         super().__init__()
  9 |         self.n_nodes = n_nodes
 10 |         self.n_dim_out = n_nodes*(n_nodes-1)
 11 |         self.hidden_dim = hidden_dim
 12 |         self.n_classes = 1
 13 |         self.n_layers = n_layers
 14 |         self.temp_rsample = temp_rsample
 15 |         self.device = device
 16 | 
 17 |         self.rnn = torch.nn.LSTM(self.hidden_dim, self.hidden_dim, num_layers=n_layers, batch_first=True)
 18 |         self.proj = torch.nn.Linear(self.hidden_dim, self.n_classes)
 19 |         self.embed = torch.nn.Linear(self.n_classes, self.hidden_dim)
 20 | 
 21 |         self.h0 = torch.nn.Parameter(1e-3*torch.randn(1, self.n_layers, self.hidden_dim))
 22 |         self.c0 = torch.nn.Parameter(1e-3*torch.randn(1, self.n_layers, self.hidden_dim))
 23 | 
 24 |         # create variable for the initial input of the LSTM
 25 |         self._init_input_param = torch.nn.Parameter(torch.zeros(1, 1, self.n_classes))
 26 | 
 27 |     def forward(self, inputs, state):
 28 |         inputs = self.embed(inputs)
 29 |         out, state = self.rnn(inputs, self._t(state))
 30 |         state = self._t(state)
 31 |         logit = self.proj(out)
 32 |         return logit, state
 33 | 
 34 |     def sample(self, size, return_states=False, start_step=0, start_state = None, init_input = None):
 35 |         samples, states, logits = self._sample(size, reparametrized=False, start_step=start_step, start_state=start_state, init_input = init_input)
 36 |         if return_states:
 37 |             return samples, states, logits
 38 |         else:
 39 |             return samples
 40 | 
 41 |     def rsample(self, size, return_states=False, temp=None, hard=False, start_step=0, start_state=None, init_input = None):
 42 |         samples, states, logits = self._sample(size, reparametrized=True, temp=temp, hard=hard, start_step=start_step, start_state=start_state, init_input = init_input)
 43 |         if return_states:
 44 |             return samples, states, logits
 45 |         else:
 46 |             return samples
 47 | 
 48 |     def _sample(self, batch_size, reparametrized=False, temp=None, hard=False, start_step=0, start_state=None, init_input = None):
 49 |         assert len(batch_size) == 1
 50 |         batch_size = batch_size[0]
 51 | 
 52 |         if temp is None:
 53 |             temp = [self.temp_rsample]*self.n_dim_out
 54 |         if start_state is None:
 55 |             state = self._get_state(batch_size) # hidden / cell state at t=0
 56 |         else:
 57 |             state = start_state
 58 |         if init_input is None:
 59 |             input = self._init_input(batch_size) # input at t=0
 60 |         else:
 61 |             input = init_input
 62 | 
 63 |         sampled_tokens = []
 64 |         state_array_1 = []
 65 |         state_array_2 = []
 66 |         logit_array = []
 67 | 
 68 |         for t in range(start_step, self.n_dim_out):
 69 |             logits, state = self.forward(input, state)
 70 |             if reparametrized:
 71 |                 _sample = torch.distributions.RelaxedBernoulli(temperature=temp[t], logits=logits).rsample()
 72 |             else:
 73 |                 _sample = torch.distributions.Bernoulli(logits=logits).sample()
 74 |             input = _sample
 75 |             sampled_tokens.append(_sample)
 76 |             state_array_1.append(state[0])
 77 |             state_array_2.append(state[1])
 78 |             logit_array.append(logits)
 79 |         
 80 |         samples = torch.cat(sampled_tokens, dim=1)
 81 |         states = [torch.stack(state_array_1, dim=1), torch.stack(state_array_2, dim=1)]
 82 |         logits =  logit_array
 83 |         if reparametrized and hard:
 84 |             samples_hard = utils.one_hot(torch.argmax(samples, -1), 2)
 85 |             samples = (samples_hard - samples).detach() + samples
 86 |         
 87 |         return samples, states, logits
 88 | 
 89 | 
 90 |     def log_prob(self, value, return_logits = False):
 91 |         batch_size, n_dim_out_value,_ = value.shape
 92 |         assert n_dim_out_value == self.n_dim_out
 93 |         # add start value
 94 |         state = self._get_state(batch_size) # hidden / cell state at t=0
 95 |         input = self._init_input(batch_size) # input at t=0
 96 |         value = torch.cat([input, value], dim=-2)
 97 |         logits, _ = self.forward(value, state)
 98 |         logits = logits[:, :-1, :]
 99 |         value = value[:, 1:]
100 |         log_probs = torch.distributions.Bernoulli(logits = logits).log_prob(value).sum(1)
101 |         if return_logits:
102 |             return log_probs, logits
103 |         return log_probs
104 | 
105 |     def entropy(self, n_samples=10 ** 6):
106 |         bs = 100000
107 |         curr = 0
108 |         ent = 0.
109 |         while curr < n_samples:
110 |             curr_batch_size = min(bs, n_samples-curr)
111 |             ent -= torch.sum(self.log_prob(self.sample([curr_batch_size])))
112 |             curr += curr_batch_size
113 |         return ent/n_samples
114 | 
115 |     def _get_state(self, batch_size=1):
116 |         return (self.h0.repeat(batch_size, 1, 1), self.c0.repeat(batch_size, 1, 1))
117 | 
118 |     def _init_input(self, batch_size):
119 |         return self._init_input_param.expand(batch_size, 1, self.n_classes)
120 | 
121 |     def mode(self, n_samples=1000, return_adj = True, return_logprob=False):
122 |         samples, logprobs = self.sample((n_samples,), return_logprobs=True)
123 |         max_idx = torch.argmax(logprobs)
124 |         mode_ = samples[max_idx].unsqueeze(0)
125 |         if return_adj:
126 |             mode_ = utils.vec_to_adj_mat(mode_.unsqueeze(0), self.n_nodes).squeeze()
127 |         if return_logprob:
128 |             return mode_, logprobs[max_idx]
129 |         return mode_
130 | 
131 |     @staticmethod
132 |     def _t(a):
133 |         return [t.transpose(0, 1).contiguous() for t in a]


--------------------------------------------------------------------------------
/models/bge_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | class BGe(torch.nn.Module):
  5 | 	"""
  6 | 	Pytorch implementation of Linear Gaussian-Gaussian Model (Continuous Gaussian data)
  7 | 	Supports batched version.
  8 | 	 Each variable distributed as Gaussian with mean being the linear combination of its parents 
  9 | 	weighted by a Gaussian parameter vector (i.e., with Gaussian-valued edges).
 10 | 	The parameter prior over (mu, lambda) of the joint Gaussian distribution (mean `mu`, precision `lambda`) over x is Gaussian-Wishart, 
 11 | 	as introduced in 
 12 | 		Geiger et al (2002):  https://projecteuclid.org/download/pdf_1/euclid.aos/1035844981
 13 | 	Computation is based on
 14 | 		Kuipers et al (2014): https://projecteuclid.org/download/suppdf_1/euclid.aos/1407420013 
 15 | 	Note: 
 16 | 		- (mu, Sigma) of joint is not factorizable into independent theta, but there exists a one-to-one correspondence.
 17 | 		- lambda = Sigma^{-1}
 18 | 		- assumes default diagonal parametric matrix T
 19 | 
 20 | 		This pytorch implementation is based on Jax implementation from Lars Lorch and Jonas Rothfuss.
 21 | 	"""
 22 | 
 23 | 	def __init__(self, *,
 24 | 			mean_obs,
 25 | 			alpha_mu,
 26 | 			alpha_lambd,
 27 | 			data,
 28 | 			device = "cpu"
 29 | 			):
 30 | 		"""
 31 | 		mean_obs : [num_nodes] : Mean of each Gaussian distributed variable (Prior)
 32 | 		alpha_mu : torch.float64 : Hyperparameter of Wishart Prior
 33 | 		alpha_lambd : 
 34 | 		data : [n_samples, num_nodes] If provided, precomputes the posterior parameter R
 35 | 		"""
 36 | 		super(BGe, self).__init__()
 37 | 
 38 | 		self.mean_obs = torch.tensor(mean_obs)
 39 | 		self.alpha_mu = alpha_mu
 40 | 		self.alpha_lambd = alpha_lambd
 41 | 		self.device = device
 42 | 
 43 | 		if not isinstance(data, torch.DoubleTensor):
 44 | 			data = torch.tensor(data).to(torch.float64)
 45 | 
 46 | 		self.N, self.d = data.shape        
 47 | 		# pre-compute matrices
 48 | 		small_t = (self.alpha_mu * (self.alpha_lambd - self.d - 1)) / \
 49 | 			(self.alpha_mu + 1)
 50 | 		T = small_t * torch.eye(self.d)
 51 | 
 52 | 		x_bar = data.mean(axis=0, keepdims=True)
 53 | 		x_center = data - x_bar
 54 | 		s_N = torch.matmul(x_center.t(), x_center)  # [d, d]
 55 | 
 56 | 		# Kuipers (2014) states R wrongly in the paper, using alpha_lambd rather than alpha_mu;
 57 | 		# the supplementary contains the correct term
 58 | 		self.R = (T + s_N + ((self.N * self.alpha_mu) / (self.N + self.alpha_mu)) * \
 59 | 		(torch.matmul((x_bar - self.mean_obs).t(), x_bar - self.mean_obs))).to(self.device)
 60 | 		
 61 | 		all_l = torch.arange(self.d)
 62 | 		
 63 | 		self.log_gamma_terms = (
 64 | 		0.5 * (np.log(self.alpha_mu) - np.log(self.N + self.alpha_mu))
 65 | 		+ torch.special.gammaln(0.5 * (self.N + self.alpha_lambd - self.d + all_l + 1))
 66 | 		- torch.special.gammaln(0.5 * (self.alpha_lambd - self.d + all_l + 1))
 67 | 		- 0.5 * self.N * np.log(np.pi)
 68 | 		# log det(T_JJ)^(..) / det(T_II)^(..) for default T
 69 | 		+ 0.5 * (self.alpha_lambd - self.d + 2 * all_l + 1) * \
 70 | 		np.log(small_t)).to(self.device)
 71 | 		
 72 | 	def slogdet_pytorch(self, parents, R = None):
 73 | 		"""
 74 | 		Batched log determinant of a submatrix
 75 | 		Done by masking everything but the submatrix and
 76 | 		adding a diagonal of ones everywhere else for the 
 77 | 		valid determinant
 78 | 		"""
 79 | 
 80 | 		if R is None:
 81 | 			R = self.R.clone()
 82 | 		batch_size = parents.shape[0]
 83 | 		R = R.unsqueeze(0).expand(batch_size,-1,-1)
 84 | 		parents = parents.to(torch.float64).to(self.device)
 85 | 		mask = torch.matmul(parents.unsqueeze(2), parents.unsqueeze(1)).to(torch.bool) #[batch_size, d,d]
 86 | 		R = torch.where(mask, R, torch.tensor([np.nan], device = self.device).to(torch.float64))
 87 | 		submat = torch.where(torch.isnan(R), torch.eye(self.d, dtype = torch.float64).unsqueeze(0).expand(batch_size,-1,-1).to(self.device), R)
 88 | 		return torch.linalg.slogdet(submat)[1]
 89 | 
 90 | 	def log_marginal_likelihood_given_g_j(self, j, w):
 91 | 		"""
 92 | 		Computes node specific terms of BGe metric
 93 | 		j : Node to compute the marginal likelihood. Marginal Likelihood decomposes over each node.
 94 | 		w : [batch_size, num_nodes, num_nodes] : {0,1} adjacency matrix 
 95 | 		"""
 96 | 
 97 | 		batch_size = w.shape[0]
 98 | 		isj = (torch.arange(self.d) == j).unsqueeze(0).expand(batch_size, -1).to(self.device)
 99 | 		parents = w[:, :, j] == 1
100 | 		parents_and_j = parents | isj
101 | 		n_parents = (w.sum(axis=1)[:,j]).long()
102 | 		n_parents_mask = n_parents == 0
103 | 		_log_term_r_no_parents = - 0.5 * (self.N + self.alpha_lambd - self.d + 1) * torch.log(torch.abs(self.R[j, j]))
104 | 
105 | 		_log_term_r = 0.5 * (self.N + self.alpha_lambd - self.d + n_parents[~n_parents_mask]) *\
106 | 						self.slogdet_pytorch(parents[~n_parents_mask])\
107 | 					- 0.5 * (self.N + self.alpha_lambd - self.d + n_parents[~n_parents_mask] + 1) *\
108 | 						self.slogdet_pytorch(parents_and_j[~n_parents_mask])     # log det(R_II)^(..) / det(R_JJ)^(..)
109 | 	
110 | 		log_term_r = torch.zeros(batch_size, dtype = torch.float64, device = self.device)
111 | 		log_term_r[n_parents_mask] = _log_term_r_no_parents
112 | 		log_term_r[~n_parents_mask] = _log_term_r
113 | 		return log_term_r + self.log_gamma_terms[n_parents]
114 | 
115 | 
116 | 	def log_marginal_likelihood_given_g(self, *, w, interv_targets=None):
117 | 		"""Computes log p(x | G) in closed form using conjugacy properties
118 | 			w:     [batch_size, num_nodes, num_nodes]	{0,1} adjacency marix
119 | 			interv_targets: [batch_size, num_nodes] boolean mask of whether or not a node was intervened on
120 | 					intervened nodes are ignored in likelihood computation
121 | 		"""
122 | 		batch_size = w.shape[0]
123 | 		if interv_targets is None:
124 | 			interv_targets = torch.zeros(batch_size,self.d).to(torch.bool)
125 | 		interv_targets = (~interv_targets).to(self.device)
126 | 		# sum scores for all nodes
127 | 		mll = torch.zeros(batch_size, dtype = torch.float64, device = self.device)
128 | 		for i in range(self.d):
129 | 			#print(self.log_marginal_likelihood_given_g_j(i, w)[interv_targets[:,i]]) 
130 | 			mll[interv_targets[:,i]] += self.log_marginal_likelihood_given_g_j(i, w)[interv_targets[:,i]]  ##TODO: Possible to use torch.vmap but should be okay for now 
131 | 		return mll
132 | 
133 | if __name__ == "__main__":
134 | 	from ..graph import distributions, utils
135 | 	from .. models import linearGaussianGaussian, linearGaussianGaussianEquivalent  
136 | 	import tqdm
137 | 
138 | 	np.random.seed(13)
139 | 	torch.manual_seed(13)
140 | 	device = "cpu"
141 | 	if torch.cuda.is_available():
142 | 		device = "cuda"
143 | 	n_vars = 3
144 | 	
145 | 	g_dist = distributions.GibbsUniformDAGDistribution(n_vars, gibbs_temp = 1000.)
146 | 	data_dist_1 = linearGaussianGaussian.LinearGaussianGaussian(g_dist = g_dist, mean_edge = 2., sig_obs = 1.0, sig_edge = 1.0)
147 | 	true_graph = g_dist.sample_G()
148 | 	print(utils.graph_to_mat(true_graph))
149 | 	theta = data_dist_1.sample_parameters(g = true_graph)
150 | 	x = data_dist_1.sample_obs(n_samples = 100, g = true_graph, theta = theta)
151 | 
152 | 	data_dist_ = BGe(mean_obs = [2.0]*n_vars, alpha_mu = 1.0, alpha_lambd=1000., data = x, device = device)
153 | 	data_dist = linearGaussianGaussianEquivalent.BGe(g_dist = g_dist, mu_0 = [2.0]*n_vars, alpha_mu = 1.0, alpha_lambd=1000.)
154 | 	
155 | 	z_g = g_dist.z_g
156 | 	
157 | 
158 | 	all_adj = utils.all_combinations(n_vars, return_adj = True)
159 | 	"""
160 | 	print(all_adj[1])
161 | 	log_posterior_graph_1 = data_dist.log_posterior_graph_given_obs(g = true_graph,x=  x, log_marginal_likelihood = 0.0, z_g = z_g) #Unnormalized Log Probabilities
162 | 	log_posterior_graph_2 = data_dist.log_posterior_graph_given_obs(g = utils.mat_to_graph(all_adj[1]),x=  x, log_marginal_likelihood = 0.0, z_g = z_g) #Unnormalized Log Probabilities
163 | 	log_prob_g = torch.tensor([g_dist.unnormalized_log_prob(g=true_graph) - z_g, g_dist.unnormalized_log_prob(g=utils.mat_to_graph(all_adj[1])) - z_g])
164 | 	w= torch.stack([torch.tensor(utils.graph_to_mat(true_graph)), torch.tensor(all_adj[1])], dim = 0)
165 | 	mll = data_dist_.log_marginal_likelihood_given_g(w = w.to(device)).cpu() + log_prob_g
166 | 	print(log_posterior_graph_1, log_posterior_graph_2, mll)
167 | 	"""
168 | 
169 | 	log_posterior_graph = np.zeros(len(all_adj))
170 | 	log_prob_g = torch.zeros(len(all_adj))
171 | 	for tt in tqdm.tqdm(range(len(all_adj)), desc = 'Calculating p(G|D)', disable = False):
172 | 		log_posterior_graph[tt] = data_dist.log_posterior_graph_given_obs(g = utils.mat_to_graph( all_adj[tt]),x=  x, log_marginal_likelihood = 0.0, z_g = z_g) #Unnormalized Log Probabilities
173 | 		log_prob_g[tt] = g_dist.unnormalized_log_prob(g=utils.mat_to_graph( all_adj[tt])) - z_g
174 | 	graph_p = torch.distributions.categorical.Categorical(logits = torch.tensor(log_posterior_graph))
175 | 	
176 | 	a, b = torch.topk(graph_p.probs, 10, largest = True) if n_vars>2 else [torch.arange(len(all_adj)), None]
177 | 	for i,j in zip(a,b):
178 | 		print(all_adj[j], i)
179 | 
180 | 	mll = data_dist_.log_marginal_likelihood_given_g(w = torch.tensor(all_adj).to(device))
181 | 
182 | 	
183 | 	graph_q = torch.distributions.categorical.Categorical(logits = mll.cpu() + log_prob_g)
184 | 	
185 | 	a, b = torch.topk(graph_q.probs, 10, largest = True) if n_vars>2 else [torch.arange(len(all_adj)), None]
186 | 	for i,j in zip(a,b):
187 | 		print(all_adj[j], i)
188 | 	
189 | 	print(torch.distributions.kl.kl_divergence(graph_p, graph_q))
190 | 	print(graph_q.probs.cpu().numpy(), graph_p.probs.cpu().numpy())
191 | 	
192 | 	assert np.allclose(graph_q.probs.cpu().numpy(), graph_p.probs.cpu().numpy(), atol = 1e-3)
193 | 	


--------------------------------------------------------------------------------
/models/factorised_base.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import utils
 3 | 
 4 | class FactorisedBase(torch.nn.Module):
 5 | 
 6 |     def __init__(self, n_nodes, temp_rsample=0.1, device = 'cpu'):
 7 |         super().__init__()
 8 |         self.n_nodes = n_nodes
 9 |         self.n_dim_out = n_nodes* (n_nodes-1)
10 |         self.temp_rsample = temp_rsample
11 |         self.device = device
12 | 
13 |         base_log_probs = torch.randn(self.n_dim_out,1).clone().detach().requires_grad_(True).to(device)
14 | 
15 |         self.params = torch.nn.Parameter(base_log_probs)
16 | 
17 |     def forward(self, inputs):
18 |         dist = torch.distributions.Bernoulli(logits=self.params)
19 |         log_prob = dist.log_prob(inputs).sum(dim = (1))
20 |         return log_prob
21 | 
22 |     def sample(self, size, return_logprobs=False):
23 |         samples, logprobs = self._sample(size, reparametrized=False)
24 |         if return_logprobs:
25 |             return samples, logprobs
26 |         else:
27 |             return samples
28 | 
29 |     def rsample(self, size, return_logprobs=False, temp=None, hard=False):
30 |         samples, logprobs = self._sample(size, reparametrized=True, temp=temp, hard=hard)
31 |         if return_logprobs:
32 |             return samples, logprobs
33 |         else:
34 |             return samples
35 | 
36 |     def _sample(self, batch_size, reparametrized=False, temp=None, hard=False):
37 | 
38 |         if temp is None:
39 |             temp = self.temp_rsample
40 | 
41 |         if reparametrized:
42 |             _sample = torch.distributions.RelaxedBernoulli(temperature=temp, logits=self.params).rsample(batch_size)
43 |         else:
44 |             _sample = torch.distributions.Bernoulli(logits=self.params).sample(batch_size)
45 |             
46 |         samples = _sample
47 |         if reparametrized and hard:
48 |             samples_hard = utils.one_hot(torch.argmax(_sample, -1), 2)
49 |             samples = (samples_hard - samples).detach() + samples
50 |         
51 |         logprobs = self.forward(samples)
52 |         return samples, logprobs
53 | 
54 | 
55 |     def log_prob(self, value):
56 |         if len(value.shape) < 3:
57 |             value = value.unsqueeze(-1)
58 |         return self.forward(value)
59 | 
60 |     def entropy(self, n_samples=10 ** 6):
61 |         bs = 100000
62 |         curr = 0
63 |         ent = 0.
64 |         while curr < n_samples:
65 |             curr_batch_size = min(bs, n_samples-curr)
66 |             ent -= torch.sum(self.log_prob(self.sample([curr_batch_size])))
67 |             curr += curr_batch_size
68 |         return ent/n_samples
69 | 
70 | 
71 |     def mode(self, n_samples=1000, return_adj = True, return_logprob = False):
72 |         samples, logprobs = self.sample((n_samples,), return_logprobs=True)
73 |         max_idx = torch.argmax(logprobs)
74 |         mode_ = samples[max_idx].unsqueeze(0)
75 |         if return_adj:
76 |             mode_ = utils.vec_to_adj_mat(mode_.unsqueeze(0),self.n_nodes).squeeze()
77 |         if return_logprob:
78 |             return mode_, logprobs[max_idx]
79 |         return mode_


--------------------------------------------------------------------------------
/models/vcn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | import utils
 5 | 
 6 | class VCN(nn.Module):
 7 | 	def __init__(self, *, num_nodes, graph_dist, sparsity_factor = 0.0, gibbs_temp_init = 10., gibbs_update = None):
 8 | 		super().__init__()
 9 | 		self.num_nodes = num_nodes
10 | 		self.graph_dist = graph_dist
11 | 		self.sparsity_factor = sparsity_factor
12 | 		self.gibbs_temp = gibbs_temp_init
13 | 		self.gibbs_update = gibbs_update
14 | 
15 | 	def forward(self, n_samples, bge_model, e, interv_targets = None):
16 | 		samples = self.graph_dist.sample([n_samples])	
17 | 		log_probs = self.graph_dist.log_prob(samples).squeeze()
18 | 
19 | 		G = utils.vec_to_adj_mat(samples, self.num_nodes) 
20 | 		likelihood = bge_model.log_marginal_likelihood_given_g(w = G, interv_targets=interv_targets)
21 | 
22 | 		dagness = utils.expm(G, self.num_nodes)
23 | 		self.update_gibbs_temp(e)
24 | 		kl_graph = log_probs + self.gibbs_temp*dagness + self.sparsity_factor*torch.sum(G, axis = [-1, -2]) 
25 | 		return likelihood, kl_graph, log_probs
26 | 	
27 | 
28 | 	def sample(self, num_samples = 10000):
29 | 		samples = self.graph_dist.sample([num_samples])
30 | 		G = utils.vec_to_adj_mat(ret, self.num_nodes) 
31 | 		return G
32 | 
33 | 	def update_gibbs_temp(self, e):
34 | 		if self.gibbs_update is None:
35 | 			return 0
36 | 		else:
37 | 			self.gibbs_temp =  self.gibbs_update(self.gibbs_temp, e)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backcall==0.2.0
 2 | cycler==0.10.0
 3 | decorator==4.4.2
 4 | graphical-models==0.1a10
 5 | ipdb==0.13.9
 6 | ipython==7.24.1
 7 | ipython-genutils==0.2.0
 8 | jedi==0.18.0
 9 | joblib==1.0.1
10 | kiwisolver==1.3.1
11 | matplotlib==3.4.2
12 | matplotlib-inline==0.1.2
13 | networkx==2.5.1
14 | numpy==1.20.3
15 | parso==0.8.2
16 | pexpect==4.8.0
17 | pickleshare==0.7.5
18 | Pillow==8.2.0
19 | prompt-toolkit==3.0.19
20 | ptyprocess==0.7.0
21 | Pygments==2.9.0
22 | pyparsing==2.4.7
23 | python-dateutil==2.8.1
24 | scikit-learn==0.24.2
25 | scipy==1.6.3
26 | six==1.16.0
27 | sklearn==0.0
28 | threadpoolctl==2.1.0
29 | toml==0.10.2
30 | torch==1.9.0
31 | tqdm==4.61.1
32 | traitlets==5.0.5
33 | typing-extensions==3.10.0.0
34 | wcwidth==0.2.5
35 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | import shutil, os
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | import numpy as np 
  8 | import matplotlib.pyplot as plt
  9 | import pickle 
 10 | from scipy.special import comb as scomb
 11 | from scipy.spatial.distance import cdist
 12 | from itertools import product
 13 | import networkx as nx
 14 | 
 15 | def simulate_gaussian_sem(graphs, w_logvar):
 16 | 	std = torch.exp(0.5*w_logvar)
 17 | 	samples = torch.zeros(len(graphs), len(graphs[0].nodes)).to(w_logvar.device)
 18 | 	for i in range(len(graphs)):
 19 | 		graph = graphs[i]
 20 | 		for j in nx.topological_sort(graph):
 21 | 			noise = torch.normal(mean = 0., std = std[i,j])
 22 | 			parents = list(graph.predecessors(j))
 23 | 			if len(parents) == 0:
 24 | 				samples[i,j] = noise
 25 | 			else:
 26 | 				curr = noise
 27 | 				for k in parents:
 28 | 					curr += graph.edges[k,j]['weight']*samples[i,k]
 29 | 				samples[i, j] = curr
 30 | 	return samples
 31 | 
 32 | def matrix_poly(matrix, d):
 33 | 	x = torch.eye(d).to(matrix.device) + torch.div(matrix, d)
 34 | 	return torch.matrix_power(x, d)
 35 | 
 36 | def expm(A, m):
 37 | 	expm_A = matrix_poly(A, m)
 38 | 	h_A = expm_A.diagonal(dim1=-2, dim2=-1).sum(-1) - m
 39 | 	return h_A
 40 | 
 41 | def matrix_poly_np(matrix, d):
 42 | 	x = np.eye(d) + np.divide(matrix, d)
 43 | 	return np.linalg.matrix_power(x, d)
 44 | 
 45 | def expm_np(A, m):
 46 | 	expm_A = matrix_poly_np(A, m)
 47 | 	h_A = np.trace(expm_A) - m
 48 | 	return h_A
 49 | 
 50 | def all_combinations(num_nodes, num_classes=2, return_adj = False):
 51 | 	comb = list(product(list(range(num_classes)),repeat = num_nodes*(num_nodes-1)))
 52 | 	comb = np.array(comb)
 53 | 	if return_adj:
 54 | 		comb = vec_to_adj_mat_np(comb, num_nodes)
 55 | 	return comb
 56 | 
 57 | def full_prior(num_nodes, num_classes=2, data_size = None, gibbs_temp = 10., sparsity_factor=0.):
 58 | 	if data_size is None:
 59 | 		data_size = num_nodes*(num_nodes-1)
 60 | 	comb = all_combinations(num_classes, data_size)
 61 | 
 62 | 	ref_config = np.zeros(len(comb))
 63 | 	comb_full = np.zeros((len(comb), num_nodes, num_nodes))
 64 | 	for i in range(len(comb)):
 65 | 		#print(i, len(comb))
 66 | 		temp_ = np.concatenate((np.zeros((num_nodes,1)),(comb[i].reshape((num_nodes, num_nodes -1)))), axis=1)
 67 | 		for j in range(num_nodes):
 68 | 			comb_full[i,j] = np.roll(temp_[j],j)
 69 | 		resid = expm_np(comb_full[i], num_nodes)
 70 | 		ref_config[i] = np.exp(-gibbs_temp*resid-sparsity_factor*np.sum(temp_))  
 71 | 	norm_factor = np.sum(ref_config)
 72 | 	ref_config = ref_config/norm_factor
 73 | 	return ref_config, comb, norm_factor, comb_full
 74 | 
 75 | def vec_to_adj_mat(matrix, num_nodes):
 76 | 	matrix = matrix.view(-1, num_nodes, num_nodes-1)
 77 | 	matrix_full = torch.cat((torch.zeros(matrix.shape[0], num_nodes,1).to(matrix.device), matrix), dim = -1)
 78 | 	for xx in range(num_nodes):
 79 | 		matrix_full[:,xx] = torch.roll(matrix_full[:,xx], xx, -1) 
 80 | 	return matrix_full
 81 | 
 82 | def vec_to_adj_mat_np(matrix, num_nodes):
 83 | 	matrix = np.reshape(matrix, (-1, num_nodes, num_nodes-1))
 84 | 	matrix_full = np.concatenate((np.zeros((matrix.shape[0], num_nodes,1), dtype = matrix.dtype), matrix), axis = -1)
 85 | 	for xx in range(num_nodes):
 86 | 		matrix_full[:,xx] = np.roll(matrix_full[:,xx], xx, axis = -1) 
 87 | 	return matrix_full
 88 | 
 89 | def adj_mat_to_vec(matrix_full, num_nodes):
 90 | 	for xx in range(num_nodes):
 91 | 		matrix_full[:,xx] = torch.roll(matrix_full[:,xx], -xx, -1) 
 92 | 	matrix = matrix_full[..., 1:]
 93 | 	return matrix.reshape(-1, num_nodes*(num_nodes-1))
 94 | 
 95 | def adj_mat_to_vec_np(matrix_full, num_nodes):
 96 | 	for xx in range(num_nodes):
 97 | 		matrix_full[:,xx] = np.roll(matrix_full[:,xx], -xx, axis = -1) 
 98 | 	matrix = np.reshape(matrix_full[..., 1:], (matrix_full.shape[0], num_nodes*(num_nodes-1)))
 99 | 	return matrix
100 | 
101 | def shd(B_est, B_true):
102 | 	"""Compute various accuracy metrics for B_est.
103 | 
104 | 	true positive = predicted association exists in condition in correct direction
105 | 	reverse = predicted association exists in condition in opposite direction
106 | 	false positive = predicted association does not exist in condition
107 | 
108 | 	Args:
109 | 		B_true (np.ndarray): [d, d] ground truth graph, {0, 1}
110 | 		B_est (np.ndarray): [d, d] estimate, {0, 1, -1}, -1 is undirected edge in CPDAG
111 | 
112 | 	Returns:
113 | 		fdr: (reverse + false positive) / prediction positive
114 | 		tpr: (true positive) / condition positive
115 | 		fpr: (reverse + false positive) / condition negative
116 | 		shd: undirected extra + undirected missing + reverse
117 | 		nnz: prediction positive
118 | 
119 | 		Taken from https://github.com/xunzheng/notears
120 | 	"""
121 | 	if (B_est == -1).any():  # cpdag
122 | 		if not ((B_est == 0) | (B_est == 1) | (B_est == -1)).all():
123 | 			raise ValueError('B_est should take value in {0,1,-1}')
124 | 		if ((B_est == -1) & (B_est.T == -1)).any():
125 | 			raise ValueError('undirected edge should only appear once')
126 | 	else:  # dag
127 | 		if not ((B_est == 0) | (B_est == 1)).all():
128 | 			raise ValueError('B_est should take value in {0,1}')
129 | 		#if not is_dag(B_est):
130 | 		#    raise ValueError('B_est should be a DAG')
131 | 	d = B_true.shape[0]
132 | 	# linear index of nonzeros
133 | 	pred_und = np.flatnonzero(B_est == -1)
134 | 	pred = np.flatnonzero(B_est == 1)
135 | 	cond = np.flatnonzero(B_true)
136 | 	cond_reversed = np.flatnonzero(B_true.T)
137 | 	cond_skeleton = np.concatenate([cond, cond_reversed])
138 | 	# true pos
139 | 	true_pos = np.intersect1d(pred, cond, assume_unique=True)
140 | 	# treat undirected edge favorably
141 | 	true_pos_und = np.intersect1d(pred_und, cond_skeleton, assume_unique=True)
142 | 	true_pos = np.concatenate([true_pos, true_pos_und])
143 | 	# false pos
144 | 	false_pos = np.setdiff1d(pred, cond_skeleton, assume_unique=True)
145 | 	false_pos_und = np.setdiff1d(pred_und, cond_skeleton, assume_unique=True)
146 | 	false_pos = np.concatenate([false_pos, false_pos_und])
147 | 	# reverse
148 | 	extra = np.setdiff1d(pred, cond, assume_unique=True)
149 | 	reverse = np.intersect1d(extra, cond_reversed, assume_unique=True)
150 | 	# compute ratio
151 | 	pred_size = len(pred) + len(pred_und)
152 | 	cond_neg_size = 0.5 * d * (d - 1) - len(cond)
153 | 	fdr = float(len(reverse) + len(false_pos)) / max(pred_size, 1)
154 | 	tpr = float(len(true_pos)) / max(len(cond), 1)
155 | 	fpr = float(len(reverse) + len(false_pos)) / max(cond_neg_size, 1)
156 | 	# structural hamming distance
157 | 	pred_lower = np.flatnonzero(np.tril(B_est + B_est.T))
158 | 	cond_lower = np.flatnonzero(np.tril(B_true + B_true.T))
159 | 	extra_lower = np.setdiff1d(pred_lower, cond_lower, assume_unique=True)
160 | 	missing_lower = np.setdiff1d(cond_lower, pred_lower, assume_unique=True)
161 | 	shd = len(extra_lower) + len(missing_lower) + len(reverse)
162 | 	shd_wc = shd + len(pred_und)
163 | 	prc = float(len(true_pos)) / max(float(len(true_pos)+len(reverse) + len(false_pos)), 1.)
164 | 	rec = tpr
165 | 	return {'fdr': fdr, 'tpr': tpr, 'fpr': fpr, 'prc': prc, 'rec' : rec, 'shd': shd, 'shd_wc': shd_wc, 'nnz': pred_size}
166 | 
167 | def kl_mdag(log_probs, probs_gt):
168 | 	return torch.nn.functional.kl_div(log_probs, probs_gt)
169 | 
170 | def one_hot(inputs, vocab_size = None):
171 | 	"""Returns one hot of data over each element of the inputs"""
172 | 	if vocab_size is None:
173 | 		vocab_size = inputs.max() + 1
174 | 	input_shape = inputs.shape
175 | 	inputs = inputs.flatten().unsqueeze(1).long()
176 | 	z = torch.zeros(len(inputs), vocab_size).to(inputs.device)
177 | 	z.scatter_(1, inputs, 1.)
178 | 	return z.view(*input_shape, vocab_size)
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------