├── theory.pdf ├── figures ├── NADE_full.png └── NADE_expand.png ├── tests ├── sampling │ ├── test_samples.pdf │ ├── make_samples.jl │ ├── NADE_probability │ ├── test_sampling.py │ └── test_utils.py └── __pycache__ │ └── test_utils.cpython-37.pyc ├── tfim1D_psi ├── run.jl ├── README.md ├── NADE.jl ├── Theory.md └── LICENSE /theory.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isaacdevlugt/GreNADE/HEAD/theory.pdf -------------------------------------------------------------------------------- /figures/NADE_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isaacdevlugt/GreNADE/HEAD/figures/NADE_full.png -------------------------------------------------------------------------------- /figures/NADE_expand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isaacdevlugt/GreNADE/HEAD/figures/NADE_expand.png -------------------------------------------------------------------------------- /tests/sampling/test_samples.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isaacdevlugt/GreNADE/HEAD/tests/sampling/test_samples.pdf -------------------------------------------------------------------------------- /tests/__pycache__/test_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/isaacdevlugt/GreNADE/HEAD/tests/__pycache__/test_utils.cpython-37.pyc -------------------------------------------------------------------------------- /tfim1D_psi: -------------------------------------------------------------------------------- 1 | 0.47989754026195663 0.0 2 | 0.25534814770632036 0.0 3 | 0.16666666666666344 0.0 4 | 0.22454939255564144 0.0 5 | 0.16666666666666344 0.0 6 | 0.10878394077768933 0.0 7 | 0.14656420692863573 0.0 8 | 0.25534814770633 0.0 9 | 0.25534814770632036 0.0 10 | 0.14656420692863514 0.0 11 | 0.1087839407776893 0.0 12 | 0.16666666666666932 0.0 13 | 0.22454939255564138 0.0 14 | 0.16666666666666943 0.0 15 | 0.25534814770633 0.0 16 | 0.47989754026198733 0.0 17 | -------------------------------------------------------------------------------- /tests/sampling/make_samples.jl: -------------------------------------------------------------------------------- 1 | using DelimitedFiles 2 | 3 | include("../../NADE.jl") 4 | 5 | # arbitraty parameters 6 | N = 5 7 | Nh = 5 8 | initialize_parameters() 9 | 10 | space = generate_hilbert_space() 11 | prob = probability(space) 12 | 13 | num_samples = 10000 14 | samples = convert(Array{Int,2}, sample(num_samples)) 15 | 16 | open("NADE_probability", "w") do io 17 | writedlm(io, prob) 18 | end 19 | 20 | open("NADE_samples", "w") do io 21 | writedlm(io, samples) 22 | end 23 | -------------------------------------------------------------------------------- /tests/sampling/NADE_probability: -------------------------------------------------------------------------------- 1 | 0.057075029372455864 2 | 0.02382279597524013 3 | 0.03309019631754118 4 | 0.01382504657568428 5 | 0.052844819156432236 6 | 0.020152280751678343 7 | 0.031199428209144543 8 | 0.011879179208153743 9 | 0.07145333072871282 10 | 0.02816952256724967 11 | 0.04126468920264857 12 | 0.016265106888402932 13 | 0.072271741828387 14 | 0.02610983995993663 15 | 0.04245955079094831 16 | 0.015279370540930004 17 | 0.044374482132848926 18 | 0.019340328398845766 19 | 0.02515769414535398 20 | 0.010992585620458224 21 | 0.04290737538748568 22 | 0.016998351701803728 23 | 0.02473137903375502 24 | 0.009811539667219663 25 | 0.05537174658464753 26 | 0.022954644218934308 27 | 0.03135251370240837 28 | 0.013042049161616828 29 | 0.05805809275410411 30 | 0.02181359684069279 31 | 0.03337939536337305 32 | 0.01255229721290572 33 | -------------------------------------------------------------------------------- /tests/sampling/test_sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | import test_utils 5 | 6 | N = 5 7 | num_samples = 10000 8 | 9 | NADE_prob = np.loadtxt("NADE_probability") 10 | NADE_samples = np.loadtxt("NADE_samples", dtype=int) 11 | 12 | prob_inds, prob_samples = test_utils.gen_samples(num_samples, N, NADE_prob) 13 | sample_inds = test_utils.gen_inds_from_samples(NADE_samples) 14 | 15 | prob_uniques, prob_counts = np.unique(prob_inds, return_counts=True) 16 | sample_uniques, sample_counts = np.unique(sample_inds, return_counts=True) 17 | 18 | prob_counts = prob_counts / len(prob_inds) 19 | sample_counts = sample_counts / len(sample_inds) 20 | 21 | 22 | plt.figure() 23 | plt.bar(prob_uniques+0.1, prob_counts, color='blue', 24 | label="NADE_probability samples", align='center', width=0.25) 25 | plt.bar(sample_uniques-0.1, sample_counts, color='green', 26 | label="NADE_samples samples", align='center', width=0.25) 27 | plt.xlabel("Basis state index") 28 | plt.ylabel("Fractional frequency") 29 | plt.legend() 30 | plt.xticks(np.arange(0,2**N,2)) 31 | plt.savefig("test_samples.pdf", dpi=500, bbox_inches='tight') 32 | 33 | -------------------------------------------------------------------------------- /run.jl: -------------------------------------------------------------------------------- 1 | using Flux 2 | using Flux.Optimise: update! 3 | using DelimitedFiles 4 | using Random 5 | using Distributions 6 | using LinearAlgebra 7 | using ArgParse 8 | 9 | include("NADE.jl") 10 | include("postprocess.jl") 11 | 12 | function parse_commandline() 13 | s = ArgParseSettings() 14 | @add_arg_table! s begin 15 | "--Nh" 16 | help = "number of hidden units" 17 | arg_type=Int 18 | "--train_path" 19 | help = "training data path" 20 | arg_type=String 21 | "--psi_path" 22 | help = "true psi path" 23 | arg_type=String 24 | end 25 | return parse_args(s) 26 | end 27 | 28 | parsed_args = parse_commandline() 29 | 30 | Nh = parsed_args["Nh"] 31 | train_path = parsed_args["train_path"] 32 | psi_path = parsed_args["psi_path"] 33 | 34 | train_data = Int.(readdlm(train_path)) 35 | true_psi = readdlm(psi_path)[:,1] 36 | 37 | N = size(train_data,2) 38 | NADE_ID = rand(0:10000) 39 | 40 | # names of files to save things to 41 | fidelity_path = "fidelities/fidelity_N=$N"*"_Nh=$Nh"*"_ID=$NADE_ID" 42 | parameter_path = "params/parameters_N=$N"*"_Nh=$Nh"*"_ID=$NADE_ID" 43 | 44 | function fidelity_stopping(current_fid, desired_fid) 45 | if current_fid >= desired_fid 46 | return true 47 | else 48 | return false 49 | end 50 | end 51 | 52 | # Change these hyperparameters to your liking 53 | η = 0.01 54 | batch_size = 100 55 | epochs = 10000 56 | log_every = 100 57 | opt = ADAM(η) 58 | 59 | desired_fid = 0.995 60 | initialize_parameters(seed=9999) 61 | 62 | args = train( 63 | train_data, 64 | batch_size=batch_size, 65 | opt=opt, 66 | epochs=epochs, 67 | calc_fidelity=true, 68 | target=true_psi, 69 | early_stopping=fidelity_stopping, 70 | early_stopping_args=desired_fid, 71 | log_every=log_every 72 | ) 73 | 74 | fidelities = args[1] 75 | 76 | if fidelities[size(fidelities,1)] >= desired_fid 77 | println("Reached desired fidelity") 78 | open(fidelity_path, "w") do io 79 | writedlm(io, fidelities) 80 | end 81 | @save parameter_path θ 82 | else 83 | println("Increasing Nh by 5") 84 | Nh += 5 85 | submit_new_job(Nh, train_path, psi_path) 86 | end 87 | -------------------------------------------------------------------------------- /tests/sampling/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from qucumber.utils import unitaries 3 | from itertools import product 4 | 5 | # Some code directly from https://github.com/emerali/rand_wvfn_sampler/blob/master/data_gen_py.ipynb 6 | 7 | ''' 8 | Check sampling algorithm by: 9 | 10 | - generating samples from DMRG wavefunction 11 | - take the wavefunction from DMRG and directly sample it 12 | - take the ED wavefunction and directly sample it 13 | - bin everything and compare 14 | 15 | Do this for every basis 16 | ''' 17 | 18 | 19 | def generate_hilbert_space(size): 20 | dim = np.arange(2 ** size) 21 | space = (((dim[:, None] & (1 << np.arange(size)))) > 0)[:, ::-1] 22 | space = space.astype(int) 23 | return space 24 | 25 | 26 | def get_samples_from_psi_indices(indices, N): 27 | return (((indices[:, None] & (1 << np.arange(N)))) > 0)[:, ::-1].astype(int) 28 | 29 | 30 | def gen_samples(num_samples, N, probs): 31 | indices = np.random.choice(len(probs), size=num_samples, p=probs) 32 | return indices, get_samples_from_psi_indices(indices, N) 33 | 34 | 35 | def gen_inds_from_samples(samples): 36 | inds = np.zeros(len(samples)) 37 | for i in range(len(samples)): 38 | inds[i] = int("".join(str(i) for i in samples[i]), base=2) 39 | return inds.astype(int) 40 | 41 | 42 | def convert_torch_cplx(tensor): 43 | real_part = tensor[0].detach().numpy() 44 | imag_part = tensor[1].detach().numpy() 45 | 46 | return real_part + (1j * imag_part) 47 | 48 | 49 | def gen_all_bases(unitary_dict, num_sites): 50 | local_bases = unitary_dict.keys() 51 | return list("".join(i) for i in product(local_bases, repeat=num_sites)) 52 | 53 | 54 | def rotate_psi(unitary_dict, basis, psi): 55 | U1 = unitary_dict[basis[0]] 56 | U2 = unitary_dict[basis[1]] 57 | unitary = np.kron(U1, U2) 58 | return np.dot(unitary, psi) 59 | 60 | 61 | def gen_data(N, num_samples_per_basis, unitary_dict, DMRG_psi, ED_psi): 62 | 63 | size = 2 ** N 64 | vis = generate_hilbert_space(N) 65 | 66 | all_bases = gen_all_bases(unitary_dict, N) 67 | 68 | tr_bases = np.zeros((len(all_bases), num_sites), dtype=str) 69 | samples = np.zeros( 70 | (len(all_bases), num_samples_per_basis, num_sites), dtype=int) 71 | 72 | for i, basis in enumerate(tqdm(all_bases)): 73 | tr_bases[i, :] = np.array(list(basis)) 74 | samples[i, :, :] = gen_samples( 75 | num_samples_per_basis, num_sites, psi, basis, unitary_dict, vis) 76 | 77 | tr_bases = np.repeat( 78 | tr_bases[:, None, :], num_samples_per_basis, axis=1).reshape(-1, num_sites) 79 | samples = samples.reshape(-1, num_sites) 80 | 81 | all_bases = np.array(list(map(list, all_bases))) 82 | 83 | return all_bases, tr_bases, samples, psi 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GreNADE 2 | 3 | GreNADE is for quantum state reconstruction using Neural Autoregressive Distribution Estimators (NADEs). 4 | 5 | ### Usage 6 | 7 | Firstly, include ```NADE.jl```. 8 | 9 | ```julia 10 | include("NADE.jl") 11 | ``` 12 | 13 | ```NADE.jl``` includes all relevant functions to train a NADE. However, the user must specify the following. 14 | 15 | - ```train_data``` : a file containing samples of binary data 16 | - ```Nh``` : the number of hidden units 17 | 18 | Now, to initialize the NADE parameters, call the ```initialize_parameters()``` function. There are two keyword arguments for this: 19 | 20 | - ```seed```: (default: 1234) the random seed for initializing the NADE weights 21 | - ```zero_weights```: (Bool, default: false) choice of initializing the NADE weights to zero or not. Of course, this will override the seed if it was specified. So, ```initialize_parameters(seed=9999, zero_weights=true)``` won't do anything with ```seed``` and one could have equivalently called ```initialize_parameters(zero_weights=true)```. 22 | 23 | The biases of the NADE are always set to initialize to zero. Now, specify what is needed to call the ```train``` function to train the NADE. 24 | 25 | - ```train_data```: binary input data 26 | - ```batch_size```: (integer, default: 100) the mini batch size used for calculating gradients 27 | - ```opt```: the optimization method (e.g. ```ADAM()```). These are optimizers available in Flux. 28 | - ```epochs```: (integer, default:1000) number of training steps (passes through the input data) 29 | - ```calc_fidelity```: (Bool, default: ```false```) Do you want to monitor the fidelity while training the NADE? 30 | - ```target```: The target quantum state. If ```calc_fidelity=true```, this is required (of course!) 31 | - ```calc_observable```: (Bool, default: ```false```) Do you want to monitor an observable while training the NADE? 32 | - ```num_samples```: (integer, default: ```nothing```) if ```calc_observable=true```, then we need to know how many samples you want to generate from the NADE to calculate your observable on. 33 | - ```observable```: (function, default: ```nothing```, returns: the value of the observable on one sample) This is a user-specified function that calculates the value of an observable given one sample from the NADE. 34 | - ```log_every```: the frequeny (in epochs) that one wishes to monitor their training metric (fidelity or an observable) 35 | - ```early_stopping```: (function, default: ```nothing```, returns: Bool) This is a user-specified function that defines a learning criteria for the NADE that, once met during the training, stops the training early (i.e. before the last epoch). The arguments to this function must be: the "current" metric value (e.g. if you're calculating fidelity, you must input the current fidelity in the training process) and other arguments required (see ```early_stopping_args```) 36 | - ```early_stopping_args```: Other required arguments required for the ```early_stopping``` function. 37 | 38 | If you're at all confused, see ```run.jl``` for an example of how to train a NADE. -------------------------------------------------------------------------------- /NADE.jl: -------------------------------------------------------------------------------- 1 | using Flux 2 | using Flux.Optimise: update! 3 | using DelimitedFiles 4 | using Random 5 | using Distributions 6 | using LinearAlgebra 7 | using Statistics 8 | using JLD2 9 | 10 | function initialize_parameters(;seed=1234, zero_weights=false) 11 | b = zeros(N) 12 | c = zeros(Nh) 13 | 14 | if zero_weights 15 | W = zeros(Nh, N) 16 | U = zeros(N, Nh) 17 | else 18 | r = MersenneTwister(seed) 19 | W = randn(r, Float64, (Nh, N)) / sqrt(N) 20 | U = randn(r, Float64, (N, Nh)) / sqrt(N) 21 | end 22 | 23 | global θ = (b, c, U, W) 24 | 25 | end 26 | 27 | function activation(v, idx) 28 | if idx == 1 29 | if length(size(v)) == 1 30 | return ones(Nh) 31 | else 32 | return ones(Nh, size(v,1)) 33 | end 34 | 35 | else 36 | if length(size(v)) == 1 37 | return σ.(θ[2] + θ[4][:,1:idx-1] * v[1:idx-1]) 38 | else 39 | return σ.(θ[2] .+ θ[4][:,1:idx-1] * transpose(v[:,1:idx-1])) 40 | end 41 | end 42 | 43 | end 44 | 45 | function Flux.Optimise.update!(opt, xs::Tuple, gs) 46 | for (x, g) in zip(xs, gs) 47 | update!(opt, x, g) 48 | end 49 | end 50 | 51 | function prob_v_given_vlt(vlt, idx) 52 | h = activation(vlt, idx) 53 | return σ.(θ[1][idx] .+ transpose(h) * θ[3][idx,:]) 54 | end 55 | 56 | function probability(v) 57 | 58 | if length(size(v)) == 1 59 | prob = 1 60 | a = θ[2] 61 | for i in 1:N 62 | h = σ.(a) 63 | p = σ.(θ[1][i] .+ transpose(h) * θ[3][i,:]) 64 | prob *= ( p^(v[i]) * (1 - p)^(1 - v[i]) ) 65 | a += θ[4][:,i] * v[i] 66 | end 67 | 68 | else 69 | prob = ones(size(v,1)) 70 | a = θ[2] 71 | for i in 2:size(v,1) 72 | a = hcat(a,θ[2]) 73 | end 74 | 75 | for i in 1:N 76 | h = σ.(a) 77 | p = σ.(θ[1][i] .+ transpose(h) * θ[3][i,:]) 78 | prob .*= ( p .^ (v[:,i]) .* (1 .- p) .^ (1 .- v[:,i]) ) 79 | a .+= θ[4][:,i] .* transpose(v[:,i]) 80 | end 81 | 82 | end 83 | 84 | return prob 85 | end 86 | 87 | function psi(v) 88 | return sqrt.(probability(v)) 89 | end 90 | 91 | function sample(num_samples) 92 | # meant for > 1 sample 93 | 94 | v = [] # put samples here 95 | a = θ[2] 96 | for i in 2:num_samples 97 | a = hcat(a,θ[2]) 98 | end 99 | 100 | for i in 1:N 101 | h = σ.(a) 102 | prob = σ.(θ[1][i] .+ transpose(h) * θ[3][i,:]) 103 | v_i = rand.(Bernoulli.(prob)) 104 | if i == 1 105 | v = v_i 106 | v = reshape(v, (num_samples,1)) 107 | else 108 | v = hcat(v, v_i) 109 | end 110 | a .+= θ[4][:,i] .* transpose(v[:,i]) 111 | end 112 | 113 | return v 114 | end 115 | 116 | function NLL(v) 117 | if length(size(v)) == 1 118 | nll = 0 119 | for idx in 1:N 120 | nll -= prob_v_given_vlt(v, idx) 121 | end 122 | else 123 | nll = zeros(size(v,1)) 124 | for idx in 1:N 125 | nll .-= prob_v_given_vlt(v, idx) 126 | end 127 | nll = sum(nll) / size(v,1) 128 | end 129 | 130 | return nll 131 | end 132 | 133 | function gradients(v) 134 | # please make 'v' a batch 135 | grads = [ 136 | zeros(size(θ[1],1),batch_size), 137 | zeros(size(θ[2],1), batch_size), 138 | zeros(size(θ[3],1), size(θ[3],2), batch_size), 139 | zeros(size(θ[4],1), size(θ[4],2), batch_size) 140 | ] 141 | da = zeros(Nh, batch_size) 142 | 143 | for i = 1:N 144 | p = prob_v_given_vlt(v, i) 145 | h = activation(v, i) 146 | dh = transpose((p .- v[:,i]) * transpose(θ[3][i,:])) .* h .* (ones(size(h)) .- h) 147 | 148 | grads[1][i,:] = p .- v[:,i] 149 | grads[2] .+= dh 150 | grads[3][i, :, :] = transpose((p .- v[:,i]) .* transpose(h)) 151 | grads[4][:,i,:] = transpose(v[:,i] .* transpose(da)) 152 | 153 | da .+= dh 154 | end 155 | 156 | for i in 1:size(grads,1) 157 | grads[i] = reshape( 158 | sum(grads[i],dims=length(size(grads[i]))), 159 | size(θ[i]) 160 | ) / batch_size 161 | end 162 | 163 | # must reteurn a tuple 164 | return (grads[1], grads[2], grads[3], grads[4]) 165 | end 166 | 167 | function fidelity(space, target) 168 | return dot(target, sqrt.(probability(space))) 169 | end 170 | 171 | function statistics_from_observable(observable, samples; args=nothing) 172 | obs = zeros(size(samples,1)) 173 | for i in 1:size(samples, 1) 174 | obs[i] += observable(samples[i,:], args=args) 175 | end 176 | mean = sum(obs) / size(samples,1) 177 | variance = var(obs) 178 | std_error = std(obs) / sqrt(size(samples,1)) 179 | 180 | return [mean variance std_error] 181 | end 182 | 183 | function train( 184 | train_data; 185 | batch_size=100, 186 | opt=ADAM(), 187 | epochs=1000, 188 | parameter_path=nothing, 189 | log_every=100, 190 | calc_fidelity=false, 191 | target=nothing, 192 | calc_observable=false, 193 | num_samples=nothing, 194 | observable=nothing, 195 | observable_args=nothing, 196 | early_stopping=nothing, 197 | early_stopping_args=nothing 198 | ) 199 | 200 | return_args = [] 201 | 202 | # TODO: what if train_size % batch_size != 0 203 | num_batches = Int(size(train_data, 1) / batch_size) 204 | 205 | # allocate space for monitoring metrics 206 | if calc_fidelity 207 | space = generate_hilbert_space() 208 | fidelities = [] 209 | end 210 | 211 | if calc_observable 212 | # observable value (mean), variance, std error 213 | observable_stats = [] 214 | end 215 | 216 | count = 1 217 | for ep in 1:epochs 218 | # shuffle training data 219 | train_data[randperm(size(train_data, 1)),:] 220 | 221 | for n in 0:num_batches-1 222 | # pass through train_data 223 | batch = train_data[(n*batch_size+1):(n+1)*batch_size, :] 224 | grads = gradients(batch) 225 | update!(opt, θ, grads) 226 | end 227 | 228 | if ep%log_every == 0 229 | println("epoch: ", ep) 230 | 231 | if calc_fidelity 232 | fid = fidelity(space, target) 233 | fidelities = vcat(fid, fidelities) 234 | println("Fidelity = ",fid) 235 | 236 | if early_stopping != nothing 237 | if early_stopping(fid, early_stopping_args) 238 | println("Met early stopping criteria.") 239 | break 240 | end 241 | end 242 | 243 | end 244 | 245 | if calc_observable 246 | samples = sample(num_samples) 247 | stats = statistics_from_observable( 248 | observable, samples, args=observable_args 249 | ) 250 | if count == 1 251 | observable_stats = stats 252 | else 253 | observbale_stats = vcat(stats, observable_stats) 254 | end 255 | 256 | println(string(observable)*" = ", stats) 257 | #if early_stopping != nothing 258 | # if early_stopping(observable_stats[count,:], early_stopping_args) 259 | # println("Met early stopping criteria.") 260 | # break 261 | # end 262 | #end 263 | 264 | end 265 | 266 | count += 1 267 | 268 | end 269 | 270 | end 271 | 272 | if calc_fidelity 273 | push!(return_args, fidelities) 274 | end 275 | 276 | if calc_observable 277 | push!(return_args, observable_stats) 278 | end 279 | return return_args 280 | 281 | end 282 | 283 | function save_params(path) 284 | @save path θ 285 | end 286 | 287 | function generate_hilbert_space() 288 | dim = [i for i in 0:2^N-1] 289 | space = space = parse.(Int64, split(bitstring(dim[1])[end-N+1:end],"")) 290 | 291 | for i in 2:length(dim) 292 | tmp = parse.(Int64, split(bitstring(dim[i])[end-N+1:end],"")) 293 | space = hcat(space, tmp) 294 | end 295 | 296 | return transpose(space) 297 | end 298 | -------------------------------------------------------------------------------- /Theory.md: -------------------------------------------------------------------------------- 1 | # Neural Autoregressive Distribution Estimators (NADEs) 2 | 3 | ## Introduction 4 | 5 | Modeling states (ground or thermal) in computational physics requires calculating the partition function - an expression that scales exponentially with the number of constituents and is thus generally intractable. As a workaround, physicists commonly use probabilistic sampling methods, many of which are based on a Markov Chain (MC). A huge drawback about MC-based sampling is that the equilibration time required to generate uncorrelated samples can be very long. 6 | 7 | Restricted Boltzmann Machines (RBMs) are a class of generative models that have many appealing properties as a tool for statistical physics, yet it too is burdened by a MC-like procedure called Gibbs sampling. 8 | 9 | In this post, we discuss an alternative that can be used for the purpose of state modeling: Neural Autoregressive Distributions Estimators (NADEs). The NADE is a generative model which is inspired by the RBM architecture, but unlike the RBM, it does not employ a MC-based sampling method. Algorithms wherein the partition function need not be calculated, yet the probability distribution defined by the model can be directly sampled, are called autoregressive. 10 | 11 | 12 | 13 | ## An RBM as a Bayesian Network 14 | 15 | The probability of a sample's occurence, as modelled by an RBM, requires the calculation of the partition function, which is intractable. Recall that for an RBM, 16 | 17 | $$ 18 | Z = \sum_{\mathbf{h} \in \mathcal{H}_{\mathbf{h}}} \sum_{\mathbf{v} \in \mathcal{H}_{\mathbf{v}}} e^{-E(\mathbf{v},\mathbf{h})}, 19 | $$ 20 | 21 | and 22 | 23 | $$ 24 | p(\mathbf{v}) = \frac{e^{-\sum_{\mathbf{h} \in \mathcal{H}_{\mathbf{h}}}E(\mathbf{v},h)}}{Z}, 25 | $$ 26 | 27 | where $\mathbf{v}$ and $\mathbf{h}$ denote the visible and hidden layer of the RBM, respectively. Autoregressive models define a probability distribution that is the product of conditional disitributions of the $i^{\text{th}}$ visible unit ($v_i$) given all preceeding visible units ($\mathbf{v}_{i}, \mathbf{h} \vert \mathbf{v}_{i}, \mathbf{h} \vert \mathbf{v}_{i}, \mathbf{h} \vert \mathbf{v}_{i}, \mathbf{h} \vert \mathbf{v}_{i} \vert \mathbf{v}_{i}) \\ 46 | =& q(v_i \vert \mathbf{v}_{ i} q(v_j \vert \mathbf{v}_{