├── .gitignore ├── variantNet ├── __init__.py ├── utils.py └── vn.py ├── images └── CYP2D6_example.jpg ├── fetch_data.sh ├── helper_scripts ├── get_variant_set.py ├── get_train_data_chr22.sh ├── get_train_data_chr21.sh ├── get_SNP_candidates.py └── get_alignment_tensor.py ├── README.md └── jupyter_nb └── demo.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /variantNet/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /images/CYP2D6_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pb-jchin/VariantNET/HEAD/images/CYP2D6_example.jpg -------------------------------------------------------------------------------- /fetch_data.sh: -------------------------------------------------------------------------------- 1 | # get testing BAM files and reference sequences 2 | wget https://www.dropbox.com/s/fwyvu24hdxqv06i/VariantNet_testing_data.tgz 3 | tar zxvf VariantNet_testing_data.tgz 4 | # get preprocessed alignment tensor data for training and validation 5 | wget https://www.dropbox.com/s/745m7mhzkj3rx28/VariantNet_wd.tgz 6 | tar zxvf VariantNet_wd.tgz 7 | -------------------------------------------------------------------------------- /helper_scripts/get_variant_set.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | for row in sys.stdin: 5 | row = row.strip().split() 6 | if row[0][0] == "#": 7 | continue 8 | last = row[-1] 9 | het_hom = last.split(":")[0].replace("/","|").split("|") 10 | p1, p2 = het_hom 11 | p1 = int(p1) 12 | p2 = int(p2) 13 | p1, p2 = (p1, p2) if p1 < p2 else (p2, p1) 14 | print row[1], row[3], row[4], p1, p2 15 | -------------------------------------------------------------------------------- /helper_scripts/get_train_data_chr22.sh: -------------------------------------------------------------------------------- 1 | python ../helper_scripts/get_SNP_candidates.py --bam_file_fn ../testing_data/chr22/hg38.NA12878-WashU_chr22-18924717-49973797.bam \ 2 | --ref_fasta_fn ../testing_data/chr22/chr22.fa \ 3 | --pm_count_fn pm_counts_chr22 --ctg_nam chr22 4 | python ../helper_scripts/get_alignment_tensor.py --bam_file_fn ../testing_data/chr22/hg38.NA12878-WashU_chr22-18924717-49973797.bam \ 5 | --pm_count_fn pm_counts_chr22 \ 6 | --ref_fasta_fn ../testing_data/chr22/chr22.fa \ 7 | --ctg_name chr22 > aln_tensor_chr22 8 | python ../helper_scripts/get_variant_set.py < ../testing_data/chr22/chr22.vcf > variants_chr22 9 | -------------------------------------------------------------------------------- /helper_scripts/get_train_data_chr21.sh: -------------------------------------------------------------------------------- 1 | python ../helper_scripts/get_SNP_candidates.py --bam_file_fn ../testing_data/chr21/hg38.NA12878-WashU_chr21-14069662-46411975.bam \ 2 | --ref_fasta_fn ../testing_data/chr21/chr21.fa \ 3 | --pm_count_fn pm_counts_chr21 --ctg_nam chr21 4 | python ../helper_scripts/get_alignment_tensor.py --bam_file_fn ../testing_data/chr21/hg38.NA12878-WashU_chr21-14069662-46411975.bam \ 5 | --pm_count_fn pm_counts_chr21 \ 6 | --ctg_name chr21 \ 7 | --ref_fasta_fn ../testing_data/chr21/chr21.fa > aln_tensor_chr21 8 | 9 | python ../helper_scripts/get_variant_set.py < ../testing_data/chr21/chr21.vcf > variants_chr21 10 | -------------------------------------------------------------------------------- /variantNet/utils.py: -------------------------------------------------------------------------------- 1 | import intervaltree 2 | import numpy as np 3 | import random 4 | 5 | def get_batch(X, Y, size=100): 6 | s = random.randint(0,len(X)-size) 7 | return X[s:s+size], Y[s:s+size] 8 | 9 | def get_aln_array( aln_tensor_fn ): 10 | 11 | X_intitial = {} 12 | 13 | with open( aln_tensor_fn ) as f: 14 | for row in f: 15 | row = row.strip().split() 16 | pos = int(row[0]) 17 | ref_seq = row[1] 18 | ref_seq = ref_seq.upper() 19 | 20 | if ref_seq[7] not in ["A","C","G","T"]: 21 | continue 22 | 23 | vec = np.reshape(np.array([float(x) for x in row[2:]]), (15,3,4)) 24 | 25 | vec = np.transpose(vec, axes=(0,2,1)) 26 | if sum(vec[7,:,0]) < 5: 27 | continue 28 | 29 | vec[:,:,1] -= vec[:,:,0] 30 | vec[:,:,2] -= vec[:,:,0] 31 | 32 | X_intitial[pos] = vec 33 | 34 | all_pos = sorted(X_intitial.keys()) 35 | 36 | Xarray = [] 37 | pos_array = [] 38 | for pos in all_pos: 39 | Xarray.append(X_intitial[pos]) 40 | pos_array.append(pos) 41 | Xarray = np.array(Xarray) 42 | 43 | return Xarray, pos_array 44 | 45 | def get_training_array( aln_tensor_fn, variant_set_fn, mask_bed_fn ): 46 | base2num = dict(zip("ACGT",(0, 1, 2, 3))) 47 | 48 | tree = intervaltree.IntervalTree() 49 | with open(mask_bed_fn) as f: 50 | for row in f: 51 | row = row.strip().split() 52 | b = int(row[1]) 53 | e = int(row[2]) 54 | tree.addi(b, e, None) 55 | 56 | Y_intitial = {} 57 | with open( variant_set_fn ) as f: 58 | for row in f: 59 | row = row.strip().split() 60 | if row[3] == "0": 61 | het = True 62 | else: 63 | het = False 64 | 65 | pos = int(row[0]) 66 | if len(tree.search(pos)) == 0: 67 | continue 68 | base_vec = [0,0,0,0,0,0,0,0] #first 4, base vec, last 4, het, hom, non-variant, not-SNPs 69 | if het: 70 | base_vec[base2num[row[1][0]]] = 0.5 71 | base_vec[base2num[row[2][0]]] = 0.5 72 | base_vec[4] = 1. 73 | else: 74 | base_vec[base2num[row[2][0]]] = 1 75 | base_vec[5] = 1. 76 | 77 | if len(row[1]) > 1 or len(row[2]) > 1 : # not simple SNP case 78 | base_vec[7] = 1. 79 | base_vec[4] = 0. 80 | base_vec[5] = 0. 81 | 82 | Y_intitial[pos] = base_vec 83 | 84 | Y_pos = sorted(Y_intitial.keys()) 85 | cpos = Y_pos[0] 86 | for pos in Y_pos[1:]: 87 | if abs(pos - cpos) < 12: 88 | Y_intitial[pos][7] = 1 89 | Y_intitial[cpos][7] = 1 90 | 91 | Y_intitial[pos][4] = 0 92 | Y_intitial[cpos][4] = 0 93 | Y_intitial[pos][5] = 0 94 | Y_intitial[cpos][5] = 0 95 | cpos = pos 96 | 97 | X_intitial = {} 98 | 99 | with open( aln_tensor_fn ) as f: 100 | for row in f: 101 | row = row.strip().split() 102 | pos = int(row[0]) 103 | if len(tree.search(pos)) == 0: 104 | continue 105 | ref_seq = row[1] 106 | if ref_seq[7] not in ["A","C","G","T"]: 107 | continue 108 | vec = np.reshape(np.array([float(x) for x in row[2:]]), (15,3,4)) 109 | 110 | vec = np.transpose(vec, axes=(0,2,1)) 111 | if sum(vec[7,:,0]) < 5: 112 | continue 113 | 114 | vec[:,:,1] -= vec[:,:,0] 115 | vec[:,:,2] -= vec[:,:,0] 116 | 117 | 118 | X_intitial[pos] = vec 119 | 120 | if pos not in Y_intitial: 121 | base_vec = [0,0,0,0,0,0,0,0] 122 | base_vec[base2num[ref_seq[7]]] = 1 123 | base_vec[6] = 1. 124 | Y_intitial[pos] = base_vec 125 | 126 | all_pos = sorted(X_intitial.keys()) 127 | random.shuffle(all_pos) 128 | 129 | Xarray = [] 130 | Yarray = [] 131 | pos_array = [] 132 | for pos in all_pos: 133 | Xarray.append(X_intitial[pos]) 134 | Yarray.append(Y_intitial[pos]) 135 | pos_array.append(pos) 136 | Xarray = np.array(Xarray) 137 | Yarray = np.array(Yarray) 138 | 139 | return Xarray, Yarray, pos_array 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | In this repository, we demostrate using a simple convolution neural network (implented 2 | with Tensorflow) for variant calling. This is sepecially useful for DNA sequences with 3 | many insertion and deletion errors (e.g. from the single molecule DNA sequencing with 4 | PacBio's platform). With insertion and deletion errors, we will see more alignment slippage, 5 | even with SNP variants, there are some possibilities that the useful information is not precisely 6 | at the column of SNP location in the alignments. With a CNN, it is possible to aggregate 7 | information from nearby bases so it can outperform simple pile-up counting for variant 8 | calling. This is a simple test to see how well we can do it without signal level information. 9 | With the singal level information and a better alignment model, it is possible to further 10 | improve the performance. 11 | 12 | 13 | ## VariantNET 14 | 15 | VariantNet is a small neural network that makes variant calls from aggregated alignment information. Unlike DeepVariant (http://www.biorxiv.org/content/early/2016/12/14/092890), we don't construct pile-up images and send the images to Google's DistBelief for computation. Instead, the alignments in a BAM file are converted to three 15 by 4 matrices for training the network and calling variants. 16 | 17 | The first matrix simple encode the expected reference sequence using one-hot-like encoding. For a candidate variant site, we padded 7 bases both on the right and the left. The number of reads that aligned to a reference position is encoded in the first matrices. The 2nd matrix encode the difference of all the bases observed in the read-reference alignment before a reference location to the expected observations (= the first matrix). The 3rd matrix is similar to the 2nd matrix, expect none of the insertion bases in the reads are counted. (We will show an example below.) 18 | 19 | The neural network used for training for calling variants contains two convolution/maxpool layer. We avoid operations that does not maintain the symmetry of the 4 different bases. For example, the max pool layers only appllyto different locations but not mix a subset of bases. And the convolution filters apply on 4 bases at the same time. After the two convolution layers, we add three full connected layers. The output layer contains two group of output. For the first 4 output unit, we like to learn about the possible bases of the site of interests. For example, if the data indicates the site has a base "C", we like to train the network to output `[0, 1, 0, 0]`. If a site has heterozygous variants, for example, "A/G", then we like to output `[0.5, 0, 0.5, 0]`. We use mean square loss for these 4 units. The 2nd group of output units contain variants type. We use a vector of 4 elements to encode possible scenarios. A variant call can be either "het"(heterozygous), "hom"(homozygous), "non-var" (non-variant), and "c-var" (complicated-variant). We use a softmax layer and use cross-entropy for the loss function for this 4 units. 20 | 21 | ## Training and testing data 22 | 23 | We take a NA12878 PacBio read dataset generated by WUSTL (https://www.ncbi.nlm.nih.gov//bioproject/PRJNA323611) and align to GRCh38 with `bwa mem`. We train the NN using calling on a SNP call set generated by GIAB project (ftp://ftp-trace.ncbi.nlm.nih.gov:/giab/ftp/release/NA12878_HG001/NISTv3.3.2/GRCh38, see also: https://groups.google.com/forum/#!topic/genome-in-a-bottle/s-t5vq8mBlQ). Like all short-read-based call set, there are a lot of region in the human genome that the short reads can not be aligned properly to call SNPs. We only training on those high confident regions. However, with PacBio read length and non-systematic random error, once we have trained a variant caller, we should be able to apply the caller for some more difficult regions to call SNPs more comprehensively in a genome. 24 | 25 | As a proof of principle, we only train using the calls on chromosome 21 and test on chromosome 22. The IGV screenshot below shows various VariantNet calls within the CYP2D6 region where there is no high confident variant calls from the GIAB project. 26 | 27 | ![VariantNet Calls for CYP2D6](https://raw.githubusercontent.com/pb-jchin/VariantNET/master/images/CYP2D6_example.jpg "VariantNet Calls for CYP2D6") 28 | 29 | 30 | This simple work results from an exercise to get my feet wet learning neutral network beyond just knowing the theories. It also shows a simple neural network rather than a big one can already help for solving some simple but relative interesting problems in genomics. 31 | 32 | I have not wrote an independent script to chain all machinary together. You can see an example in the Jupyter Notebook: https://github.com/pb-jchin/VariantNET/blob/master/jupyter_nb/demo.ipynb. The neural network model is defined in https://github.com/pb-jchin/VariantNET/blob/master/variantNet/vn.py. 33 | 34 | July 13 2017 35 | 36 | Jason Chin 37 | -------------------------------------------------------------------------------- /variantNet/vn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | class VariantNet(object): 4 | 5 | def __init__(self, input_shape = (15, 4, 3), 6 | output_shape1 = (4, ), 7 | output_shape2 = (4, ), 8 | kernel_size1 = (2, 4), 9 | kernel_size2 = (3, 4), 10 | poll_size1 = (7, 1), 11 | poll_size2 = (3, 1), 12 | filter_num = 48, 13 | hidden_layer_unit_number = 48): 14 | self.input_shape = input_shape 15 | self.output_shape1 = output_shape1 16 | self.output_shape2 = output_shape2 17 | self.kernel_size1 = kernel_size1 18 | self.kernel_size2 = kernel_size2 19 | self.poll_size1 = poll_size1 20 | self.poll_size2 = poll_size2 21 | self.filter_num = filter_num 22 | self.hidden_layer_unit_number = hidden_layer_unit_number 23 | self.g = tf.Graph() 24 | self._build_graph() 25 | self.session = tf.Session(graph = self.g) 26 | 27 | 28 | def _build_graph(self): 29 | with self.g.as_default(): 30 | X_in = tf.placeholder(tf.float32, [None, self.input_shape[0], 31 | self.input_shape[1], 32 | self.input_shape[2]]) 33 | 34 | Y_out = tf.placeholder(tf.float32, [None, self.output_shape1[0] + self.output_shape2[0]]) 35 | 36 | self.X_in = X_in 37 | self.Y_out = Y_out 38 | 39 | conv1 = tf.layers.conv2d( 40 | inputs=X_in, 41 | filters=self.filter_num, 42 | kernel_size=self.kernel_size1, 43 | padding="same", 44 | activation=tf.nn.elu) 45 | 46 | pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=self.poll_size1, strides=1) 47 | 48 | conv2 = tf.layers.conv2d( 49 | inputs=pool1, 50 | filters=self.filter_num, 51 | kernel_size=self.kernel_size2, 52 | padding="same", 53 | activation=tf.nn.elu) 54 | 55 | pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=self.poll_size2, strides=1) 56 | 57 | flat_size = ( 15 - (self.poll_size1[0] - 1) - (self.poll_size2[0] - 1)) 58 | flat_size *= ( 4 - (self.poll_size1[1] - 1) - (self.poll_size2[1] - 1)) 59 | flat_size *= self.filter_num 60 | 61 | conv2_flat = tf.reshape(pool2, [-1, flat_size]) 62 | 63 | unit_num = self.hidden_layer_unit_number 64 | h1 = tf.layers.dense(inputs=conv2_flat, units=unit_num, activation=tf.nn.elu) 65 | dropout1 = tf.layers.dropout(inputs=h1, rate=0.50) 66 | 67 | h2 = tf.layers.dense(inputs=dropout1, units=unit_num, activation=tf.nn.elu) 68 | dropout2 = tf.layers.dropout(inputs=h2, rate=0.50) 69 | 70 | h3 = tf.layers.dense(inputs=dropout2, units=unit_num, activation=tf.nn.elu) 71 | dropout3 = tf.layers.dropout(inputs=h3, rate=0.50) 72 | 73 | 74 | Y1 = tf.layers.dense(inputs=dropout3, units=self.output_shape1[0], activation=tf.nn.sigmoid) 75 | Y2 = tf.layers.dense(inputs=dropout3, units=self.output_shape2[0], activation=tf.nn.elu) 76 | Y3 = tf.nn.softmax(Y2) 77 | 78 | self.Y1 = Y1 79 | self.Y3 = Y3 80 | 81 | loss = tf.reduce_sum( tf.pow( Y1 - tf.slice(Y_out,[0,0],[-1,self.output_shape1[0]] ), 2) ) +\ 82 | tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits( logits=Y2, 83 | labels=tf.slice( Y_out, [0,self.output_shape1[0]], 84 | [-1,self.output_shape2[0]] ) ) ) 85 | self.loss = loss 86 | learning_rate = 0.0005 87 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 88 | training_op = optimizer.minimize(loss) 89 | self.training_op = training_op 90 | self.init_op = tf.global_variables_initializer() 91 | 92 | def init(self): 93 | self.session.run( self.init_op ) 94 | 95 | def close(self): 96 | self.session.close() 97 | 98 | def train(self, batchX, batchY): 99 | loss = 0 100 | X_in = self.X_in 101 | Y_out = self.Y_out 102 | loss, _ = self.session.run( (self.loss, self.training_op), feed_dict={X_in:batchX, Y_out:batchY}) 103 | return loss 104 | 105 | def get_loss(self, batchX, batchY): 106 | loss = 0 107 | X_in = self.X_in 108 | Y_out = self.Y_out 109 | loss = self.session.run( self.loss, feed_dict={X_in:batchX, Y_out:batchY}) 110 | return loss 111 | 112 | def save_parameters(self, fn): 113 | with self.g.as_default(): 114 | self.saver = tf.train.Saver() 115 | self.saver.save(self.session, fn) 116 | 117 | def restore_parameters(self, fn): 118 | with self.g.as_default(): 119 | self.saver = tf.train.Saver() 120 | self.saver.restore(self.session, fn) 121 | 122 | def predict(self, Xarray): 123 | with self.g.as_default(): 124 | bases_, type_ = self.session.run( (self.Y1, self.Y3), feed_dict={self.X_in:Xarray}) 125 | return bases_, type_ 126 | 127 | def __del__(self): 128 | self.session.close() 129 | 130 | -------------------------------------------------------------------------------- /helper_scripts/get_SNP_candidates.py: -------------------------------------------------------------------------------- 1 | from falcon_kit.FastaReader import FastaReader 2 | import argparse 3 | import os 4 | import re 5 | import shlex 6 | import subprocess 7 | import sys 8 | from math import log 9 | 10 | 11 | cigar_re = r"(\d+)([MIDNSHP=X])" 12 | 13 | def output_count(pos, base_count, ref_base, min_cov, th): 14 | 15 | total_count = 0 16 | total_count += sum(c[1] for c in base_count) 17 | if total_count < min_cov: 18 | return None 19 | 20 | base_count.sort(key = lambda x:-x[1]) 21 | p0 = 1.0 * base_count[0][1] / total_count 22 | p1 = 1.0 * base_count[1][1] / total_count 23 | output_line = [] 24 | if (p0 < 1.0 - th and p1 > th) or base_count[0][0] != ref_base: 25 | output_line = [pos+1, ref_base, total_count] 26 | output_line.extend( ["%s %d" % x for x in base_count] ) 27 | output_line = " ".join([str(c) for c in output_line]) 28 | return total_count, output_line 29 | else: 30 | return None 31 | 32 | def make_variant_candidates( args ): 33 | 34 | bam_file_fn = args.bam_file_fn 35 | pm_count_fn = args.pm_count_fn 36 | threshold = args.threshold 37 | min_cov = args.min_cov 38 | ctg_name = args.ctg_name 39 | samtools = args.samtools 40 | ref_fasta_fn = args.ref_fasta_fn 41 | 42 | # assume the ref.fa has only one reference, the name does not mattere, we only read the first one 43 | ref_seq = None 44 | for r in FastaReader(ref_fasta_fn): 45 | if r.name != ctg_name: 46 | continue 47 | ref_seq = r.sequence 48 | 49 | if ref_seq == None: 50 | print >> sys.stderr, "Can't get reference sequence" 51 | sys.exit(1) 52 | 53 | # maybe we should check if the samtools path is valid 54 | p = subprocess.Popen(shlex.split("%s view %s" % (samtools, bam_file_fn ) ), stdout=subprocess.PIPE) 55 | pileup = {} 56 | 57 | pm_count_f = open(pm_count_fn, "w") 58 | 59 | for l in p.stdout: 60 | l = l.strip().split() 61 | if l[0][0] == "@": 62 | continue 63 | 64 | QNAME = l[0] 65 | RNAME = l[2] 66 | 67 | if RNAME != ctg_name: 68 | continue 69 | 70 | FLAG = int(l[1]) 71 | POS = int(l[3]) - 1 #make it zero base to match sequence index 72 | CIGAR = l[5] 73 | SEQ = l[9] 74 | rp = POS 75 | qp = 0 76 | 77 | skip_base = 0 78 | total_aln_pos = 0 79 | for m in re.finditer(cigar_re, CIGAR): 80 | adv = int(m.group(1)) 81 | total_aln_pos += adv 82 | if m.group(2) == "S": 83 | skip_base += adv 84 | 85 | if 1.0 - 1.0 * skip_base / (total_aln_pos+1) < 0.50: #if a read is less than 50% aligned, skip 86 | continue 87 | 88 | for m in re.finditer(cigar_re, CIGAR): 89 | 90 | adv = int(m.group(1)) 91 | 92 | if m.group(2) == "S": 93 | qp += adv 94 | 95 | if m.group(2) in ("M", "=", "X"): 96 | matches = [] 97 | for i in range(adv): 98 | matches.append( (rp, SEQ[qp]) ) 99 | rp += 1 100 | qp += 1 101 | for pos, b in matches: 102 | pileup.setdefault(pos, {"A":0, "C":0, "G":0, "T":0}) 103 | pileup[pos][b] += 1 104 | elif m.group(2) == "I": 105 | for i in range(adv): 106 | qp += 1 107 | elif m.group(2) == "D": 108 | for i in range(adv): 109 | rp += 1 110 | 111 | pos_k = pileup.keys() 112 | pos_k.sort() 113 | 114 | th = threshold 115 | for pos in pos_k: 116 | if pos < POS: # output pileup informaiton before POS which is the current head of the ref 117 | base_count = pileup[pos].items() 118 | ref_base = ref_seq[pos] 119 | out = output_count(pos, base_count, ref_base, min_cov, th) 120 | if out != None: 121 | total_count, out_line = out 122 | print >> pm_count_f, out_line 123 | 124 | del pileup[pos] 125 | 126 | # for the last one 127 | th = threshold 128 | pos_k = pileup.keys() 129 | pos_k.sort() 130 | for pos in pos_k: 131 | base_count = pileup[pos].items() 132 | ref_base = ref_seq[pos] 133 | out = output_count(pos, base_count, ref_base, min_cov, th) 134 | if out != None: 135 | total_count, out_line = out 136 | print >> pm_count_f, out_line 137 | 138 | del pileup[pos] 139 | 140 | 141 | 142 | if __name__ == "__main__": 143 | 144 | 145 | parser = argparse.ArgumentParser(description='Generate SNP candidates using alignment pile-up') 146 | 147 | parser.add_argument('--bam_file_fn', type=str, default="input.bam", 148 | help="path to the sorted bam file that contains the alignments, default:input.bam") 149 | 150 | parser.add_argument('--pm_count_fn', type=str, default="pm_count", 151 | help="pile-up count output, default:pm_count") 152 | 153 | parser.add_argument('--ref_fasta_fn', type=str, default="ref.fa", 154 | help="path to the reference fasta file, default:ref.fa") 155 | 156 | parser.add_argument('--threshold', type=float, default=0.15, 157 | help="minimum frequence threshold for 2nd allele to be considered as a condidate site, default:0.15") 158 | 159 | parser.add_argument('--min_cov', type=float, default=10, 160 | help="minimum coverage for making a variant call, default=10") 161 | 162 | parser.add_argument('--ctg_name', type=str, default="ctg", 163 | help="the reference name, defaults:ctg") 164 | 165 | parser.add_argument('--samtools', type=str, default="samtools", 166 | help="the path to `samtools` command, default:samtools") 167 | 168 | 169 | 170 | args = parser.parse_args() 171 | 172 | make_variant_candidates(args) 173 | 174 | -------------------------------------------------------------------------------- /helper_scripts/get_alignment_tensor.py: -------------------------------------------------------------------------------- 1 | 2 | from falcon_kit.FastaReader import FastaReader 3 | from collections import Counter 4 | import argparse 5 | import logging 6 | import os 7 | import re 8 | import shlex 9 | import subprocess 10 | import sys 11 | import numpy as np 12 | from math import log 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | cigar_re = r"(\d+)([MIDNSHP=X])" 21 | base2num = dict(zip("ACGT", (0,1,2,3))) 22 | 23 | def generate_aln_count_tensor(alns, center, ref_seq): 24 | aln_code = np.zeros( (15, 3, 4) ) 25 | for aln in alns: 26 | for rp, qp, rb, qb in aln: 27 | if rb not in ("A","C","G","T"): 28 | continue 29 | if rp - center >= -8 and rp - center < 7 and qb != "-": 30 | offset = rp - center + 8 31 | if rb != "-": 32 | aln_code[offset][0][ base2num[rb] ] += 1 33 | aln_code[offset][1][ base2num[qb] ] += 1 34 | aln_code[offset][2][ base2num[qb] ] += 1 35 | else: 36 | aln_code[offset][1][ base2num[qb] ] += 1 37 | output_line = [] 38 | output_line.append( "%d %s" % (center, ref_seq[center-8:center+7]) ) 39 | for c1 in np.reshape(aln_code, 15*3*4): 40 | output_line.append("%0.1f" % c1) 41 | return " ".join(output_line) 42 | 43 | def output_aln_tensor(args): 44 | 45 | bam_file_fn = args.bam_file_fn 46 | pm_count_fn = args.pm_count_fn 47 | ctg_name = args.ctg_name 48 | samtools = args.samtools 49 | ref_fasta_fn = args.ref_fasta_fn 50 | 51 | # assume the ref.fa has only one reference, the name does not mattere, we only read the first one 52 | ref_seq = None 53 | for r in FastaReader(ref_fasta_fn): 54 | if r.name != ctg_name: 55 | continue 56 | ref_seq = r.sequence 57 | 58 | if ref_seq == None: 59 | print >> sys.stderr, "Can't get reference sequence" 60 | sys.exit(1) 61 | 62 | 63 | begin2end = {} 64 | with open(pm_count_fn) as f: 65 | for row in f.readlines(): 66 | row = row.strip().split() 67 | pos = int(row[0]) 68 | begin2end[ pos-8 ] = (pos + 8, pos) 69 | 70 | # maybe we should check if the samtools path is valid 71 | p = subprocess.Popen(shlex.split("%s view %s" % (samtools, bam_file_fn) ), stdout=subprocess.PIPE) 72 | 73 | center_to_aln = {} 74 | 75 | for l in p.stdout: 76 | l = l.strip().split() 77 | if l[0][0] == "@": 78 | continue 79 | 80 | QNAME = l[0] 81 | FLAG = int(l[1]) 82 | RNAME = l[2] 83 | POS = int(l[3]) - 1 #make it zero base to match sequence index 84 | CIGAR = l[5] 85 | SEQ = l[9] 86 | rp = POS 87 | qp = 0 88 | 89 | end_to_center = {} 90 | active_set = set() 91 | 92 | for m in re.finditer(cigar_re, CIGAR): 93 | adv = int(m.group(1)) 94 | if m.group(2) == "S": 95 | qp += adv 96 | if m.group(2) in ("M", "=", "X"): 97 | matches = [] 98 | for i in xrange(adv): 99 | matches.append( (rp, SEQ[qp]) ) 100 | 101 | if rp in begin2end: 102 | r_end, r_center = begin2end[rp] 103 | end_to_center[r_end] = r_center 104 | active_set.add(r_center) 105 | center_to_aln.setdefault(r_center, []) 106 | center_to_aln[r_center].append([]) 107 | 108 | for center in list(active_set): 109 | center_to_aln[center][-1].append( (rp, qp, ref_seq[rp], SEQ[qp] ) ) 110 | 111 | if rp in end_to_center: 112 | center = end_to_center[rp] 113 | active_set.remove(center) 114 | 115 | rp += 1 116 | qp += 1 117 | 118 | elif m.group(2) == "I": 119 | for i in range(adv): 120 | for center in list(active_set): 121 | center_to_aln[center][-1].append( (rp, qp, "-", SEQ[qp] )) 122 | qp += 1 123 | 124 | elif m.group(2) == "D": 125 | for i in xrange(adv): 126 | for center in list(active_set): 127 | center_to_aln[center][-1].append( (rp, qp, ref_seq[rp], "-" )) 128 | 129 | if rp in begin2end: 130 | r_end, r_center = begin2end[rp] 131 | end_to_center[r_end] = r_center 132 | active_set.add(r_center) 133 | center_to_aln.setdefault(r_center, []) 134 | center_to_aln[r_center].append([]) 135 | 136 | if rp in end_to_center: 137 | center = end_to_center[rp] 138 | active_set.remove(center) 139 | 140 | rp += 1 141 | 142 | 143 | for center in center_to_aln.keys(): 144 | if center + 8 < POS: 145 | t_line = generate_aln_count_tensor(center_to_aln[center], center, ref_seq) 146 | print t_line 147 | del center_to_aln[center] 148 | 149 | for center in center_to_aln.keys(): 150 | if center + 8 < POS: 151 | t_line = generate_aln_count_tensor(center_to_aln[center], center, ref_seq) 152 | print t_line 153 | 154 | 155 | if __name__ == "__main__": 156 | 157 | parser = argparse.ArgumentParser( 158 | description='Generate a 15x4x3 "tensor" summarizing local alignments from a BAM file and a list of candidate locations' ) 159 | 160 | parser.add_argument('--bam_file_fn', type=str, default="input.bam", 161 | help="path to the sorted bam file that contains the alignments, default:input.bam") 162 | 163 | parser.add_argument('--pm_count_fn', type=str, default="pm_counts", 164 | help="pile-up count input, default:pm_count") 165 | 166 | parser.add_argument('--ref_fasta_fn', type=str, default="ref.fa", 167 | help="path to the reference fasta file, default:ref.fa") 168 | 169 | 170 | parser.add_argument('--ctg_name', type=str, default="ctg", 171 | help="the reference name, defaults:ctg") 172 | 173 | parser.add_argument('--samtools', type=str, default="samtools", 174 | help="the path to `samtools` command, default:samtools") 175 | 176 | args = parser.parse_args() 177 | 178 | output_aln_tensor(args) 179 | 180 | -------------------------------------------------------------------------------- /jupyter_nb/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# VariantNet\n", 8 | "\n", 9 | "VariantNet is a small neural network that makes variant calls from aggregated alignment information. Unlike DeepVariant (http://www.biorxiv.org/content/early/2016/12/14/092890), we don't construct pile-up images and send the images to Google's DistBelief for computation. Instead, the alignments in a BAM file are converted to three 15 by 4 matrices for training the network and calling variants. \n", 10 | "\n", 11 | "The first matrix simple encode the expected reference sequence using one-hot-like encoding. For a candidate variant site, we padded 7 bases both on the right and the left. The number of reads that aligned to a reference position is encoded in the first matrices. The 2nd matrix encode the difference of all the bases observed in the read-reference alignment before a reference location to the expected observations (= the first matrix). The 3rd matrix is similar to the 2nd matrix, expect none of the insertion bases in the reads are counted. (We will show an example below.)\n", 12 | "\n", 13 | "The neural network used for training for calling variants contains two convolution/maxpool layer. We avoid operations that does not maintain the symmetry of the 4 different bases. For example, the max pool layers only appllyto different locations but not mix a subset of bases. And the convolution filters apply on 4 bases at the same time. After the two convolution layers, we add three full connected layers. The output layer contains two group of output. For the first 4 output unit, we like to learn about the possible bases of the site of interests. For example, if the data indicates the site has a base \"C\", we like to train the network to output `[0, 1, 0, 0]`. If a site has heterozygous variants, for example, \"A/G\", then we like to output `[0.5, 0, 0.5, 0]`. We use mean square loss for these 4 units. The 2nd group of output units contain variants type. We use a vector of 4 elements to encode possible scenarios. A variant call can be either \"het\"(heterozygous), \"hom\"(homozygous), \"non-var\" (non-variant), and \"c-var\" (complicated-variant). We use a softmax layer and use cross-entropy for the loss function for this 4 units.\n", 14 | "\n", 15 | "## Training and testing data\n", 16 | "\n", 17 | "We take a NA12878 PacBio read dataset generated by WUSTL (https://www.ncbi.nlm.nih.gov//bioproject/PRJNA323611) and align to GRCh38 with `bwa mem`. We train the NN using calling on a SNP call set generated by GIAB project (ftp://ftp-trace.ncbi.nlm.nih.gov:/giab/ftp/release/NA12878_HG001/NISTv3.3.2/GRCh38). Like all short-read-based call set, there are a lot of region in the human genome that the short reads can not be aligned properly to call SNPs. We only training on those high confident regions. However, with PacBio read length and non-systematic random error, once we have trained a variant caller, we should be able to apply the caller for some more difficult regions to call SNPs more comprehensively in a genome.\n", 18 | "\n", 19 | "As a proof of principle, we only train using the calls on chromosome 21 and test on chromosome 22.\n", 20 | "\n", 21 | "This simple work results from an exercise to get my feet wet learning neutral network beyond just knowing the theories. It also shows a simple neural network rather than a big one can already help for solving some simple but relative interesting problems in genomics. \n", 22 | "\n", 23 | "\n", 24 | "July 13, 2017\n", 25 | "\n", 26 | "Jason Chin" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "Populating the interactive namespace from numpy and matplotlib\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "# for plotting to visualize some data\n", 44 | "%matplotlib inline\n", 45 | "%pylab inline" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# I have not written a python packaging script, load the NN class within the repository\n", 55 | "import sys\n", 56 | "from __future__ import print_function\n", 57 | "sys.path.append('../')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "#import variantNet as vn\n", 67 | "import variantNet.utils as utils\n", 68 | "import variantNet.vn as vn" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# useful for testing\n", 78 | "#reload(utils)\n", 79 | "#reload(vn)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Shell script example to get testing data from BAM files and (stripped) VCF files\n", 87 | "These scripts depend on `falcon_kit.FastaReader` for reading fasta files and `samtools` for parsing alignment records. \n", 88 | "\n", 89 | "```\n", 90 | "mkdir -p ./wd\n", 91 | "cd ./wd\n", 92 | "python ../helper_scripts/get_SNP_candidates.py --bam_file_fn ../testing_data/chr22/hg38.NA12878-WashU_chr22-18924717-49973797.bam \\\n", 93 | " --ref_fasta_fn ../testing_data/chr22/chr22.fa \\\n", 94 | " --pm_count_fn pm_counts_chr22 --ctg_nam chr22\n", 95 | "python ../helper_scripts/get_alignment_tensor.py --bam_file_fn ../testing_data/chr22/hg38.NA12878-WashU_chr22-18924717-49973797.bam \\\n", 96 | " --pm_count_fn pm_counts_chr22 \\\n", 97 | " --ref_fasta_fn ../testing_data/chr22/chr22.fa \\\n", 98 | " --ctg_name chr22 > aln_tensor_chr22\n", 99 | "python ../helper_scripts/get_variant_set.py < ../testing_data/chr22/chr22.vcf > variants_chr22\n", 100 | "```\n", 101 | "\n", 102 | "You can download the initial alignment files and vcf files from https://www.dropbox.com/s/fwyvu24hdxqv06i/VariantNet_testing_data.tgz and you can get a pre-generated dataset file for training and evaluation from https://www.dropbox.com/s/745m7mhzkj3rx28/VariantNet_wd.tgz. (Not sure how long I will be hosting those files though.)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Training using Chr21 variant calls" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# load the generate alignment \"tensors\" \n", 119 | "# we don't use the variats outside the regions defined in `CHROM21_v.3.3.2_highconf_noinconsistent.bed`\n", 120 | "\n", 121 | "Xarray, Yarray, pos_array = \\\n", 122 | "utils.get_training_array(\"../wd/aln_tensor_chr21\", \n", 123 | " \"../wd/variants_chr21\", \n", 124 | " \"../testing_data/chr21/CHROM21_v.3.3.2_highconf_noinconsistent.bed\" )" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 6, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "(53537, 15, 4, 3)\n", 137 | "(53537, 8)\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "print(Xarray.shape)\n", 143 | "print(Yarray.shape)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "" 155 | ] 156 | }, 157 | "execution_count": 7, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | }, 161 | { 162 | "data": { 163 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAATYAAABtCAYAAAArgk0pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAACEVJREFUeJzt3X+oX3Udx/Hnq7s7dE7uCjdbu8NJ\nyGKMdDLEGkXMjM3M9WdSYiTsn37MGIQrCPqnCYUWFIXoctBYyFwkw5VDByKpta1t7ofpqMy7rm0i\nTi1wP3r3xzmD2929fT+793zP5/Tp9YDL/X6/98v3vO453/u658f3fI4iAjOzkrwndwAzs6a52Mys\nOC42MyuOi83MiuNiM7PiuNjMrDitF5ukVZL+KOmYpHvanv5EJC2UtFvSEUmHJa3Lnek8SQOS/iBp\nR+4sAJLmSNom6UVJRyV9pAOZvl4vt0OStkq6JFOOTZJOSDo05rH3Sdol6eX6+3s7kOl79fI7KOmX\nkubkzjTmZ+slhaQrpjONVotN0gDwY2A1sAS4XdKSNjNM4iywPiKWADcCX+5ILoB1wNHcIcb4IfDr\niPgQcC2Zs0laAHwNWB4RS4EB4HOZ4jwMrBr32D3AkxFxDfBkfT93pl3A0oj4MPASsKEDmZC0EPgU\n8NfpTqDtNbYbgGMR8aeIOA38AljTcoYLRMRoROyrb79N9ce6IG8qkDQMfBp4MHcWAElDwMeBhwAi\n4nREvJk3FQAzgEslzQBmAX/LESIingbeGPfwGmBzfXsz8NncmSLiiYg4W999DhjOnal2P/ANYNpn\nDbRdbAuAV8fcH6EDBTKWpEXAMuD5vEkA+AHVgv5X7iC1q4GTwM/qzeMHJV2WM1BEHAe+T/VffhQ4\nFRFP5Mw0zpURMVrffg24MmeYCXwJ2Jk7hKQ1wPGIONDE6/ngwRiSZgOPAndHxFuZs9wKnIiIvTlz\njDMDuB74SUQsA/5B+5tW/6HeZ7WGqnQ/AFwm6Qs5M00mqvMXO3MOo6RvUe2G2ZI5xyzgm8C3m3rN\ntovtOLBwzP3h+rHsJA1SldqWiNieOw+wArhN0l+oNtlXSvp53kiMACMRcX5tdhtV0eX0SeDPEXEy\nIs4A24GPZs401t8lzQeov5/InAcASV8EbgU+H/lPGP8g1T+mA/X7fRjYJ+n9U33Btovt98A1kq6W\nNJNqJ+9jLWe4gCRR7Tc6GhH35c4DEBEbImI4IhZRzaenIiLrmkhEvAa8Kmlx/dBNwJGMkaDaBL1R\n0qx6Od5Etw62PAbcWd++E/hVxixA9ckEql0ct0XEP3PniYgXImJeRCyq3+8jwPX1+21KWi22eofl\nV4DfUL35HomIw21mmMQK4A6qtaL99dctuUN11FeBLZIOAtcB380Zpl573AbsA16gek8/kCOLpK3A\ns8BiSSOS7gLuBW6W9DLV2uW9Hcj0I+ByYFf9Xv9pBzI1O438a6FmZs3ywQMzK46LzcyK42Izs+K4\n2MysOC42MytOtmKTtDbXtCfjTGm6mAm6mcuZ0jSdKecaW+dmLs6UqouZoJu5nClNMcVmZtYXffmA\n7pAGYh6D//U5pzjHEAM9X2t06KpGMs0/9UrP57SdKcWZ06cYnDnU2vSg97zKMZ+aXH5NSfn9ciy/\nXtrO1OSyO8a7r0fE3F7Pm5EW7eLMY5D7B5p5U2/8WDNnx2zY2dyablOZuqqpedXkfGpy+TWl9PdB\nU5pcdp8591LvlsSbomZWIBebmRXHxWZmxUkqti5eWcrMbDI9i63DV5YyM5tQyhpbJ68sZWY2mZRi\n6/yVpczMxmrsc2z1uV5rAeb25+NxZmZJUtbYkq4sFREPRMTyiFje5qe/zczGSym2Tl5ZysxsMj23\nGSPirKTzV5YaADZ15MpSZmYTStoZFhGPA4/3OYuZWSN85oGZFcfFZmbFcbGZWXFcbGZWnL6MoDt7\nzuK4tmMDRG5cXfaggI0OpFn4vLI0Xfzb++2OT+yNiOW9nuc1NjMrjovNzIrjYjOz4rjYzKw4LjYz\nK07KCLqbJJ2QdKiNQGZm05WyxvYwsKrPOczMGtOz2CLiaeCNFrKYmTXC+9jMrDiNFZuktZL2SNpz\n5vSppl7WzOyiNVZsY4cGH5w51NTLmpldNG+KmllxUj7usRV4FlgsaUTSXf2PZWY2dSnXPLi9jSBm\nZk3xpqiZFcfFZmbFcbGZWXE6P4Kumdl5HkHXzP5vudjMrDguNjMrjovNzIrjYjOz4rjYzKw4KeeK\nLpS0W9IRSYclrWsjmJnZVPU8VxQ4C6yPiH2SLgf2StoVEUf6nM3MbEpShgYfjYh99e23gaPAgn4H\nMzObqovaxyZpEbAMeH6Cn3kEXTPrhORikzQbeBS4OyLeGv9zj6BrZl2RVGySBqlKbUtEbO9vJDOz\n6Uk5KirgIeBoRNzX/0hmZtOTssa2ArgDWClpf/11S59zmZlNWcrQ4M8AaiGLmVkjfOaBmRXHxWZm\nxXGxmVlxUk6psnE27FybO8IFNq7u3lDsTc6n0n+/LuriPE/lNTYzK46LzcyK42Izs+K42MysOC42\nMytOyrmil0j6naQD9Qi632kjmJnZVKV83ONdYGVEvFOP8vGMpJ0R8Vyfs5mZTUnKuaIBvFPfHay/\nop+hzMymI3U8tgFJ+4ETwK6IuGAEXTOzrkgqtog4FxHXAcPADZKWjn+OhwY3s664qKOiEfEmsBtY\nNcHPPDS4mXVCylHRuZLm1LcvBW4GXux3MDOzqUo5Kjof2CxpgKoIH4mIHf2NZWY2dSlHRQ9SXXLP\nzOx/gs88MLPiuNjMrDguNjMrjovNzIqj6oyphl9UOgm80uNpVwCvNz7x6XGmNF3MBN3M5UxpUjNd\nFRFzez2pL8WWQtKeiFieZeKTcKY0XcwE3czlTGmazuRNUTMrjovNzIqTs9i6eG0vZ0rTxUzQzVzO\nlKbRTNn2sZmZ9Ys3Rc2sOC42MyuOi83MiuNiM7PiuNjMrDj/BoyJoSA37bmWAAAAAElFTkSuQmCC\n", 164 | "text/plain": [ 165 | "" 166 | ] 167 | }, 168 | "metadata": {}, 169 | "output_type": "display_data" 170 | }, 171 | { 172 | "data": { 173 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAATYAAABtCAYAAAArgk0pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAACUFJREFUeJzt3WuMHWUdx/Hvb09b2nLtKffdhhJs\nMKRRIA1BSXwBYgoi9Y2JRAhGEt4ogiExoImJbwyJpmqi0RBASGxQUiASBKRBEkICaKlcSotCVKRY\nKPQs5aItbffvixnIWnZ7Znefc57hye+TbPZcJs/8cmb2v3M58x9FBGZmJRnJHcDMLDUXNjMrjgub\nmRXHhc3MiuPCZmbFcWEzs+IMvbBJWi3pr5JelHTdsOc/FUnLJD0saYuk5yRdnTvT+yR1JP1F0r25\nswBIOkrSeknPS9oq6VMtyPSterltlnS7pIWZctwiaYekzZNe60raIOmF+veSFmT6Yb38npF0t6Sj\ncmea9N61kkLS0XOZx1ALm6QO8HPgAuA04BJJpw0zwzT2AddGxGnA2cDXW5IL4Gpga+4Qk/wUeCAi\nPg58kszZJI0C3wRWRcRKoAN8OVOcW4HVB7x2HfBQRKwAHqqf5860AVgZEZ8A/gZc34JMSFoGfA74\n11xnMOwttrOAFyPi7xHxHvAbYM2QM3xIRGyPiE3147ep/lhH86YCSWPA54GbcmcBkHQk8BngZoCI\neC8i3sybCoB5wCJJ84DFwL9zhIiIR4DeAS+vAW6rH98GfDF3poh4MCL21U8fB8ZyZ6r9GPg2MOer\nBoZd2EaBlyc930YLCshkkpYDZwBP5E0CwE+oFvRE7iC1k4HXgV/Vu8c3STo0Z6CIeAX4EdV/+e3A\nroh4MGemAxwXEdvrx68Cx+UMM4WvAffnDiFpDfBKRDydYjyfPJhE0mHAncA1EfFW5iwXATsi4smc\nOQ4wDzgT+EVEnAG8y/B3rf5PfcxqDVXRPRE4VNKlOTNNJ6rrF1tzDaOk71IdhlmXOcdi4DvA91KN\nOezC9gqwbNLzsfq17CTNpypq6yLirtx5gHOAiyX9k2qX/VxJv84biW3Atoh4f2t2PVWhy+mzwD8i\n4vWI2AvcBXw6c6bJXpN0AkD9e0fmPABI+ipwEfCVyH/B+ClU/5iertf3MWCTpONnO+CwC9ufgRWS\nTpa0gOog7z1DzvAhkkR13GhrRKzNnQcgIq6PiLGIWE71Of0xIrJuiUTEq8DLkk6tXzoP2JIxElS7\noGdLWlwvx/No18mWe4DL68eXA7/LmAWovplAdYjj4oj4T+48EfFsRBwbEcvr9X0bcGa9vs3KUAtb\nfcDyG8AfqFa+OyLiuWFmmMY5wGVUW0VP1T8X5g7VUlcB6yQ9A5wO/CBnmHrrcT2wCXiWap2+MUcW\nSbcDjwGnStom6QrgBuB8SS9QbV3e0IJMPwMOBzbU6/ovW5Ap7Tzyb4WamaXlkwdmVhwXNjMrjgub\nmRXHhc3MiuPCZmbFyVbYJF2Za97TcaZm2pgJ2pnLmZpJnSnnFlvrPlycqak2ZoJ25nKmZoopbGZm\nAzGQL+gu6XZjdPTgnVDGez2WdLt9x+rE/iSZ9mle32maZpr3QceXuQmp7zQ7e+Ms7TboTZhwOYY6\nB32/19tJt7u07zhK2JRkz8SCvtPsGn+DI5ccvD/hgpE0y67S/zPv9Xp0G6xTSnRtfNB/nWqaaSTS\nLL8J9d9+appp8+bNb0TEMf2m6//XPgujo2Osv/v3ScY6fP94knF2jqTrFrN04rUk4+ztpGv02pnY\nm2ys3Z00nYjmx3tJxgF4aXealmEnLkp3DXqHdEVymEWkqUP2pbmMdM+8xUnGATjlYyteajKdd0XN\nrDgubGZWHBc2MytOo8LWxjtLmZlNp29ha/GdpczMptRki62Vd5YyM5tOk8LW+jtLmZlNluzkgaQr\nJW2UtHG8N9UtA83MhqNJYWt0Z6mIuDEiVkXEqibf3jczG5Qmha2Vd5YyM5tO30uqImKfpPfvLNUB\nbmnJnaXMzKbU6FrRiLgPuG/AWczMkvCVB2ZWHBc2MyuOC5uZFceFzcyKM5BGkyNMsJD/JhlrV+fg\nHVGbOoJdScaBdA0iUzYF3D1yRLKxFkaaZXfEb9cmGQdg5RcuTTLOLo5PMk5qqdaF+fv3JBkH0v3t\nLSRNw8qZ8BabmRXHhc3MiuPCZmbFcWEzs+K4sJlZcZp00L1F0g5Jm4cRyMxsrppssd0KrB5wDjOz\nZPoWtoh4BHDnSDP7yPAxNjMrzkBag+/sjaca1sxsxpIVtsmtwZd2l6Qa1sxsxrwrambFafJ1j9uB\nx4BTJW2TdMXgY5mZzV6Tex5cMowgZmapeFfUzIrjwmZmxXFhM7PiDKSDbqBkHUGTdd+MNMOkNBIT\nycZaoHSdUycS/b9790tXJRkHYCT2Jxon3WeesgNyqlx7O4ckGQfydL5NxVtsZlYcFzYzK44Lm5kV\nx4XNzIrjwmZmxXFhM7PiNLlWdJmkhyVtkfScpKuHEczMbLaafI9tH3BtRGySdDjwpKQNEbFlwNnM\nzGalSWvw7RGxqX78NrAVGB10MDOz2ZrRMTZJy4EzgCemeO+DDrq9nm+RYGb5NC5skg4D7gSuiYi3\nDnx/cgfdbrebMqOZ2Yw0KmyS5lMVtXURcddgI5mZzU2Ts6ICbga2RsTawUcyM5ubJlts5wCXAedK\neqr+uXDAuczMZq1Ja/BHAQ0hi5lZEr7ywMyK48JmZsVxYTOz4gykNbiIZK2OU7Vf7kzsTTIOwE6O\nTTLOwpHdScYBWEC61uCprH3wpGRjXXXBq8nGaqPdLEozUMIW+CnbzQ+bt9jMrDgubGZWHBc2MyuO\nC5uZFceFzcyK0+Ra0YWS/iTp6bqD7veHEczMbLaafN1jD3BuRLxTd/l4VNL9EfH4gLOZmc1Kk2tF\nA3infjq//kn4bRkzs7Sa9mPrSHoK2AFsiIgPddA1M2uLRoUtIvZHxOnAGHCWpJUHTjO5NfjO3njq\nnGZmjc3orGhEvAk8DKye4r0PWoMv7S5Jlc/MbMaanBU9RtJR9eNFwPnA84MOZmY2W03Oip4A3Cap\nQ1UI74iIewcby8xs9pqcFX2G6pZ7ZmYfCb7ywMyK48JmZsVxYTOz4riwmVlxVF0xlXhQ6XXgpT6T\nHQ28kXzmc+NMzbQxE7QzlzM10zTTSRFxTL+JBlLYmpC0MSJWZZn5NJypmTZmgnbmcqZmUmfyrqiZ\nFceFzcyKk7Ow3Zhx3tNxpmbamAnamcuZmkmaKdsxNjOzQfGuqJkVx4XNzIrjwmZmxXFhM7PiuLCZ\nWXH+Bw5/xjbcspCtAAAAAElFTkSuQmCC\n", 174 | "text/plain": [ 175 | "" 176 | ] 177 | }, 178 | "metadata": {}, 179 | "output_type": "display_data" 180 | }, 181 | { 182 | "data": { 183 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAATYAAABtCAYAAAArgk0pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAACUFJREFUeJzt3WuMHWUdx/Hvb09b2nLtKffdhhJs\nMKRRIA1BSXwBYgoi9Y2JRAhGEt4ogiExoImJbwyJpmqi0RBASGxQUiASBKRBEkICaKlcSotCVKRY\nKPQs5aItbffvixnIWnZ7Znefc57hye+TbPZcJs/8cmb2v3M58x9FBGZmJRnJHcDMLDUXNjMrjgub\nmRXHhc3MiuPCZmbFcWEzs+IMvbBJWi3pr5JelHTdsOc/FUnLJD0saYuk5yRdnTvT+yR1JP1F0r25\nswBIOkrSeknPS9oq6VMtyPSterltlnS7pIWZctwiaYekzZNe60raIOmF+veSFmT6Yb38npF0t6Sj\ncmea9N61kkLS0XOZx1ALm6QO8HPgAuA04BJJpw0zwzT2AddGxGnA2cDXW5IL4Gpga+4Qk/wUeCAi\nPg58kszZJI0C3wRWRcRKoAN8OVOcW4HVB7x2HfBQRKwAHqqf5860AVgZEZ8A/gZc34JMSFoGfA74\n11xnMOwttrOAFyPi7xHxHvAbYM2QM3xIRGyPiE3147ep/lhH86YCSWPA54GbcmcBkHQk8BngZoCI\neC8i3sybCoB5wCJJ84DFwL9zhIiIR4DeAS+vAW6rH98GfDF3poh4MCL21U8fB8ZyZ6r9GPg2MOer\nBoZd2EaBlyc930YLCshkkpYDZwBP5E0CwE+oFvRE7iC1k4HXgV/Vu8c3STo0Z6CIeAX4EdV/+e3A\nroh4MGemAxwXEdvrx68Cx+UMM4WvAffnDiFpDfBKRDydYjyfPJhE0mHAncA1EfFW5iwXATsi4smc\nOQ4wDzgT+EVEnAG8y/B3rf5PfcxqDVXRPRE4VNKlOTNNJ6rrF1tzDaOk71IdhlmXOcdi4DvA91KN\nOezC9gqwbNLzsfq17CTNpypq6yLirtx5gHOAiyX9k2qX/VxJv84biW3Atoh4f2t2PVWhy+mzwD8i\n4vWI2AvcBXw6c6bJXpN0AkD9e0fmPABI+ipwEfCVyH/B+ClU/5iertf3MWCTpONnO+CwC9ufgRWS\nTpa0gOog7z1DzvAhkkR13GhrRKzNnQcgIq6PiLGIWE71Of0xIrJuiUTEq8DLkk6tXzoP2JIxElS7\noGdLWlwvx/No18mWe4DL68eXA7/LmAWovplAdYjj4oj4T+48EfFsRBwbEcvr9X0bcGa9vs3KUAtb\nfcDyG8AfqFa+OyLiuWFmmMY5wGVUW0VP1T8X5g7VUlcB6yQ9A5wO/CBnmHrrcT2wCXiWap2+MUcW\nSbcDjwGnStom6QrgBuB8SS9QbV3e0IJMPwMOBzbU6/ovW5Ap7Tzyb4WamaXlkwdmVhwXNjMrjgub\nmRXHhc3MiuPCZmbFyVbYJF2Za97TcaZm2pgJ2pnLmZpJnSnnFlvrPlycqak2ZoJ25nKmZoopbGZm\nAzGQL+gu6XZjdPTgnVDGez2WdLt9x+rE/iSZ9mle32maZpr3QceXuQmp7zQ7e+Ms7TboTZhwOYY6\nB32/19tJt7u07zhK2JRkz8SCvtPsGn+DI5ccvD/hgpE0y67S/zPv9Xp0G6xTSnRtfNB/nWqaaSTS\nLL8J9d9+appp8+bNb0TEMf2m6//XPgujo2Osv/v3ScY6fP94knF2jqTrFrN04rUk4+ztpGv02pnY\nm2ys3Z00nYjmx3tJxgF4aXealmEnLkp3DXqHdEVymEWkqUP2pbmMdM+8xUnGATjlYyteajKdd0XN\nrDgubGZWHBc2MytOo8LWxjtLmZlNp29ha/GdpczMptRki62Vd5YyM5tOk8LW+jtLmZlNluzkgaQr\nJW2UtHG8N9UtA83MhqNJYWt0Z6mIuDEiVkXEqibf3jczG5Qmha2Vd5YyM5tO30uqImKfpPfvLNUB\nbmnJnaXMzKbU6FrRiLgPuG/AWczMkvCVB2ZWHBc2MyuOC5uZFceFzcyKM5BGkyNMsJD/JhlrV+fg\nHVGbOoJdScaBdA0iUzYF3D1yRLKxFkaaZXfEb9cmGQdg5RcuTTLOLo5PMk5qqdaF+fv3JBkH0v3t\nLSRNw8qZ8BabmRXHhc3MiuPCZmbFcWEzs+K4sJlZcZp00L1F0g5Jm4cRyMxsrppssd0KrB5wDjOz\nZPoWtoh4BHDnSDP7yPAxNjMrzkBag+/sjaca1sxsxpIVtsmtwZd2l6Qa1sxsxrwrambFafJ1j9uB\nx4BTJW2TdMXgY5mZzV6Tex5cMowgZmapeFfUzIrjwmZmxXFhM7PiDKSDbqBkHUGTdd+MNMOkNBIT\nycZaoHSdUycS/b9790tXJRkHYCT2Jxon3WeesgNyqlx7O4ckGQfydL5NxVtsZlYcFzYzK44Lm5kV\nx4XNzIrjwmZmxXFhM7PiNLlWdJmkhyVtkfScpKuHEczMbLaafI9tH3BtRGySdDjwpKQNEbFlwNnM\nzGalSWvw7RGxqX78NrAVGB10MDOz2ZrRMTZJy4EzgCemeO+DDrq9nm+RYGb5NC5skg4D7gSuiYi3\nDnx/cgfdbrebMqOZ2Yw0KmyS5lMVtXURcddgI5mZzU2Ts6ICbga2RsTawUcyM5ubJlts5wCXAedK\neqr+uXDAuczMZq1Ja/BHAQ0hi5lZEr7ywMyK48JmZsVxYTOz4gykNbiIZK2OU7Vf7kzsTTIOwE6O\nTTLOwpHdScYBWEC61uCprH3wpGRjXXXBq8nGaqPdLEozUMIW+CnbzQ+bt9jMrDgubGZWHBc2MyuO\nC5uZFceFzcyK0+Ra0YWS/iTp6bqD7veHEczMbLaafN1jD3BuRLxTd/l4VNL9EfH4gLOZmc1Kk2tF\nA3infjq//kn4bRkzs7Sa9mPrSHoK2AFsiIgPddA1M2uLRoUtIvZHxOnAGHCWpJUHTjO5NfjO3njq\nnGZmjc3orGhEvAk8DKye4r0PWoMv7S5Jlc/MbMaanBU9RtJR9eNFwPnA84MOZmY2W03Oip4A3Cap\nQ1UI74iIewcby8xs9pqcFX2G6pZ7ZmYfCb7ywMyK48JmZsVxYTOz4riwmVlxVF0xlXhQ6XXgpT6T\nHQ28kXzmc+NMzbQxE7QzlzM10zTTSRFxTL+JBlLYmpC0MSJWZZn5NJypmTZmgnbmcqZmUmfyrqiZ\nFceFzcyKk7Ow3Zhx3tNxpmbamAnamcuZmkmaKdsxNjOzQfGuqJkVx4XNzIrjwmZmxXFhM7PiuLCZ\nWXH+Bw5/xjbcspCtAAAAAElFTkSuQmCC\n", 184 | "text/plain": [ 185 | "" 186 | ] 187 | }, 188 | "metadata": {}, 189 | "output_type": "display_data" 190 | } 191 | ], 192 | "source": [ 193 | "# here we show how the three matrices are like for a candidate site\n", 194 | "\n", 195 | "i = 999\n", 196 | "figure(figsize=(5, 2))\n", 197 | "matshow(Xarray[i,:,:,0].transpose(), vmin=0, vmax=50, cmap=cm.coolwarm, fignum=0)\n", 198 | "figure(figsize=(5, 2))\n", 199 | "matshow(Xarray[i,:,:,1].transpose(), vmin=-50, vmax=50, cmap=cm.coolwarm, fignum=0)\n", 200 | "figure(figsize=(5, 2))\n", 201 | "matshow(Xarray[i,:,:,2].transpose(), vmin=-50, vmax=50, cmap=cm.coolwarm, fignum=0)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "%%bash\n", 211 | "mkdir -p ../wd/parameters" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 9, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "WARNING:tensorflow:From ../variantNet/vn.py:84: softmax_cross_entropy_with_logits (from tensorflow.python.ops.nn_ops) is deprecated and will be removed in a future version.\n", 224 | "Instructions for updating:\n", 225 | "\n", 226 | "Future major versions of TensorFlow will allow gradients to flow\n", 227 | "into the labels input on backprop by default.\n", 228 | "\n", 229 | "See tf.nn.softmax_cross_entropy_with_logits_v2.\n", 230 | "\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "# create a VariantNet\n", 236 | "vnn = vn.VariantNet()" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "# initialize the parameters\n", 246 | "vnn.init()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 11, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "0 train lost: 3.72539306640625 validation lost 2.8897869140625\n", 259 | "60 train lost: 0.8873594970703125 validation lost 0.92711123046875\n", 260 | "120 train lost: 0.6507681884765625 validation lost 0.697503271484375\n", 261 | "180 train lost: 0.549986328125 validation lost 0.58614619140625\n", 262 | "240 train lost: 0.400969482421875 validation lost 0.501640966796875\n", 263 | "300 train lost: 0.38352987670898436 validation lost 0.447988525390625\n", 264 | "360 train lost: 0.3977647399902344 validation lost 0.3904426513671875\n", 265 | "420 train lost: 0.3853101806640625 validation lost 0.3850858154296875\n", 266 | "480 train lost: 0.30514312744140626 validation lost 0.3599074462890625\n", 267 | "540 train lost: 0.3052677612304687 validation lost 0.3359606201171875\n", 268 | "600 train lost: 0.299947021484375 validation lost 0.3302803955078125\n", 269 | "660 train lost: 0.30207510375976565 validation lost 0.316009033203125\n", 270 | "720 train lost: 0.2679033203125 validation lost 0.3506486328125\n", 271 | "780 train lost: 0.23057183837890624 validation lost 0.3141190185546875\n", 272 | "840 train lost: 0.268615234375 validation lost 0.303251318359375\n", 273 | "900 train lost: 0.25584968566894534 validation lost 0.2866251220703125\n", 274 | "960 train lost: 0.24577947998046876 validation lost 0.291543994140625\n", 275 | "1020 train lost: 0.21327096557617187 validation lost 0.2886482421875\n", 276 | "1080 train lost: 0.21391114807128905 validation lost 0.284293701171875\n", 277 | "1140 train lost: 0.23039756774902342 validation lost 0.281445751953125\n", 278 | "1200 train lost: 0.20374896240234375 validation lost 0.281747119140625\n", 279 | "1260 train lost: 0.21570712280273438 validation lost 0.2784237548828125\n", 280 | "1320 train lost: 0.1649161834716797 validation lost 0.280825830078125\n", 281 | "1380 train lost: 0.2258702392578125 validation lost 0.2849654541015625\n", 282 | "1440 train lost: 0.21041897583007813 validation lost 0.2756542724609375\n", 283 | "1500 train lost: 0.21202040100097655 validation lost 0.2837438232421875\n", 284 | "1560 train lost: 0.162168701171875 validation lost 0.2728369140625\n", 285 | "1620 train lost: 0.16351847839355468 validation lost 0.282944921875\n", 286 | "1680 train lost: 0.17375711059570312 validation lost 0.28183955078125\n", 287 | "1740 train lost: 0.19061044311523437 validation lost 0.2737459228515625\n", 288 | "1800 train lost: 0.22648509216308593 validation lost 0.2860446044921875\n", 289 | "1860 train lost: 0.18516549682617187 validation lost 0.297446142578125\n", 290 | "1920 train lost: 0.19757418823242187 validation lost 0.281408984375\n", 291 | "1980 train lost: 0.2553403015136719 validation lost 0.29214912109375\n", 292 | "2040 train lost: 0.17660171508789063 validation lost 0.2930916015625\n", 293 | "2100 train lost: 0.18530152893066407 validation lost 0.299178173828125\n", 294 | "2160 train lost: 0.1738466796875 validation lost 0.293308349609375\n", 295 | "2220 train lost: 0.14881591796875 validation lost 0.2928556640625\n", 296 | "2280 train lost: 0.16735308837890625 validation lost 0.29966865234375\n", 297 | "2340 train lost: 0.14085601806640624 validation lost 0.31744697265625\n", 298 | "2400 train lost: 0.14231411743164063 validation lost 0.2919728271484375\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "# training and save the parameters, we train on the first 30000 SNP sites and validate on other 10000 SNP sites\n", 304 | "batch_size = 500\n", 305 | "validation_lost = []\n", 306 | "for i in range(2401):\n", 307 | " Xbatch, Ybatch = utils.get_batch(Xarray[:30000], Yarray[:30000], size=batch_size)\n", 308 | " loss = vnn.train(Xbatch, Ybatch)\n", 309 | " if i % (len(Xarray[:30000])/batch_size) == 0:\n", 310 | " v_lost = vnn.get_loss( Xarray[30000:40000], Yarray[30000:40000] )\n", 311 | " print(i, \"train lost:\", loss/batch_size, \"validation lost\", v_lost/10000)\n", 312 | " vnn.save_parameters('../wd/parameters/vn.params-%04d' % i)\n", 313 | " validation_lost.append( (v_lost, i) )\n", 314 | " " 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 12, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "1560\n", 327 | "INFO:tensorflow:Restoring parameters from ../wd/parameters/vn.params-1560\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "# pick the parameter set of the smallest validation loss\n", 333 | "\n", 334 | "validation_lost.sort()\n", 335 | "i = validation_lost[0][1]\n", 336 | "print(i)\n", 337 | "vnn.restore_parameters('../wd/parameters/vn.params-%04d' % i)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## Testing using Chr22 variant calls" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 13, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "Xarray2, Yarray2, pos_array2 = \\\n", 354 | "utils.get_training_array(\"../wd/aln_tensor_chr22\", \n", 355 | " \"../wd/variants_chr22\", \n", 356 | " \"../testing_data/chr22/CHROM22_v.3.3.2_highconf_noinconsistent.bed\" )" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 14, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "base, t = vnn.predict(Xarray2)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 15, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "" 377 | ] 378 | }, 379 | "execution_count": 15, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | }, 383 | { 384 | "data": { 385 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAABPCAYAAACEawagAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAADyhJREFUeJzt3X/sJPVZwPH30+/B0TtSr3C0Anf0\naMEq0hbIpaUpMQgq0BLwj2rBmla0OZs0CgRioE2sJhpjJMUaFXOWQo0IxQOVkFQKlQbRAL3jZ8uv\nXlsKhxQOpBSO5IDz8Y8djuVu9r4z7MzuzO77lRC+Ozs/np1nPrPzuZ3PM5GZSJIkSZIm703TDkCS\nJEmS5pUdMkmSJEmaEjtkkiRJkjQldsgkSZIkaUrskEmSJEnSlNghkyRJkqQpaa1DFhEnR8RDEbE5\nIi5oazsaX0SsjoibI+L+iPhORJxdTN8vIm6MiO8W/3/rtGPVaBGxEBF3RcT1xetDI+L2og1+NSL2\nnnaMKhcRKyJiQ0Q8GBEPRMQHbX/9EBHnFufNb0fElRGxj22vuyLiyxHxVER8e2haaVuLgb8q8nhv\nRBwzvcgFI/P3F8W5896I+JeIWDH03oVF/h6KiJOmE7WgPHdD750XERkRK4vXc9f2WumQRcQC8DfA\nKcARwJkRcUQb21IjXgHOy8wjgGOBzxT5ugD4RmYeDnyjeK3uOht4YOj1nwMXZ+ZhwLPA70wlKlXx\nReDfM/NngfcxyKPtr+Mi4mDg94G1mXkksACcgW2vyy4HTt5l2qi2dgpwePHfOuCSCcWo0S5n9/zd\nCByZme8FHgYuBCiuY84Afr5Y5m+L61NNx+XsnjsiYjXwK8CjQ5Pnru219QvZ+4HNmfn9zHwJuAo4\nvaVtaUyZ+URm3ln8/TyDi8GDGeTsK8VsXwF+dToRajERsQr4CPCl4nUAJwAbilnMX0dFxE8BvwBc\nCpCZL2Xmj7H99cUS4M0RsQRYBjyBba+zMvMW4H93mTyqrZ0O/EMO3AasiIgDJxOpypTlLzO/npmv\nFC9vA1YVf58OXJWZ2zPzB8BmBtenmoIRbQ/gYuAPgByaNndtr60O2cHAY0OvtxTT1HERsQY4Grgd\neHtmPlG89SPg7VMKS4v7SwYntP8rXu8P/HjoS8o22F2HAluBy4pbTr8UEcux/XVeZj4OXMTgX3af\nAJ4DNmHb65tRbc1rmf75beBrxd/mr+Mi4nTg8cy8Z5e35i53FvXQThGxL3ANcE5m/mT4vcxMXv+v\nF+qIiDgVeCozN007Fr0hS4BjgEsy82hgG7vcnmj766ZirNHpDDrVBwHLKbklR/1hW+uviPgcgyEY\nV0w7Fi0uIpYBnwX+cNqxdEFbHbLHgdVDr1cV09RREbEXg87YFZl5bTH5yVd/Ii7+/9S04tMefQg4\nLSIeYXB78AkMxiStKG6jAttgl20BtmTm7cXrDQw6aLa/7vsl4AeZuTUzXwauZdAebXv9MqqteS3T\nExHxW8CpwMeLTjWYv657F4N/zLqnuH5ZBdwZET/NHOaurQ7Zt4DDi0pTezMYVHldS9vSmIrxRpcC\nD2TmF4beug74ZPH3J4F/m3RsWlxmXpiZqzJzDYO29h+Z+XHgZuCjxWzmr6My80fAYxHx7mLSicD9\n2P764FHg2IhYVpxHX82dba9fRrW164BPFBXfjgWeG7q1UR0RESczuGX/tMx8ceit64AzImJpRBzK\noEDEHdOIUbvLzPsy822Zuaa4ftkCHFN8J85d24vX/iGh4RVHfJjBuJYF4MuZ+aetbEhji4jjgP8E\n7uO1MUifZTCO7GrgEOCHwK9nZtmATHVERBwPnJ+Zp0bEOxn8YrYfcBfwm5m5fZrxqVxEHMWgIMve\nwPeBsxj8g5ntr+Mi4o+BjzG4Veou4FMMxjrY9jooIq4EjgdWAk8Cnwf+lZK2VnSy/5rBbagvAmdl\n5sZpxK2BEfm7EFgKPFPMdltmfrqY/3MMxpW9wmA4xtd2Xacmoyx3mXnp0PuPMKhY+/Q8tr3WOmSS\nJEmSpD2zqIckSZIkTYkdMkmSJEmaEjtkkiRJkjQldsgkSZIkaUrskEmSJEnSlLTaIYuIdW2uX+0y\nf/1m/vrL3PWb+esvc9dv5q/f5jl/bf9CNrc7dkaYv34zf/1l7vrN/PWXues389dvc5u/Sh2yiDg5\nIh6KiM0RcUHbQUmSJEnSPFj0wdARsQA8DPwysAX4FnBmZt4/apm9Y2nuw3JeZjt7sbTJeFvzM+99\ncbdpD9+7rPLy21cvL53+nrduLZ1eZ93jxvZGVclf2ede+ti20nmb+Byj9nOZUXFM0qh424pteHs7\nXtjGwr7LG9lenf0+ShfyUVeX215b6hyzTZz3mthe1eWbWsdi3mj+2tr39z17QK046myvzj4qa09N\nxFZHWQzw2rG5a+7GPV7q5KnN78M62jpXL7bvmzCcvybaU1ttYVY0kdPhdWx9ZgcH7L9Qex1l6raP\nOvmrc+37PM8+nZmLnuiqdMg+CPxRZp5UvL4QIDP/bNQyb4n98gNx4mLb7pQb/ufu3aaddNBRlZff\nfPGxpdO/97G/K51eZ93jxtamss992Lm3lc7bxOcYtZ/LjIpjkkbF21ZsbW2vzn4fpQv5qKvLba8t\ndY6hJs57TWyv6vJNraMtbe37d33107XiqLO9OvuorD01EVsdZTHA6HY97vFSJ09tfh/W0dZxX3ff\nj6uJ9tRWW5gVTeS0reOibvuok78617435YZNmbl2sXVWuWXxYOCxoddbimmvExHrImJjRGx8me0V\nVitJkiRJ862xoh6ZuT4z12bm2r7cpihJkiRJ01SlQ/Y4sHro9apimiRJkiRpDFXGkF0GfALYDLyH\nQVGP38jM74xaZukhq/Og887ZbXpb9+02cS92HXXuHW1rHZO+F3tWjNpvdcYxdOX++nGPoTbvg69z\n73ad9jvpMXLjjmMatY46x2HdcahtHReTPs+OO36viXNkm8dF1fWOWneb7bStsRRNGHc/1x1bOMlx\nQXW/nyadpzptss68ZbE1Mf6+jibGlo7KX5kmrjkmvb066sTW1vjNrmhyDNllwDnAGuAB4Oo9dcYk\nSZIkSdUsWWyGzLwlIh4Ffjczj5xATJIkSZI0Fxor6jFcZXHHC/P3LAZJkiRJqquVKouvPoxWkiRJ\nkjRalaIeq4GrgaOB7wHrM/OLe1qmiQdDd+GBrJMunNHW9uoMrhylzoDQJh4K28Q6ykx6sGoTg7Hr\nHBdNFGTowkOgJ13gpE1dPpeVaavoxag46rSRtopQ1NVWgZOuPMh23Ac4d6W4UBPGLVDUxHdAHV1+\nsHsTulAIbtKfuc1r0S5/d9YxbhEZGH3cl6lTgGvhwM2NFfV4BfgTBlUWjwU+ExFHVFhOkiRJkrQH\nixb1AL4AHA+sZFBlcStwMHB/e2FJkiRJ0uyrUmXxzFf/jog1wC3A7e2FJEmSJEnzocovZABExL7A\nNcA5mfmTkvfXAesA9mFZYwFKkiRJ0qyqVGUxIvZi0Bm7IjOvLZtnuMriXixtMkZJkiRJmklVqizu\nA2wBEngK2JCZn9/TMksPWZ0HnXfObtPHrUBUp7JN3cqCTVTYKTOqWk1blW3aqujWVlWhuuuts9/G\nnXeUruRp3GNo0p950hXyxq2YBPXOQ6OM+/nqVMKD8s/Sx6pZdXShUlib1Um78PnqaKIqXBP7s0zd\n74Au7OdZj62J7+pJVj5t8/uwzKS/f9uqylvXuPuoK+3mptzQWJXFtcD+wBPAy8D5EXHumPFJkiRJ\n0tyrUtTjViAAImIZcCvw3y3HJUmSJEkzr+oYsoWIuJvBLYs3ZuZuVRYjYl1EbIyIjTte2NZ0nJIk\nSZI0cyp1yDJzR2YeBawC3h8RR5bMs7Oox8K+y5uOU5IkSZJmzqJFPXbOGLEAbASWAX+fmReNmnft\n+/bJO25Yvdv0ssF/TQz+nbQmBjy2VXyjLW0N8qyb/7rFWtpYb5ufr45Jx9yWtmKbdJ7aWkfdY2Xc\nddT5HJPeb21ur446sU1aE7GNu44+nm8mfVxMuo3U0eXzXlvni67st3HX25XtNWHcmNss7ldHY0U9\nIuKAiFgBnA08DKwEHhw/REmSJEmab1UeDH0g8E/AO4Cngacz8/pWo5IkSZKkObDoL2SZeS+DX8SO\nB84CvttyTJIkSZI0F6rcsngq8FRmblpkvp1VFrc+s6OxACVJkiRpVlWpsvgh4LSIeAS4CjghIv5x\n15mGqywesP9Cw2FKkiRJ0uypVGWx6Iw9D7wZWJmZK/Y0/1tiv/xAnDhWYG1VIZx0VccuVJHsQgxd\nMa/7Ylaqek465raqHo4y6UpRfTsu1A9drujWlXNLma7ENulqpl34fH28vmxLVypOdmG/NVGpc+HA\nzc1UWRzyi8CngFtrLCNJkiRJGqFKlcWdMvObwDdbiUSSJEmS5kzVX8gS+HpEbIqIdWUzDBf1eJnt\nzUUoSZIkSTOq6i9kx2Xm4xHxNuDGiHgwM28ZniEz1wPrYTCGrOE4JUmSJGnmVP2FbFtEbABuAQ4C\nfq29kCRJkiRpPixaZTEiljP45etm4ErgJuCizLxm1DKjqixOurJNmT5Wfhm34s2oeLu837qy78tM\ner81se4u69t5oY/aapNtnlvGPS7aPId0uTplW5VBu/L5umzcNtL1fTzJmLt8DVBXW+eyOrp83dqV\nc3Jbx/dNuaGxKouHAR8Ffg+4A7h+T50xSZIkSVI1VTpkAdwD3AW8BBxa/GomSZIkSRpDlQ7ZEuAY\n4JLMPBrYBlyw60xWWZQkSZKkeqp0yLYAWzLz9uL1BgYdtNfJzPWZuTYz1+7F0iZjlCRJkqSZVKWo\nx7uBTcBjwHbg54D/yswTRi0zqqjHuGZ9kPYsDWIt04V93BVd2Bezfry1xf3WPV1oT9Kkedy/xn2h\nSapzvDVW1CMzHwKOY3Cr4psYPCT67MWWkyRJkiTtWaXnkGXm3UXv7nxgU2be125YkiRJkjT7qj4Y\n+lVnMHgWmSRJkiRpTJU7ZBGxN3Aa8M8j3rfKoiRJkiTVUOcXslOAOzPzybI3rbIoSZIkSfUsWmVx\n54wRVwE3ZOZlFebdCvwQWAk8PVaEmibz12/mr7/MXb+Zv/4yd/1m/vptFvP3jsw8YLGZKnXIImI5\n8Cjwzsx8rmoEEbGxSqlHdZP56zfz11/mrt/MX3+Zu34zf/02z/lbUmWmzNwG7N9yLJIkSZI0V+pW\nWZQkSZIkNaTtDtn6ltevdpm/fjN//WXu+s389Ze56zfz129zm7/KRT0kSZIkSc3ylkVJkiRJmhI7\nZJIkSZI0JXbIJEmSJGlK7JBJkiRJ0pTYIZMkSZKkKfl/RyhaXiz9iA8AAAAASUVORK5CYII=\n", 386 | "text/plain": [ 387 | "" 388 | ] 389 | }, 390 | "metadata": {}, 391 | "output_type": "display_data" 392 | }, 393 | { 394 | "data": { 395 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAABPCAYAAACEawagAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAH8pJREFUeJzt3Xl8VeW1N/DfOicT8xQESQIkEBAE\nK5Qqtfattd6KlYJ9b+XiS+ssVVrFqb1SW623tbbO19bSF6WIdaBe9FrloihOaKuWQRQFAiGQkDDL\nIGOms+4f5yjRtR44pyQkx/y+n48fycpz9nnOfoa9d85+1hZVBRERERERER19keauABERERERUWvF\nCzIiIiIiIqJmwgsyIiIiIiKiZsILMiIiIiIiombCCzIiIiIiIqJmwgsyIiIiIiKiZtJkF2QiMkpE\nSkSkVERuaKr3oSMnIgUi8oqILBeRD0RkciLeVUReFJHVif93ae66UpiIREXkHRGZk/i5UETeTozB\nv4hIVnPXkXwi0llEZovIShFZISJf5vhLDyJyTWLefF9EHheRHI69lktE/iQiW0Tk/QYxd6xJ3H2J\ndnxPRIY3X80JCLbfHYm58z0R+W8R6dzgd1MS7VciImc2T60J8Nuuwe+uExEVkdzEz61u7DXJBZmI\nRAHcD+AsAIMBnCcig5vivahR1AG4TlUHAxgJ4IeJ9roBwEuqWgzgpcTP1HJNBrCiwc+/BXCPqvYH\nsAPAJc1SK0rGfwJ4XlWPA/AFxNuR46+FE5E8AFcBGKGqQwBEAYwHx15L9hCAUZ+JhcbaWQCKE/9N\nBDD1KNWRwh6Cbb8XAQxR1RMArAIwBQAS5zHjARyfeM0fEuen1Dwegm07iEgBgG8CqGgQbnVjr6m+\nITsJQKmqlqlqDYBZAMY20XvREVLVjaq6JPHv3YifDOYh3mYzE8VmAjineWpIhyMi+QDOBvBg4mcB\ncDqA2YkibL8WSkQ6Afg/AKYDgKrWqOpOcPyliwwAbUQkA0BbABvBsddiqeoCANs/Ew6NtbEAHta4\ntwB0FpFjj05NyeO1n6q+oKp1iR/fApCf+PdYALNUtVpV1wIoRfz8lJpBYOwBwD0AfgJAG8Ra3dhr\nqguyPADrG/xcmYhRCycifQEMA/A2gB6qujHxq00AejRTtejw7kV8Qoslfu4GYGeDgxTHYMtVCGAr\ngBmJW04fFJF24Phr8VS1CsCdiP9ldyOAXQAWg2Mv3YTGGs9l0s/FAJ5L/Jvt18KJyFgAVar67md+\n1erajkk96BMi0h7AkwCuVtWPGv5OVRWf/usFtRAiMhrAFlVd3Nx1oX9KBoDhAKaq6jAAe/GZ2xM5\n/lqmxFqjsYhfVPcC0A7OLTmUPjjW0peI3Ij4EoxHm7sudHgi0hbATwHc1Nx1aQma6oKsCkBBg5/z\nEzFqoUQkE/GLsUdV9alEePPHXxEn/r+luepHh/QVAGNEZB3itwefjviapM6J26gAjsGWrBJApaq+\nnfh5NuIXaBx/Ld8ZANaq6lZVrQXwFOLjkWMvvYTGGs9l0oSIXAhgNIAJiYtqgO3X0vVD/I9Z7ybO\nX/IBLBGRnmiFbddUF2QLARQnMk1lIb6o8pkmei86Qon1RtMBrFDVuxv86hkAFyT+fQGAvx7tutHh\nqeoUVc1X1b6Ij7WXVXUCgFcAfDdRjO3XQqnqJgDrRWRgIvQNAMvB8ZcOKgCMFJG2iXn047bj2Esv\nobH2DIDzExnfRgLY1eDWRmohRGQU4rfsj1HVfQ1+9QyA8SKSLSKFiCeI+Edz1JEsVV2mqseoat/E\n+UslgOGJY2KrG3ty8A8JjbxhkW8hvq4lCuBPqnprk7wRHTERORXA6wCW4eAapJ8ivo7sCQC9AZQD\nGKeq3oJMaiFE5DQA16vqaBEpQvwbs64A3gHwPVWtbs76kU9ETkQ8IUsWgDIAFyH+BzOOvxZORG4B\n8G+I3yr1DoBLEV/rwLHXAonI4wBOA5ALYDOAmwE8DWesJS6yf4/4baj7AFykqouao94UF2i/KQCy\nAXyYKPaWql6eKH8j4uvK6hBfjvHcZ7dJR4fXdqo6vcHv1yGesXZbaxx7TXZBRkRERERERIfGpB5E\nRERERETNhBdkREREREREzYQXZERERERERM2EF2RERERERETNhBdkREREREREzaRJL8hEZGJTbp+a\nFtsvvbH90hfbLr2x/dIX2y69sf3SW2tuv6b+hqzV7tjPCbZfemP7pS+2XXpj+6Uvtl16Y/ult1bb\nfkldkInIKBEpEZFSEbmhqStFRERERETUGhz2wdAiEgWwCsC/AKgEsBDAeaq6PPSaLMnRHGmHWj2A\nTMk5+Iuj+BBqycr0f1Fb54b7Dd1jYqXvtUv6/arz/bKDu25242vea2+DIm7ZvCG2blXvO68HUN+1\nrYlFP9zrlpWIfz2uGgMA1Go1MiXbLfOxmp72c2fs99u5uPcWEytb1iFQB38btc77xTL8uuVsPGC3\nW1/vFw7se6/PSlaWX7SmxsSqe/v9IvvDmN1utX09AGi9LRuqW3Xvg+1fv2cvou3j759dsc++X6D9\nEY2a0IEe/meWwO7UHFvnnPLqQGH7OVJ+YL3TfsH+neN8lr373bLFJ9ixs3qZP/aCUvksiY/RcOwJ\n/L5Z18WOdQCIbrd1Do91pw/18rebvcH2oZqeftnQn/qydth+Ud3NL5y93n6O6r7+++VscjpizN/v\n1d0DxwanePYmv19ozBm/mQe3WxPbj6xIm3jZ2lp3Gw3Lf+zAsf44y66w+6Kmlz+39Om61cQqtnV3\ny8ba+HOLN1are7bx67Y5+XnWG0/v7zzGLRvxp0Nkbbb9MDhfOPNCf+dYDxw83teiGpk4eNxz97Mz\nvwFAVpntL6Fzg2M67TKxXR8EDmYBkmOPz9Xd/G2oM8xC83fWhhTOGQL73msTr/0BYLVzriUZ9jgE\nAAgcD9sOisf37ahG2y7x/bJjhz9XZ2106tHe79/YY9u0Js9v06wq//Olwv3cgbns43M1E29v58mI\n8zkAd9pL6ZjVbrBfdt+KwDGnje2zuUUHx8Lu7XXo0DXeh7e9f+jzz09xDpP1Xf12qgs0dXaVnVtC\n+6Kuu3Puu9Vv/93YsU1V/Um44esPVwDASQBKVbUMAERkFoCxAIIXZDnSDiMzR5m41gZm2CaQ0avA\njcc224MVADz9/OsmNibvS0m/35qrv+zGXzvvHjf+rwW2vGT4Jwq3PvuGid143Ffdstu/PdzEusx8\n0y0baet3Vq0OnDg7yn9wkol1X+pf9D57/3+a2Pj+X3fLxgJ12HCB3W8HcgMXgL+2XbR+pz0IAoBk\n+wPfu8jKyO/tlq1bW25iq2842S3b/xF7EpOxar1bNrbHH+Re3Vbd6PfZAVcsNLFIe/9gFenU0cRK\nJvufOWuXf7FQM8hO/sWXlbhl4Zzcxg7Y/XMokmlPZCPt/f5dX5xvg4v86Wzu83a/nd13pF+JiL8v\nUhlPkmGnZC8GADtHnejGOz7+lq1aG/9CxjtxXjPZziEA0O9ni02s4pIRbtm6Dv6YLHrSngyvnuC3\nU/9r7b5ffYv/mY+7w25X9vl9qPTSXm4czrlN0W/e84vutWMyo2eeW7auaoMbz+hxrImt+Hf/uFV8\n1SITK7/Czr0AMHXC/zexHz54uVv2wGD/JG3A5atMbO0VJ7hli+5Ofp6d+/wSEyt++gq3bJtK/4S8\n9z12G7GawEWv88elp56zx1MA+E6+vz/Lr7DHnNgAf04uHG/7y5pr/fniR6OeN7HnhnZzyyJw4h3t\nV2xipef726hrZ7eRudM/ae7788A5Q3vnD6iBPzh4bTLn+X+4ZUcX2H0f7dzJLat7nZNmAMMes/Gn\nnzrVLVvwy7/b7Z7ozy3y5jITW3ul31cKp/j7zf2Db+BEP9q5qy0a6N+63x+/tSd/wcSy/vaBvw3n\nAlfr/Pfz6vylx/2r+sUjA+eXx/c3sYsfe9YtO31AoV8Ph3ec3D7GPx/aPsTf9/1vfMfEQsfvrefa\neaH7VL/95+tse3LoSOaWxTwADc8UKxOxTxGRiSKySEQW1WpqJ1NEREREREStUaMl9VDVaao6QlVH\nfOo2RSIiIiIiInIlc0FWBaDhfRT5iRgREREREREdgWSSeswAcD6AUgBDEU/q8f9U1b8hFUBOfoHm\nX3mNiRf+1K5t2DHH3k8KAF3OXm1iB77t37f78h//aGJjThrtlg3dzx9M6uBY9yt7T3jhf9j72gH/\nHnYAWPMze49v8YMb3bL1VTb+17LAffAnfsu+fqu/bi7S1l9XEulh1x7Wr/evwb37dkPrv9zXB/ZP\nMHGGs+ZFA4laIm3sN7VzVtm1ggAweMYP/W3U2n7R59d2PUe8Is5C/9DatEH23uhzHn7FLfvMyH5u\nfO01Q0ys9y/f9usWs/ut4henuEX73Go/X6Sjv94stL5t46Qvmlhd4Ivz6Zf9zsRuLrKvB4B1v/TX\nanprfWL7/LUGXt9adfswt2zxZDtnrb3Nr0NoPV3eb+x6hXkblrplB8y062lem3CHW/bi489y42t/\nbPtFn5uSX9uwarq/75d+07bTvxV+zd+sk2wAAGJ77FqvSGCMzFlj99u38vz1bRkFdl3gsGf92/aX\nnGnXbgGA7rPrMconD3XL9rnbtt+mi/01KLWBHDBZu+yxt/sfbX8DgDV32rWoxT8PrG87YOffSKA9\nPhxnj0MA0PURu34vlKhj1f12ncaASf5aIU8kx58YQsmMNv/Arlv86GR/LU20ym678AZ/LJTf4s+H\nhbc5a9YCx7iMHjZBSWy3n0REMu2xc+0D/lrd+jV+JyqeWmliuusjt+z6S483sbzf2XWh8Tf02/q+\nNa+Z2ORB/+KWfaxkvomdF1gzvuaXdly/ed6dbtnzT/y2G6/fYdcthvq9d2zo/vfObtm3FwwysaIp\n/jidV2XXIAHAtnp7nPzyo9e7ZXPfs/NCx8f89xv5rr/Wa8MB+1nmL7btDwDtym0/zL/LH7/eHOD1\neQCYsfApNz5uxQQTyx7lr5//8GJ7zv+RfzqEwr/afRwt8berdf45Y8xbn+icOwFAtKNda1//kT/2\n5uvsxarqL7huIJlvyGYAuBpAXwArADxxqIsxIiIiIiIiSs5hsyyq6gIRqQDwA1W1f3olIiIiIiKi\nf0qjJfVomGWx3kkJTERERERERJ/WJFkWo+2Sf6AyERERERFRa5VMUo8CAE8AGAZgDYBpqmqf8NtA\nx0g3HZltF5xvvNwuFu/5Oz8JwbxKu9j0zF7+omlP6GGqocV8nv3z/IfStTmrwsS8BBJAOLGAZ/c4\n/yHCHf7LLrAOLjTsZh8qOHfZy27ZxdX+g7oHZtrF1ENf+JFbNnOzfZj178990C17V7G947Vyip8g\n4ZUr/EQGFw4bY2Kbzh3glq1raxMWHHuXTRQA+A8WBvxFrBOW+8kCzu+4zcT6v3KRW7bf92xSgOhg\n/3Pc9Oxjbvzm/nbB67pb/MQ371xkh+z/LfQfmuktmq/3dw8OfMn/NrzowpUmFkreIB3sgvX/WTjX\nLRuaA0rvtYl2cpf4STa6PWvrFnrApvuA6oifiCaS5T/Y/bkyuyA79Dkizh+zvIcQA6k9zHyuM58C\nwB61yQnGFfjJDbwH15dM9R8WPOha+2BhAJi93C70H/HHq92yfe9738TKrvcXpntJS6JdurhlV97r\nz+tRZ947pqu/SPvKQpuA5+FT/MQwsUCShUgH+5DdsmuOc8v2vdUmlqi8xk++kne7PaaWPeInJ+l3\nvt3HADCj7FUTO/Pen7hlO1bYObLd7EByIUfFzYFkGn+wib0AQJ1jaug46805wYfOBxJ7rZpm58Nj\nXvfPL/Z3t9uosWv/AQC9XrdjL/NlP9mPBB467/WhFXcXuWUHXu6Mp5kD3bJLTp3mxscNted1K+70\nsywMvNwmndFa/5zDPV8LJPyq+Inf73vfZhNRrLrLz6NQdq5NBHdmvr9db9+vu8l/4HAoeVLtN209\nMuf7CUC8c8nHVr7olh0fmKsz8u1D6kPz0KqpNrHecdf7ye/qNm02sdA8G3qYuXS15evX++8nUfud\nUSihzilLbfztM+x+AIDyS+0D1QHgtUn2vDM36n/BVDT/YhMrO+NPbtnosaWNltSjDsCvEM+yOBLA\nD0VkcBKvIyIiIiIiokM4bFIPAHcDOA1ALuJZFrcCyAOwvOmqRURERERE9PmXTJbF8z7+t4j0BbAA\nQPL3IxAREREREZErmW/IAAAi0h7AkwCuVlVzM6qITAQwEQBy4D9wmIiIiIiIiA5KKsuiiGQifjH2\nqKq6j99umGUxU/wEF0RERERERHRQMlkWcwBUAlAAWwDMVtWbD/Wa7KI8zf/1JBNf/rXpJnbyr/zs\nfd2n2mw1d6yz2coA4MeFNlPfH9a97pbtndHGjZ8z4mwT0wN+NpeV99gsRlrjX9sOnORn0im5b7iJ\nZe70swr1+9W7JhYL1G3QQpsRaMUX/cyS0Y5++icvi03prX4Gsf5TbPavYHYdJ5NS2Uw/s2C/SZVu\nfMXtdt8PuMzfx+un2KyVBbf6WRZXz7TtAQDYZTPLDbrXZhoCgLq1NvtipI3f385bYrPQPT7c3xc1\nIwe58fUTa02s6Hv+0k6N2XFeepefKaroSZuFLLOkyi0b+8jP3KS1ts+tvsd/vwGP2CyCunCZW7b0\nEb8fRtfbPwJNHednCrvs+UtNLLLPH7/9fmznoeoX+rplK1b2cOPFVyV/h/eau2y2yH7X+fNer7ds\nhjUAWLWzu4m1H1Xmlo3k2P0Wye3mV845VtRv3uIWLfm9P54GTXEy5zmZtACg/sPtfj0cpX+2WSsH\n/sj/zFpjxw0AIGbnLckKpBd1Mmquvq/AL5rtv1/1Wtt+/W9Y5JZdda/NADfwentcAADJcbJvhrIO\n7wk8M9TJcBfK9ukdR+auXOCW9bKLho5D+2bbjMEAULGip4nlbPH7UN3QPSZWdEGJW3btw37GwaKL\nSk1MAlmVy66yWTILf+tnTvSyoW78kZ8ld//J/r6PqT3eD5i01n8/Z06Gk0UYAGKBMZLR22at2/J1\nP5Nd1xlOxsFAhtpVU+18cdy1/rGs5A9+O3V/yfb73Nf884i68vUmFsoWuO4B+/l6j7eZeoHUsnhv\neto/rvfrarM17z3Nnwtrz/CPh1k77DlcySR/Lht0je3f97/7P27Z3hn27rezi/xM2RLIkhnp5Iz3\nQNnlP7P7vqBwq1vWy37u9SsAGHCFc94KINrRZnyu37nLLZtKRuT5OrvRsiyOANANwEYAtQCuF5Fr\nkngdERERERERHUIyST3eACAAICJtAbwBwP+agYiIiIiIiJKW7BqyqIgsRfyWxRdV1dyDIyITRWSR\niCyK7Q7cBkFERERERESfSOqCTFXrVfVEAPkAThKRIU6ZT5J6RDr4T7YmIiIiIiKigw6b1OOTgiJR\nAIsAtAXwgKreGSp7/AlZOmvOMSZ+bV+7+O+BijfcbVzW+9Sk6hUSDSxMr9/24RFtFwAeXf83E+sQ\n8RdMjsnzExncvc4ueL1u4GluWXWSbIREe9j9Hlp4n4q5Vf4iyB2x/SY2/vwr3bLRV+w2nqz0Exb8\na75NbgAAT1Ta/bYr5i9M7p1hF2h6i8oB4Hfltk0BoE+GbddQm3q8+gLAuHx/IWwq/uz0w5rAeL7E\nGU+hxfTPrnjVxLx2BgA/fQuwqd4u0s0Rv52u7POVwFasOVWLA/WwNQm1U+yrdiH0E4/d75YdX3CK\niUUH9nfL3v/CQ2788j7Jz2X/XfkPE/tOvr/Q/7dr/WQhHSJ2Qf6kFOqwa4I/9jo9asdqqH9fsOYc\nN77/azYhzvTAMcDrs16fB4DvFyTfh3q86ff73+bNNbElNblu2fv62+QNoc/RIZDIIAqbkCES+Bup\n179La+3rAeDfC20yo1Tr5s1Pobl66Fw73w+YuNAt67mu9AM3fnKOnzCovM5+7s4RP5lCW7FlJwT6\nyneW+8kCvtvBJmA66WX/GFd8gT3GPRTq36d/38RWX2oTlgDAmxP8060Jzhh5qNxPaHbblq+bWET8\n40UoEZinMY5x3j66qOg0t+wja19z4167pnJ+MfqDHW7Z4W1skpSijH1u2QtTOG+dtd5f+VNeZ8ek\nN6YBYPX9frzTB3Yb10/+i1t2XHt7fvjtInvcA/xzUS9ZG5BaghM44xQAxi3faGJj269xy3rt7x1P\nAX8+BcLnnZ5o504mFkoA0mhJPUSku4h0BjAZwCoAuQD8FDNERERERESUtGQeDH0sgMcA9AGwDcA2\nVZ3TpLUiIiIiIiJqBQ77DZmqvof4N2KnAbgIgPMgGSIiIiIiIkpVMrcsjgawRVX9xRsHy32SZXHH\n9tDKEiIiIiIiIvpYMlkWvwJgjIisAzALwOki8shnCzXMstila1LJG4mIiIiIiFq1pLIsJi7GdgNo\nAyBXVTsfqnxH6aonyzeSqkAkJ8eN37LCZgq66Tg/g00qWQhvLFvqxm/tZzOvIckMlIdSvDDbja/+\nUvJ1PlJXlfo5WLxMYU3JyyrkZbFrSvM2+O0fyr74efFMlc16lkq2yKOtKbNTpsLLvnnVgNPdsqF5\nKNTnPF4/DM1Ztw3xx05sn58BzPOTNctM7PZ+Q5N+fVAga1ZjzKme2Kl2v0XeSH6/f+4FsikikKH2\nSIWy8u7XGhNLJbNZYwhlXgtlM/XsuNCfh7o85M9byaq6wR/Teb/xM/I1Fe94AaR2zPC2MfOjPm7Z\nJwfZ7NBNyZvXg9l+Q3OZY17VO278qJ9feOM9hbF++jL/WcITO79rYvP25bllZwz029oTyjg5vs9X\nbbCJ5qyQYN162/4yq8LPApybv6Fxsiw28HUAlwLwc7gSERERERFRSpLJsvgJVX0VwKtNUhMiIiIi\nIqJWJtlvyBTACyKyWEQmegUaJvWoxdG7HY+IiIiIiChdJfsN2amqWiUixwB4UURWquqChgVUdRqA\naUB8DVkj15OIiIiIiOhzJ9lvyPaKyGwACwD0AnBu01WJiIiIiIiodThslkURaYf4N1+vAHgcwHwA\nd6rqk6HXhLIsetnGUso+E8h242W2WXDA38StRcm/39HO9Lb64eFuvOyMPyW9jZf22+w6oaxptWd8\n0Y3PmznNxEbn+WU9a2ed4MYLx7+X9DaOdjZEyfC/LNa6OhObU+U/ki9T7L5Ppb6hz1yrflahVNrE\ndZQz4YXeb07lIhM74s+G8P78xvcvMbGMlw75mMVPiQ4e4Mbrl69KehtHWyrjKdrDz3g2950Xkno9\nAOiXv+DGX3hyZtLb8DL1fSvPnyO9vvVMIJve2L5+NjWttRkAm7TPettuhLF3xMdZpLbvd37fHg87\n/zmFbIOpzkNeeQn8bbmpMrIF6vy1d22G09dOaOOWrfypzahYcIftV0CgbwK4qKTcxILZ7RphvzVG\n30rW9Ao/f9wlvf1s2xl9Ckysrnx90u8X6dDBjcd27056GyGp7LdI27Ym9sxqm3UcAIY+eKUbry3a\nb2L9zw9knW2i433oPCkC2w9TmddD9W2MvplKNupocZGJ1a8uc8vO19mNlmWxP4DvArgSwD8AzDnU\nxRgRERERERElJ5kLMgHwLoB3ANQAKEx8a0ZERERERERHIJkLsgwAwwFMVdVhAPYCuOGzhZhlkYiI\niIiIKDXJXJBVAqhU1bcTP89G/ALtU1R1mqqOUNURmchuzDoSERERERF9LiWT1GMggMUA1gOoBjAI\nwN9U9fTQazplHaOn5I4z8bpNm+32s/2Lt2j3XBN75M3/csuOL7CLY0MLNCO5Xd340288ZWJj+vmL\nvxG1yRu02v9W8MA3h7nxtovtYtz1F/R3y/a6/e9+PRze55Zj/UX6Q/+yxo0v/aL9fKFFvpF29u7V\n2pMGumUz315pYlprk2bE4/4i5khOjolJjt+H6nfucuOeaOdObjy2Z6+tWywwZpx95NUXAJ4re8vE\nggtQI057AIi0s4t/JbDYPLbPLjZ/sOxVt6y3aNpr5/gv/L/pRNrb8lpb65at377Tvj4r0y0bq/G3\nIRH7ub2ELICfdKbfxaX+dp3P3flpf7s7R7th1O/Y4f/Ce7/MLBOL5h/rltUddr/FX2D7S/2H292i\nGUV9Tazybj8JQc9zVphYaP6OFPRy4/Wla00s2rGjX9ZZTO8teAeAr765zcQWDPePAQfO9OfkNgvs\n/CQ9u/t1W2Pn76jT5wEgFjg2oN6ZU0NJFjRm3y+QfKWuaoOJhY6HCIxJr86SZfsmAGCIPW7pkuVu\nUW8b0Ty/f9dXbXTjEWeu1n02iUF8I3Yfh445+IKfrEeW24X6ktfTLVvb09YtY6k/t5T+fIiJFT9g\nz5EAQDf4cembb8tW2PYHgAOn2OOyBA5lWS/7CSC8OTU0B7j9+8Tj3KLiJCdBzPb5Q7l52QIT+8VA\nPwGb24d6dPPL7tpjq7bdn9Nje+35QkhoLhPn2Ff/ka0DAETa+OcX4sQ1MJfFPiixrx9h+yYAyAf2\nnFH65Lll60v8BBfeXBaaW2RgoQ2WVrhlvXMc73gKAHDOFwB/bogEzi/HLrJ99q8j/IQ6L+x9uHGS\neqhqCYBTEb9VMYL4Q6InH+51REREREREdGhJPYdMVZcmru6uB7BYVZc1bbWIiIiIiIg+/5J9MPTH\nxiP+LDIiIiIiIiI6QklfkIlIFoAxANyFXA2zLNbEAvdzExERERER0SdS+YbsLABLVNVdXdowy2JW\nxF8UTkRERERERAcdNsviJwVFZgGYp6ozkii7FUA5gFwANv0VpQu2X3pj+6Uvtl16Y/ulL7ZdemP7\npbfPY/v1UVU/zWUDSV2QiUg7ABUAilQ16VziIrIomVSP1DKx/dIb2y99se3SG9svfbHt0hvbL721\n5vbLSKaQqu4F4D+kgYiIiIiIiP4pqWZZJCIiIiIiokbS1Bdk05p4+9S02H7pje2Xvth26Y3tl77Y\ndumN7ZfeWm37JZ3Ug4iIiIiIiBoXb1kkIiIiIiJqJrwgIyIiIiIiaia8ICMiIiIiImomvCAjIiIi\nIiJqJrwgIyIiIiIiaib/C8T1gTkbKqg4AAAAAElFTkSuQmCC\n", 396 | "text/plain": [ 397 | "" 398 | ] 399 | }, 400 | "metadata": {}, 401 | "output_type": "display_data" 402 | } 403 | ], 404 | "source": [ 405 | "# we can compare the output of the expected calls and the predicted calls\n", 406 | "\n", 407 | "figure(figsize=(15, 5))\n", 408 | "matshow(Yarray2[4000:4150,:].transpose(), fignum=0)\n", 409 | "figure(figsize=(15, 5))\n", 410 | "matshow(np.concatenate( (base[4000:4150,:],t[4000:4150,:]),1).transpose(), fignum=0)\n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "## Quick Evaluation" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 16, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "evaluation_data = []\n", 427 | "for pos, predict_v, annotate_v in zip(np.array(pos_array2), t, Yarray2[:,4:]):\n", 428 | " evaluation_data.append( (pos, np.argmax(predict_v), np.argmax(annotate_v)) )\n", 429 | "evaluation_data = np.array(evaluation_data)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 17, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "from collections import Counter" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 18, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "name": "stdout", 448 | "output_type": "stream", 449 | "text": [ 450 | "0 \t\n", 451 | "20958 \t\n", 452 | "283 \t\n", 453 | "287 \t\n", 454 | "281 \t\n", 455 | "1 \t\n", 456 | "35 \t\n", 457 | "9607 \t\n", 458 | "4 \t\n", 459 | "45 \t\n", 460 | "2 \t\n", 461 | "294 \t\n", 462 | "8 \t\n", 463 | "11846 \t\n", 464 | "461 \t\n", 465 | "3 \t\n", 466 | "667 \t\n", 467 | "363 \t\n", 468 | "247 \t\n", 469 | "1848 \t\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "for i in range(4):\n", 475 | " cnt = Counter(evaluation_data[evaluation_data[:,2]==i,1])\n", 476 | " print(i,\"\\t\",)\n", 477 | " for j in range(4):\n", 478 | " print(cnt.get(j,0),\"\\t\",)\n", 479 | " print" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 19, 485 | "metadata": {}, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "Recall rate for het-call (regardless called variant types): 0.986840295290935\n", 492 | "Recall rate for het-call (called variant type = het): 0.960979412169288\n", 493 | "PPV for het-call (regardless called variant types): 0.9866083629406942\n", 494 | "PPV for het-call (called variant type = het): 0.9546324132276578\n", 495 | "Recall rate for hom-call (regardless called variant types): 0.9995872458982561\n", 496 | "Recall rate for hom-call (called variant type = hom): 0.9913321638633784\n", 497 | "PPV for hom-call (regardless called variant types): 0.99922034889387\n", 498 | "PPV for hom-call (called variant type = hom): 0.936263522073872\n", 499 | "Recall rate for all calls: 0.9844620938628159\n", 500 | "PPV for all calls: 0.9781061692969871\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "ed = evaluation_data\n", 506 | "print(\"Recall rate for het-call (regardless called variant types):\", 1.0*sum((ed[:,1]!=2) & (ed[:,2]==0))/sum(ed[:,2]==0))\n", 507 | "print(\"Recall rate for het-call (called variant type = het):\", 1.0*sum((ed[:,1]==0) & (ed[:,2]==0))/sum(ed[:,2]==0))\n", 508 | "print\n", 509 | "print(\"PPV for het-call (regardless called variant types):\", 1.0*sum((ed[:,1]==0) & (ed[:,2]!=2))/sum(ed[:,1]==0))\n", 510 | "print(\"PPV for het-call (called variant type = het):\", 1.0*sum((ed[:,1]==0) & (ed[:,2]==0))/sum(ed[:,1]==0))\n", 511 | "print\n", 512 | "print(\"Recall rate for hom-call (regardless called variant types):\", 1.0*sum((ed[:,1]!=2) & (ed[:,2]==1))/sum(ed[:,2]==1))\n", 513 | "print(\"Recall rate for hom-call (called variant type = hom):\", 1.0*sum((ed[:,1]==1) & (ed[:,2]==1))/sum(ed[:,2]==1))\n", 514 | "print\n", 515 | "print(\"PPV for hom-call (regardless called variant types):\", 1.0*sum((ed[:,1]==1) & (ed[:,2]!=2))/sum(ed[:,1]==1))\n", 516 | "print(\"PPV for hom-call (called variant type = hom):\", 1.0*sum((ed[:,1]==1) & (ed[:,2]==1))/sum(ed[:,1]==1))\n", 517 | "print\n", 518 | "print(\"Recall rate for all calls:\", 1.0*sum((ed[:,1]!=2) & (ed[:,2]!=2))/sum(ed[:,2]!=2))\n", 519 | "print(\"PPV for all calls:\", 1.0*sum((ed[:,1]!=2) & (ed[:,2]!=2))/sum(ed[:,1]!=2))" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "We acutally call more variants outside those \"high confidence\" (a short alignment concept) regions, e.g., CYP2D6 regions in chr22." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 20, 532 | "metadata": {}, 533 | "outputs": [ 534 | { 535 | "name": "stdout", 536 | "output_type": "stream", 537 | "text": [ 538 | "(82618, 15, 4, 3)\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "Xarray3, pos_array3 = utils.get_aln_array(\"../wd/aln_tensor_chr22\")\n", 544 | "print(Xarray3.shape)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 21, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "all_t = []\n", 554 | "for i in range(0,len(Xarray3),10000):\n", 555 | " base, t = vnn.predict(Xarray3[i:i+10000])\n", 556 | " all_t.append(t)\n", 557 | "all_t = np.concatenate(all_t)\n" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 22, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "name": "stdout", 567 | "output_type": "stream", 568 | "text": [ 569 | "Total number of variant from the high-confident short-read call-set: 34850\n", 570 | "Total number of variant calls from our chr22 data: 52817\n" 571 | ] 572 | } 573 | ], 574 | "source": [ 575 | "evaluation_data2 = []\n", 576 | "for pos, predict_v in zip(np.array(pos_array3), all_t):\n", 577 | " evaluation_data2.append( (pos, np.argmax(predict_v)) )\n", 578 | "evaluation_data2 = np.array(evaluation_data2)\n", 579 | "print(\"Total number of variant from the high-confident short-read call-set: \", sum(ed[:,1]!=2))\n", 580 | "print(\"Total number of variant calls from our chr22 data: \", sum(evaluation_data2[:,1] != 2))" 581 | ] 582 | } 583 | ], 584 | "metadata": { 585 | "kernelspec": { 586 | "display_name": "Python 3", 587 | "language": "python", 588 | "name": "python3" 589 | }, 590 | "language_info": { 591 | "codemirror_mode": { 592 | "name": "ipython", 593 | "version": 3 594 | }, 595 | "file_extension": ".py", 596 | "mimetype": "text/x-python", 597 | "name": "python", 598 | "nbconvert_exporter": "python", 599 | "pygments_lexer": "ipython3", 600 | "version": "3.6.3" 601 | } 602 | }, 603 | "nbformat": 4, 604 | "nbformat_minor": 2 605 | } 606 | --------------------------------------------------------------------------------