├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── ipynb └── pandasVCF_example.ipynb ├── pandasvcf.py ├── requirements.txt ├── setup.py ├── test_data ├── ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz ├── ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz.tbi ├── SWGR_titin.vcf.gz └── SWGR_titin.vcf.gz.tbi ├── variant_annotations.py └── vcf_metadata.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | 56 | # Specific files 57 | /ipynb/pdVCF.ipynb 58 | /ipynb/INDEL_counts.ipynb 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Erick Scott 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of pandasVCF nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pandasVCF 2 | ========= 3 | VCF parser using the Python pandas library for interactive analysis 4 | 5 |

Update: April 13 2018

6 | Updates to Python3 print statements and changed default pandas compression to 'infer' 7 | 8 |
9 | 10 | 11 |

Update: March 9 2016

12 | VCF header parsing requires the tabix -H command which is broken in tabix version 1.2.X Please update your tabix version to 1.3. 13 | 14 |
15 | 16 |

Update: August 21 2015

17 | pandasVCF handles both multi-sample and single-sample VCF files. Please see ipynb/ for usage. pandasVCFmulti and pandasVCFsingle are now depracated. 18 | 19 |
20 | 21 |

Update: February 12 2015

22 | pandasVCFmulti now handles both multi-sample and single-sample VCF files. Please see http://nbviewer.ipython.org/github/erscott/pandasVCF/blob/master/ipynb/multi_sample_ex.ipynb for usage. pdVCFsingle.py is now depracated and will be removed in the near future. 23 | 24 | Command line support will be added in the near future. 25 | 26 |

Update: Nov 10 2014

27 | pdVCFsingle.py can now parse a dataframe with a single individual, either from a multi-sample VCF or a single-sample VCF. Missing genotype calls maked with '.' are dropped when add_variant_annotations are called. 100,000 variants are parsed in ~10sec. 28 | 29 | 30 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erscott/pandasVCF/8c832ddbd811e2425a700af4ad568514a2691577/__init__.py -------------------------------------------------------------------------------- /ipynb/pandasVCF_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pandasVCF" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### This example notebook describes simple usage of pandasVCFmulti, a module for parsing VCF files using the pandas library. pandasVCFmulti also handles single sample vcf files. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Libraries" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "Populating the interactive namespace from numpy and matplotlib\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "#Import pdVCFsingle package\n", 41 | "%matplotlib inline\n", 42 | "%pylab inline\n", 43 | "import sys\n", 44 | "sys.path.append( '../src/' )\n", 45 | "from pandasvcf import *\n", 46 | "%config InlineBackend.figure_format = 'retina'\n", 47 | "pd.options.mode.chained_assignment = None #supressing the chained assignment warnings" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Example File Path" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "vcf_path = '../test_data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz'\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "# Creating Vcf object" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Initiate Vcf object by specifying the sample_id string and the columns the user wants to include for parsing. \n", 80 | "\n", 81 | "###Only the CHROM, POS, REF, ALT, and FORMAT fields are required.\n", 82 | "\n", 83 | "###Some VCF files are quite large and will not fit in memory, therefore the user can specify the chunksize which allows iteration through the VCF. " 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "vcf_chunk = VCF(vcf_path, sample_id='all', cols=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT', 'INFO', 'FILTER'], \\\n", 95 | " chunksize=1000, n_cores=20)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "CPU times: user 38.7 s, sys: 528 ms, total: 39.2 s\n", 110 | "Wall time: 38.9 s\n" 111 | ] 112 | }, 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "0" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "%time vcf_chunk.get_vcf_df_chunk()\n" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 5, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "\n", 140 | "MultiIndex: 1000 entries, (22, 16050075, A, G) to (22, 16139996, G, T)\n", 141 | "Columns: 2511 entries, CHROM to NA21144\n", 142 | "dtypes: int64(1), object(2510)\n", 143 | "memory usage: 19.2+ MB\n", 144 | "\n", 145 | "2511000 Genotypes read\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "vcf_chunk.df.info()\n", 151 | "print \n", 152 | "print vcf_chunk.df.shape[1] * vcf_chunk.df.shape[0], 'Genotypes read'" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 6, 158 | "metadata": { 159 | "collapsed": false 160 | }, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/html": [ 165 | "
\n", 166 | "\n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | "
CHROMPOSREFALTFILTERINFOFORMATHG00096HG00097HG00099...NA21128NA21129NA21130NA21133NA21135NA21137NA21141NA21142NA21143NA21144
CHROMPOSREFALT
2216050075AG2216050075AGPASSAC=1;AF=0.000199681;AN=5008;NS=2504GT0|00|00|0...0|00|00|00|00|00|00|00|00|00|0
16050115GA2216050115GAPASSAC=32;AF=0.00638978;AN=5008;NS=2504GT0|00|00|0...0|00|00|00|00|00|00|00|00|00|0
16050213CT2216050213CTPASSAC=38;AF=0.00758786;AN=5008;NS=2504GT0|00|00|0...0|00|00|00|00|00|00|00|00|00|0
16050319CT2216050319CTPASSAC=1;AF=0.000199681;AN=5008;NS=2504GT0|00|00|0...0|00|00|00|00|00|00|00|00|00|0
16050527CA2216050527CAPASSAC=1;AF=0.000199681;AN=5008;NS=2504GT0|00|00|0...0|00|00|00|00|00|00|00|00|00|0
\n", 357 | "

5 rows × 2511 columns

\n", 358 | "
" 359 | ], 360 | "text/plain": [ 361 | " CHROM POS REF ALT FILTER \\\n", 362 | "CHROM POS REF ALT \n", 363 | "22 16050075 A G 22 16050075 A G PASS \n", 364 | " 16050115 G A 22 16050115 G A PASS \n", 365 | " 16050213 C T 22 16050213 C T PASS \n", 366 | " 16050319 C T 22 16050319 C T PASS \n", 367 | " 16050527 C A 22 16050527 C A PASS \n", 368 | "\n", 369 | " INFO FORMAT HG00096 \\\n", 370 | "CHROM POS REF ALT \n", 371 | "22 16050075 A G AC=1;AF=0.000199681;AN=5008;NS=2504 GT 0|0 \n", 372 | " 16050115 G A AC=32;AF=0.00638978;AN=5008;NS=2504 GT 0|0 \n", 373 | " 16050213 C T AC=38;AF=0.00758786;AN=5008;NS=2504 GT 0|0 \n", 374 | " 16050319 C T AC=1;AF=0.000199681;AN=5008;NS=2504 GT 0|0 \n", 375 | " 16050527 C A AC=1;AF=0.000199681;AN=5008;NS=2504 GT 0|0 \n", 376 | "\n", 377 | " HG00097 HG00099 ... NA21128 NA21129 NA21130 \\\n", 378 | "CHROM POS REF ALT ... \n", 379 | "22 16050075 A G 0|0 0|0 ... 0|0 0|0 0|0 \n", 380 | " 16050115 G A 0|0 0|0 ... 0|0 0|0 0|0 \n", 381 | " 16050213 C T 0|0 0|0 ... 0|0 0|0 0|0 \n", 382 | " 16050319 C T 0|0 0|0 ... 0|0 0|0 0|0 \n", 383 | " 16050527 C A 0|0 0|0 ... 0|0 0|0 0|0 \n", 384 | "\n", 385 | " NA21133 NA21135 NA21137 NA21141 NA21142 NA21143 NA21144 \n", 386 | "CHROM POS REF ALT \n", 387 | "22 16050075 A G 0|0 0|0 0|0 0|0 0|0 0|0 0|0 \n", 388 | " 16050115 G A 0|0 0|0 0|0 0|0 0|0 0|0 0|0 \n", 389 | " 16050213 C T 0|0 0|0 0|0 0|0 0|0 0|0 0|0 \n", 390 | " 16050319 C T 0|0 0|0 0|0 0|0 0|0 0|0 0|0 \n", 391 | " 16050527 C A 0|0 0|0 0|0 0|0 0|0 0|0 0|0 \n", 392 | "\n", 393 | "[5 rows x 2511 columns]" 394 | ] 395 | }, 396 | "execution_count": 6, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | } 400 | ], 401 | "source": [ 402 | "vcf_chunk.df.head()" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 7, 408 | "metadata": { 409 | "collapsed": false 410 | }, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "False" 416 | ] 417 | }, 418 | "execution_count": 7, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "#checking stopIteration flag\n", 425 | "vcf_chunk.stopIteration" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "### Adding Annotations" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 8, 438 | "metadata": { 439 | "collapsed": false 440 | }, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "CPU times: user 963 ms, sys: 141 ms, total: 1.1 s\n", 447 | "Wall time: 2.47 s\n" 448 | ] 449 | }, 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "0" 454 | ] 455 | }, 456 | "execution_count": 8, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "%time vcf_chunk.add_variant_annotations(inplace=True) #split_columns={'AD':2, 'HQ':2}," 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 9, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "\n", 477 | "MultiIndex: 101737 entries, (22, 16050075, A, G) to (22, 16139996, G, T)\n", 478 | "Data columns (total 15 columns):\n", 479 | "sample_ids 101737 non-null object\n", 480 | "multiallele 101737 non-null int64\n", 481 | "phase 101737 non-null object\n", 482 | "GT1 101737 non-null int64\n", 483 | "GT2 101737 non-null int64\n", 484 | "a1 101737 non-null object\n", 485 | "a2 101737 non-null object\n", 486 | "zygosity 101737 non-null object\n", 487 | "vartype1 101737 non-null object\n", 488 | "vartype2 101737 non-null object\n", 489 | "GT 101737 non-null object\n", 490 | "FORMAT 101737 non-null object\n", 491 | "hom_ref_counts 101737 non-null float64\n", 492 | "INFO 101737 non-null object\n", 493 | "FILTER 101737 non-null object\n", 494 | "dtypes: float64(1), int64(3), object(11)\n", 495 | "memory usage: 12.1+ MB\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "vcf_chunk.df.info()" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "### Unstacking the parsed dataframe by sample leads to sparsity due to rare variants" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 10, 513 | "metadata": { 514 | "collapsed": false 515 | }, 516 | "outputs": [ 517 | { 518 | "data": { 519 | "text/html": [ 520 | "
\n", 521 | "\n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | "
multiallele...FILTER
sample_idsHG00096HG00097HG00099HG00100HG00101HG00102HG00103HG00105HG00106HG00107...NA21128NA21129NA21130NA21133NA21135NA21137NA21141NA21142NA21143NA21144
CHROMPOSREFALT
2216139873CTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16139876CTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16139887ATNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16139971AGNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
16139996GTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", 721 | "

5 rows × 35056 columns

\n", 722 | "
" 723 | ], 724 | "text/plain": [ 725 | " multiallele \\\n", 726 | "sample_ids HG00096 HG00097 HG00099 HG00100 HG00101 HG00102 \n", 727 | "CHROM POS REF ALT \n", 728 | "22 16139873 C T NaN NaN NaN NaN NaN NaN \n", 729 | " 16139876 C T NaN NaN NaN NaN NaN NaN \n", 730 | " 16139887 A T NaN NaN NaN NaN NaN NaN \n", 731 | " 16139971 A G NaN NaN NaN NaN NaN NaN \n", 732 | " 16139996 G T NaN NaN NaN NaN NaN NaN \n", 733 | "\n", 734 | " ... FILTER \\\n", 735 | "sample_ids HG00103 HG00105 HG00106 HG00107 ... NA21128 \n", 736 | "CHROM POS REF ALT ... \n", 737 | "22 16139873 C T NaN NaN NaN NaN ... NaN \n", 738 | " 16139876 C T NaN NaN NaN NaN ... NaN \n", 739 | " 16139887 A T NaN NaN NaN NaN ... NaN \n", 740 | " 16139971 A G NaN NaN NaN NaN ... NaN \n", 741 | " 16139996 G T NaN NaN NaN NaN ... NaN \n", 742 | "\n", 743 | " \\\n", 744 | "sample_ids NA21129 NA21130 NA21133 NA21135 NA21137 NA21141 \n", 745 | "CHROM POS REF ALT \n", 746 | "22 16139873 C T NaN NaN NaN NaN NaN NaN \n", 747 | " 16139876 C T NaN NaN NaN NaN NaN NaN \n", 748 | " 16139887 A T NaN NaN NaN NaN NaN NaN \n", 749 | " 16139971 A G NaN NaN NaN NaN NaN NaN \n", 750 | " 16139996 G T NaN NaN NaN NaN NaN NaN \n", 751 | "\n", 752 | " \n", 753 | "sample_ids NA21142 NA21143 NA21144 \n", 754 | "CHROM POS REF ALT \n", 755 | "22 16139873 C T NaN NaN NaN \n", 756 | " 16139876 C T NaN NaN NaN \n", 757 | " 16139887 A T NaN NaN NaN \n", 758 | " 16139971 A G NaN NaN NaN \n", 759 | " 16139996 G T NaN NaN NaN \n", 760 | "\n", 761 | "[5 rows x 35056 columns]" 762 | ] 763 | }, 764 | "execution_count": 10, 765 | "metadata": {}, 766 | "output_type": "execute_result" 767 | } 768 | ], 769 | "source": [ 770 | "#unstack dataframe by sample - QUITE SPARSE DUE TO RARE VARIANTS\n", 771 | "vcf_chunk.df.set_index('sample_ids', append=True).unstack(level=4).tail()" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "## CONVENINCE FUNCTION FOR PARSING AN ENTIRE MULTISAMPLE FILE" 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "metadata": {}, 784 | "source": [ 785 | "###!!! Known Issue: get_whole_file will break if there are duplicate rows for the same genotype." 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": 11, 791 | "metadata": { 792 | "collapsed": false 793 | }, 794 | "outputs": [], 795 | "source": [ 796 | "def get_whole_file(vcf_path, sample_ids='all', columns=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'], \\\n", 797 | " add_variant_annotations=True, split_columns='', chunksize=5000, inplace=True, n_cores=1):\n", 798 | " '''\n", 799 | " This function will parse the whole multi-sample vcf file\n", 800 | " and return a dataframe.\n", 801 | " \n", 802 | " Note using multiple cores with add_variant_annotations will be \n", 803 | " very memory intensive as the parsed dataframe is copied to each process.\n", 804 | " '''\n", 805 | " \n", 806 | " vcf_df_obj = Vcf(vcf_path, sample_id=sample_ids, cols=columns, chunksize=chunksize, n_cores=n_cores) #initiate object\n", 807 | " stopIteration = False #initiating stopIteration flag\n", 808 | " data = [] #aggregation df list\n", 809 | " \n", 810 | " while stopIteration == False:\n", 811 | "\n", 812 | " vcf_df_obj.get_vcf_df_chunk() #retrieving df chunk\n", 813 | " if vcf_df_obj.stopIteration == True: break #checking for end of file\n", 814 | " \n", 815 | " if add_variant_annotations: \n", 816 | " vcf_df_obj.add_variant_annotations(split_columns=split_columns, inplace=inplace) #parsing df and adding annotations\n", 817 | " if inplace:\n", 818 | " data.append(vcf_df_obj.df)\n", 819 | " else:\n", 820 | " data.append(vcf_df_obj.df_annot) #aggregating annotation data\n", 821 | " else:\n", 822 | " vcf_df_obj.append(vcf_df_obj.df)\n", 823 | "\n", 824 | " df = pd.concat(data)\n", 825 | " return df" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": 12, 831 | "metadata": { 832 | "collapsed": false 833 | }, 834 | "outputs": [ 835 | { 836 | "name": "stdout", 837 | "output_type": "stream", 838 | "text": [ 839 | "End of File Reached\n", 840 | "CPU times: user 1min 31s, sys: 2.53 s, total: 1min 34s\n", 841 | "Wall time: 1min 40s\n" 842 | ] 843 | } 844 | ], 845 | "source": [ 846 | "%time master_df = get_whole_file(vcf_path, sample_ids='all', \\\n", 847 | " columns=['#CHROM', 'POS', 'REF', 'ALT','FORMAT', 'INFO'], \\\n", 848 | " chunksize=5000, n_cores=20)\n", 849 | "\n" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": 13, 855 | "metadata": { 856 | "collapsed": false 857 | }, 858 | "outputs": [ 859 | { 860 | "data": { 861 | "text/plain": [ 862 | "" 863 | ] 864 | }, 865 | "execution_count": 13, 866 | "metadata": {}, 867 | "output_type": "execute_result" 868 | }, 869 | { 870 | "data": { 871 | "image/png": [ 872 | "iVBORw0KGgoAAAANSUhEUgAAAuUAAAI6CAYAAACAUjYEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", 873 | "AAAWJQAAFiUBSVIk8AAAIABJREFUeJzt3X20bXdZH/rvY4LYUAy+oNGikmAMthigxRKJQjANpXrv\n", 874 | "aIq03FYISPEP4GIojrQ3mDb7cI2txWtQhNFL24A3WM3ti+gdao0WUpFA7Qs1hY4AJUlLGilRILyF\n", 875 | "Sslz/1hrNzs7Z59zcvY8+zfXnp/PGGuss9dca51nj+wn57t/6zefWd0dAABgnC8ZXQAAACydUA4A\n", 876 | "AIMJ5QAAMJhQDgAAgwnlAAAwmFAOAACDCeUAADCYUA4AAIMJ5QAAMJhQDgAAgwnlAAAwmFAOAACD\n", 877 | "CeUAADDYxoTyqvq6qvrZqvpYVd1bVe+vqqePrgsAAPbr9NEFnIiqelSSdyX5rSTfk+TuJOck+djI\n", 878 | "ugAAYArV3aNrOK6q+rEk39Xd3zW6FgAAmNqmbF+5NMnvVNUNVfXfquq9VfXy0UUBAMAUNiWUn5Pk\n", 879 | "ZUn+U5JnJfmpJH9HMAcA4DDYlO0rf5jkd7r7O3c8dk2Sv9Ddf3xcZQAAsH+TrJRX1XOr6vVV9c6q\n", 880 | "+lRV3VdV1x/nNY+pquuq6q6q+nxV3V5V165P6tztriT/cddjtyb5xinqBwCAkaaavnJVkvOTfDrJ\n", 881 | "nUken2TPJfiqelySm5M8OsnbsgrYT01yeZJnV9WF3f3xHS951/o9d/qWJHdMVD8AAAwz1Z7yVyY5\n", 882 | "t7vPTPLSE3j+G7MK5K/o7ud096u7++Ik1yY5L8k1u55/bZILqurVVfXNVfUXk7wiyRsmqh8AAIaZ\n", 883 | "fE95VV2U5O1J3trdlx3l+OOSfCjJ7d39uF3H/miSj2a1yv613f25Hce+J8mPZRXa/3OSn+nun5m0\n", 884 | "eAAAGGDExYOeub6/cfeB7v5MVb0rySVJLsgq3G8f+9Ukv3ogFQIAwAEaEcrPW99/cI/jH8oqlJ+b\n", 885 | "HaF8P6pq/iNmAAA4FLq7HuprRswpP3N9f88ex7cfP9oUFgAAOHRGrJQPczK/tTBeVbX/djCG/oMx\n", 886 | "9N5m2s/ujBEr5dsr4WfucXz78U8eQC0AADDciFB+6/r+vD2On7u+32vP+Umrqt5x25r6/QEAWJaq\n", 887 | "2trOl/t5nxHbV96xvr+k1p/NbB+oqkcmuTDJZ5O8Z+q/2MdAAABMqbu3kmwlG7Z9pbtvy2oc4tlJ\n", 888 | "Xr7r8JEkZyS5vrvvPejaAABghEkuHlRVlya5dP3lWUmeleS2JL+9fuzu7r5ix/PPSXJzkq9J8ktZ\n", 889 | "bWl5apKLknwgydO6+xP7Luz+v68TK+WbyskuMI7+gzH03mbaT+acKpRfneTqrK7E+YBD6/s7uvuc\n", 890 | "Xa95TJLXJHl2kq9KcleSX0xypLv3Gpd4svUd7Zs8sv64gZnzPyYYR//BGHpvc6zPU7x652PDQvnc\n", 891 | "WSnfbFW15RcoGEP/wRh6bzMNXymfO6EcAIBTbT+Zc8RIRAAAYIdFhXJzygEAmNJUc8oXFcq7u3bc\n", 892 | "tkbXw4nxCxSMo/9gDL23Obp7aztf7ud97Cln9pyBDuPoPxhD720me8o57I6MLgAWTP/BGHpvYayU\n", 893 | "AwDABKyUnyAnegIAMKWpTvS0Ug4AABOwUg4AABtMKAcAgMGEcmbP/n8YR//BGHpveewpZ/bMaoVx\n", 894 | "9B+Mofc2034y5+nTlzNfu86KPXJYruq537N9N8Fh/x79j5cZMysZxtB7G2L9qcbV+34fK+Wb77AH\n", 895 | "1iU4rD+bALAkVspJkpz/2ueNLoGH6JYrbhhdAgAwA070BACAwYRyAAAYTCgHAIDBhHJm76M3vm90\n", 896 | "CbBYZiXDGHpveYRyZu9jv/H+0SXAku17zBdwUvTewiwqlFdV77htja6HE/M1l/yJ0SXAkpmVDGPo\n", 897 | "vQ1RVVvb+XJf72NO+ebb/v6MRNw82yMRD+vPJgAsyX4y56JWygEAYI6EcgAAGEwoBwCAwYRyAAAY\n", 898 | "TChn9swph3FMqoIx9N7yCOXMnjnlMJRZyTCG3lsYoZzZM6cchjIrGcbQewuzqFDu4kGb6axnPWF0\n", 899 | "CbBY3b01ugZYIr23Oaa6eNDpUxW0CVygBQCAKa1/gdpK7r940MlYVCgHmNJ+V0UYz2INMBeL2r4C\n", 900 | "AABzZKUcYJ/Of+3zRpfAQ3TLFTeMLgHgAayUM3vmlMM4+g/GMJBieYRyZs+cchhH/8Ew5pQvjFDO\n", 901 | "7JlTDuPoPxjGnPKFEcqZPXPKYRz9B2OYU748QjkAAAwmlAMAwGBCOQAADCaUAwDAYIsK5VXVO25b\n", 902 | "o+vhxJiTDOPoPxhDTtkcVbW1nS/38z6LCuXdXTtuW6Pr4cSYkwzj6D8YxpzyDdHdW9v5cj/vs6hQ\n", 903 | "zmYyJxnG0X8wjDnlCyOUM3vmJMM4+g/G8In+8gjlAAAwmFAOAACDCeUAADCYUA4AAIMJ5cyeOckw\n", 904 | "jv6DMcwpXx6hnNkzJxnG0X8wjDnlCyOUM3vmJMM4+g+GMad8YTYmlK8vYXrfrttdo+vi1DMnGcbR\n", 905 | "fzCGOeXLc/roAh6iW5NctOPrLw6qAwAAJrNpofyL3f2x0UUAAMCUNmb7yto5VfVfq+q2qvr5qjp7\n", 906 | "dEEAALBfmxTK35PkhUn+bJIfTHJWkpur6iuHVgUAAPu0MaG8u/95d/+T7n5fd/+LJN+bVf0vHFwa\n", 907 | "p5g5yTCO/oMxzClfnslCeVU9t6peX1XvrKpPraejXH+c1zymqq6rqruq6vNVdXtVXVtVjzre39fd\n", 908 | "n0vy/iTfPNX3wDyZkwzj6D8YxpzyhZnyRM+rkpyf5NNJ7kzy+CS915Or6nFJbk7y6CRvy2qyylOT\n", 909 | "XJ7k2VV1YXd//Biv/7Ik35rk7VN9A8yTOckwjv6DYcwpX5gpt6+8Msm53X1mkpeewPPfmFUgf0V3\n", 910 | "P6e7X93dFye5Nsl5Sa7Z+eSq+omqenpVnV1VT03yT5L8kSQ/O+H3wAyZkwzj6D8Yw5zy5ZkslHf3\n", 911 | "Td394fWXdaznrlfJL0lye3e/Ydfhq5N8Lsnzq+qMHY//sSQ/n9WK+j9Ncm+SC7r7I1PUDwAAo4ya\n", 912 | "U/7M9f2Nuw9092eq6l1ZhfYLst6e0t1/+eDKAwCAgzNq+sp56/sP7nH8Q+v7c6f8S6uqj3G7aX2/\n", 913 | "tcdrt+Z8PEluueKGPSclfPTG9zk+4+Ojf34c39/x0T8/jus/xx13/OCO73jsQbejvceJqu59vf7o\n", 914 | "b1p1UVYr3G/t7suOcvxNSV6S5CXdfd1Rjl+T5MokV3b3j09QTydJdx9zW82m2v7+zn/t80aXwkN0\n", 915 | "yxU3JDm8P5uHnd7bXHoPOBX2kzk3Zk45y2VOMoyj/2CMvVZuObxGhfJ71vdn7nF8+/FPHkAtzJw5\n", 916 | "yTCO/oNhzClfmFGh/Nb1/Xl7HN/eS77XnvOTsmvfz9aU782pY04yjKP/YBhzyjfEzj3m+3mfUdNX\n", 917 | "3rG+v6SqqndsbK+qRya5MMlnk7xnyr/U3sHNZE4yjKP/YAxzyjfH+r/VVnL/nvKTMWSlvLtvy2oc\n", 918 | "4tlJXr7r8JEkZyS5vrvvPejaAADgoE22Ul5Vlya5dP3lWev7p1XVW9Z/vru7r9jxkpcluTnJT1fV\n", 919 | "xVltaXlqkouSfCDJj0xVGwAAzNmUK+VPTHJZkhdkdeGfzmol/LL17ft2Pnm9Wv6UJG/JKoy/av38\n", 920 | "12V1pc5PTFhbEnvKAQCY1uz2lHf3kTzEkxK6+84kL56qhhP4++wpBwBgMhu9pxweCnOSYRz9B2P4\n", 921 | "RH95hHJmz5xkGEf/wTDmlC/MokK5PeWbyZxkGEf/wTDmlG+I2e0p3wT2lG8mc5JhHP0HY5hTvjns\n", 922 | "KQcAgENCKAcAgMGEcgAAGGxRodyJngAATGmqEz0XFcq7u3bctkbXw4kxJxnG0X8whsXDzdHdW9v5\n", 923 | "cj/vs6hQzmYyJxnG0X8wjDnlCyOUM3vmJMM4+g+GMad8YYRyZs+cZBhH/8EYttkuj1AOAACDLSqU\n", 924 | "m74CAMCUppq+cvpUBW2C/Z4VCwAAO623Gm0lqwXgk32fRa2UAwDAHAnlzJ45yTCO/oMxbLNdHqGc\n", 925 | "2TMnGcbRfzCMOeULI5Qze+Ykwzj6D4Yxp3xhhHJmz5xkGEf/wRjmlC/PokK5kYgAAEzJSMSTYCQi\n", 926 | "AABTMhIRAAAOCaEcAAAGE8qZPXOSYRz9B2M49215hHJmz5xkGEf/wTDmlC+MUM7smZMM4+g/GMac\n", 927 | "8oURypk9c5JhHP0HY5hTvjxCOQAADLaoUO7iQQAATMnFg06CiwcBADAlFw8CAIBDQihn9sxJhnH0\n", 928 | "H4xhm+3yCOXMnjnJMI7+g2HMKV8YoZzZMycZxtF/MIw55QsjlDN75iTDOPoPxjCnfHmEcgAAGEwo\n", 929 | "BwCAwYRyAAAYTCgHAIDBhHJmz5xkGEf/wRjmlC+PUM7smZMM4+g/GMac8oVZVCivqt5x2xpdDyfG\n", 930 | "nGQYR//BMOaUb4iq2trOl/t5n9OnKmgTdHeNroGHzpxkGEf/wRjmlG+O9X+rrWS1AHyy77OolXIA\n", 931 | "AJgjoRwAAAYTygEAYDChHAAABhPKmT1zkmEc/QdjmBK3PEI5s2dOMoyj/2AYc8oXRihn9sxJhnH0\n", 932 | "HwxjTvnCCOXMnjnJMI7+gzHMKV8eoRwAAAbbuFBeVVdW1X1V9frRtQAAwBQ2KpRX1QVJfjDJLUlO\n", 933 | "+jKmAAAwJxsTyqvqzCRvTfIDST4xuBwAAJjMxoTyJG9K8o+7+18mqdHFcHDMSYZx9B+MYU758mxE\n", 934 | "KK+qH0xyTpKr1g/ZurIg5iTDOPoPhjGnfGFOH13A8VTVeUmuSfKd3f3F7YdjtXwxzEmGcfQfDGNO\n", 935 | "+cJMslJeVc+tqtdX1Tur6lPr6SjXH+c1j6mq66rqrqr6fFXdXlXXVtWjdj31O5J8dZL3V9UXquoL\n", 936 | "SZ6e5GVV9YdV9bApvgfmy5xkGEf/wRjmlC/PVCvlVyU5P8mnk9yZ5PE5xhaTqnpckpuTPDrJ25Lc\n", 937 | "muSpSS5P8uyqurC7P75++i8m+Z2dL0/y5iQfTPJj3f2Fib4HAAAYYqpQ/sokH+nuD1fVM5K84zjP\n", 938 | "f2NWgfwV3f2G7Qer6v9K8tey2q7y0iTp7nuS3LPzxVX1uSSf6O7/OFH9AAAwzCTbV7r7pu7+8PrL\n", 939 | "Y+71Xq+SX5Lk9p2BfO3qJJ9L8vyqOuNYf2Wc7AkAwCExYvrKM9f3N+4+0N2fSfKuJI9IcsFeb9Dd\n", 940 | "z+zuHzo15QEAwMEaEcrPW99/cI/jH1rfn3sAtbABzEmGcfQfjGFO+fKMCOVnru/v2eP49uO7p7Ds\n", 941 | "W1X1MW43re+39njt1pyPJ8ktV9yw5z+gH73xfRt7/GO/8f5Z1zfF8dE/P47v7/jonx/9p/8cP5TH\n", 942 | "r555fYs9vuOxB92O9h4nqrqn3ZpdVRcleXuSt3b3ZUc5/qYkL0nyku6+7ijHr0lyZZIru/vHJ6qp\n", 943 | "k6S7D+Vs8+3v7/zXPm90KafER29836Edy3bLFTckObw/m4fdYe+95PD2n95j7qpqy1jEzbOfzDli\n", 944 | "pXx7JfzMPY5vP/7JA6iFDXAYAwFsCv0HYwjkyzMilN+6vj9vj+Pbe8n32nN+0nZ9xLA19fsDALAs\n", 945 | "O7ez7Od9pppT/lBszzC/pKqqd+yfqapHJrkwyWeTvGfqv9jHlAAATGn9qcZWcv/2lZNx4Cvl3X1b\n", 946 | "VuMQz07y8l2HjyQ5I8n13X3vQdcGAAAjTLJSXlWXJrl0/eVZ6/unVdVb1n++u7uv2PGSlyW5OclP\n", 947 | "V9XFWW1peWqSi5J8IMmPTFEXAABsgqlWyp+Y5LIkL8jqap2d1Ur4Zevb9+188nq1/ClJ3pJVGH/V\n", 948 | "+vmvS3JBd39ioroewJ7yzWROMoyj/2AMOWVzTLWnfJJQ3t1HuvtLuvu0XbcvWd/OOcpr7uzuF3f3\n", 949 | "13f3w7v77O5+VXfvNb98ijprx23rVP09TOtjv/H+0SXAYuk/GObq0QVwYrp7aztf7ud9RkxfgYfk\n", 950 | "ay75E6NLgMXSfzDMkdEFcLCEcmbPnGQYR//BGD7RX55FhXJ7ygEAmNImzykfxpxyAACmtLFzygEA\n", 951 | "gAcSygEAYDChnNkzJxnG0X8whnPflmdRodyJnpvJnGQYR//BMOaUbwgnep4EJ3puJnOSYRz9B8OY\n", 952 | "U74hnOjJYpiTDOPoPxjDnPLlEcoBAGAwoRwAAAYTygEAYLBFhXLTVwAAmNJU01cWFcq7u3bctkbX\n", 953 | "w4kxJxnG0X8whsXDzdHdW9v5cj/vs6hQzmYyJxnG0X8wjDnlCyOUM3vmJMM4+g+GMad8YYRyZs+c\n", 954 | "ZBhH/8EYttkuj1AOAACDCeUAADDYokK5kYgAAExpqpGIp09V0CbY76gaAADYab3/fytZLQCf7Pss\n", 955 | "aqWczWROMoyj/2AMn+gvj1DO7JmTDOPoPxjGnPKFEcqZPXOSYRz9B8OYU74wQjmzZ04yjKP/YAxz\n", 956 | "ypdHKAcAgMGEcgAAGEwoBwCAwYRyAAAYbFGh3BU9N5M5yTCO/oMx5JTNMdUVPRcVyru7dty2RtfD\n", 957 | "iTEnGcbRfzCMOeUboru3tvPlft5nUaGczWROMoyj/2AYc8oXRihn9sxJhnH0H4zhE/3lEcoBAGAw\n", 958 | "oRwAAAYTygEAYDChHAAABhPKmT1zkmEc/QdjmFO+PEI5s2dOMoyj/2AYc8oXRihn9sxJhnH0Hwxj\n", 959 | "TvnCCOXMnjnJMI7+gzHMKV8eoRwAAAZbVCivqt5x2xpdDwAAm62qtrbz5X7e5/SpCtoE3V2jawAA\n", 960 | "4PBYbzXaSlYLwCf7PotaKQcAgDkSypk9c5JhHP0HY9hmuzxCObNnTjKMo/9gGHPKF0YoZ/bMSYZx\n", 961 | "9B8MY075wgjlzJ45yTCO/oMxzClfHqEcAAAGE8oBAGAwoRwAAAYTygEAYLCNCeVV9fKq+t2qumd9\n", 962 | "u7mqvmd0XZx65iTDOPoPxjCnfHk2JpQn+UiSv57kyUn+VJK3J3lbVT1xaFWccuYkwzj6D4Yxp3xh\n", 963 | "Th9dwInq7l/e9dBVVfXSJH86ye8OKIkDYk4yjKP/YBhzyhdmY0L5TlV1WpK/mOTLkvzW4HI4xcxJ\n", 964 | "hnH0H4xhTvnybFQor6pvS/LuJA9Pcm+Sv9TdHxhbFQAA7M8m7SlPkluTnJ/VlpWfSfILVfWUsSUB\n", 965 | "AMD+TBbKq+q5VfX6qnpnVX2qqu6rquuP85rHVNV1VXVXVX2+qm6vqmur6lFHe353f6G7b+vu93b3\n", 966 | "q5O8J8nLp/oeAABghCm3r1yV1Sr2p5PcmeTxSXqvJ1fV45LcnOTRSd6W1Sr4U5NcnuTZVXVhd3/8\n", 967 | "OH/nadm81X4AAHiAKQPtK5Oc291nJnnpCTz/jVkF8ld093O6+9XdfXGSa5Ocl+SanU+uqr9TVd9Z\n", 968 | "VY+tqm+rqr+d5BlJ3jrh98AMmZMM4+g/GMOc8uWZLJR3903d/eH1l3Ws565XyS9Jcnt3v2HX4auT\n", 969 | "fC7J86vqjB2Pf21WAfzWJL+Z1azyZ3f3b0xRP/NlTjKMo/9gGHPKF2bU9JVnru9v3H2guz9TVe/K\n", 970 | "KrRfkNVFgtLdP3Bw5TEn5iTDOPoPhjGnfGFG7cc+b33/wT2Of2h9f+6Uf2lV9TFuN63vt/Z47dac\n", 971 | "jyfJLVfcsOdHzR+98X0be/ysZz1h1vVNcXz0z4/j+zs++udH/+k/xw/f8e055XOtb8nHdzz2oNvR\n", 972 | "3uNEVfe+Xn/0N626KKsV7rd292VHOf6mJC9J8pLuvu4ox69JcmWSK7v7xyeop5Oku4+5rWZTbX9/\n", 973 | "57/2eaNL4SG65Yobkhzen83DTu9tLr0HnAr7yZwmlwAAwGCjQvk96/sz9zi+/fgnD6AWAAAYalQo\n", 974 | "v3V9f94ex7f3ku+15/yk7Nr3szXlewMAsDw795jv531GhfJ3rO8vqaoH7LmpqkcmuTDJZ7O6Yudk\n", 975 | "urt23LamfG9OHXOSYRz9B2NYPNwc3b21nS/38z5DQnl335bVOMSzk7x81+EjSc5Icn1333vQtTE/\n", 976 | "5iTDOPoPhjGnfGEmm1NeVZcmuXT95Vnr+6dV1VvWf767u6/Y8ZKXJbk5yU9X1cVZbWl5apKLknwg\n", 977 | "yY9MVduOGnd+rHDEavlmMCcZxtF/MIw55Rti/anGvn+JmvLiQU9MclmS7eDbWa2En7P++o4k/zOU\n", 978 | "d/dtVfWUJK9J8uwk35PkriSvyyowb58MOhmjrzbTWc96wugSYLH0H4xh4XBzrP9bbSUPWgB+SCYL\n", 979 | "5d19JA/xt7ruvjPJi6eqAQAANpE55QAAMJhQDgAAgy0qlJtTDgDAlDZ9TvkQ5pRvJnOSYRz9B2NY\n", 980 | "PNwcGz2nHB4Kc5JhHP0Hw5hTvjBCObNnTjKMo/9gGHPKF0YoZ/bMSYZx9B+MYZvt8iwqlDvREwCA\n", 981 | "KU11oueUV/ScPVf0BABgSlNd0XNRK+UAADBHQjkAAAwmlDN75iTDOPoPxnDu2/II5cyeOckwjv6D\n", 982 | "YcwpX5hFhXLTVzaTOckwjv6DYcwp3xCmr5wE01c2kznJMI7+gzHMKd8cpq8AAMAhIZQDAMBgQjkA\n", 983 | "AAwmlAMAwGBCObNnTjKMo/9gDFPilmdRodxIxM1kTjKMo/9gGHPKN4SRiCfBSMTNZE4yjKP/YBhz\n", 984 | "yjeEkYgshjnJMI7+gzHMKV8eoRwAAAYTygEAYDChHAAABhPKAQBgMKGc2TMnGcbRfzCG0c3LI5Qz\n", 985 | "e+Ykwzj6D4Yxp3xhFhXKXTxoM5mTDOPoPxjGnPIN4eJBJ8HFgzaTOckwjv6DMcwp3xwuHgQAAIeE\n", 986 | "UA4AAIMJ5QAAMJhQDgAAgwnlzJ45yTCO/oMxTIlbHqGc2TMnGcbRfzCMOeULI5Qze+Ykwzj6D4Yx\n", 987 | "p3xhhHJmz5xkGEf/wRjmlC+PUA4AAIMJ5QAAMJhQDgAAgy0qlFdV77htja4HAIDNVlVb2/lyP++z\n", 988 | "qFDe3bXjtjW6Hk6MOckwjv6DMSwebo7u3trOl/t5n0WFcjaTOckwjv6DYcwpXxihnNkzJxnG0X8w\n", 989 | "jDnlCyOUM3vmJMM4+g/GsM12eYRyAAAYTCgHAIDBhHIAABhMKAcAgMGEcmbPnGQYR//BGOaUL49Q\n", 990 | "zuyZkwzj6D8YxpzyhdmYUF5VV1bVv66qe6rqY1X1y1VlgO4CmJMM4+g/GMac8oXZmFCe5BlJfibJ\n", 991 | "dyT57iT/I8lvVtVXDK2KU86cZBhH/8EY5pQvz+mjCzhR3f3snV9X1QuS3JPkaUl+ZUhRAAAwgU1a\n", 992 | "Kd/ty7Oq/xOjCwEAgP3Y5FD+U0nem+TdowsBAID92JjtKztV1U9mtW3lO7u7R9cDAAD7MdlKeVU9\n", 993 | "t6peX1XvrKpPVdV9VXX9cV7zmKq6rqruqqrPV9XtVXVtVT3qGK+5Nsnzknx3d98xVf3MlznJMI7+\n", 994 | "gzHMKV+eKbevXJXk5UnOT3Ln+rE9V7Gr6nFJ/m2SFyV5T5KfTHJbksuTvLuqvvIor/mp3B/IPzhh\n", 995 | "7cyYOckwjv6DYcwpX5gpQ/krk5zb3WcmeekJPP+NSR6d5BXd/ZzufnV3X5zk2iTnJblm55Or6g1Z\n", 996 | "BfjvT3JPVZ21vj1iwu+BGTInGcbRfzCMOeULM1ko7+6buvvD6y/rWM9dr5JfkuT27n7DrsNXJ/lc\n", 997 | "kudX1Rk7Hn9pkj+a5F8kuWvH7YcnKJ8ZMycZxtF/MIY55csz6kTPZ67vb9x9oLs/U1Xvyiq0X5Dk\n", 998 | "7evHN3lSDAAA7GlU0D1vfb/XvvAPre/PPYBaAABgqFGh/Mz1/T17HN9+fM8pLCejqvoYt5vW91t7\n", 999 | "vHZrzseT5JYrbthzUsJHb3yf4zM+Pvrnx/H9HR/98+O4/nPccccP7viOxx50O9p7nKg6FWO+q+qi\n", 1000 | "rLadvLW7LzvK8TcleUmSl3T3dUc5fk2SK5Nc2d0/PkE9nSTdfcy97ptq+/s7/7XPG10KD9EtV9yQ\n", 1001 | "5PD+bB52em9z6T3gVNhP5hy1Ur69En7mHse3H//kAdTCzJmTDOPoPxhjr5VbDq9RofzW9f15exzf\n", 1002 | "3ks+6SzyXR8xbE353pw65iTDOPoPhjGnfEPs3M6yn/cZNX3lHev7S6qqescemqp6ZJILk3w2q4sK\n", 1003 | "TcbHlJvJnGQYR//BMOaUb4j1+Mqt5P7tKydjyEp5d9+W1TjEs7O6CuhOR5KckeT67r73oGtjfsxJ\n", 1004 | "hnH0H4xhTvnyTLZSXlWXJrl0/eVZ6/unVdVb1n++u7uv2PGSlyW5OclPV9XFWW1peWqSi5J8IMmP\n", 1005 | "TFUbAADM2ZQr5U9MclmSF2R14Z/OaiX8svXt+3Y+eb1a/pQkb8kqjL9q/fzXJbmguz8xYW1J7CkH\n", 1006 | "AGBas9tT3t1H8hD3P3X3nUlePFUNJ/D32VMOABtuv+GHeTgsuWyj95QDAAD3GzV9BU7YR298n5PN\n", 1007 | "YBD9x5wd5gt3Hebe2754Fw+0qJVye8o3kznJMI7+gzH03uaY3Z7yTXBY9i4tjTnJMI7+gzH03uaw\n", 1008 | "p5zFOKwf38Em0H8wht5bHqEcAAAGE8oBAGCwRYVyJ3oCADAlJ3qeBCd6AgAwJSd6shgfvfF9o0uA\n", 1009 | "xdJ/MIbeWx6hnNkzqxXG0X8wht5bHqGc2TOrFcbRfzCG3lseoZzZM6sVxtF/MIbeW55FhXLTVwAA\n", 1010 | "mJLpKyfB9BUAAKZk+goAABwSQjkAAAwmlDN7ZrXCOPoPxtB7yyOUM3tmtcI4+g/G0HvLI5Qze2a1\n", 1011 | "wjj6D8bQe8uzqFBuJOJmMqsVxtF/MIbe2xxGIp4EIxEBAJiSkYgAAHBICOUAADCYUA4AAIMJ5cye\n", 1012 | "Wa0wjv6DMfTe8gjlzJ5ZrTCO/oMx9N7yCOXMnlmtMI7+gzH03vII5cyeWa0wjv6DMfTe8iwqlLt4\n", 1013 | "EAAAU3LxoJPg4kEAAEzJxYMAAOCQEMoBAGAwoZzZM6sVxtF/MIbeWx6hnNkzqxXG0X8wht5bHqGc\n", 1014 | "2TOrFcbRfzCG3lseoZzZM6sVxtF/MIbeWx6hHAAABhPKAQBgMKEcAAAGE8oBAGAwoZzZM6sVxtF/\n", 1015 | "MIbeWx6hnNkzqxXG0X8wht5bnkWF8qrqHbet0fVwYsxqhXH0H4yh9zZHVW1t58v9vM/pUxW0Cbq7\n", 1016 | "RtfAQ2dWK4yj/2AMvbc5unsryVayWgA+2fdZ1Eo5AADMkVAOAACDCeUAADCYUA4AAIMJ5cyeWa0w\n", 1017 | "jv6DMfTQo1UDAAAVc0lEQVTe8gjlzJ5ZrTCO/oMx9N7yCOXMnlmtMI7+gzH03vII5cyeWa0wjv6D\n", 1018 | "MfTe8mxMKK+qp1fVL1fVnVV1X1W9cHRNAAAwhY0J5UkekeSWJJcnuTfJvi5lCgAAc3H66AJOVHf/\n", 1019 | "WpJfS5KqesvYagAAYDqbtFIOAACHklDO7JnVCuPoPxhD7y2PUM7smdUK4+g/GEPvLc9kobyqnltV\n", 1020 | "r6+qd1bVp9YTUq4/zmseU1XXVdVdVfX5qrq9qq6tqkdNVRebz6xWGEf/wRh6b3mmPNHzqiTnJ/l0\n", 1021 | "kjuTPD7HmJBSVY9LcnOSRyd5W5Jbkzw1q+kqz66qC7v74xPWx4YyqxXG0X8wht5bnim3r7wyybnd\n", 1022 | "fWaSl57A89+YVSB/RXc/p7tf3d0XJ7k2yXlJrtn55Kp6RFU9qaqetK77m9Zff8OE3wMAABy4yUJ5\n", 1023 | "d9/U3R9ef1nHeu56lfySJLd39xt2Hb46yeeSPL+qztjx+Lcn+Xfr25clObL+85EJygcAgGFGnej5\n", 1024 | "zPX9jbsPdPdnkrwrq4sFXbDj8Zu6+0vWt9N2/PnFB1MyAACcGqNC+Xnr+w/ucfxD6/tzD6AWAAAY\n", 1025 | "alQoP3N9f88ex7cfn3QKS1X1MW43re+39njt1pyPJ8ktV9yw51zTj974vo09/tEb3zfr+qY4Pvrn\n", 1026 | "x/H9HR/986P/9N+Sj4/++TlVx7e/nmt9Ux0f/fNzMsd3PPag21G/yRNU3ft6/dHftOqiJG9P8tbu\n", 1027 | "vuwox9+U5CVJXtLd1x3l+DVJrkxyZXf/+AT1dJJ09zH3um+q7e/v/Nc+b3Qpp8QtV9xwqL+35PD+\n", 1028 | "bB52h733ksPbf3pvs+m9zXaY+28/mXPUSvn2SviZexzffvyTB1ALM2dWK4yj/2AMvbc8o0L5rev7\n", 1029 | "8/Y4vr2XfK895ydl10cMW1O+N6eOWa0wjv6DMfTe5ti5nWU/7zPlxYMeines7y+pquode2iq6pFJ\n", 1030 | "Lkzy2STvmfIvPYwfkwAAME53byXZSu7fvnIyhqyUd/dtWY1DPDvJy3cdPpLkjCTXd/e9B10bAAAc\n", 1031 | "tMlWyqvq0iSXrr88a33/tKp6y/rPd3f3FTte8rIkNyf56aq6OKstLU9NclGSDyT5kalqAwCAOZty\n", 1032 | "pfyJSS5L8oKsrtbZWa2EX7a+fd/OJ69Xy5+S5C1ZhfFXrZ//uiQXdPcnJqwtiT3lAABMa6o95ZOF\n", 1033 | "8u4+suNqmztv21fePOcor7mzu1/c3V/f3Q/v7rO7+1Xdvdf88v3WWDtuW6fi72B6e802BU49/Qdj\n", 1034 | "6L3N0d1b2/lyP+8zavoKnLCP/cb7R5cAi6X/YAy9tzxCObNnViuMo/9gDL23PIsK5faUbyazWmEc\n", 1035 | "/Qdj6L3NselzyocwpxwAgClt9JxyAADgfkI5AAAMJpQDAMBgiwrlTvTcTGa1wjj6D8bQe5tjdhcP\n", 1036 | "2gQuHrSZzGqFcfQfjKH3NoeLB7EYZrXCOPoPxtB7yyOUM3tmtcI4+g/G0HvLI5QDAMBgQjkAAAy2\n", 1037 | "qFBu+goAAFOaavrK6VMVtAn2e1YsAADstJ7ot5WsFoBP9n0WtVLOZjKrFcbRfzCG3lseoZzZM6sV\n", 1038 | "xtF/MIbeWx6hnNkzqxXG0X8wht5bHqGc2TOrFcbRfzCG3lseoRwAAAYTygEAYLBFhXJzygEAmJI5\n", 1039 | "5SfBnHIAAKZkTjmLYVYrjKP/YAy9tzxCObNnViuMo/9gDL23PEI5s2dWK4yj/2AMvbc8QjmzZ1Yr\n", 1040 | "jKP/YAy9tzxCOQAADCaUAwDAYEI5AAAMtqhQ7uJBAABMaaqLBy0qlHd37bhtja6HE2NWK4yj/2AM\n", 1041 | "vbc5untrO1/u530WFcrZTGa1wjj6D8bQe8sjlDN7ZrXCOPoPxtB7yyOUM3tmtcI4+g/G0HvLI5QD\n", 1042 | "AMBgQjkAAAwmlAMAwGBCOQAADCaUM3tmtcI4+g/G0HvLI5Qze2a1wjj6D8bQe8sjlDN7ZrXCOPoP\n", 1043 | "xtB7yyOUM3tmtcI4+g/G0HvLs6hQXlW947Y1uh4AADZbVW1t58v9vM/pUxW0Cbq7RtcAAMDh0d1b\n", 1044 | "SbaS1QLwyb7PolbKAQBgjoRyAAAYTChn9sxqhXH0H4yh95ZHKGf2zGqFcfQfjKH3lkcoZ/bMaoVx\n", 1045 | "9B+MofeWRyhn9sxqhXH0H4yh95ZHKAcAgMGEcgAAGEwoBwCAwTYqlFfVy6rq9qq6t6r+TVV95+ia\n", 1046 | "AABgvzYmlFfV85K8LsmPJnlSkpuT/FpVfcPQwjjlzGqFcfQfjKH3lmdjQnmSVyV5c3f/w+7+QHf/\n", 1047 | "UJLfS/LSwXVxipnVCuPoPxhD7y3PRoTyqvrSJH8yyY27Dt2Y5GkHXxEH6YxzHj26BFgs/Qdj6L3l\n", 1048 | "2YhQnuSrk5yW5L/tevxjSc46+HI4SJ+77e7RJcBi6T8YQ+8tz6aEcgAAOLQmC+VV9dyqen1VvbOq\n", 1049 | "PlVV91XV9cd5zWOq6rqququqPr+erHJtVT1q11N/P8kXk3ztrse/Nqt95QAAsLGmXCm/KsnLk5yf\n", 1050 | "5M71Y73Xk6vqcUn+bZIXJXlPkp9McluSy5O8u6q+cvu53f2H6+c+a9fbXJLVFBYAANhYU4byVyY5\n", 1051 | "t7vPzIlNRHljkkcneUV3P6e7X93dFye5Nsl5Sa7Z9fyfTPKiqvqrVfWtVfVTWe0n/3vTfQsAAHDw\n", 1052 | "Jgvl3X1Td394/WUd67nrVfJLktze3W/YdfjqJJ9L8vyqOmPH+/+/WQX/q5K8N6upK9/T3R+Z6FsA\n", 1053 | "AIAhqnvPHSYn/6ZVFyV5e5K3dvdlRzn+kiRvSvJ/d/eDVtWr6tezCu1/prvfPkE903+TAABwFN19\n", 1054 | "zAXqoxk1feW89f0H9zj+ofX9uQdQCwAADHX6oL/3zPX9PXsc33589xSWk3Iyv60AAMBBMaccAAAG\n", 1055 | "GxXKt1fCz9zj+PbjnzyAWgAAYKhRofzW9f15exzf3ku+155zAAA4NEaF8nes7y+pqgfs966qRya5\n", 1056 | "MMlns7qoEAAAHGpDQnl335bkxiRnZ3UV0J2OJDkjyfXdfe9B1wYAAAdtsjnlVXVpkkvXX56V5FlJ\n", 1057 | "bkvy2+vH7u7uK3Y8/5wkNyf5miS/lNWWlqcmuSjJB5I8rbs/MUlxAAAwY1OG8quzuhrn7jfc3p5y\n", 1058 | "R3efs+s1j0nymiTPTvJVSe5K8otJjnT3XuMSAQDgUDklV/QEYHNU1TcmuedYiyFV9eVJHtXd/+Xg\n", 1059 | "KgNYDnPKGaaqPlFVf33H11dX1dNH1gQLdUeSy4/znB9KcvupLwWW40T+3auq76qqv3VQNTGOUM5I\n", 1060 | "Zyb5sh1fX53VOQXAPLk6MkzrRP7de8b6eRxyQjkjfSzJY0YXAZyQr81qVC1wsB6WB5+vxyF0+ugC\n", 1061 | "WLR3J7msqu5L8nvrxy7aNbr+qLr7NaeyMDjsquqFWf1Dv91wT6qqy47y1NOSfFOSFyT5DwdUHnC/\n", 1062 | "Jyf5/dFFcOo50ZNhqurcJG9L8q0P9bXd7VMe2If1L8MPxeeSfF93//qpqAeWoqrekftXvi/K6pyO\n", 1063 | "O47y1NOSfEOSxyb5+e7+/lNfHSMJ5QxVVadldRGpr09yU5KfXd+OqbtvOqWFwSFXVS/a8eV1WV0v\n", 1064 | "4peO8tQvJvmDJDd39ycPoDQ41B7CL8SdVe/9ZpLLu/vuU1cVcyCUMxvr/1Ed6e4jo2uBJamqm5K8\n", 1065 | "ubuP+wsxMB3/7rGTUA4AMMD6E6v3dvfvjq6F8YRyZqmqvjWrveaP6O7rR9cDAHAqCeXMSlU9Ock/\n", 1066 | "yOps8yTp7j5tfeyiJL+a5H/r7l8eUyFsvqq6PSc5Yq27z5m4HFiM/Vwgr7t/a8pamB+hnNmoqm9J\n", 1067 | "8jtZnXH+D5J8S5I/tz1ppaq+JMlHkvxmd79wWKGw4arqjpN8aXf32VPWAktyElOPtv3PBSoOL3PK\n", 1068 | "mZOrkzw8yVO6+/1VtZXkz20f7O77qurdSb59UH1wKHT3Y0fXAAt1stfYsIK6AEI5c3Jxkn/W3e8/\n", 1069 | "xnM+kuTPHFA9ADCZ7t4aXQPz5QIszMlXZBW6j6WyWk0HADg0rJQzJx9L8s3Hec4fz/GDO3ASqurL\n", 1070 | "stoe9vXZ45ff7v5/DrQogIUQypmTf5HkL1fV47v71t0Hq+rbs9ri8sYDrwwOuar6q0n+blafWO2l\n", 1071 | "kwjlMKH1EIPnJnlWkj+WvX8h/u6DrIuDZ/oKs1FVj0/y75J8JquTPp+U5AeTnJ/k6evHvizJ+d39\n", 1072 | "n0fVCYdNVT07q3Gj70/y5iQ/keSXspqG9IyswsI/SfIrrvoJ06mqhyf5tSQXHe+525PIOLz8B2Y2\n", 1073 | "1qvjz0nysCRvyCqQJ8ktSX5m/fhfEMhhcj+c5ONJLuzun1w/9t7u/tvd/eysevE5ST48qkA4pP5G\n", 1074 | "VoH8R5N89fqxI1mtmP+VrLZr/kJW//5xyFkpZ3aq6iuSXJbkO5J8VZJ7krw7yZu7++Mja4PDqKr+\n", 1075 | "IMkvd/cPrL++L8lrdk6KqKp/meTedUgHJlBV/yHJH3b3n1p/fV+Sre5+zfrrc5L8+6z68SfGVcpB\n", 1076 | "sKec2aiqq5Pc1t3XJ/mp9Q049R6R5K4dX38+yZfves6/SfIDB1YRLMPjkvz9HV93dqyKd/dtVfUr\n", 1077 | "SV6Y1bYyDjHbV5iTH0nybaOLgAX6b0kevePrjyY5b9dzvjwWcmBqX8jql+Btn8kDezFJ/ktW4Z1D\n", 1078 | "TihnTu7Kg1fngFPv/XlgCP+tJBdX1dOTpKq+LclfWj8PmM5/zWr/+LYPZrV1c6cnZXXOB4ecUM6c\n", 1079 | "/LMkf6aq/sjoQmBhfjXJhVX19euvX5vkviQ3VdXdSX43ySOzOhkNmM5vJ3najq9/Mcm3VdU/rKrv\n", 1080 | "raqfSHJJkptGFMfBcqIns1FVj0zyjqxWBH64u//D4JJgEarqYVmdVP3x7v7D9WMXJLkqqwt63Z7k\n", 1081 | "dd396+OqhMOnqp6Z5K8neVl3315Vj0jy9qwu4rXtPyX57u6+c0SNHByhnNmoqtuTfGmSr8vqZJfP\n", 1082 | "Z3WVzwf9kHb3OQdbHQCceutfkv987v+F+P/r7s+NrYqDIJQzG1V1R1YBvI7z1O7us099RQAAB0Mo\n", 1083 | "B+ABquryJJf7RAoOVlV9U5Jv6u7fGl0LB8+JngDs9hVJHju6CFigH8jq3CoWSChntqrqm7ZHsgHA\n", 1084 | "QhxvCyeHlFDOnFkxAAAWQShn7qwYALAkTvZbKJdMBmC3t2U1ig04eBajFsr0FWarqraS/M3uPm10\n", 1085 | "LQAAp5LtK8ydFQMADqWquvp4Aw2q6ruq6m8dVE2MY6UcgFTVNyT5a0memOQxSR52tOeZXQ7Tqar7\n", 1086 | "kmx192uO8ZyrkhzxqfHhZ6Wc2bBiAGNU1UVJPpjklUm+K8kjsvr3YffNJ1dw8B4WJ38ughM9mZOr\n", 1087 | "s/ofz7GuZPaM9fP2XFUAHrLXZhW6L0vyj7r7vsH1APd7cpLfH10Ep55QzqaxYgDTe0KSX+jut44u\n", 1088 | "BA67qnpHHvjv2IvWn1btdlqSb8jq6ro/f+orYzShnE1jxQCm98kkfzC6CFiIZ+z6+rHr226dVV/+\n", 1089 | "QlZbyzjkhHKGsmIAs/AreXBQAE6B7v6f5/OtT/Q80t1HBpbETJi+wlDr/yGdiO0Vg99Mcnl3333q\n", 1090 | "qoJlqapHJ/lXSf55kiu6+7ODS4JFqKoXJXlvd//u6FoYTyhnNqwYwDhV9fgk78nqU6kPJrnnaM/r\n", 1091 | "7u8+yLoAlsL2FebkxUneO7oIWJqqekKSm5J8+fqhJ4+rBpalqk5L8vIk35/kW5M8YnsmeVU9OckP\n", 1092 | "Jnldd39wXJUcBCvlAAtXVTcmuTircaM/m+T3uvt/jK0KDr+q+tKsto1dlNUWzT9M8nXb+86r6iuS\n", 1093 | "/F6Sv9vdrtFxyLl4ELNSVadV1Q9V1b+qqk9V1Rd3HHtyVb2xqr5lZI1wCF2Q5Be7+0e7+yMCORyY\n", 1094 | "K7IK5EeSnJXk7+882N2fSPLOJM868Mo4cEI5s7FeMfiNJK9Lck6ST+eBVxC8I6stLs8/8OLgcPtC\n", 1095 | "kttHFwEL9P1Jbu7uI939xT2ec3uSbzzAmhhEKGdOrBjAGO9I8qdHFwELdHaSdx/nOR9P8lUHUAuD\n", 1096 | "CeXMiRUDGONvJPnjVXVlVdVxnw1M5b8nedRxnvMNWV3gi0PO9BXm5OysLmJyLFYMYHpXJXlfkmuS\n", 1097 | "vKSq/n32Hon44oMsDA659yZ5VlU9vLv/++6DVXVmkj+b46+mcwgI5cyJFQMY44U7/nz2+rYXoRym\n", 1098 | "86YkP5fk56rqAb21nrxyXZKvTPL3BtTGARPKmRMrBjDGOaMLgCXq7p+vqkuSvCjJ/5r1olNV/Zsk\n", 1099 | "T0jypUne2N3H+xSZQ0AoZ06sGMAA3X3H6Bpgqbr7xVX1W0kuT/LE9cN/Msn7k/xkd795WHEcKBcP\n", 1100 | "Ylaq6rqsVgy+kNWKwaOT/Ls8cMXgfx9WIACcIlV1RpKvSHJPd39mdD0cLKGc2amqF+WBKwaJFQM4\n", 1101 | "5arqO5K8JMmTsjq/456sfim+rrtvHlkbwGEnlDNbVgzg4FTVNUmu3ONwZ3WZ772OAyepqi7K6jod\n", 1102 | "357Vv3m7x1VXku7u0w64NA6YUA6wcFX1F5PckOQ/J/k/k7w9yUeTfF2SZyb5m1ldH+CvdPcNo+qE\n", 1103 | "w6aqvjfJL2UVxD+yvv2Pozy1u/uZB1kbB08oZ1asGMDBW59k9i1Jvq277z7K8a/OagvZB7r76Qdd\n", 1104 | "HxxWVfWvszpn6s93942j62Es01eYjaOsGHwwe6wYHGRdsABPTHL90QJ5knT371fVP07ygoMtCw69\n", 1105 | "JyS5QSAnEcqZl62spq5YMYCDdXqSzx7nOZ9L8rADqAWW5LNJ/mB0EczD7q0BMJIVAxjjtiT/S1Ud\n", 1106 | "9d+E9eN/LsmHD7QqOPx+M8l3jC6CeRDKmRMrBjDGzyX51iS/XFXfsvNAVX1zkn+a5E8k+UcDaoPD\n", 1107 | "7P9I8riq+ptVVaOLYSwnejIbVfULSb6xu582uhZYkqp6eJJfT/L0JPcluSvJ7yU5K8ljsjrB+reT\n", 1108 | "XNLd/31UnbDpqurNefB5UY9NclGSO5L8+6wunPcg3f3ioz3O4SGUMxtV9dgk/yrJzyT50fbDCQem\n", 1109 | "qr40yQ8neXGSx+049OEk1yX5ie7+woja4LCoqvtO9rXdbXfDISeUM4wVA5inqnpkkjOzunDXp0fX\n", 1110 | "A4fFevHppHT3HZMVwiwJ5QxjxQAAYMVIREY6Z3QBwIoLdwGMJZQzjI/iYB5cuAtgPNtXABbOpb4B\n", 1111 | "xhPKARauqu7N6sJdLxpdC8BSOVkOABfuAhhMKAfApb4BBhPKAXCpb4DB7CkHWBgX7gKYH6EcYGFc\n", 1112 | "uAtgfswpB1geF+4CmBkr5QAAMJiPIQEAYDChHAAABhPKAQBgMKEcAAAGE8oBAGAwoRwAAAYTygEA\n", 1113 | "YDChHAAABhPKAQBgMKEcAAAGE8oBAGAwoRwAAAb7/wHpqjSRyoLpQgAAAABJRU5ErkJggg==\n" 1114 | ], 1115 | "text/plain": [ 1116 | "" 1117 | ] 1118 | }, 1119 | "metadata": { 1120 | "image/png": { 1121 | "height": 285, 1122 | "width": 370 1123 | } 1124 | }, 1125 | "output_type": "display_data" 1126 | } 1127 | ], 1128 | "source": [ 1129 | "master_df.zygosity.value_counts().plot(kind='bar', log=True, grid=True, color='seagreen')" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "code", 1134 | "execution_count": 14, 1135 | "metadata": { 1136 | "collapsed": false 1137 | }, 1138 | "outputs": [ 1139 | { 1140 | "data": { 1141 | "text/plain": [ 1142 | "" 1143 | ] 1144 | }, 1145 | "execution_count": 14, 1146 | "metadata": {}, 1147 | "output_type": "execute_result" 1148 | }, 1149 | { 1150 | "data": { 1151 | "image/png": [ 1152 | "iVBORw0KGgoAAAANSUhEUgAAAuUAAAIRCAYAAADtOcPxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\n", 1153 | "AAAWJQAAFiUBSVIk8AAAIABJREFUeJzt3X/UrWdZH/jvJbHSIAZUhDpYTSiNOk7wBzYICkEmlGHa\n", 1154 | "mYylpa2YKoMzAxSh2LQNsIY3dqK1VMMPoS0zxdjQ0XSmC+ua1jF2IEoJjNW6zAIXPyShJUaEDhB+\n", 1155 | "hUrlmj/2fuXl9bznnOz3Oe9977M/n7X22mfve+99rsP+5nCd572f66nuDgAAMM4XjC4AAAB2naYc\n", 1156 | "AAAG05QDAMBgmnIAABhMUw4AAINpygEAYDBNOQAADKYpBwCAwTTlAAAwmKYcAAAG05QDAMBgmnIA\n", 1157 | "ABhMUw4AAINtTVNeVX+sqn6qqj5YVfdW1Tuq6vGj6wIAgOO6YHQBZ6OqHpTkLUl+OclTk3woySVJ\n", 1158 | "PjiyLgAAWEJ19+gazqiqfjjJd3T3d4yuBQAAlrYt21euSvIrVXVzVf1uVf16VT13dFEAALCEbWnK\n", 1159 | "L0nynCS/leTJSV6R5O9ozAEAOB9sy/aV30vyK9397Qeeuz7Jf9fdXz+uMgAAOL5FjpRX1dOq6lVV\n", 1160 | "9eaq+lhVfbaqbjrDex5eVa+rqrur6tNVdWdV3bA+qfOwu5P85qHn3pnkjy9RPwAAjLTU9JWXJLks\n", 1161 | "yceT3JXka5MceQi+qh6R5LYkD0nys1k12JcneX6Sp1TV47r7wwfe8pb1Zx70J5O8b6H6AQBgmKX2\n", 1162 | "lL8gySO7+6Ikzz6L178mq4b8ed39Xd39ou5+UpIbklya5PpDr78hyWOq6kVV9Seq6s8neV6SVy9U\n", 1163 | "PwAADLP4nvKquiLJG5O8vruvPsX6I5K8J8md3f2IQ2tfnOQDWR1lf2h3f+rA2lOT/HBWTfu/S/IT\n", 1164 | "3f0TixYPAAADjLh40BPX97ccXujuT1TVW5JcmeQxWTX3+2v/Msm/PJEKAQDgBI1oyi9d37/7iPX3\n", 1165 | "ZNWUPzIHmvLjqKr5R8wAAHBe6O66r+8ZMaf8ovX9PUes7z9/qiksAABw3hlxpHyYTf7VwqlVVfvf\n", 1166 | "kxnJJrOSTWYmn8s4zu6MEUfK94+EX3TE+v7zHz2BWgAAYLgRTfk71/eXHrH+yPX9UXvOAQDgvDKi\n", 1167 | "KX/T+v7Kqvq8H5NU1QOTPC7JJ5O8benfuKr6wG1v6c8HAGC3VNXefn95nM858aa8u+/IahzixUme\n", 1168 | "e2j5uiQXJrmpu+89B793HbjtLf35AADslu7e2+8vj/M5i1w8qKquSnLV+uHDkjw5yR1J/vX6uQ91\n", 1169 | "9zUHXn9JktuSfEWSf57VlpbLk1yR5F1JHtvdHzl2YZ/7/TpxoueSnBDCrGSTWckmM5PPZRyn51yq\n", 1170 | "KX9pkpdmdSXOz1ta37+vuy859J6HJ/mhJE9J8mVJ7k7yhiTXdfdR4xI3rU9TvjD/8TIr2WRWssnM\n", 1171 | "5HMZx+k5F9m+0t3XdfcXdPf9Dt2+YH275BTvuau7n9ndX9ndX9TdF3f3C5duyA+yp3xR140uAI4g\n", 1172 | "m8xKNpmZfG5oqT3lixwpn50j5QAAnGvDj5QDAACb05QDAMBgO9WU21MOAMCStnZO+UjmlC/HP2qY\n", 1173 | "lWwyK9lkZvK5uanmlM/OiZ7LMzqJWckms5JNZiafy3CiJyMYncSsZJNZySYzk8/BHCkHAIAFOFJ+\n", 1174 | "lpzoCQDAklw86D5wpBwAgHPNkXIAANhimnIAABhMU85G7MlnVrLJrGSTmcnnePaUsxHzTJmVbDIr\n", 1175 | "2WRm8rmM4/ScFyxfzrwOnRV73WxX9TzuWbsnbRvq9RfMTjJrl1nJJjOTzw2tf8rw0mN/jiPl89iG\n", 1176 | "JnfbzP6dAwDnD0fKzzOXvezpo0vYerdfc/PoEgAAzpoTPQEAYDBNOQAADKYpBwCAwTTlbOQDt7x9\n", 1177 | "dAlwSmbtMivZZGbyOd5ONeVV1Qdue6Pr2WYf/MV3jC4BjnLssVRwjsgmM5PPDVXV3n5/eZzP2anp\n", 1178 | "K8bjLecrrvzPR5cARzFrl1nJJjOTzw2tr3uzlxxvvLU55RPZr9NIxOPbH4k4+3cOAJw/jtNz7tT2\n", 1179 | "FQAAmJGmHAAABtOUAwDAYJpyAAAYTFPORswpZ1bGnTIr2WRm8jmeppyNmFPOxMzaZVayyczkc7Cd\n", 1180 | "aspdPGg55pQzMbN2mZVsMjP53NBSFw8yp3wi5pQvx5xyAOCkmVMOAABbTFMOAACDacoBAGAwTTkA\n", 1181 | "AAymKWcj5pTvlkOTi9wWuo3+XjlZpn4xM/kcT1PORswpB7jPzIFmZvI52AWjC2A7mVO+m7ZhXOcH\n", 1182 | "bnl7Hvbkbxhdxmntj+xk55gDzczkczBHytnI7E0Pu0s2mVV3742uAY4in+NpygEAYDBNOQAADKYp\n", 1183 | "BwCAwTTlAAAw2E415YdmBO+NrmebmVPOrGSTWfn/HWYmn5urqr0lrj+xU015d9eB297oeraZOeXM\n", 1184 | "SjaZmDnQzEw+N9Tde/v95XE+Z6eacpZjTjmzkk0mZg40M5PPwTTlbMQsaGYlm8zKT2iZmXyOpykH\n", 1185 | "AIDBNOUAADCYphwAAAbTlAMAwGCacjZiFjSzkk1mZQ40M5PP8TTlbMQsaGYlm0zMHGhmJp+DbU1T\n", 1186 | "vr5a0mcP3e4eXdeuMguaWckmEzMHmpnJ52AXjC7gPnpnkisOPP79QXXsPLOgmZVsMitzoJmZfI63\n", 1187 | "bU3573f3B0cXAQAAS9qa7Strl1TVb1fVHVX101V18eiCAADguLapKX9bkr+S5E8n+f4kD0tyW1V9\n", 1188 | "6dCqAADgmLZm+0p3/98HHr69qt6a5M6sGvUbxlQFAADHt9iR8qp6WlW9qqreXFUfW09HuekM73l4\n", 1189 | "Vb2uqu6uqk9X1Z1VdUNVPehMv193fyrJO5L8iaX+DJw9s6CZlWwyK3OgmZl8jrfk9pWXJHluksuS\n", 1190 | "3LV+ro96cVU9IsmvJfnerLam/HiSO5I8P8lbz7Qtparun+TrkvzOcQvnvjMLmlnJJhMzB5qZyedg\n", 1191 | "SzblL0jyyO6+KMmzz+L1r0nykCTP6+7v6u4XdfeTstqKcmmS6w++uKr+XlU9vqourqrLk/yfSf5o\n", 1192 | "kp9a8M/AWTILmlnJJhMzB5qZyedgizXl3X1rd793/bBO99r1UfIrk9zZ3a8+tPzSJJ9K8oyquvDA\n", 1193 | "8/9Zkp/Oalb5P0tyb5LHdPf7l6if+8YsaGYlm8zKHGhmJp/jjTrR84nr+1sOL3T3J6rqLVk17Y9J\n", 1194 | "8sb183/p5MoDAICTM2ok4qXr+3cfsf6e9f0jT6AWAAAYalRTftH6/p4j1vefP+MUlvuiqvo0t1vX\n", 1195 | "93tHvHfvXK+frvYP3PL23H7NzUdOlrB++vWT+P7O5/V9s36/274++vu1bt26detnv37guT90O9Vn\n", 1196 | "nK3qPtb7T/2hVVdkte3k9d199SnWX5vkWUme1d2vO8X69UmuTXJtd//oAvV0knT3afe6j7Zf52Uv\n", 1197 | "e/roUrbe7dfcnGT+73xbyOay5BPg/HScnnPUkfL9I+EXHbG+//xHT6AWNmAWNLOSTWZ11JE4mIF8\n", 1198 | "jjeqKX/n+v7SI9b395Ifted8I4d+xLC35GfvGrOgmZVsMjFzoJmZfG7o4HaW43zOqOkrb1rfX1lV\n", 1199 | "1Qf20FTVA5M8Lskns7qo0GL8qHg5ZkEzK9lkYuZAMzP53NB6nORe8rntK5sYcqS8u+/IahzixVld\n", 1200 | "BfSg65JcmOSm7r73pGvj7JgFzaxkk1mZA83M5HO8xY6UV9VVSa5aP3zY+v6xVXXj+tcf6u5rDrzl\n", 1201 | "OUluS/LKqnpSVltaLk9yRZJ3JXnxUrUBAMDMljxS/qgkVyf5nqwu/NNZHQm/en37cwdfvD5a/ugk\n", 1202 | "N2bVjL9w/fqXZ3Wlzo8sWFsSe8oBAFjWdHvKu/u63Mf9SN19V5JnLlXDWfx+9pQDALCYrd5TDgAA\n", 1203 | "fI6mnI2YBc2sZJNZ2TbJzORzvJ1qyu0pX45Z0MxKNpmYOdDMTD43NN2e8m1gT/lyzIJmVrLJxMyB\n", 1204 | "ZmbyuSF7yhnKLGhmJZvMyhxoZiaf42nKAQBgME05AAAMtlNNuRM9AQBYkhM9N+BETwAAluRET4Yy\n", 1205 | "C5pZySaz8hNaZiaf42nK2YhZ0MxKNpmYOdDMTD4H05SzEbOgmZVsMjFzoJmZfA6mKWcjZkEzK9lk\n", 1206 | "VuZAMzP5HG+nmnLTVwAAWJLpKxswfQUAgCWZvgIAAOcJTTkAAAymKWcjZkEzK9lkVs5lYmbyOZ6m\n", 1207 | "nI2YBc2sZJOJmQPNzORzME05GzELmlnJJhMzB5qZyedgmnI2YhY0s5JNZmUONDOTz/F2qik3pxwA\n", 1208 | "gCWZU74Bc8oBAFiSOeUAAHCe0JQDAMBgmnI2YhY0s5JNZuVcJmYmn+NpytmIWdDMSjaZmDnQzEw+\n", 1209 | "B9OUsxGzoJmVbDIxc6CZmXwOpilnI2ZBMyvZZFbmQDMz+RxPUw4AAIPtVFPu4kEAACzJxYM24OJB\n", 1210 | "AAAsycWDAADgPKEpZyNmQTMr2WRWtk0yM/kcT1PORsyCZlayycTMgWZm8jmYppyNmAXNrGSTiZkD\n", 1211 | "zczkczBNORsxC5pZySazMgeamcnneJpyAAAYTFMOAACDacoBAGAwTTkAAAymKWcjZkEzK9lkVuZA\n", 1212 | "MzP5HE9TzkbMgmZWssnEzIFmZvI52AWjCzhJVdUHHl5n/M/mzIJmVrK5ew793T61bam1u2t0DZw4\n", 1213 | "c8o3tP4pw7H/UbNTTbm/ZJZjFjSzkk2A+86Bys2t/7fbS473D++dasoBOH9d9rKnjy5h691+zc2j\n", 1214 | "S4CdZU85AAAMpikHAIDBNOUAADCYppyNmAXNrGSTWckmMzOnfDxNORsxC5pZySazkk0mZ075YJpy\n", 1215 | "NmIWNLOSTWYlm0zOnPLBNOVsxCxoZiWbzEo2mZk55eNtXVNeVddW1Wer6lWjawEAgCVsVVNeVY9J\n", 1216 | "8v1Jbk+yFZcqBgCAM9mapryqLkry+iTfl+Qjg8sBAIDFbE1TnuS1Sf6P7v6lJDW6GAAAWMpWNOVV\n", 1217 | "9f1JLknykvVTtq4MZt4us5JNZiWbzMyc8vGmb8qr6tIk1yf57u7+/f2n42j5UObtMivZZFayyeTM\n", 1218 | "KR9skaa8qp5WVa+qqjdX1cfW01FuOsN7Hl5Vr6uqu6vq01V1Z1XdUFUPOvTSb0vy5UneUVWfqarP\n", 1219 | "JHl8kudU1e9V1Rcu8WfgvjFvl1nJJrOSTSZnTvlgFyz0OS9JclmSjye5K8nX5jRbTKrqEUluS/KQ\n", 1220 | "JD+b5J1JLk/y/CRPqarHdfeH1y9/Q5JfOfj2JD+Z5N1Jfri7P7PQn4H7wLxdZiWbzEo2mZk55eMt\n", 1221 | "1ZS/IMn7u/u9VfWEJG86w+tfk1VD/rzufvX+k1X1Y0n+WlbbVZ6dJN19T5J7Dr65qj6V5CPd/ZsL\n", 1222 | "1Q8AAMMssn2lu2/t7veuH552r/f6KPmVSe482JCvvTTJp5I8o6ouPN1vGSd7AgBwnhhxoucT1/e3\n", 1223 | "HF7o7k8keUuSByR5zFEf0N1P7O4fODflAQDAyRrRlF+6vn/3EevvWd8/8gRqAQCA4UY05Ret7+85\n", 1224 | "Yn3/+cNTWI6tqvo0t1vX93tHvHfvXK+frvYP3PL23H7NzUfOuT3p9cOvm62+w07i+zuf1/fN+v0e\n", 1225 | "XD/Va2aq71RGf7/bvr5v1u93f/23/v4bp65PPnd+/dbJ65tm/cBzf+h2qs84W9W97NbsqroiyRuT\n", 1226 | "vL67rz7F+muTPCvJs7r7dadYvz7JtUmu7e4fXaimTpLunnq2+X6dl73s6aNLOaPbr7l56jpvv+bm\n", 1227 | "JPN/59tCNpcln8valnzKJjOrqva9H99xes4RR8r3j4RfdMT6/vMfPYFa2JB5u8xKNpmVbDI5c8oH\n", 1228 | "G9GUv3N9f+kR6/t7yY/ac84EzNtlVrLJrGSTmZlTPt6Ipnx/hvmVVfV5h/ar6oFJHpfkk0netvRv\n", 1229 | "fGjfz97Snw8AwG45uMf8OJ9z4k15d9+R1TjEi5M899DydUkuTHJTd997Dn7vOnDbW/rzAQDYLd29\n", 1230 | "t99fHudzFrmiZ1VdleSq9cOHre8fW1U3rn/9oe6+5sBbnpPktiSvrKonZbWl5fIkVyR5V5IXL1EX\n", 1231 | "AABsg0Wa8iSPSnJ1PneVzc7qSPgl68fvS/IHTXl331FVj07yQ0mekuSpSe5O8vIk13X3UeMSAQDg\n", 1232 | "vLPI9pXuvq67v6C773fo9gXr2yWneM9d3f3M7v7K7v6i7r64u194Lhtye8qXc9Q8WxhNNpmVbO6e\n", 1233 | "o2ZZux3vNvp7PWxr95SPZE/5cj74i+8YXQKckmwyK9mE89NUe8rZPebtMivZZFayubtmv2hUsvpJ\n", 1234 | "zuxjO/cvbnW+2qkj5Sxn9v9w2V2yyaxkk5nJ53g71ZTbUw4AwJKW2lO+U9tXjrvXBwAADlqfp7iX\n", 1235 | "rA4Ab/o5O3WkHAAAZqQpBwCAwTTlbMS8XWYlm8xKNpmZfI63U025Ez2XY94us5JNZiWbzEw+N+dE\n", 1236 | "zw040XM55u0yK9lkVrLJzORzc070ZCjzTJmVbDIr2WRm8jmephwAAAbTlAMAwGCacgAAGGynmnLT\n", 1237 | "VwAAWNJS01d2qinv7jpw2xtdzzYzz5RZySazkk1mJp+b6+69/f7yOJ+zU005yzHPlFnJJrOSTWYm\n", 1238 | "n+NpytmIeabMSjaZlWwyM/kcT1PORswzZVayyaxkk5nJ53iacgAAGExTDgAAg+1UU24kIgAAS1pq\n", 1239 | "JOIFSxW0DY47qgYAAA5aj9neS1YHgDf9nJ06Us5yzDNlVrLJrGSTmcnneJpyNmKeKbOSTWYlm8xM\n", 1240 | "PsfTlLMR80yZlWwyK9lkZvI5nqacjZhnyqxkk1nJJjOTz/E05QAAMJimHAAABtOUAwDAYDvVlLt4\n", 1241 | "EAAAS1rq4kE71ZR3dx247Y2uZ5uZZ8qsZJNZySYzk8/Ndffefn95nM/Zqaac5Zhnyqxkk1nJJjOT\n", 1242 | "z/E05WzEPFNmJZvMSjaZmXyOpylnI+aZMivZZFayyczkczxNOQAADKYpBwCAwTTlAAAwmKYcAAAG\n", 1243 | "05SzEfNMmZVsMivZZGbyOZ6mnI2YZ8qsZJNZySYzk8/xNOVsxDxTZiWbzEo2mZl8jqcpZyPmmTIr\n", 1244 | "2WRWssnM5HO8nWrKq6oP3PZG1wMAwHarqr39/vI4n3PBUgVtg+6u0TUAAHD+6O69JHvJ6gDwpp+z\n", 1245 | "U0fKAQBgRppyAAAYTFPORswzZVayyaxkk5nJ53iacjZinimzkk1mJZvMTD7H05SzEfNMmZVsMivZ\n", 1246 | "ZGbyOZ6mnI2YZ8qsZJNZySYzk8/xNOUAADCYphwAAAbTlAMAwGBb05RX1XOr6jeq6p717baqeuro\n", 1247 | "ugAA4Li2pilP8v4kfyPJNyX5liRvTPKzVfWooVXtKPNMmZVsMivZZGbyOd7WNOXd/XPd/QvdfUd3\n", 1248 | "/1Z3vyTJx5P8qdG17SLzTJmVbDIr2WRm8jneBaML2ERV3S/Jn09y/yS/PLicnWSeKbOSTWYlm8xM\n", 1249 | "Psfbqqa8qv6LJG9N8kVJ7k3yF7r7XWOr2k3mmTIr2WRWssnM5HO8rdm+svbOJJdltWXlJ5L8TFU9\n", 1250 | "emxJAABwPIs15VX1tKp6VVW9uao+VlWfraqbzvCeh1fV66rq7qr6dFXdWVU3VNWDTvX67v7Mek/5\n", 1251 | "r3f3i5K8Lclzl/ozAADACEtuX3lJVkexP57kriRfm6SPenFVPSLJbUkekuRnszoKfnmS5yd5SlU9\n", 1252 | "rrs/fIbf837ZvqP9AADweZZsyl+Q5P3d/d6qekKSN53h9a/JqiF/Xne/ev/JqvqxJH8tyfVJnn3g\n", 1253 | "+b+T5P/KquF/YJK/nOQJSZ6y4J8BAABO3GJHmbv71u5+7/phne6166PkVya582BDvvbSJJ9K8oyq\n", 1254 | "uvDA8w9N8vqsjqj/q6xmlT+lu39xifq5b8wzZVayyaxkk5nJ53ijtn48cX1/y+GF7v5EkrckeUCS\n", 1255 | "xxx4/vu6+2u6+/7d/dDufrKGfBzzTJmVbDIr2WRm8jneqKb80vX9u49Yf8/6/pEnUAsbMM+UWckm\n", 1256 | "s5JNZiaf441qyi9a399zxPr+86ecwrKpqurT3G5d3+8d8d69c71+uto/cMvbc/s1Nx/546WTXj88\n", 1257 | "z3S2+g47ie/vfF7fN+v3e3D9VLN2Z6rvVEZ/v9u+vm/W73d//XR1z1CffJ67fM78/crnfV8/8Nwf\n", 1258 | "up2yyLNU3cd6/6k/tOqKJG9M8vruvvoU669N8qwkz+ru151i/fok1ya5trt/dIF6Okm6+7R73Ufb\n", 1259 | "r/Oylz19dClbb/8vmNm/820hm8uSz2XJ53Jkc1myuaxtyOdxes5RR8r3j4RfdMT6/vMfPYFaAABg\n", 1260 | "qFFN+TvX95cesb6/l/yoPecbOfQjhr0lPxsAgN1zcDvLcT5nyTnl98X+DPMrq6r6wB6aqnpgkscl\n", 1261 | "+WRWV+xczMw/7gAAYPt0916SveRz21c2MeRIeXffkdU4xIuTPPfQ8nVJLkxyU3ffe9K1cXbMM2VW\n", 1262 | "ssmsZJOZyed4izXlVXVVVd1YVTcm+Vvrpx+7/1xVvezQW56T5INJXllVb6iqH6mqN2Z1ZdB3JXnx\n", 1263 | "UrWxPPNMmZVsMivZZGbyOd6SR8ofleTqJN+T1dU6O6sj4Vevb3/u4IvXR8sfneTGJJcneeH69S9P\n", 1264 | "8pju/siCtSWxp3xJ5pkyK9lkVrLJzORzc9PtKe/u67LaenJf3nNXkmcuVcNZ/H72lC/kVLOgYQay\n", 1265 | "yaxkk5nJ5+a2ek85AADwOZpyAAAYbKeacnvKAQBY0nR7yreBPeUAACzJnnKGMs+UWckms5JNZiaf\n", 1266 | "42nK2Yh5psxKNpmVbDIz+RxPU85GzDNlVrLJrGSTmcnneDvVlDvRcznmmTIr2WRWssnM5HNzTvTc\n", 1267 | "gBM9AQBYkhM9AQDgPKEpBwCAwTTlAAAwmKacjZhnyqxkk1nJJjOTz/F2qik3fWU55pkyK9lkVrLJ\n", 1268 | "zORzc6avbMD0leWYZ8qsZJNZySYzk8/Nmb7CUOaZMivZZFayyczkczxNOQAADKYpBwCAwTTlAAAw\n", 1269 | "mKYcAAAG26mm3EjE5Zhnyqxkk1nJJjOTz80tNRJxp5ry7q4Dt73R9Wwz80yZlWwyK9lkZvK5ue7e\n", 1270 | "2+8vj/M5O9WUsxzzTJmVbDIr2WRm8jmeppyNmGfKrGSTWckmM5PP8TTlAAAwmKYcAAAG05QDAMBg\n", 1271 | "mnIAABhMU85GzDNlVrLJrGSTmcnneJpyNmKeKbOSTWYlm8xMPsfbqabcFT2XY54ps5JNZiWbzEw+\n", 1272 | "N7fUFT0vWKqgbXDcKy3xOeaZMivZZFayyczkc3Prq8TvJasDwJt+zk4dKQcAgBlpygEAYDBNOQAA\n", 1273 | "DKYpBwCAwTTlbMQ8U2Ylm8xKNpmZfI6nKWcj5pkyK9lkVrLJzORzPE05GzHPlFnJJrOSTWYmn+Np\n", 1274 | "ytmIeabMSjaZlWwyM/kcT1MOAACDacoBAGAwTTkAAAy2U015VfWB297oegAA2G5VtbffXx7nc3aq\n", 1275 | "Ke/uOnDbG13PNjPPlFnJJrOSTWYmn5vr7r39/vI4n7NTTTnLMc+UWckms5JNZiaf42nK2Yh5psxK\n", 1276 | "NpmVbDIz+RxPU85GzDNlVrLJrGSTmcnneJpyAAAYTFMOAACDacoBAGAwTTkAAAymKWcj5pkyK9lk\n", 1277 | "VrLJzORzvK1pyqvq2qr6N1V1T1V9sKp+rqrM7xnEPFNmJZvMSjaZmXyOtzVNeZInJPmJJN+W5DuT\n", 1278 | "/Kck/6qqHjy0qh1lnimzkk1mJZvMTD7Hu2B0AWeru59y8HFVfU+Se5I8Nsm/GFLUDjPPlFnJJrOS\n", 1279 | "TWYmn+Nt05Hyw74kq/o/MroQAAA4jm1uyl+R5NeTvHV0IQAAcBxbs33loKr68ay2rXx7d/foegAA\n", 1280 | "4DgWO1JeVU+rqldV1Zur6mNV9dmquukM73l4Vb2uqu6uqk9X1Z1VdUNVPeg077khydOTfGd3v2+p\n", 1281 | "+gEAYJQlt6+8JMlzk1yW5K71c0cexa6qRyT5tSTfm+RtSX48yR1Jnp/krVX1pad4zyvyuYb83QvW\n", 1282 | "zn1knimzkk1mJZvMTD7HW7Ipf0GSR3b3RUmefRavf02ShyR5Xnd/V3e/qLuflOSGJJcmuf7gi6vq\n", 1283 | "1Vk18N+d5J6qetj69oAF/wycJfNMmZVsMivZZGbyOd5iTXl339rd710/rNO9dn2U/Mokd3b3qw8t\n", 1284 | "vzTJp5I8o6ouPPD8s5N8cZL/J8ndB24/uED53EfmmTIr2WRWssnM5HO8USd6PnF9f8vhhe7+RFW9\n", 1285 | "Jaum/TFJ3rh+fpsnxZx3zDNlVrLJrGSTmcnneKMa3UvX90ftC3/P+v6RJ1ALAAAMNaopv2h9f88R\n", 1286 | "6/vPHzmFZRNV1ae53bq+3zvivXvnev10tX/glrfn9mtuPvJEDOunXz+J7+98Xt836/e77eujv99t\n", 1287 | "X9836/e77eujv99tX08y9fe77esjvt8Dz/2h2ymLPEt1LsZ8V9UVWW07eX13X32K9dcmeVaSZ3X3\n", 1288 | "606xfn2Sa5Nc290/ukA9nSTdfdq97qPt13nZy54+upStd/s1NyeZ/zvfFrK5LPlclnwuRzaXJZvL\n", 1289 | "2oZ8HqfnHHWkfP9I+EVHrO8//9ETqAUAAIYa1ZS/c31/6RHr+3vJzSKflHmmzEo2mZVsMjP5HG9U\n", 1290 | "U/6m9f2VVfV5h/er6oFJHpfkk1ldVGgxh/b97C352bvGPFNmJZvMSjaZmXxu7uAe8+N8zpCmvLvv\n", 1291 | "yGoc4sVZXQX0oOuSXJjkpu6+d+Hftw7c9pb87F1jnimzkk1mJZvMTD431917+/3lcT5nsTnlVXVV\n", 1292 | "kqvWDx+2vn9sVd24/vWHuvuaA295TpLbkryyqp6U1ZaWy5NckeRdSV68VG0szzxTZiWbzEo2mZl8\n", 1293 | "jrfkkfJHJbk6yfdkdeGfzupI+NXr2587+OL10fJHJ7kxq2b8hevXvzzJY7r7IwvWlsT2FQAAlrXU\n", 1294 | "9pXFjpR393VZbT25L++5K8kzl6rhLH6/aUfoAACwfdZboveSz41E3IRL1wMAwGCacgAAGExTzkbM\n", 1295 | "M2VWssmsZJOZyed4O9WUO9FzOeaZMivZZFayyczkc3PTnei5DZzouRzzTJmVbDIr2WRm8rk5J3oy\n", 1296 | "lHmmzEo2mZVsMjP5HE9TDgAAg2nKAQBgsJ1qyp3oCQDAkpzouQEnegIAsCQnejKUeabMSjaZlWwy\n", 1297 | "M/kcT1OIUC0NAAAUlklEQVTORswzZVayyaxkk5nJ53iacjZinimzkk1mJZvMTD7H26mm3ImeyzHP\n", 1298 | "lFnJJrOSTWYmn5tzoucGnOgJAMCSnOgJAADnCU05AAAMpikHAIDBNOVsxDxTZiWbzEo2mZl8jqcp\n", 1299 | "ZyPmmTIr2WRWssnM5HM8TTkbMc+UWckms5JNZiaf4+1UU25O+XLMM2VWssmsZJOZyefmzCnfgDnl\n", 1300 | "AAAsyZxyAAA4T2jKAQBgME05AAAMpilnI+aZMivZZFayyczkczxNORsxz5RZySazkk1mJp/jacrZ\n", 1301 | "iHmmzEo2mZVsMjP5HE9TzkbMM2VWssmsZJOZyed4O9WUu3gQAABLcvGgDbh4EAAAS3LxIAAAOE9o\n", 1302 | "ygEAYDBNORsxz5RZySazkk1mJp/jacrZiHmmzEo2mZVsMjP5HE9TzkbMM2VWssmsZJOZyed4mnI2\n", 1303 | "Yp4ps5JNZiWbzEw+x9OUAwDAYJpyAAAYTFMOAACDacoBAGAwTTkbMc+UWckms5JNZiaf4+1UU15V\n", 1304 | "feC2N7qebWaeKbOSTWYlm8xMPjdXVXv7/eVxPueCpQraBt1do2s4X5hnyqxkk1nJJjOTz811916S\n", 1305 | "vWR1AHjTz9mpI+UsxzxTZiWbzEo2mZl8jqcpBwCAwTTlAAAwmKYcAAAG05QDAMBgmnI2Yp4ps5JN\n", 1306 | "ZiWbzEw+x9OUsxHzTJmVbDIr2WRm8jmeppyNmGfKrGSTWckmM5PP8bamKa+qx1fVz1XVXVX12ar6\n", 1307 | "K6Nr2mXmmTIr2WRWssnM5HO8rWnKkzwgye1Jnp/k3iTHupQpAADM4oLRBZyt7v75JD+fJFV149hq\n", 1308 | "AABgOdt0pBwAAM5LmnIAABhMU85GzDNlVrLJrGSTmcnneIs15VX1tKp6VVW9uao+tp6QctMZ3vPw\n", 1309 | "qnpdVd1dVZ+uqjur6oaqetBSdXFumGfKrGSTWckmM5PP8ZY80fMlSS5L8vEkdyX52pxmQkpVPSLJ\n", 1310 | "bUkekuRnk7wzyeVZTVd5SlU9rrs/vGB9LMg8U2Ylm8xKNpmZfI635PaVFyR5ZHdflOTZZ/H612TV\n", 1311 | "kD+vu7+ru1/U3U9KckOSS5Ncf/DFVfWAqvrGqvrGdd1fvX78VQv+GThL5pkyK9lkVrLJzORzvMWa\n", 1312 | "8u6+tbvfu35Yp3vt+ij5lUnu7O5XH1p+aZJPJXlGVV144PlvTfJv17f7J7lu/evrFigfAACGGXWi\n", 1313 | "5xPX97ccXujuTyR5S1YXC3rMgedv7e4vWN/ud+DXzzyZkgEA4NwY1ZRfur5/9xHr71nfP/IEagEA\n", 1314 | "gKFGNeUXre/vOWJ9//lFp7BUVZ/mduv6fu+I9+6d6/XT1f6BW96e26+5+ciRRdZPv34S39/5vL5v\n", 1315 | "1u9329dHf7/bvr5v1u9329dHf7/bvp5k6u9329dHfL8HnvtDt1MWeZaq+1jvP/WHVl2R5I1JXt/d\n", 1316 | "V59i/bVJnpXkWd39ulOsX5/k2iTXdvePLlBPJ0l3n3av+2j7dV72sqePLuWMPnDL26c+KeT2a25O\n", 1317 | "Mv93vi1kc1nyuaxtyads7p5tyWYin0s5Ts856kj5/pHwi45Y33/+oydQCxswz5RZySazkk1mJp/j\n", 1318 | "jWrK37m+v/SI9f295EftOWcw80yZlWwyK9lkZvI53qim/E3r+yur6vMO71fVA5M8Lsknk7xtyd/0\n", 1319 | "0L6fvSU/e9fM/iMudpdsMivZZGbyubmDe8yP8zlDmvLuviOrcYgXJ3nuoeXrklyY5Kbuvnfh37cO\n", 1320 | "3PaW/GwAAHZPd+/t95fH+ZwLliqoqq5KctX64cPW94+tqhvXv/5Qd19z4C3PSXJbkldW1ZOy2tJy\n", 1321 | "eZIrkrwryYuXqg0AAGa2WFOe5FFJrk6yf+i+szoSfsn68fuS/EFT3t13VNWjk/xQkqckeWqSu5O8\n", 1322 | "PMl13X3UuEQAADivLLZ9pbuvO3C1zYO3/StvXnKK99zV3c/s7q/s7i/q7ou7+4XnqiG3pxwAgCVt\n", 1323 | "9Z7yUewpX85RQ/xhNNlkVrLJzORzc0vtKd+pppzlmGfKrGSTWckmM5PP8TTlbMQ8U2Ylm8xKNpmZ\n", 1324 | "fI63U025PeXLMc+UWckms5JNZiafm1tqT/mS01emd9y9PgAAcND6PMW9ZHUAeNPP2akj5QAAMCNN\n", 1325 | "OQAADKYpBwCAwXaqKXei53LMM2VWssmsZJOZyefmXDxoAy4etBzzTJmVbDIr2WRm8rk5Fw9iKPNM\n", 1326 | "mZVsMivZZGbyOZ6mnI2YZ8qsZJNZySYzk8/xNOUAADCYphwAAAbbqabc9BUAAJa01PSVC5YqaBsc\n", 1327 | "96xYAAA4aD3Rby9ZHQDe9HN26kg5yzHPlFnJJrOSTWYmn+NpytmIeabMSjaZlWwyM/kcT1PORswz\n", 1328 | "ZVayyaxkk5nJ53iacjZinimzkk1mJZvMTD7H05QDAMBgO9WUG4kIAMCSjETcgJGIAAAsyUhEAAA4\n", 1329 | "T2jK2Yh5psxKNpmVbDIz+RxPU85GzDNlVrLJrGSTmcnneJpyNmKeKbOSTWYlm8xMPsfTlLMR80yZ\n", 1330 | "lWwyK9lkZvI5nqYcAAAG05QDAMBgO9WUu3gQAABLcvGgDbh4EAAAS3LxIIYyz5RZySazkk1mJp/j\n", 1331 | "acrZiHmmzEo2mZVsMjP5HE9TzkbMM2VWssmsZJOZyed4mnI2Yp4ps5JNZiWbzEw+x9OUAwDAYJpy\n", 1332 | "AAAYTFMOAACDacoBAGAwTTkbMc+UWckms5JNZiaf42nK2Yh5psxKNpmVbDIz+RxPU85GzDNlVrLJ\n", 1333 | "rGSTmcnneJpyNmKeKbOSTWYlm8xMPsfbqaa8qvrAbW90PQAAbLeq2tvvL4/zORcsVdA26O4aXQMA\n", 1334 | "AOeP7t5LspesDgBv+jk7daQcAABmpCkHAIDBNOVsxDxTZiWbzEo2mZl8jqcpZyPmmTIr2WRWssnM\n", 1335 | "5HM8TTkbMc+UWckms5JNZiaf42nK2Yh5psxKNpmVbDIz+RxPUw4AAINpygEAYLCtasqr6jlVdWdV\n", 1336 | "3VtVv1pV3z66JgAAOK6tacqr6ulJXp7kf0nyjUluS/LzVfVVQwsDAIBj2pqmPMkLk/xkd/+j7n5X\n", 1337 | "d/9Akt9J8uzBde0k80yZlWwyK9lkZvI53lY05VX1R5J8c5JbDi3dkuSxJ18R5pkyK9lkVrLJzORz\n", 1338 | "vK1oypN8eZL7JfndQ89/MMnDTr4cLrzkIaNLgFOSTWYlm8xMPsfblqacyXzqjg+NLgFOSTaZlWwy\n", 1339 | "M/kcb7GmvKqeVlWvqqo3V9XHquqzVXXTGd7z8Kp6XVXdXVWfXk9WuaGqHnTopf8hye8neeih5x+a\n", 1340 | "1b5yAADYWkseKX9JkucmuSzJXevn+qgXV9Ujkvxaku9N8rYkP57kjiTPT/LWqvrS/dd29++tX/vk\n", 1341 | "Qx9zZVZTWAAAYGst2ZS/IMkju/uinN1ElNckeUiS53X3d3X3i7r7SUluSHJpkusPvf7Hk3xvVf33\n", 1342 | "VfV1VfWKrPaT/4Pl/ggAAHDyFmvKu/vW7n7v+mGd7rXro+RXJrmzu199aPmlST6V5BlVdeGBz/+n\n", 1343 | "WTX+L0ny61lNXXlqd79/oT8CAAAMMepEzyeu7w+POEx3fyLJW5I8IMljDq39/e6+uLvv393f2t3/\n", 1344 | "+tyXCgAA51Z1H7nte/MPrboiyRuTvL67rz7F+suS/GCSH+zuG06x/hNJnpPk2d39DxeoZ/k/JAAA\n", 1345 | "nEJ3n3bXyKmMOlJ+0fr+niPW958/PIUFAADOOxeMLuAkbPKvFQAAOCmjjpTvHwm/6Ij1/ec/egK1\n", 1346 | "AADAUKOa8neu7y89Yv2R6/t3n0AtAAAw1Kim/E3r+yur6vO2llTVA5M8Lskns7qoEAAAnNeGNOXd\n", 1347 | "fUdW4xAvzuoqoAddl+TCJDd1970nXRsAAJy0xUYiVtVVSa5aP3xYkicnuSPJ/izxD3X3NQdef0mS\n", 1348 | "25J8RZJ/ntWWlsuTXJHkXUke290fWaQ4AACY2JJN+Uuzuhrn4Q/c357yvu6+5NB7Hp7kh5I8JcmX\n", 1349 | "Jbk7yRuSXNfdR41LBACA88o5uXgQAABw9kad6Alw1qrqDVX1Fw48fnxVffXImgC2TVVdUFUXnmLI\n", 1350 | "xpOq6hVV9SNVdfGo+nadI+Wc0fo/0O9J8o1ZzZC/J8mvJ3l9d985sjZ2Q1V9Nsled//QqR7DSFX1\n", 1351 | "pvzhrZtnpbu/c+Fy4EhVdUOSZyd56P424ar6i0n+9wMv+3CSb+ru9w8ocaftxBU92VxV/fUk1yf5\n", 1352 | "wkNLVyV5SVW9qLt/7OQrY8d8LMmXjC4CjvCE0QXAWXp8klsPnbf30qwu1vgDWQ3q+JEkP5jkBSdf\n", 1353 | "3m7TlHOkqvpLSf5uko8keWWSW5P8blb/0V6R1X/Af7eqfru7f2ZQmeyG30zyl6rqV5P8zvq5r6mq\n", 1354 | "x5/pjd39y+e0MnZed9sKyrb4qqwm3yVJquoRWV3I8W939+vXzz0hyZ8eU95us32FI60boEuSfHN3\n", 1355 | "v+8U6xcn+bdJ3tvdjz7h8tghVfXkrEanftF9fGt39/3OQUkAW6eqPp3kx7r7xevHVye5Mcnjuvut\n", 1356 | "6+f+TpK/2t1fPKzQHeVIOafz9Un+8aka8iTp7jur6p9mtd8czpnuvqWqvi7Jf5nkK5PsJfml9e20\n", 1357 | "bz3HpQFskw9kdeHGfU9Kcm+SXz3w3Bcn+U8nWRQrmnJO5xNZbV05nY8m+fgJ1MKOW//j8H9Lkqra\n", 1358 | "y2pf5HUja4KjVNX9srpi9Xcn+bokD9j/qU1VfVOS70/y8u5+97gq2UFvTfLfVNWfTfLpJE9L8sbu\n", 1359 | "/syB13xNkt8eUNvOsw+O0/mFnGZf2Xqk0pVJbjmximDlO7P6kStMp6r+SJJfTPLyrLYAfjyfu5Be\n", 1360 | "krwvyTOTPOPEi2PX/UiS+2W1HfAXkvyRrIY5JEmq6v5JviPJ/zukuh2nKed0/maSB1fVzxyeCb1+\n", 1361 | "/NNJHpzkmhHFsbu6+9bu/nf7j6vqwVX1VSNrggOuyepk+OuyOjH+fz242N0fSfLmJE8+8crYad19\n", 1362 | "e5LLk9ywvj22u9924CXfnORNWf3/OyfMiZ4caT1798FJLstqf9m/z2r6ykOT/PGstj/dntVM089j\n", 1363 | "9i7nWlU9MKum57uTPCQHTuqsqsuzGvP1ku7+t+OqZBdV1W8m+XB3f/v68V6S//nglJaqem2SP9Pd\n", 1364 | "XzmmSmA29pRzOgdn716Q1Y9hLzn0mstOrhxYqaqLkrwlq5ORfyPJf8hq3+6+t2f1I9i/nNWEIDhJ\n", 1365 | "Fyf5F2d4zYeTfNkJ1AJsCU05RzJ7l4m9OKuG/Pu6+6f2j0TuL3b3J6vql7Paew4n7T8medAZXvNV\n", 1366 | "WZ0oDyeuqr41ybdm9dPwU46NdcXkk6cpB7bRdyW5pbt/6jSv+XdJzM9nhF9P8uSq+qLu/o+HF9c/\n", 1367 | "6fnTWU3CgBNTVV+S5A1JnngWL9eUnzBNObCNHp7kn53hNZ/ImY9Wwrnw2iT/JMk/qapnHlyoqgcn\n", 1368 | "eV2SL03yDwbUxm57WVYN+ZuT/GSSu2Im+TQ05ZxRVT00ybfk9D/m+scnWhS77hNJvuIMr/marPaa\n", 1369 | "w4nq7p+uqiuTfG+SP5v1NpX1VZK/IasxdK/p7jPtO4el/bdZ/STnO7v790cXw+fTlHOkqvrCJP8w\n", 1370 | "ydU5/fjMTqIp5yT9SpI/U1Vf0t0fO7xYVX8syVNz5pPt4Jzo7meuz2t4fpJHrZ/+5iTvSPLj3f2T\n", 1371 | "w4pjl12U5CYN+Zw05ZzO387qSM97s/pR7FE/5jJXk5P2iiQ/n+RfVtX350AGq+rrs5oL/UeTvHJM\n", 1372 | "eZB0941JbqyqC7P6SeM93f2JsVWx434rq7HGTMicco5UVf8+yb1Jvqm7PzW6Hjioql6a1SzyJPlM\n", 1373 | "ki/Maszcl66f+1vd/XdH1AYwo6r6H5P8aJJv6O67RtfD59OUc6Sq+nRW+x5fOLoWOKiqHp/Vpcsf\n", 1374 | "lOQHknxbVjOf78lqosUN3f3GcRWyS6rqztz3nxhWVhe8OnztBzhn1lfjflmSP5XVdJVfzRGjObv7\n", 1375 | "359gaURTzmlU1XuS/FJ3P2t0LXBQVf1+kn/Y3c8ZXQtU1ftO8fQXJvlj619/NquTjr88nzs/53eS\n", 1376 | "/F53X3zOC4S1qvrsWb70D66QzMmxp5zT+ckkf7WqHtTdLnLBTP6/rLZWwXDd/TUHH6/nkP9iVrPy\n", 1377 | "r03ylu7+T1V1QZJvT/IjWTXnV55wqXC2QxkcsR3AkXKOVFX3S/LTWV0y+m8m+dVTTbqAk1ZVNyf5\n", 1378 | "4939baNrgcOq6lVJnpLVvt1TXTzo/knenuTnu/t5J10fMCdNOUc6xY+5ThWW/X2RfszFiamqP5nk\n", 1379 | "bUlek+S67v7M4JLgD1TVXUl+prv/+mle82NJnt7dDz+5yoCZ2b7C6fzyWb7Ov+w4addmdaTxRUme\n", 1380 | "WVW/keQDOUUWu/uZh5+Dc+zLcub/f/3CrPaYwzlTVT+Z1d+L13b37x54fEb+7jx5jpQDW+c+nKyU\n", 1381 | "7j7dha9gcVX1jqwu0vINpzofp6oenNU/Ku/p7q8/6frYHQf+rvza7n63vzvn5kg591lVfXmS70jy\n", 1382 | "qST/ypXBGMAYOWb297O6cNWvVNX1SX4pye8meViSJyR5cVaTWX54WIXsiv2/K+869JgJOVLOkarq\n", 1383 | "2Vld0fO/6u4Pr5/7liS/kM9doOVXkzyxuz85pEiACVXVK5Lsn8R58P9oa33/qu5+/slWBcxMU86R\n", 1384 | "qurWJH+0uy8/8Nwbkzw+yY1ZXar3v07yN7r7742oEWBWVfXYJN+X5Juz2s5yT5JfS3Jjd982sjZg\n", 1385 | "PppyjlRVv53kX3T3/7B+/JCsTqb7Rwee+5Uk9+vubxlXKQDAdrOJn9P5siQfPPD4cVn96PUNB557\n", 1386 | "c5KvOcGaAADOO5pyTucj+fyRXY/P6nLRB3/s2knuf5JFAQCcbzTlnM5vJvmzVfXlVfWgJH8xyb/p\n", 1387 | "7nsOvOars9rSAgDAhjTlnM4rshrb9f6sxik9LKsrKB70mCS/ccJ1AQCcVzTlHKm7fy7J/5TVEfN3\n", 1388 | "JfnB7r5pf72qnpjkgVmNSAQAYEOmrwAAwGCOlAMAwGCacgAAGExTDgAAg2nKAQBgME05AAAMpikH\n", 1389 | "AIDBNOUAADCYphwAAAbTlAMAwGCacgAAGExTDgAAg2nKAQBgsP8fnpzhChO+CfYAAAAASUVORK5C\n", 1390 | "YII=\n" 1391 | ], 1392 | "text/plain": [ 1393 | "" 1394 | ] 1395 | }, 1396 | "metadata": { 1397 | "image/png": { 1398 | "height": 264, 1399 | "width": 370 1400 | } 1401 | }, 1402 | "output_type": "display_data" 1403 | } 1404 | ], 1405 | "source": [ 1406 | "master_df.vartype2.value_counts().plot(kind='bar', log=True, grid=True, color='seagreen')" 1407 | ] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": 15, 1412 | "metadata": { 1413 | "collapsed": false 1414 | }, 1415 | "outputs": [ 1416 | { 1417 | "data": { 1418 | "text/plain": [ 1419 | "snp 483986\n", 1420 | "ref 244660\n", 1421 | "del 19604\n", 1422 | "ins 7551\n", 1423 | "dtype: int64" 1424 | ] 1425 | }, 1426 | "execution_count": 15, 1427 | "metadata": {}, 1428 | "output_type": "execute_result" 1429 | } 1430 | ], 1431 | "source": [ 1432 | "master_df.vartype2.value_counts()" 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "code", 1437 | "execution_count": 16, 1438 | "metadata": { 1439 | "collapsed": false 1440 | }, 1441 | "outputs": [ 1442 | { 1443 | "data": { 1444 | "text/plain": [ 1445 | "755801" 1446 | ] 1447 | }, 1448 | "execution_count": 16, 1449 | "metadata": {}, 1450 | "output_type": "execute_result" 1451 | } 1452 | ], 1453 | "source": [ 1454 | "len(master_df)" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": 17, 1460 | "metadata": { 1461 | "collapsed": false 1462 | }, 1463 | "outputs": [ 1464 | { 1465 | "data": { 1466 | "text/html": [ 1467 | "
\n", 1468 | "\n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | " \n", 1814 | " \n", 1815 | " \n", 1816 | " \n", 1817 | " \n", 1818 | " \n", 1819 | " \n", 1820 | " \n", 1821 | " \n", 1822 | " \n", 1823 | " \n", 1824 | " \n", 1825 | " \n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | "
sample_idsmultiallelephaseGT1GT2a1a2zygosityvartype1vartype2GTFORMAThom_ref_countsINFO
CHROMPOSREFALT
2216050075AGHG037700|01AGhet-refrefsnp0|1GT2503AC=1;AF=0.000199681;AN=5008;NS=2504
16050115GAHG013630|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG023340|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG023430|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG025740|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG030520|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG033540|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG034320|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG034730|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA185160|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA188580|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA188740|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA190270|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA191210|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA191370|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA197070|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
ANA199840|01GAhet-refrefsnp0|1GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG024970|10AGhet-refsnpref1|0GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG025360|10AGhet-refsnpref1|0GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
AHG026230|10AGhet-refsnpref1|0GT2472AC=32;AF=0.00638978;AN=5008;NS=2504
\n", 1859 | "
" 1860 | ], 1861 | "text/plain": [ 1862 | " sample_ids multiallele phase GT1 GT2 a1 a2 zygosity \\\n", 1863 | "CHROM POS REF ALT \n", 1864 | "22 16050075 A G HG03770 0 | 0 1 A G het-ref \n", 1865 | " 16050115 G A HG01363 0 | 0 1 G A het-ref \n", 1866 | " A HG02334 0 | 0 1 G A het-ref \n", 1867 | " A HG02343 0 | 0 1 G A het-ref \n", 1868 | " A HG02574 0 | 0 1 G A het-ref \n", 1869 | " A HG03052 0 | 0 1 G A het-ref \n", 1870 | " A HG03354 0 | 0 1 G A het-ref \n", 1871 | " A HG03432 0 | 0 1 G A het-ref \n", 1872 | " A HG03473 0 | 0 1 G A het-ref \n", 1873 | " A NA18516 0 | 0 1 G A het-ref \n", 1874 | " A NA18858 0 | 0 1 G A het-ref \n", 1875 | " A NA18874 0 | 0 1 G A het-ref \n", 1876 | " A NA19027 0 | 0 1 G A het-ref \n", 1877 | " A NA19121 0 | 0 1 G A het-ref \n", 1878 | " A NA19137 0 | 0 1 G A het-ref \n", 1879 | " A NA19707 0 | 0 1 G A het-ref \n", 1880 | " A NA19984 0 | 0 1 G A het-ref \n", 1881 | " A HG02497 0 | 1 0 A G het-ref \n", 1882 | " A HG02536 0 | 1 0 A G het-ref \n", 1883 | " A HG02623 0 | 1 0 A G het-ref \n", 1884 | "\n", 1885 | " vartype1 vartype2 GT FORMAT hom_ref_counts \\\n", 1886 | "CHROM POS REF ALT \n", 1887 | "22 16050075 A G ref snp 0|1 GT 2503 \n", 1888 | " 16050115 G A ref snp 0|1 GT 2472 \n", 1889 | " A ref snp 0|1 GT 2472 \n", 1890 | " A ref snp 0|1 GT 2472 \n", 1891 | " A ref snp 0|1 GT 2472 \n", 1892 | " A ref snp 0|1 GT 2472 \n", 1893 | " A ref snp 0|1 GT 2472 \n", 1894 | " A ref snp 0|1 GT 2472 \n", 1895 | " A ref snp 0|1 GT 2472 \n", 1896 | " A ref snp 0|1 GT 2472 \n", 1897 | " A ref snp 0|1 GT 2472 \n", 1898 | " A ref snp 0|1 GT 2472 \n", 1899 | " A ref snp 0|1 GT 2472 \n", 1900 | " A ref snp 0|1 GT 2472 \n", 1901 | " A ref snp 0|1 GT 2472 \n", 1902 | " A ref snp 0|1 GT 2472 \n", 1903 | " A ref snp 0|1 GT 2472 \n", 1904 | " A snp ref 1|0 GT 2472 \n", 1905 | " A snp ref 1|0 GT 2472 \n", 1906 | " A snp ref 1|0 GT 2472 \n", 1907 | "\n", 1908 | " INFO \n", 1909 | "CHROM POS REF ALT \n", 1910 | "22 16050075 A G AC=1;AF=0.000199681;AN=5008;NS=2504 \n", 1911 | " 16050115 G A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1912 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1913 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1914 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1915 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1916 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1917 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1918 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1919 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1920 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1921 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1922 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1923 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1924 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1925 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1926 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1927 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1928 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 \n", 1929 | " A AC=32;AF=0.00638978;AN=5008;NS=2504 " 1930 | ] 1931 | }, 1932 | "execution_count": 17, 1933 | "metadata": {}, 1934 | "output_type": "execute_result" 1935 | } 1936 | ], 1937 | "source": [ 1938 | "master_df.head(20)" 1939 | ] 1940 | }, 1941 | { 1942 | "cell_type": "code", 1943 | "execution_count": 18, 1944 | "metadata": { 1945 | "collapsed": false 1946 | }, 1947 | "outputs": [ 1948 | { 1949 | "name": "stdout", 1950 | "output_type": "stream", 1951 | "text": [ 1952 | "\n", 1953 | "MultiIndex: 755801 entries, (22, 16050075, A, G) to (22, 16644712, G, C)\n", 1954 | "Data columns (total 14 columns):\n", 1955 | "sample_ids 755801 non-null object\n", 1956 | "multiallele 755801 non-null int64\n", 1957 | "phase 755801 non-null object\n", 1958 | "GT1 755801 non-null int64\n", 1959 | "GT2 755801 non-null int64\n", 1960 | "a1 755801 non-null object\n", 1961 | "a2 755801 non-null object\n", 1962 | "zygosity 755801 non-null object\n", 1963 | "vartype1 755801 non-null object\n", 1964 | "vartype2 755801 non-null object\n", 1965 | "GT 755801 non-null object\n", 1966 | "FORMAT 755801 non-null object\n", 1967 | "hom_ref_counts 755801 non-null float64\n", 1968 | "INFO 755801 non-null object\n", 1969 | "dtypes: float64(1), int64(3), object(10)\n", 1970 | "memory usage: 84.4+ MB\n" 1971 | ] 1972 | } 1973 | ], 1974 | "source": [ 1975 | "master_df.info()" 1976 | ] 1977 | } 1978 | ], 1979 | "metadata": { 1980 | "kernelspec": { 1981 | "display_name": "Python 2", 1982 | "language": "python", 1983 | "name": "python2" 1984 | }, 1985 | "language_info": { 1986 | "codemirror_mode": { 1987 | "name": "ipython", 1988 | "version": 2 1989 | }, 1990 | "file_extension": ".py", 1991 | "mimetype": "text/x-python", 1992 | "name": "python", 1993 | "nbconvert_exporter": "python", 1994 | "pygments_lexer": "ipython2", 1995 | "version": "2.7.10" 1996 | } 1997 | }, 1998 | "nbformat": 4, 1999 | "nbformat_minor": 0 2000 | } 2001 | -------------------------------------------------------------------------------- /pandasvcf.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from variant_annotations import process_variant_annotations, mp_variant_annotations 4 | from vcf_metadata import VCFMetadata 5 | 6 | 7 | class VCF(object): 8 | """Loads in a vcf file, aware of gzipped files. 9 | 10 | 11 | Parameters 12 | -------------------------------------- 13 | filename: str, required 14 | path to vcf file 15 | 16 | sample_id: str or list, default='all' 17 | specifies the sample column ids to read and parse 18 | 19 | 'all' means all sample columns 20 | 21 | can use a str (e.g. 'NA12878') 22 | or 23 | can use a list (e.g. ['NA12878', 'NA12877'] 24 | 25 | 26 | cols: list, default ['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'] 27 | specifies the VCF column names, EXCEPT SAMPLE COLS, to read and parse 28 | 29 | Must include ['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'] 30 | 31 | Additional columns such as QUAL, FILTER, INFO will be accepted 32 | e.g. ['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT', 'INFO', 'QUAL'] 33 | 34 | chunksize: int, default=5000 35 | specifies the number of VCF lines to read and parse in 1 chunk 36 | 37 | Note using a large chunksize with large n_cores requires LOTS OF RAM 38 | 39 | requires ~40 seconds to parse 1000 rows with 2500 samples 40 | 41 | 42 | Methods 43 | ----------------------------------------- 44 | get_vcf_df_chunk 45 | returns VCF pandasDF with chunksize 46 | 47 | 48 | add_variant_annotations 49 | Annotates each variant 50 | See docstring for details 51 | 52 | 53 | 54 | Returns VCF Obj with following attributes 55 | ----------------------------------------- 56 | header_df: pandas df 57 | VCF header as a pandas df 58 | 59 | samples: list 60 | sample column IDs 61 | 62 | all_columns: list 63 | all sample column IDs in VCF 64 | 65 | vcf_chunks: pandas.io.parsers.TextFileReader chunk 66 | VCF chunk 67 | Access to chunk provided by get_vcf_df_chunk() 68 | 69 | df: pandas DF 70 | Index: CHROM, POS, REF, ALT 71 | Columns: CHROM, POS, REF, ALT, SAMPLE(S) +/- {QUAL, FILTER, INFO if specified} 72 | 73 | 74 | """ 75 | 76 | def __init__(self, filename, sample_id='all', 77 | cols=['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'], 78 | chunksize=5000): 79 | 80 | # Header 81 | header_parsed = VCFMetadata(filename) 82 | # header parsed into key/values dataframe 83 | self.header_df = self.get_header_df(header_parsed.header) 84 | # Sample IDs 85 | self.samples = list(self.header_df.loc['SampleIDs'])[0] 86 | self.sample_id = self.get_sample_ids(sample_id) 87 | 88 | self.set_cols(cols) 89 | 90 | self.set_dtypes() 91 | 92 | # Open pandas chunk object (TextReader) 93 | self.chunksize = chunksize 94 | self.vcf_chunks = pd.read_csv(filename, sep="\t", 95 | compression=header_parsed.compression, 96 | skiprows=(len(self.header_df) - 2), 97 | usecols=self.usecols, 98 | chunksize=chunksize, 99 | dtype=self.vcf_dtypes) 100 | 101 | 102 | def get_header_df(self, header_txt): 103 | """Parses header into pandas DataFrame""" 104 | try: 105 | key_value_header = [i.replace('##', '').replace( 106 | '\n', '').split('=', 1) for i in header_txt if '##' in i] 107 | key_value_header.append( 108 | ['SampleIDs', header_txt[-1].rstrip('\n').split('\t')[9:]]) 109 | key_value_header.append( 110 | ['ColumnHeader', header_txt[-1].rstrip('\n').split('\t')]) 111 | header_df = pd.DataFrame.from_records(key_value_header) 112 | header_df.set_index(0, inplace=True) 113 | header_df.index.name = 'header_keys' 114 | header_df.columns = ['header_values'] 115 | return header_df 116 | except IndexError: 117 | print("VCF header parsing failed, " 118 | "this may be due to the use of " 119 | "tabix version 1.2.x, please upgrade to tabix 1.3 or greater") 120 | return 121 | 122 | def get_sample_ids(self, sample_id): 123 | """ 124 | Identifies and stores sample_id(s) 125 | """ 126 | if sample_id == 'all': 127 | return self.samples[:] 128 | else: 129 | if type(sample_id) == str: 130 | return [sample_id] 131 | else: 132 | return sample_id 133 | 134 | def set_cols(self, cols): 135 | # Columns 136 | self.all_columns = list(self.header_df.loc['ColumnHeader'])[0] 137 | self.FORMAT = self.all_columns[8] 138 | 139 | assert len(set(cols) & set(['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT'])) > 4, "cols requires the following columns: ['#CHROM', 'POS', 'REF', 'ALT', 'FORMAT']" 140 | self.cols = cols 141 | if len(cols) > 0: # columns specified 142 | self.usecols = [c for c in self.all_columns if c in cols] 143 | 144 | if len(self.sample_id) > 0: 145 | self.usecols.extend(self.sample_id) 146 | # print self.usecols 147 | else: 148 | assert False, 'no sample IDs' 149 | else: # columns not specified 150 | self.usecols = [s for s in self.cols if s not in self.samples] 151 | self.usecols.extend(self.sample_id) 152 | 153 | def set_dtypes(self): 154 | self.vcf_dtypes = {'CHROM':'category', 155 | 'POS':'int32', 156 | 'REF':'category', 157 | 'ALT':'category', 158 | 'FORMAT':'category', 159 | 'QUAL':'int8', 160 | 'FILTER':'category'} 161 | 162 | def get_vcf_df_chunk(self): 163 | """ 164 | This function iterates through the VCF files using the user-defined 165 | chunksize (default = 5000 lines). 166 | """ 167 | try: 168 | self.df = self.vcf_chunks.get_chunk() 169 | self.stopIteration = False 170 | except StopIteration: 171 | self.stopIteration = True 172 | print("End of File Reached") 173 | # self.df = None 174 | return 1 175 | self.df.drop_duplicates(inplace=True) # dropping duplicate rows 176 | self.df.columns = [c.replace('#', '') for c in self.usecols] 177 | self.df['CHROM'] = self.df['CHROM'].astype(str).str.replace('chr', '').astype('category') 178 | self.df.set_index( 179 | ['CHROM', 'POS', 'REF', 'ALT'], inplace=True, drop=False) 180 | 181 | self.df_bytes = self.df.values.nbytes + \ 182 | self.df.index.nbytes + self.df.columns.nbytes 183 | 184 | return 0 185 | 186 | def add_variant_annotations(self, split_columns='', verbose=False, 187 | inplace=False, drop_hom_ref=True, 188 | n_cores=1): 189 | """ 190 | This function adds the following annotations for each variant: 191 | multiallele, phase, a1, a2, GT1, GT2, vartype1, vartype2, zygosity, 192 | and parsed FORMAT values, see below for additional information. 193 | 194 | Parameters 195 | -------------- 196 | 197 | split_columns: dict, optional 198 | key:FORMAT id value:#fields expected 199 | e.g. {'AD':2} indicates Allelic Depth should be 200 | split into 2 columns. 201 | 202 | drop_hom_ref: bool, default=True 203 | This will drop homozygous reference genotype calls from 204 | the long dataframe. As most calls in a multisample vcf 205 | are homozygous reference, this will reduce memory requirements 206 | dramatically. 207 | 208 | verbose: bool, default=False 209 | This will describe how many missing variants were dropped 210 | 211 | inplace: bool, default=False 212 | This will replace the sample_id column with parsed columns, 213 | and drop the FORMAT field. If True, this will create an 214 | additional dataframe, df_annot, to the VCF object composed of 215 | the parsed columns (memory intensive) 216 | 217 | n_cores: int, default=1 218 | specifies the number of cpus to use during variantAnnotation 219 | 220 | Note using a large chunksize with large n_cores requires LOTS OF RAM 221 | 222 | Output 223 | -------------- 224 | This function adds the following annotations to each variant: 225 | 226 | multiallele: {0,1} 0=biallele 1=multiallelic 227 | 228 | phase: {'/', '|'} /=unphased, |=phased 229 | 230 | a1: DNA base representation of allele1 call, e.g. A 231 | a2: DNA base representation of allele2 call, e.g. A 232 | 233 | GT1: numeric representation of allele1 call, e.g. 0 234 | GT2: numeric representation of allele2 call, e.g. 1 235 | 236 | vartype1: {snp, mnp, ins, del, indel or SV} variant type of first allele 237 | vartype2: {snp, mnp, ins, del, indel or SV} variant type of second allele 238 | 239 | zygosity: {het-ref, hom-ref, alt-ref, het-miss, hom-miss} 240 | 241 | FORMAT values: any values associated with the genotype calls are 242 | added as additional columns, split_columns are further 243 | split by ',' into individual columns 244 | 245 | """ 246 | 247 | if self.stopIteration: 248 | print('End of File Reached') 249 | return 1 250 | 251 | self.drop_hom_ref = drop_hom_ref 252 | 253 | df_vcf_cols = self.df[list(set(self.df.columns) 254 | - {'CHROM', 'POS', 'REF', 'ALT', 'FORMAT'} 255 | - set(self.sample_id))] 256 | 257 | self.df = self.df.reset_index(drop=True) 258 | 259 | if n_cores==1: 260 | if inplace: 261 | self.df = process_variant_annotations(self.df, 262 | split_columns=split_columns, 263 | sample_id=self.sample_id, 264 | drop_hom_ref=drop_hom_ref) 265 | # joining QUAL, FILTER, and/or INFO columns 266 | else: 267 | self.df_annot = process_variant_annotations(self.df, 268 | split_columns=split_columns, 269 | sample_id=self.sample_id, 270 | drop_hom_ref=drop_hom_ref) 271 | else: 272 | if inplace: 273 | self.df = mp_variant_annotations(self.df, 274 | n_cores=n_cores, 275 | df_split_cols=split_columns, 276 | df_sampleid=self.sample_id, 277 | drop_hom_ref=drop_hom_ref) 278 | else: 279 | self.df_annot = mp_variant_annotations(self.df, 280 | n_cores=n_cores, 281 | df_split_cols=split_columns, 282 | df_sampleid=self.sample_id, 283 | drop_hom_ref=drop_hom_ref) 284 | if inplace: 285 | self.df = self.df.set_index(['CHROM', 'POS', 'REF', 'ALT']) 286 | else: 287 | self.df_annot = self.df_annot.set_index(['CHROM', 'POS', 'REF', 'ALT']) 288 | return 0 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastparquet==0.2.1 2 | numpy==1.15.4 3 | pandas==0.23.4 4 | pyarrow==0.11.1 5 | pyfaidx==0.5.5.2 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='pandasvcf', 5 | version="0.5.1", 6 | py_modules=['pandasvcf'] 7 | ) 8 | -------------------------------------------------------------------------------- /test_data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erscott/pandasVCF/8c832ddbd811e2425a700af4ad568514a2691577/test_data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz -------------------------------------------------------------------------------- /test_data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erscott/pandasVCF/8c832ddbd811e2425a700af4ad568514a2691577/test_data/ALL.chr22.phase3_shapeit2_mvncall_integrated_v4.20130502.genotypes_10k.vcf.gz.tbi -------------------------------------------------------------------------------- /test_data/SWGR_titin.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erscott/pandasVCF/8c832ddbd811e2425a700af4ad568514a2691577/test_data/SWGR_titin.vcf.gz -------------------------------------------------------------------------------- /test_data/SWGR_titin.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/erscott/pandasVCF/8c832ddbd811e2425a700af4ad568514a2691577/test_data/SWGR_titin.vcf.gz.tbi -------------------------------------------------------------------------------- /variant_annotations.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | The following methods generate annotations for each 4 | VCF DNA variant 5 | """ 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from functools import partial 10 | import multiprocessing as mp 11 | import gc 12 | 13 | 14 | 15 | def add_allelic_bases(df, sample_col, single_sample_vcf=True): 16 | """This function returns the base call for each biallelic base 17 | 10X faster than previous iterations 18 | """ 19 | haploid_chromosomes = ['X', 'chrX', 'Y', 'chrY', 'M', 'chrM'] 20 | 21 | def vector_GT_alleles(ref_alt_gt): 22 | '''Retrieve GT1, GT2, a1, a2''' 23 | 24 | def get_phase(genotype): 25 | """Returns phase from genotype""" 26 | if "|" in genotype: 27 | return "|" 28 | if "/" in genotype: 29 | return "/" 30 | else: 31 | return '-' 32 | 33 | REF, ALT, GT = ref_alt_gt 34 | phase = get_phase(GT) 35 | bases = [REF] + str(ALT).split(',') 36 | gt1, gt2 = np.NaN, np.NaN 37 | a1, a2 = '.', '.' 38 | GT = str(GT).split(phase) 39 | 40 | if len(GT) == 2: #Diploid 41 | gt1, gt2 = GT 42 | 43 | if gt1 == ".": 44 | a1 = "." 45 | else: 46 | a1 = bases[int(gt1)] 47 | 48 | if gt2 == ".": 49 | a2 = "." 50 | else: 51 | a2 = bases[int(gt2)] 52 | 53 | if len(GT) == 1: #Haploid 54 | gt1 = GT[0] 55 | a1 = bases[int(gt1)] 56 | 57 | return (gt1, gt2, a1, a2, phase) 58 | 59 | df = df.assign(**{'GT1':-1, 'GT2':-1, 'a1':'', 'a2':'', 'phase':''}) 60 | df.loc[:, ['GT1', 'GT2', 'a1', 'a2', 'phase']] = [i for i in map(vector_GT_alleles, df[['REF','ALT', sample_col]].values)] 61 | return df 62 | 63 | 64 | def zygosity_fast(df): 65 | """ 66 | This function quickly assigns zygosity states to 67 | each variant using set logic 68 | """ 69 | 70 | def check_empty_df(df, zygosity): 71 | 72 | try: 73 | df.loc[:, 'zygosity'] = zygosity 74 | return df 75 | except ValueError: 76 | if len(df) == 0: 77 | return pd.DataFrame() 78 | else: 79 | assert False 80 | 81 | 82 | df_hom_ref = df[(df['a1'] == df['REF']) & (df['a2'] == df['REF'])].copy() 83 | if len(df_hom_ref) > 0: 84 | df_hom_ref.loc[:, 'zygosity'] = 'hom-ref' 85 | else: 86 | df_hom_ref = check_empty_df(df_hom_ref, 'hom-ref') 87 | 88 | df_hom_miss = df[(df['a1'] == '.') & (df['a2'] == '.')].copy() 89 | if len(df_hom_miss) > 0: 90 | df_hom_miss = check_empty_df(df_hom_miss, 'hom-miss') 91 | #df_hom_miss.loc[:, 'zygosity'] = 'hom-miss' 92 | 93 | df_het_miss = df[(df['a1'] == '.') | (df['a2'] == '.')].copy() 94 | if len(df_het_miss) > 0: 95 | df_het_miss = check_empty_df(df_het_miss, 'het-miss') 96 | #df_het_miss.loc[:, 'zygosity'] = 'het-miss' 97 | 98 | df_not_miss = df.drop(set(df_hom_miss.index) | 99 | set(df_het_miss.index)).copy() 100 | 101 | df_het_alt = df_not_miss[((df_not_miss['a1'] != df_not_miss['REF']) & 102 | (df_not_miss['a2'] != df_not_miss['REF'])) & 103 | (df_not_miss['a1'] != df_not_miss['a2'])].copy() 104 | df_het_alt = check_empty_df(df_het_alt, 'het-alt') 105 | #df_het_alt.loc[:, 'zygosity'] = 'het-alt' 106 | 107 | df_hom_alt = df_not_miss[(((df_not_miss['a1'] != df_not_miss['REF']) & 108 | (df_not_miss['a2'] != df_not_miss['REF']))) & 109 | (df_not_miss['a1'] == df_not_miss['a2'])].copy() 110 | df_hom_alt = check_empty_df(df_hom_alt, 'hom-alt') 111 | #df_hom_alt.loc[:, 'zygosity'] = 'hom-alt' 112 | 113 | df_het_ref = df_not_miss[((df_not_miss['a1'] == df_not_miss['REF']) & 114 | (df_not_miss['a2'] != df_not_miss['REF'])) | 115 | ((df_not_miss['a1'] != df_not_miss['REF']) & 116 | (df_not_miss['a2'] == df_not_miss['REF']))].copy() 117 | df_het_ref = check_empty_df(df_het_ref, 'het-ref') 118 | #df_het_ref.loc[:, 'zygosity'] = 'het-ref' 119 | 120 | df_zygosity = pd.concat([df_hom_ref, df_hom_miss, 121 | df_het_miss, df_het_ref, 122 | df_het_alt, df_hom_alt]) 123 | 124 | df_zygosity.loc[:, 'zygosity'] = df_zygosity['zygosity'].astype('category') 125 | # return df_zygosity 126 | assert len(df_zygosity) == len(df) 127 | return df_zygosity 128 | 129 | 130 | def vartype_map(ref_alt_bases): 131 | """ 132 | This function assigns the following vartypes to the 133 | allele specified by allele_base_col: snp, mnp, ins, del, indel or SV 134 | """ 135 | ref, alt = str(ref_alt_bases[0]), str(ref_alt_bases[-1]) 136 | len_diff = len(ref) - len(alt) 137 | 138 | if ref == alt: 139 | return 'ref' # Orderd by frequency of the variant to reduce complexity 140 | 141 | if len_diff == 0: 142 | base_diff = [nt for i, nt in enumerate(alt) if ref[i] != alt[i]] 143 | if len(base_diff) == 1: 144 | return 'snp' 145 | else: 146 | return 'mnp' 147 | 148 | if len_diff > 0: 149 | base_diff = [nt for i, nt in enumerate(alt) if ref[i] != alt[i]] 150 | if len(base_diff) > 0: 151 | return 'indel' 152 | else: 153 | return 'del' 154 | 155 | if len_diff < 0: 156 | # base_diff = [nt for i,nt in enumerate(ref) if ref[i] != alt[i]] 157 | return 'ins' 158 | 159 | # elif is_sv(ref,alt): return 'sv' 160 | 161 | else: 162 | return 'indel or SV' 163 | 164 | 165 | def get_hom_ref_counts(df): 166 | """ 167 | This function calculates the number of homozygous reference variant 168 | calls in a dataframe assuming the df is indexed on: 169 | ['CHROM', 'POS', 'REF', 'ALT'] in that order. 170 | 171 | Also assumes the homozygous reference values are ascribed: 172 | 0|0 , 0/0 , 0 173 | """ 174 | if 'hom-ref' in df.zygosity.value_counts().index: 175 | hom_ref = df.groupby(['CHROM', 'POS', 'REF', 'ALT'])['zygosity'].value_counts().xs('hom-ref',level=4) 176 | hom_ref = pd.DataFrame(hom_ref).reset_index() 177 | hom_ref = hom_ref.rename(columns={'zygosity':'hom_ref_counts'}) 178 | return hom_ref 179 | else: 180 | return pd.DataFrame() 181 | 182 | 183 | def parse_single_genotype_data(df, sample_id, split_cols=''): 184 | """ 185 | This function parses the genotype sample column and left joins to 186 | the df object. 187 | 188 | split_cols is a dictionary specifying the name of the column and the number 189 | of values expected for that column, e.g. {'AD':2, 'PL':3} 190 | """ 191 | 192 | # genotypes grouped by FORMAT variant annotations 193 | genotypes = df.groupby(by='FORMAT') 194 | 195 | # Iterate through genotype groups, dropping missing calls 196 | master_df = [] 197 | for name, group in genotypes: 198 | temp_group = group[sample_id].astype(str) # group of interest 199 | # del temp_group['FORMAT'] #remove the format column 200 | # replace . with none, allows stack to remove null columns, space 201 | # savings 202 | temp_group = temp_group.replace(to_replace='.', value='') 203 | 204 | temp_group_data = pd.DataFrame.from_records( 205 | list(temp_group.str.split(':'))) 206 | temp_group_data.index = temp_group.index 207 | temp_group_data.columns = name.split(':') 208 | temp_group_data = temp_group_data.replace(to_replace='.', value='') 209 | 210 | master_df.append(temp_group_data) 211 | 212 | # Concatenating all genotype groups 213 | sample_df = pd.concat(master_df) 214 | sample_df.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] 215 | 216 | # spliting user-defined columns 217 | if split_cols != '': 218 | for col in split_cols: 219 | for i in range(0, split_cols[col]): 220 | sample_df[col + '_' + str(i)] = sample_df[col].str.split(',').str[i] 221 | del sample_df[col] 222 | return sample_df 223 | 224 | 225 | 226 | def process_variant_annotations(df_vars_split_cols_sample_id_drop_hom_ref): 227 | """ 228 | This function stacks a pandas vcf dataframe and adds annotations for 229 | each genotype 230 | 231 | This function adds the following annotations to each variant: 232 | 233 | multiallele: {0,1} 0=biallele 1=multiallelic 234 | 235 | phase: {'/', '|'} /=unphased, |=phased 236 | 237 | a1: DNA base representation of allele1 call, e.g. A 238 | a2: DNA base representation of allele2 call, e.g. A 239 | 240 | GT1: numeric representation of allele1 call, e.g. 0 241 | GT2: numeric representation of allele2 call, e.g. 1 242 | 243 | vartype1: {snp, mnp, ins, del, indel or SV} variant type of first allele 244 | vartype2: {snp, mnp, ins, del, indel or SV} variant type of second allele 245 | 246 | zygosity: {het-ref, hom-ref, alt-ref, het-miss, hom-miss} 247 | 248 | FORMAT values: any values associated with the genotype calls are 249 | added as additional columns, split_columns are further 250 | split by ',' into individual columns 251 | """ 252 | df_vars, split_columns, sample_id, drop_hom_ref = df_vars_split_cols_sample_id_drop_hom_ref 253 | 254 | df_groups = df_vars.groupby('FORMAT') 255 | 256 | parsed_df = [] 257 | # iterate through different FORMAT types 258 | for format, df_format in df_groups: 259 | 260 | # dropping missing ALT alleles 261 | df_format = df_format[df_format['ALT'] != '.'] 262 | df_format = df_format[sample_id] # only consider sample columns 263 | # replacing missing calls with None 264 | df_format = df_format.replace(to_replace='.', value=np.NaN) 265 | 266 | # stacks sample calls and drops none calls 267 | df_format = pd.DataFrame( 268 | df_format.stack(), columns=['sample_genotypes']) 269 | 270 | if len(df_format) < 1: # occurs when all calls are empty 271 | continue 272 | 273 | # SAVE QUALITY INFORMATION SEPARETELY TO AVOID ANNOTATION PROCESSING 274 | # IDENTICAL GENOTYPE CALLS (DIFFERENT QUALITY DOESNT MATTER) 275 | if format.count(':') > 0: 276 | # qual df, setting aside for later joining 277 | df_qual = pd.DataFrame(list(df_format['sample_genotypes'].str.split(':')), 278 | index=df_format.index) 279 | # print df_format.head(), format.split(':') 280 | df_qual.columns = format.split(':') # setting quality column names 281 | # setting index names for joining with df_format later 282 | df_qual.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] 283 | # setting just the GT calls 284 | df_format['sample_genotypes'] = df_qual[format.split(':')[0]] 285 | # removing from df_qual to avoid joining problems with df_format 286 | # after add_annotations 287 | del df_qual['GT'] 288 | 289 | # DROPPING MISSING CALLS 290 | df_format = df_format[(df_format['sample_genotypes'] != './.') & 291 | (df_format['sample_genotypes'] != '.|.') & 292 | (df_format['sample_genotypes'] != '.')] 293 | 294 | # SETTING INDICES 295 | # setting index names 296 | df_format.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] 297 | df_format.reset_index(inplace=True) 298 | 299 | # ONLY NEED TO PASS UNIQUE GENOTYPE CALLS DF TO get_vcf_annotations, 300 | # then broadcast back to df_format 301 | df_annotations = df_format.drop_duplicates(subset=['CHROM', 'POS', 302 | 'REF', 'ALT', 303 | 'sample_genotypes']) 304 | df_annotations['FORMAT'] = format.split(':')[0] # setting format id 305 | df_annotations.set_index(['CHROM', 'POS', 'REF', 306 | 'ALT', 'sample_genotypes'], 307 | drop=False, inplace=True) 308 | # getting annotations 309 | df_annotations = get_vcf_annotations(df_annotations, 310 | 'sample_genotypes', 311 | split_columns=split_columns) 312 | 313 | # SETTING INDICES AGAIN 314 | if len(df_annotations) < 1: 315 | continue # continue if no variants within this FORMAT category 316 | df_format.set_index(['CHROM', 'POS', 'REF', 317 | 'ALT', 'sample_genotypes'], 318 | drop=True, inplace=True) 319 | df_annotations.index.names = ['CHROM', 'POS', 'REF', 320 | 'ALT', 'sample_genotypes'] 321 | df_format = df_format.join(df_annotations) 322 | 323 | # df_format.set_index('sample_ids', drop=True, inplace=True, append=True) 324 | df_format['FORMAT'] = format 325 | df_format.reset_index(level=4, inplace=True, drop=False) 326 | 327 | if drop_hom_ref: 328 | hom_ref_counts = get_hom_ref_counts(df_format) 329 | hom_ref_counts.name = 'hom_ref_counts' 330 | # dropping all homozygous reference variants 331 | df_format = df_format[df_format['zygosity'] != 'hom-ref'] 332 | df_format = df_format.join(hom_ref_counts) 333 | df_format['hom_ref_counts'].fillna(value=0, inplace=True) 334 | 335 | del df_format['sample_genotypes'] 336 | df_format.set_index('sample_ids', inplace=True, append=True, drop=True) 337 | 338 | # JOINING QUAL INFO BACK TO DF 339 | if format.count(':') > 0 and len(df_qual) > 0: 340 | df_format = df_format.join(df_qual, how='left') 341 | pass 342 | 343 | # SPLITTING GENOTYPE QUALITY COLUMNS 344 | if split_columns != '': 345 | for col in split_columns: 346 | split_col_names = [col + '_' + str(n) 347 | for n in range(0, split_columns[col])] 348 | df_format = df_format.join(pd.DataFrame(list(df_format[col].str.split(',').str[:len(split_col_names)]), 349 | index=df_format.index, 350 | columns=split_col_names)) 351 | del df_format[col] 352 | 353 | parsed_df.append(df_format) 354 | 355 | if len(parsed_df) > 0: 356 | df_annot = pd.concat(parsed_df) 357 | # reseting sample_ids from index 358 | df_annot.reset_index('sample_ids', drop=False, inplace=True) 359 | return df_annot 360 | else: 361 | print('No Annotations generated, please check for excessive missing values') 362 | return df_vars 363 | 364 | 365 | def df_split(df, split_level): 366 | """ 367 | Splits pandas dataframe into roughly 368 | equal sizes 369 | 370 | Parameters 371 | --------------- 372 | df: pandas df, required 373 | VCF pandas dataframe 374 | 375 | split_level: int, required 376 | Specifies the number of chunks to split df into 377 | 378 | """ 379 | row_count = len(df) 380 | split_size = int(row_count / split_level) 381 | split_df = [] 382 | for n, i in enumerate(range(0, row_count, split_size)): 383 | if n + 1 == split_level: 384 | split_df.append(df.ix[df.index[i:]]) 385 | break 386 | else: 387 | split_df.append(df.ix[df.index[i: i + split_size]]) 388 | return split_df 389 | 390 | 391 | def mp_variant_annotations(df_mp, df_split_cols='', df_sampleid='all', 392 | drop_hom_ref=True, n_cores=1): 393 | """ 394 | Multiprocessing variant annotations 395 | 396 | see variantAnnotations.process_variant_annotations for description of annotations 397 | 398 | 399 | This function coordinates the annotation of variants using the 400 | multiprocessing library. 401 | 402 | Parameters 403 | --------------- 404 | df_mp: pandas df, required 405 | VCF DataFrame 406 | 407 | df_split_cols: dict, optional 408 | key:FORMAT id value:#fields expected 409 | e.g. {'AD':2} indicates Allelic Depth should be 410 | split into 2 columns. 411 | 412 | df_sampleid: list, required 413 | list of sample_ids, can be 'all' 414 | 415 | drop_hom_ref: bool, optional 416 | specifies whether to drop all homozygous reference 417 | variants from dataframe. 418 | FALSE REQUIRES LARGE MEMORY FOOTPRINT 419 | 420 | n_cores: int, optional 421 | Number of multiprocessing jobs to start. 422 | Be careful as memory is copied to each process, RAM intensive 423 | """ 424 | from functools import partial 425 | import multiprocessing as mp 426 | import gc 427 | 428 | print('starting multiprocessing') 429 | pool = mp.Pool(int(n_cores)) 430 | # tasks = np.array_split(df_mp.copy(), int(n_cores)) #breaks with older 431 | # pandas/numpy 432 | dfs = df_split(df_mp.copy(), int(n_cores)) 433 | 434 | mp_process = partial(process_variant_annotations, sample_id=df_sampleid, 435 | split_columns=df_split_cols, drop_hom_ref=drop_hom_ref) 436 | 437 | results = [] 438 | del df_mp 439 | gc.collect() 440 | r = pool.map_async(mp_process, \ 441 | dfs, callback=results.append) 442 | r.wait() 443 | pool.close() 444 | pool.join() 445 | pool.terminate() 446 | 447 | print('multiprocessing complete') 448 | res_df = pd.concat([df for df in results[0] if len(df) > 0]) 449 | 450 | cat_cols = ['vartype1', 'vartype2', 'a1', 'a2', \ 451 | 'GT1', 'GT2', 'GT','sample_ids', 'zygosity'] 452 | res_df.loc[:, cat_cols] = res_df[cat_cols].astype('category') 453 | return res_df 454 | 455 | 456 | def get_vcf_annotations(df, sample_name, split_columns='', drop_hom_ref=True): 457 | """ 458 | This function adds the following annotations for each variant: 459 | multiallele, phase, a1, a2, GT1, GT2, vartype1, vartype2, zygosity, 460 | and parsed FORMAT values, see below for additional information. 461 | 462 | Parameters 463 | -------------- 464 | sample_name: str, required 465 | sample column header id, e.g. NA12878 466 | 467 | split_columns: dict, optional 468 | key:FORMAT id value:#fields expected 469 | e.g. {'AD':2} indicates Allelic Depth should be 470 | split into 2 columns. 471 | 472 | drop_hom_ref: bool, optional 473 | specifies whether to drop all homozygous reference 474 | variants from dataframe. 475 | FALSE REQUIRES LARGE MEMORY FOOTPRINT 476 | 477 | Output 478 | -------------- 479 | This function adds the following annotations to each variant: 480 | 481 | multiallele: {0,1} 0=biallele 1=multiallelic 482 | 483 | phase: {'/', '|'} /=unphased, |=phased 484 | 485 | a1: DNA base representation of allele1 call, e.g. A 486 | a2: DNA base representation of allele2 call, e.g. A 487 | 488 | GT1: numeric representation of allele1 call, e.g. 0 489 | GT2: numeric representation of allele2 call, e.g. 1 490 | 491 | vartype1: {snp, mnp, ins, del, indel or SV} variant type of first allele 492 | vartype2: {snp, mnp, ins, del, indel or SV} variant type of second allele 493 | 494 | zygosity: {het-ref, hom-ref, alt-ref, het-miss, hom-miss} 495 | 496 | FORMAT values: any values associated with the genotype calls are 497 | added as additional columns, split_columns are further 498 | split by ',' into individual columns 499 | 500 | 501 | 502 | """ 503 | 504 | df.loc[:, 'multiallele'] = df.ALT.str.count(',') 505 | multidf = df[df['multiallele'] > 0] 506 | 507 | while len(df) > 0: 508 | 509 | 510 | df = add_allelic_bases(df, sample_name) 511 | 512 | df = zygosity_fast(df) 513 | 514 | 515 | df.loc[:, 'vartype1'] = [vtype for vtype in map(vartype_map, df[['REF', 'a1']].values)] 516 | df.loc[:, 'vartype2'] = [vtype for vtype in map(vartype_map, df[['REF', 'a2']].values)] 517 | 518 | cat_cols = ['vartype1', 'vartype2', 'a1', 'a2', 'GT1', 'GT2', 'sample_genotypes', 'phase'] 519 | for c in cat_cols: 520 | df.loc[:, c] = df[c].astype('category') 521 | 522 | 523 | df.loc[:, 'GT'] = df['sample_genotypes'].astype('category') 524 | del df[sample_name] 525 | if 'FORMAT' in df.columns: 526 | del df['FORMAT'] 527 | 528 | 529 | return df 530 | 531 | return pd.DataFrame() 532 | 533 | 534 | def process_variant_annotations(df_vars, sample_id='all', split_columns='', drop_hom_ref=False): 535 | """ 536 | This function stacks a pandas vcf dataframe and adds annotations for 537 | each genotype 538 | 539 | This function adds the following annotations to each variant: 540 | 541 | multiallele: {0,1} 0=biallele 1=multiallelic 542 | 543 | phase: {'/', '|'} /=unphased, |=phased 544 | 545 | a1: DNA base representation of allele1 call, e.g. A 546 | a2: DNA base representation of allele2 call, e.g. A 547 | 548 | GT1: numeric representation of allele1 call, e.g. 0 549 | GT2: numeric representation of allele2 call, e.g. 1 550 | 551 | vartype1: {snp, mnp, ins, del, indel or SV} variant type of first allele 552 | vartype2: {snp, mnp, ins, del, indel or SV} variant type of second allele 553 | 554 | zygosity: {het-ref, hom-ref, alt-ref, het-miss, hom-miss} 555 | 556 | FORMAT values: any values associated with the genotype calls are 557 | added as additional columns, split_columns are further 558 | split by ',' into individual columns 559 | """ 560 | #df_vars, split_columns, sample_id, drop_hom_ref = df_vars_split_cols_sample_id_drop_hom_ref 561 | 562 | 563 | def _format_preprocess(df, sample_id): 564 | """ 565 | Handles stacking the wide dataframe into a long dataframe of sample genotypes 566 | """ 567 | # dropping missing ALT alleles 568 | df = df[df['ALT'] != '.'] 569 | df = df[['CHROM', 'POS', 'REF', 'ALT'] + sample_id] # only consider sample columns 570 | # replacing missing calls with None 571 | df = df.replace(to_replace='.', value=np.NaN) 572 | 573 | # stacks sample calls and drops none calls 574 | 575 | df = df.set_index(['CHROM', 'POS', 'REF', 'ALT']) 576 | 577 | df = pd.DataFrame(df.stack(), 578 | columns=['sample_genotypes']) 579 | df.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] 580 | 581 | return df.reset_index() 582 | 583 | 584 | def _sampleid_preprocess(df): 585 | """ 586 | Identifies sample columns 587 | """ 588 | s_ids = set(df.columns) - set(['CHROM', 'POS', 'REF', 'ALT', \ 589 | 'ID', 'QUAL', 'FILTER', 'INFO','FORMAT']) 590 | return list(s_ids) 591 | 592 | 593 | def _qual_preprocess(df, form): 594 | """ 595 | Creates dataframe with non-GT data for each genotype call, 596 | often quality data 597 | """ 598 | # qual df, setting aside for later joining 599 | df_qual = pd.DataFrame(list(df['sample_genotypes'].str.split(':')), 600 | df.index) 601 | df_qual.columns = form.split(':') # setting quality column names 602 | df_qual = df[['CHROM', 'POS', 'REF', 'ALT', 'sample_ids']].join(df_qual) 603 | # print df_format.head(), format.split(':') 604 | 605 | # setting index names for joining with df_format later 606 | # setting just the GT calls 607 | df.loc[:, 'sample_genotypes'] = df_qual[form.split(':')[0]] 608 | # removing from df_qual to avoid joining problems with df_format 609 | # after add_annotations 610 | #del df_qual['GT'] 611 | return df_qual 612 | 613 | 614 | def _missing_preprocess(df): 615 | """ 616 | Filters dataframe for missing values 617 | """ 618 | df_nonmissing = df[(df['sample_genotypes'] != './.') & 619 | (df['sample_genotypes'] != '.|.') & 620 | (df['sample_genotypes'] != '.')] 621 | 622 | return df_nonmissing 623 | 624 | 625 | def _coordinate_variant_annotation(df_format, sample_id): 626 | """ 627 | Coordinates variant annotations, hom-ref counting and dropping, 628 | and stacking into a tidy df 629 | 630 | Parameters 631 | -------------- 632 | df_format: pandas DataFrame, required 633 | pd.DataFrame containing CHROM, POS, REF, ALT, sample genotype cols 634 | 635 | sample_id: list, required 636 | list of sample genotype columns 637 | 638 | 639 | Output 640 | -------------- 641 | This function produces the df_annot dataframe containing the following columns: 642 | CHROM, POS, REF, ALT, GT, GT1, GT2, a1, a2, multiallele, phase, zygosity, 643 | vartype1, vartype2, FORMAT, hom_ref_counts 644 | """ 645 | 646 | df_format = _format_preprocess(df_format, sample_id) 647 | 648 | if len(df_format) < 1: # occurs when all calls are empty 649 | return pd.DataFrame() 650 | 651 | # SAVE QUALITY INFORMATION SEPARETELY TO AVOID ANNOTATION PROCESSING 652 | # IDENTICAL GENOTYPE CALLS (DIFFERENT QUALITY DOESNT MATTER) 653 | if form.count(':') > 0: 654 | df_qual = _qual_preprocess(df_format, form) 655 | 656 | # DROPPING MISSING CALLS 657 | df_format = _missing_preprocess(df_format) 658 | 659 | 660 | # SETTING INDICES 661 | # setting index names 662 | # df_format.index.names = ['CHROM', 'POS', 'REF', 'ALT', 'sample_ids'] 663 | # df_format = df_format.reset_index() 664 | 665 | # ONLY NEED TO PASS UNIQUE GENOTYPE CALLS DF TO get_vcf_annotations, 666 | # then broadcast back to df_format 667 | annot_cols = ['CHROM', 'POS','REF', 'ALT','sample_genotypes'] 668 | df_annotations = df_format[annot_cols].drop_duplicates(subset=annot_cols) 669 | 670 | df_annotations.loc[:, 'FORMAT'] = form.split(':')[0] # setting format id 671 | 672 | # get variant annotations 673 | df_annotations = get_vcf_annotations(df_annotations, 674 | 'sample_genotypes', 675 | split_columns=split_columns) 676 | 677 | # BROADCASTING VARIANT ANNOTATIONS BACK TO SAMPLE GENOTYPE DF 678 | if len(df_annotations) < 1: 679 | return pd.DataFrame() # continue if no variants within this FORMAT category 680 | 681 | df_format.rename(columns={'sample_genotypes':'GT'}, inplace=True) 682 | df_format.loc[:, 'GT'] = df_format['GT'].astype('category') 683 | df_format = df_format.merge(df_annotations, how='left', 684 | left_on = ['CHROM', 'POS', 'REF', 'ALT','GT'], 685 | right_on = ['CHROM', 'POS', 'REF', 'ALT','GT']) 686 | 687 | del df_annotations 688 | gc.collect() 689 | # df_format.set_index('sample_ids', drop=True, inplace=True, append=True) 690 | df_format.loc[:, 'FORMAT'] = form 691 | 692 | 693 | # DROPPING HOMOZYGOUS REFERENCE VARIANTS IF SPECIFIED BY USER 694 | hom_ref_counts = get_hom_ref_counts(df_format) 695 | if len(hom_ref_counts) > 0: 696 | df_format = df_format.merge(hom_ref_counts, how='left', \ 697 | left_on=['CHROM', 'POS','REF','ALT'], \ 698 | right_on=['CHROM', 'POS','REF','ALT']) 699 | df_format['hom_ref_counts'].fillna(value=0, inplace=True) 700 | df_format.loc[:, 'hom_ref_counts'] = df_format['hom_ref_counts'].astype(np.uint8) 701 | else: 702 | df_format.loc[:, 'hom_ref_counts'] = -1 703 | 704 | if drop_hom_ref: 705 | # dropping all homozygous reference variants 706 | df_format = df_format[df_format['zygosity'] != 'hom-ref'] 707 | 708 | # JOINING QUAL INFO BACK TO DF, IF NON-GT FIELDS IN SAMPLE COLUMNS 709 | if form.count(':') > 0 and len(df_qual) > 0: 710 | df_format = df_format.merge(df_qual, how='left', 711 | left_on = ['CHROM', 'POS', 'REF', 'ALT', 'GT', 'sample_ids'], 712 | right_on = ['CHROM', 'POS', 'REF', 'ALT', 'GT', 'sample_ids']) 713 | del df_qual 714 | gc.collect() 715 | pass 716 | 717 | # SPLITTING GENOTYPE QUALITY COLUMNS, IF SPECIFIED BY USER 718 | if split_columns != '': 719 | for col in split_columns: 720 | if split_columns[col] > 1: #only parse split_columns with more than expected 1 column 721 | split_col_names = [col + '_' + str(n) for n in range(0, split_columns[col])] 722 | try: 723 | split_col_df = pd.DataFrame(list(df_format[col].str.split(',') \ 724 | .str[:len(split_col_names)]), 725 | index=df_format.index, 726 | columns=split_col_names) 727 | df_format = df_format.join(split_col_df) 728 | del df_format[col] 729 | except AssertionError: 730 | print('{} has incorrect column number, '.format(col) + \ 731 | 'please check split_cols value. ' + \ 732 | 'Leaving {} unparsed.'.format(col)) 733 | print() 734 | else: 735 | continue 736 | return df_format 737 | 738 | 739 | if '#CHROM' in df_vars.columns: 740 | df_vars = df_vars.rename(columns={'#CHROM':'CHROM'}).set_index(['CHROM', 'POS', 'REF', 'ALT'],drop=False) 741 | 742 | if sample_id == 'all': 743 | sample_id = _sampleid_preprocess(df_vars) 744 | 745 | df_groups = df_vars.groupby('FORMAT') 746 | 747 | parsed_df = [] 748 | # iterate through different FORMAT types 749 | for form, df_format in df_groups: 750 | 751 | df_format = _coordinate_variant_annotation(df_format, sample_id) 752 | 753 | parsed_df.append(df_format) 754 | 755 | if len(parsed_df) > 0: 756 | df_annot = pd.concat(parsed_df) 757 | 758 | for c in ['sample_ids', 'FORMAT']: 759 | df_annot.loc[:, c] = df_annot[c].astype('category') 760 | df_annot.loc[:, ['multiallele']] = df_annot[['multiallele']].astype(np.uint8) 761 | if 'hom_ref_counts' not in df_annot.columns: 762 | df_annot.loc[:, 'hom_ref_counts'] = -1 763 | df_annot.loc[:, 'hom_ref_counts'] = df_annot['hom_ref_counts'].astype(np.uint8) 764 | 765 | return df_annot 766 | else: 767 | print('No Annotations generated, please check for excessive missing values') 768 | return pd.DataFrame() 769 | -------------------------------------------------------------------------------- /vcf_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class VCFMetadata(object): 5 | """This class parses a VCF header into a pandas dataframe object. 6 | 7 | It recognizes gzip and uncompressed file formats. 8 | This function assumes the header does not extent past 5000 lines. 9 | """ 10 | 11 | def __init__(self, filename): 12 | if filename.endswith('.gz'): 13 | self.compression = 'gzip' 14 | if filename + '.tbi' in os.listdir(os.path.split(filename)[0]): 15 | header_lines = os.popen('tabix -H ' + filename).readlines() 16 | self.header = [l.replace('#CHROM', 'CHROM') 17 | for l in header_lines if l.startswith('#')] 18 | os.system('tabix -p vcf ' + filename) 19 | header_lines = os.popen('tabix -H ' + filename).readlines() 20 | self.header = [l for l in header_lines if l.startswith('#')] 21 | 22 | else: 23 | self.compression = 'infer' 24 | header_lines = os.popen('head -5000 ' + filename).readlines() 25 | self.header = [l for l in header_lines if l.startswith('#')] 26 | --------------------------------------------------------------------------------