├── genome.png ├── nosce.png ├── seriously.png ├── DNA_articles.png ├── output_34_0.png ├── result.csv ├── README.md └── DNA.ipynb /genome.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorarjohns/DNA_pandas_selenium/HEAD/genome.png -------------------------------------------------------------------------------- /nosce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorarjohns/DNA_pandas_selenium/HEAD/nosce.png -------------------------------------------------------------------------------- /seriously.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorarjohns/DNA_pandas_selenium/HEAD/seriously.png -------------------------------------------------------------------------------- /DNA_articles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorarjohns/DNA_pandas_selenium/HEAD/DNA_articles.png -------------------------------------------------------------------------------- /output_34_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorarjohns/DNA_pandas_selenium/HEAD/output_34_0.png -------------------------------------------------------------------------------- /result.csv: -------------------------------------------------------------------------------- 1 | ,Magnitude,Repute,Summary 2 | Rs661(A;A),9,Bad,"early onset Alzheimer's disease" 3 | Rs6647(T;T),0,Good,"Normal; two copies of Pi-M1V allele" 4 | Rs6647(C;C),0,Good,"Normal; two copies of Pi-M1A allele" 5 | Rs1303(T;T),0,Good,"common in clinvar" 6 | Rs28929471(G;G),0,Good,"common in complete genomics" 7 | Rs28929474(G;G),0,Good,"common in complete genomics" 8 | Rs28929474(A;A),5.5,Bad,"Alpha-1 Antitrypsin Deficiency" 9 | Rs17580(A;A),0,Good,"common in complete genomics" 10 | Rs17580(T;T),2.5,Bad,"a slightly reduced functionality form of Alpha-1 Antitrypsin Deficiency" 11 | Rs28931568(G;G),0,Good,"common in complete genomics" 12 | Rs28931568(A;A),4,Bad,"high risk of emphysema due to Alpha 1-Antitrypsin Deficiency" 13 | Rs28931569(T;T),0,Good,"common in complete genomics" 14 | Rs28931569(C;C),4,,"high risk of emphysema" 15 | Rs28931570(C;C),0,Good,"common in complete genomics" 16 | Rs28929473(A;A),0,Good,"common in complete genomics" 17 | Rs28929473(T;T),0,, 18 | Rs28931572(T;T),0,Good,"common in complete genomics" 19 | Rs28931572(A;A),0,, 20 | Rs769455(C;C),0,Good,"common in clinvar" 21 | Rs769452(T;T),0,Good,"common in clinvar" 22 | Rs28931576(A;A),0,Good,"common in clinvar" 23 | Rs28931578(G;G),0,Good,"common in clinvar" 24 | Rs28931579(A;A),0,Good,"common in clinvar" 25 | Rs28931580(A;A),0,Good,"common in clinvar" 26 | Rs4987076(G;G),0,Good,"common in clinvar" 27 | Rs28929478(G;G),0,Good,"common in clinvar" 28 | Rs28929478(A;A),0,Good,"common on affy axiom data" 29 | Rs28931581(C;C),0,Good,"common in clinvar" 30 | Rs28931582(T;T),0,Good,"common in clinvar" 31 | Rs28929479(T;T),0,Good,"common in clinvar" 32 | Rs28931586(T;T),0,Good,"common in clinvar" 33 | Rs1801252(A;A),0,Good,None 34 | Rs1801252(G;G),3,, 35 | Rs1042714(C;C),0,Good,normal 36 | Rs1800888(C;C),0,Good,normal 37 | Rs1800888(T;T),2.5,Bad,"increased risk of coronary artery disease" 38 | Rs4994(T;T),0,Good,normal 39 | Rs4994(C;C),2,Bad,"2x higher risk in certain women for cardiac events; associated with elite endurance performance in men" 40 | Rs11276(G;G),0,Good,"common in clinvar" 41 | Rs28362459(T;T),0,Good,"common in clinvar" 42 | Rs8192466(C;C),0,Good,common 43 | Rs8192466(T;T),4,,uncertain 44 | Rs28897672(T;T),0,Good,normal 45 | Rs4986852(G;G),0,Good,normal 46 | Rs4986852(A;A),2,,"predisposition to breast cancer?" 47 | Rs1800709(C;C),0,Good,normal 48 | Rs1800709(T;T),2,,"predisposition to breast cancer?" 49 | Rs28931588(G;G),0,Good,"common in complete genomics" 50 | Rs28931589(G;G),0,Good,"common in complete genomics" 51 | Rs28931590(A;A),0,Good,"common in complete genomics" 52 | Rs28931590(T;T),2.5,Bad,"ACUTE MYELOID LEUKEMIA" 53 | Rs2303790(A;A),0,Good,"common in clinvar" 54 | Rs5882(G;G),2.1,Good,"Longer lifespan, 0.28x lower risk of dementia, 0.31x lower risk of Alzheimer's." 55 | Rs5882(A;A),2,Bad,"Faster aging. Increased risk for Dementia. Less good cholesterol." 56 | Rs28931591(C;C),0,Good,"common in clinvar" 57 | Rs2230199(C;C),0,Good,common 58 | Rs2230199(G;G),2,Bad,"2.5x+ risk of ARMD" 59 | Rs1047286(C;C),0,Good,"common in clinvar" 60 | Rs1047286(T;T),1.7,Bad,"1.7x increased risk for age-related macular degeneration" 61 | Rs28929485(C;C),0,Good,"common in clinvar" 62 | Rs28929485(T;T),6,Bad,"Deafness and keratitis; possible dominant or recessive" 63 | Rs28931592(A;A),0,Good,"common in clinvar" 64 | Rs28931592(T;T),0,, 65 | Rs28931593(G;G),0,Good,"common in clinvar" 66 | Rs28931594(G;G),0,Good,"common in clinvar" 67 | Rs28931595(G;G),0,Good,"common in clinvar" 68 | Rs28931600(G;G),0,Good,"common in clinvar" 69 | Rs28931601(G;G),0,Good,"common in clinvar" 70 | Rs11547328(C;C),0,Good,"common in complete genomics" 71 | Rs231775(A;A),1,Good,"no increased risk of autoimmune thyroid disease" 72 | Rs28931606(T;T),0,Good,"common in clinvar" 73 | Rs28931607(G;G),0,Good,"common in clinvar" 74 | Rs28931608(G;G),0,Good,"common in clinvar" 75 | Rs4986893(G;G),0,Good,normal 76 | Rs4986893(A;A),2.1,,"poor metabolizer of several commonly prescribed drugs" 77 | Rs28399504(A;A),0,Good,normal 78 | Rs28399504(G;G),2.5,Bad,"poor metabolizer" 79 | Rs28931609(C;C),0,Good,"common in clinvar" 80 | Rs28931610(C;C),0,Good,"common in clinvar" 81 | Rs28931610(T;T),6,Bad,"carrier of a skin fragility/woolly hair syndrome allele" 82 | Rs28931611(T;T),0,Good,"common in clinvar" 83 | Rs28931612(G;G),0,Good,"common in complete genomics" 84 | Rs11570255(G;G),0,Good,"common in complete genomics" 85 | Rs11570351(G;G),0,Good,"common on affy axiom data" 86 | Rs5352(G;G),0,Good,common 87 | Rs5352(A;A),3.5,Bad,"Hirschsprung disease?" 88 | Rs2234922(A;A),0,Good,"common in clinvar" 89 | Rs460897(G;G),0,Good,"common in complete genomics" 90 | Rs28929498(A;A),0,Good,"common in clinvar" 91 | Rs28931614(G;G),0,Good,"common in clinvar" 92 | Rs28933068(C;C),0,Good,"common in clinvar" 93 | Rs28931615(C;C),0,Good,"common in clinvar" 94 | Rs4647924(C;C),0,Good,"common in clinvar" 95 | Rs28928868(G;G),0,Good,"common in clinvar" 96 | Rs6165(G;G),0,Good,"common in clinvar" 97 | Rs6166(G;G),1,Bad,"Females slightly more likely to be sterile" 98 | Rs6166(A;A),0,Good,common/normal 99 | Rs28928870(C;C),0,Good,"common in clinvar" 100 | Rs28928871(G;G),0,Good,"common in clinvar" 101 | Rs17855739(G;G),0,Good,"common in clinvar" 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Know Thyself: Using Data Science to Explore Your Own Genome 2 | ## DNA analysis with pandas and Selenium 3 | 4 |
5 | Geminus’s flaps,  https://exhibitions.lib.cam.ac.uk/vesalius/artifacts/geminuss-flaps1/
   6 | 7 |
"Nosce te ipsum" ("know thyself"), an ancient maxim frequently associated with gory anatomical pop-up books of yore.
8 |
9 | 10 | [Image from the University of Cambridge](https://exhibitions.lib.cam.ac.uk/vesalius/artifacts/geminuss-flaps1/) 11 | 12 | [23andme](https://www.23andme.com/) once offered me a free DNA and ancestry test kit if I participated in one of their clinical studies. In exchange for a cheek swab and baring my guts and soul in a score of questionnaires, I got my genome sequenced and gained access to myriad reports on where my ancestors were likely from, whom else on the site I might be related to, and what health conditions and traits I probably have inherited. 13 | 14 |
15 | Bunion and Flat Feet Report, 23andme 16 |
Seriously?
17 |
18 | 19 | 23andme already provides an overwhelming amount of consumer-ready infographics and tools, but I knew I could do more with the data. The intrepid may download their raw genetic data if they dare, so of course I poured it into pandas to [see what I could make of it](https://github.com/lorarjohns/DNA_pandas_selenium). 20 | 21 | 22 | ```python 23 | %matplotlib inline 24 | import seaborn as sns 25 | sns.set_style('darkgrid') 26 | sns.color_palette('Spectral') 27 | import matplotlib.pyplot as plt 28 | ``` 29 | 30 | 31 | ```python 32 | import numpy as np 33 | import requests 34 | import pandas as pd 35 | 36 | import re 37 | ``` 38 | 39 | 40 | ```python 41 | from selenium import webdriver 42 | from selenium.webdriver.support.ui import WebDriverWait 43 | ``` 44 | 45 | ## Importing my DNA into pandas and exploring the genome 46 | 47 | Looking at the .txt file, I could see that I was missing some genotype values, denoted with '--'. 48 | 49 | Most of the chromosomes are ints, but three are X, Y, and MT (for 'mitochondrial'). I needed to specify the data type properly so that pandas wouldn't throw an error when it found mixed data in the input. 50 | 51 | The other columns were fairly straightforward. I also wanted pandas to ignore the prefatory comments at the beginning of the file that consisted of lines beginning with an octothorpe. 52 | 53 | The arguments I needed to pass, therefore, were: 54 | * separator (tab-delimited) 55 | * dtype (as a dict) 56 | * na_values ('--') *(n.b.: I decided against this in the end to avoid dealing with more NaNs)* 57 | * comment ('#') 58 | 59 | ![img](/genome.png) 60 | 61 | 62 | ```python 63 | data = pd.read_csv('genome.txt', sep='\t', dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, comment='#') 64 | 65 | 66 | ``` 67 | 68 | 69 | ```python 70 | print(data) 71 | ``` 72 | 73 | rsid chromosome position genotype 74 | 0 rs548049170 1 69869 TT 75 | 1 rs13328684 1 74792 -- 76 | 2 rs9283150 1 565508 AA 77 | 3 i713426 1 726912 -- 78 | 4 rs116587930 1 727841 GG 79 | 5 rs3131972 1 752721 AG 80 | 6 rs12184325 1 754105 CC 81 | 82 | ... ... ... ... ... 83 | 84 | [638531 rows x 4 columns] 85 | 86 | 87 | A quick note on the column names: 88 | 89 | * rsid stands for Reference SNP cluster ID. It identifies unique SNPs. 90 | 91 | * SNPs are Single Nucleotide Polymorphisms ('snips'), locations in the genome that vary between individuals. They can influence disease risk and drug effects, tell you about your ancestry, and predict aspects of how you look and act. 92 | 93 | * All humans have almost the same sequence of 3 billion DNA bases (A,C,G, or T) distributed between their 23 pairs of chromosomes. But at certain locations, some differences exist that researchers have declared meaningful, for medical or other reasons (like genealogy). 94 | 95 | I started to navigate my new DataFrame with basic exploratory data analysis and data cleaning. 96 | 97 | 98 | ```python 99 | # Read the data into a pandas DataFrame and do some EDA 100 | df = pd.DataFrame(data) 101 | ``` 102 | 103 | 104 | ```python 105 | df.head(25) 106 | ``` 107 | 108 |
109 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 |
rsidchromosomepositiongenotype
0rs548049170169869TT
1rs13328684174792--
2rs92831501565508AA
3i7134261726912--
4rs1165879301727841GG
5rs31319721752721AG
6rs121843251754105CC
7rs125676391756268AA
8rs1145251171759036GG
9rs121248191776546AA
10rs121274251794332GG
11rs793739281801536TT
12rs728888531815421--
13rs75383051824398AA
14rs284446991830181AA
15i7134491830731--
16rs1164527381834830GG
17rs726318871835092TT
18rs286786931838665TT
19rs49703821840753CT
20rs44756911846808CC
21rs726318891851390GG
22rs75377561854250AA
23rs133029821861808GG
24rs3767477911863130AA
310 |
311 | 312 | 313 | 314 | 315 | ```python 316 | df.isna().any() 317 | ``` 318 | 319 | 320 | 321 | 322 | rsid False 323 | chromosome False 324 | position False 325 | genotype False 326 | dtype: bool 327 | 328 | 329 | 330 | 331 | ```python 332 | df.nunique() 333 | ``` 334 | 335 | 336 | 337 | 338 | rsid 638531 339 | chromosome 25 340 | position 634934 341 | genotype 20 342 | dtype: int64 343 | 344 | 345 | 346 | ```python 347 | # How many chromosomes am I missing by not 348 | # having a Y chromosome? 349 | Y_chromosome = df[df.chromosome == 'Y'] 350 | ``` 351 | 352 | 353 | ```python 354 | len(Y_chromosome) 355 | ``` 356 | 357 | 358 | 359 | 360 | 3733 361 | 362 | 363 | I converted the letter chromosomes to numbers, cast them to ints, and created a dictionary to translate them back later so that I could better manipulate the data. 364 | 365 | 366 | ```python 367 | df['chromosome'].unique() 368 | ``` 369 | 370 | 371 | 372 | 373 | array(['1', '2', '3', '4', '5', '6', '7', '8', 374 | '9', '10', '11', '12','13', '14', '15', 375 | '16', '17', '18', '19', '20', '21', '22', 376 | 'X','MT'], dtype=object) 377 | 378 | 379 | 380 | 381 | ```python 382 | df['chromosome'] = df['chromosome'].apply(lambda x: 383 | re.sub(r'X', r'23', x)) 384 | df['chromosome'] = df['chromosome'].apply(lambda x: 385 | re.sub(r'MT', r'24', x)) 386 | ``` 387 | 388 | 389 | ```python 390 | df['chromosome'] = df['chromosome'].apply(lambda x: 391 | int(x)) 392 | ``` 393 | 394 | 395 | ```python 396 | chromosome_dict = {1:'1', 2:'2', 3:'3', 4:'4', 5:'5', 397 | 6:'6', 7:'7', 8:'8', 9:'9', 10:'10', 398 | 11:'11', 12:'12', 13:'13', 14:'14', 399 | 15:'15', 16:'16', 17:'17', 18:'18', 400 | 19:'19', 20:'20', 21:'21', 22:'22', 401 | 23:'X', 24:'MT'} 402 | ``` 403 | 404 | 405 | ```python 406 | print(chromosome_dict) 407 | display(df.info()) 408 | ``` 409 | 410 | {1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 411 | 8: '8', 9: '9', 10: '10', 11: '11', 12: '12', 13: '13', 412 | 14: '14', 15: '15', 16: '16', 17: '17', 18: '18', 413 | 19: '19', 20: '20', 21: '21', 22: '22', 23: 'X', 414 | 24: 'MT'} 415 | 416 | 417 | Int64Index: 634798 entries, 0 to 638530 418 | 419 | Data columns (total 4 columns): 420 | 421 | rsid 634798 non-null object 422 | chromosome 634798 non-null int64 423 | position 634798 non-null int64 424 | genotype 634798 non-null object 425 | 426 | dtypes: int64(2), object(2) 427 | memory usage: 24.2+ MB 428 | 429 | None 430 | 431 | 432 | There were 16,005 genotypes that I simply lacked: 433 | 434 | 435 | ```python 436 | genotype_na = df[df.genotype == '--'] 437 | len(genotype_na) 438 | ``` 439 | 16005 440 | 441 | 442 | ### Some visualizations 443 | 444 | 445 | ```python 446 | df[df.chromosome == 1].info() 447 | ``` 448 | 449 | 450 | Int64Index: 49514 entries, 0 to 49513 451 | Data columns (total 4 columns): 452 | rsid 49514 non-null object 453 | chromosome 49514 non-null int64 454 | position 49514 non-null int64 455 | genotype 49514 non-null object 456 | dtypes: int64(2), object(2) 457 | memory usage: 1.9+ MB 458 | 459 | 460 | ```python 461 | # Remove that pesky whitespace from the column name 462 | df.rename({' rsid': 'rsid'}, axis='columns', inplace=True) 463 | ``` 464 | 465 | How many SNPs are there per chromosome? 466 | 467 | 468 | ```python 469 | # We can do this manually with a for loop . . . 470 | x = [] 471 | y = [] 472 | for k in chromosome_dict: 473 | x.append(k) 474 | y.append(len(df[df.chromosome == k])) 475 | rsid_per_chromosome = dict(zip(x,y)) 476 | ``` 477 | 478 | ```python 479 | rsid_per_chromosome 480 | ``` 481 | 482 | {1: 49514, 483 | 2: 51775, 484 | 3: 43024, 485 | 4: 39474, 486 | 5: 37032, 487 | 6: 44023, 488 | 7: 34357, 489 | 8: 31683, 490 | 9: 26446, 491 | 10: 30525, 492 | 11: 30942, 493 | 12: 29432, 494 | 13: 22080, 495 | 14: 19961, 496 | 15: 19006, 497 | 16: 20397, 498 | 17: 19401, 499 | 18: 17675, 500 | 19: 14917, 501 | 20: 14781, 502 | 21: 8607, 503 | 22: 8915, 504 | 23: 16530, 505 | 24: 4301} 506 | 507 | 508 | ```python 509 | # . . . but pandas makes it a lot easier! 510 | rsid_per_chromosome_series = df.groupby('chromosome')['rsid'].count() 511 | rsid_per_chromosome_series.columns = ['chromosome', 'count'] 512 | ``` 513 | 514 | ```python 515 | rsid_per_chromosome_series.plot.barh(figsize=(16,9), fontsize=15) 516 | plt.show() 517 | ``` 518 | 519 |
520 | Graph of rsid per chromosome 521 |
rsid per chromosome
522 |
523 | 524 | 525 | ## Getting data on SNPs from SNPedia 526 | 527 | To acquire more information about my DNA, I pulled files from [SNPedia](https://www.snpedia.com/index.php/SNPedia), a wiki investigating human genetics that gathers extensive data and cites to peer-reviewed scientific publications. SNPedia catalogues common, reproducible SNPs (or ones found in meta-analyses or studies of at least 500 patients), or those with other historic or medical significance. 528 | 529 | The columns are: 530 | 531 | * Unnamed: 0 (actually the SNP name) 532 | * Magnitude (a subjective measure of interest) 533 | * Repute (a subjective measure of whether the genotype is "good" or "bad" to have based on research, and blank for things like ancestry and eye color) 534 | * Summary (a narrative description) 535 | 536 | 537 | ```python 538 | snp_df = pd.read_csv('result.csv') 539 | snp_df.head() 540 | ``` 541 | 542 |
543 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 |
Unnamed: 0MagnitudeReputeSummary
0Rs661(A;A)9.0Badearly onset Alzheimer's disease
1Rs6647(T;T)0.0GoodNormal; two copies of Pi-M1V allele
2Rs6647(C;C)0.0GoodNormal; two copies of Pi-M1A allele
3Rs1303(T;T)0.0Goodcommon in clinvar
4Rs28929471(G;G)0.0Goodcommon in complete genomics
604 |
605 | 606 | ### Fun with regular expressions 607 | 608 | To align with my original DataFrame, I created a genotype column and used regex to separate out the genotype, which was stitched onto the end of the SNP name. 609 | 610 | 611 | ```python 612 | snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x: 613 | re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x)) 614 | ``` 615 | 616 | 617 | ```python 618 | snp_df.head() 619 | ``` 620 | 621 | 622 |
623 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 |
Unnamed: 0MagnitudeReputeSummarygenotype
0Rs661(A;A)9.0Badearly onset Alzheimer's diseaseAA
1Rs6647(T;T)0.0GoodNormal; two copies of Pi-M1V alleleTT
2Rs6647(C;C)0.0GoodNormal; two copies of Pi-M1A alleleCC
3Rs1303(T;T)0.0Goodcommon in clinvarTT
4Rs28929471(G;G)0.0Goodcommon in complete genomicsGG
690 |
691 | 692 | 693 | 694 | For consistency's sake, I renamed the columns to match my original DataFrame and made sure the rsids were all lower-case. 695 | 696 | 697 | ```python 698 | new_cols = ['rsid', 'magnitude', 'repute', 699 | 'summary', 'genotype'] 700 | snp_df.columns = new_cols 701 | ``` 702 | 703 | I used regex to clean up the rsid a little more, too (because I will take any excuse to use more regex). 704 | 705 | 706 | ```python 707 | snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower()) 708 | snp_df['rsid'] = snp_df['rsid'].map(lambda x : 709 | re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)', 710 | r'\1', x)) 711 | ``` 712 | 713 | 714 | ```python 715 | snp_df.head() 716 | ``` 717 | 718 |
719 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 |
rsidmagnitudereputesummarygenotype
0rs6619.0Badearly onset Alzheimer's diseaseAA
1rs66470.0GoodNormal; two copies of Pi-M1V alleleTT
2rs66470.0GoodNormal; two copies of Pi-M1A alleleCC
3rs13030.0Goodcommon in clinvarTT
4rs289294710.0Goodcommon in complete genomicsGG
786 |
787 | 788 | 789 | 790 | ```python 791 | snp_df.info() 792 | ``` 793 | 794 | 795 | RangeIndex: 100 entries, 0 to 99 796 | Data columns (total 5 columns): 797 | rsid 100 non-null object 798 | magnitude 100 non-null float64 799 | repute 91 non-null object 800 | summary 96 non-null object 801 | genotype 100 non-null object 802 | dtypes: float64(1), object(4) 803 | memory usage: 4.0+ KB 804 | 805 | I overwrote the null reputes and summaries. 806 | 807 | ```python 808 | null_repute = snp_df[snp_df['repute'].isnull()] 809 | null_summaries = snp_df[snp_df['summary'].isnull()] 810 | null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True) 811 | display(null_repute_and_summaries) 812 | ``` 813 | 814 | 815 |
816 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 |
rsidmagnitudereputesummarygenotype
0rs289315694.0NaNhigh risk of emphysemaCC
1rs289294730.0NaNNaNTT
2rs289315720.0NaNNaNAA
3rs18012523.0NaNNaNGG
4rs81924664.0NaNuncertainTT
5rs49868522.0NaNpredisposition to breast cancer?AA
6rs18007092.0NaNpredisposition to breast cancer?TT
7rs289315920.0NaNNaNTT
8rs49868932.1NaNpoor metabolizer of several commonly prescribe...AA
915 |
916 | 917 | 918 | 919 | ```python 920 | snp_df['repute'].fillna(value='Neutral', inplace=True) 921 | snp_df['summary'].fillna(value='None', inplace=True) 922 | ``` 923 | 924 | 925 | ```python 926 | # No no NaNette 927 | snp_df.isna().any() 928 | ``` 929 | 930 | 931 | 932 | 933 | rsid False 934 | magnitude False 935 | repute False 936 | summary False 937 | genotype False 938 | dtype: bool 939 | 940 | 941 | 942 | # Merging my data with SNPedia 943 | 944 | Appropriately enough, I did an inner join of the SNPedia DataFrame on my DNA to see what data, if any, it had on my particular genotypes. 945 | 946 | 947 | ```python 948 | new_df = snp_df.merge(df, how='inner', on=['rsid', 'genotype'], suffixes=('_SNPedia', '_myDNA')) 949 | ``` 950 | 951 | 952 | ```python 953 | new_df.head(20) 954 | ``` 955 | 956 | 957 | 958 | 959 |
960 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1095 | 1096 | 1097 | 1098 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1106 | 1107 | 1108 | 1109 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | 1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | 1128 |
rsidmagnitudereputesummarygenotypechromosomeposition
0rs13030.0Goodcommon in clinvarTT1494844843
1rs175802.5Bada slightly reduced functionality form of Alpha...TT1494847262
2rs289315800.0Goodcommon in clinvarAA1250344783
3rs10427140.0GoodnormalCC5148206473
4rs18008880.0GoodnormalCC5148206885
5rs23037900.0Goodcommon in clinvarAA1657017292
6rs58822.0BadFaster aging. Increased risk for Dementia. Les...AA1657016092
7rs22301992.0Bad2.5x+ risk of ARMDGG196718387
8rs289316080.0Goodcommon in clinvarGG775614497
9rs49868930.0GoodnormalGG1096540410
10rs283995040.0GoodnormalAA1096522463
11rs22349220.0Goodcommon in clinvarAA1226026406
12rs289316140.0Goodcommon in clinvarGG41806119
13rs289330680.0Goodcommon in clinvarCC41807371
1129 |
1130 | 1131 | 1132 | 1133 | ### What's hiding in there? 1134 | 1135 | 1136 | ```python 1137 | # Create a DataFrame for some subsets of genes 1138 | good_genes = new_df[new_df.repute == 'Good'] 1139 | bad_genes = new_df[new_df.repute == 'Bad'] 1140 | interesting_genes = new_df[new_df.magnitude > 4] # 4 is the threshold for "worth your time" given by SNPedia 1141 | ``` 1142 | 1143 | I have plenty of "good" genotypes, but none with a nonzero magnitude. 1144 | 1145 | 1146 | ```python 1147 | good_genes 1148 | ``` 1149 | 1150 | 1151 | 1152 | 1153 |
1154 | 1167 | 1168 | 1169 | 1170 | 1171 | 1172 | 1173 | 1174 | 1175 | 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | 1182 | 1183 | 1184 | 1185 | 1186 | 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | 1193 | 1194 | 1195 | 1196 | 1197 | 1198 | 1199 | 1200 | 1201 | 1202 | 1203 | 1204 | 1205 | 1206 | 1207 | 1208 | 1209 | 1210 | 1211 | 1212 | 1213 | 1214 | 1215 | 1216 | 1217 | 1218 | 1219 | 1220 | 1221 | 1222 | 1223 | 1224 | 1225 | 1226 | 1227 | 1228 | 1229 | 1230 | 1231 | 1232 | 1233 | 1234 | 1235 | 1236 | 1237 | 1238 | 1239 | 1240 | 1241 | 1242 | 1243 | 1244 | 1245 | 1246 | 1247 | 1248 | 1249 | 1250 | 1251 | 1252 | 1253 | 1254 | 1255 | 1256 | 1257 | 1258 | 1259 | 1260 | 1261 | 1262 | 1263 | 1264 | 1265 | 1266 | 1267 | 1268 | 1269 | 1270 | 1271 | 1272 | 1273 | 1274 | 1275 | 1276 | 1277 | 1278 | 1279 | 1280 | 1281 | 1282 | 1283 | 1284 | 1285 | 1286 | 1287 | 1288 | 1289 | 1290 | 1291 | 1292 |
rsidmagnitudereputesummarygenotypechromosomeposition
0rs13030.0Goodcommon in clinvarTT1494844843
2rs289315800.0Goodcommon in clinvarAA1250344783
3rs10427140.0GoodnormalCC5148206473
4rs18008880.0GoodnormalCC5148206885
5rs23037900.0Goodcommon in clinvarAA1657017292
8rs289316080.0Goodcommon in clinvarGG775614497
9rs49868930.0GoodnormalGG1096540410
10rs283995040.0GoodnormalAA1096522463
11rs22349220.0Goodcommon in clinvarAA1226026406
12rs289316140.0Goodcommon in clinvarGG41806119
13rs289330680.0Goodcommon in clinvarCC41807371
1293 |
1294 | 1295 | 1296 | 1297 | I have three "bad" genotypes with a nonzero magnitude. 1298 | 1299 | 1300 | ```python 1301 | bad_genes 1302 | ``` 1303 | 1304 | 1305 | 1306 | 1307 |
1308 | 1321 | 1322 | 1323 | 1324 | 1325 | 1326 | 1327 | 1328 | 1329 | 1330 | 1331 | 1332 | 1333 | 1334 | 1335 | 1336 | 1337 | 1338 | 1339 | 1340 | 1341 | 1342 | 1343 | 1344 | 1345 | 1346 | 1347 | 1348 | 1349 | 1350 | 1351 | 1352 | 1353 | 1354 | 1355 | 1356 | 1357 | 1358 | 1359 | 1360 | 1361 | 1362 | 1363 | 1364 | 1365 | 1366 |
rsidmagnitudereputesummarygenotypechromosomeposition
1rs175802.5Bada slightly reduced functionality form of Alpha...TT1494847262
6rs58822.0BadFaster aging. Increased risk for Dementia. Les...AA1657016092
7rs22301992.0Bad2.5x+ risk of ARMDGG196718387
1367 |
1368 | 1369 | 1370 | 1371 | Sadly, I had no "interesting" genotypes above the threshold of 4, although hearteningly I did possess some slightly interesting bad ones. 1372 | 1373 | 1374 | # Scrape relevant articles with Selenium 1375 | 1376 | I decided I might like to read up on my bad genetics, so I used Selenium to scrape the abstracts of some scientific papers from PubMed. 1377 | 1378 | 1379 | ```python 1380 | # Get the base URL from SNPedia 1381 | base_url = 'https://www.snpedia.com/index.php/' 1382 | ``` 1383 | 1384 | 1385 | ```python 1386 | # Create URLs for each gene that I want to study 1387 | gene_urls = [base_url + rsid for rsid in bad_genes['rsid']] 1388 | 1389 | ``` 1390 | 1391 | ```python 1392 | # Initialize Selenium 1393 | browser = webdriver.Chrome() 1394 | ``` 1395 | 1396 | 1397 | ```python 1398 | import time 1399 | ``` 1400 | 1401 | 1402 | ```python 1403 | # Write a function to visit the SNPedia URLs, click through to PubMed, 1404 | # and retrieve the info on the articles for each gene 1405 | 1406 | def scrape_abstracts(urls): 1407 | 1408 | rsid_list = [] 1409 | all_article_title = [] 1410 | all_article_citation = [] 1411 | all_article_authors = [] 1412 | all_article_abstract = [] 1413 | all_article_links = [] 1414 | 1415 | for url in urls: 1416 | link_urls = [] 1417 | browser.get(url) #load url 1418 | rsid = browser.find_element_by_css_selector('.firstHeading').text 1419 | links_elements = browser.find_elements_by_partial_link_text('PMID') 1420 | 1421 | # get the URLs to the PubMed pages 1422 | for link in links_elements: 1423 | link_urls.append(link.get_attribute('href')) 1424 | 1425 | # follow each link element to PubMed 1426 | for element in link_urls: 1427 | browser.get(element) 1428 | time.sleep(2) 1429 | article_title = browser.find_element_by_xpath("//div[@class='cit']/../h1").text 1430 | article_citation = browser.find_element_by_class_name('cit').text 1431 | article_authors = browser.find_element_by_class_name('auths').text 1432 | article_abstract = browser.find_element_by_class_name('abstr').text 1433 | 1434 | rsid_list.append(rsid) 1435 | all_article_title.append(article_title) 1436 | all_article_citation.append(article_citation) 1437 | all_article_authors.append(article_authors) 1438 | all_article_abstract.append(article_abstract) 1439 | all_article_links.append(element) 1440 | 1441 | # store the information 1442 | df = pd.DataFrame() 1443 | df['rsid'] = rsid_list 1444 | df['article_title'] = all_article_title 1445 | df['article_citation'] = all_article_citation 1446 | df['article_authors'] = all_article_authors 1447 | df['article_abstract'] = all_article_abstract 1448 | df['link'] = all_article_links 1449 | 1450 | df = df.drop_duplicates() 1451 | 1452 | df.index = range(len(df.index)) 1453 | 1454 | return df 1455 | ``` 1456 | 1457 | 1458 | ```python 1459 | abstracts_df = scrape_abstracts(gene_urls) 1460 | ``` 1461 | 1462 | For later hypochondriacal perusal, I exported my findings, complete with abstracts and hyperlinks, to a CSV file using the pandas DataFrame.to_csv method. 1463 | 1464 | 1465 | ```python 1466 | # DataFrame to CSV 1467 | export_csv = abstracts_df.to_csv(r'/Users/lorajohns/Documents/Python/DNA/DNA_articles.csv') 1468 | ``` 1469 | 1470 | ## Reading up on the medical literature 1471 | 1472 |
1473 | Formatted DNA_articles.csv in Numbers 1474 |
DNA_articles.csv in Numbers
1475 |
1476 | 1477 | Now I have a handy CSV file, nicely formatted, with citations to scientific articles analyzing and describing my probably un-problematic, but probationally proditory genotypes. Python provides prodigious tools to engage in literal introspection that the sawbones of old could never have imagined. 1478 | -------------------------------------------------------------------------------- /DNA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Know Thyself: Using Data Science to Explore Your Own Genome\n", 8 | "## DNA analysis with pandas and Selenium" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "[23andme](https://www.23andme.com/) once offered me a free DNA and ancestry test kit if I participated in one of their clinical studies. In exchange for my genetic data and a bunch of questionnaires, I got my genome sequenced and gained access to myriad reports on where my ancestors were likely from, what health conditions and traits I probably have inherited, and who else on the site I might be related to.\n", 16 | "\n", 17 | "While 23andme provides an overwhelming amount of consumer-ready infographics and tools, I wondered what else I could do with the data. I knew I could download my raw genetic data from the site in a text file, so I decided to pour it into pandas and see what I could make of it. " 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "%matplotlib inline\n", 27 | "import seaborn as sns\n", 28 | "sns.set_style('darkgrid')\n", 29 | "sns.color_palette('Spectral')\n", 30 | "import matplotlib.pyplot as plt" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import numpy as np\n", 40 | "import requests\n", 41 | "import pandas as pd\n", 42 | "\n", 43 | "import re" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "from selenium import webdriver\n", 53 | "from selenium.webdriver.support.ui import WebDriverWait" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Importing my DNA into pandas and exploring the genome\n", 61 | "\n", 62 | "Looking at the .txt file, I could see that I was missing some genotype values, which were denoted with '--'. \n", 63 | "\n", 64 | "While most of the chromosomes are ints, there is an X and a Y (which I may convert later to a number for analytic purposes), but for now I need to make sure to specify the data type properly so that pandas doesn't throw an error when it sees mixed data in the input. \n", 65 | "\n", 66 | "The other columns are fairly straightforward. I also want pandas to ignore the prefatory comments at the beginning of the file that consist of lines beginning with an octothorpe.\n", 67 | "\n", 68 | "The arguments I need to pass are:\n", 69 | "* separator (tab-delimited)\n", 70 | "* dtype (as a dict)\n", 71 | "* na_values ('--')\n", 72 | "* comment ('#')\n", 73 | "\n", 74 | "![img](/genome.png)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "data = pd.read_csv('genome.txt', sep='\\t', dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, comment='#')\n", 84 | "\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 5, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | " rsid chromosome position genotype\n", 97 | "0 rs548049170 1 69869 TT\n", 98 | "1 rs13328684 1 74792 --\n", 99 | "2 rs9283150 1 565508 AA\n", 100 | "3 i713426 1 726912 --\n", 101 | "4 rs116587930 1 727841 GG\n", 102 | "5 rs3131972 1 752721 AG\n", 103 | "6 rs12184325 1 754105 CC\n", 104 | "7 rs12567639 1 756268 AA\n", 105 | "8 rs114525117 1 759036 GG\n", 106 | "9 rs12124819 1 776546 AA\n", 107 | "10 rs12127425 1 794332 GG\n", 108 | "11 rs79373928 1 801536 TT\n", 109 | "12 rs72888853 1 815421 --\n", 110 | "13 rs7538305 1 824398 AA\n", 111 | "14 rs28444699 1 830181 AA\n", 112 | "15 i713449 1 830731 --\n", 113 | "16 rs116452738 1 834830 GG\n", 114 | "17 rs72631887 1 835092 TT\n", 115 | "18 rs28678693 1 838665 TT\n", 116 | "19 rs4970382 1 840753 CT\n", 117 | "20 rs4475691 1 846808 CC\n", 118 | "21 rs72631889 1 851390 GG\n", 119 | "22 rs7537756 1 854250 AA\n", 120 | "23 rs13302982 1 861808 GG\n", 121 | "24 rs376747791 1 863130 AA\n", 122 | "25 rs2880024 1 866893 CC\n", 123 | "26 rs13302914 1 868404 TT\n", 124 | "27 rs76723341 1 872952 CC\n", 125 | "28 rs2272757 1 881627 AA\n", 126 | "29 rs35471880 1 881918 GG\n", 127 | "... ... ... ... ...\n", 128 | "638501 i702582 MT 16399 A\n", 129 | "638502 i3001912 MT 16428 G\n", 130 | "638503 i705509 MT 16428 G\n", 131 | "638504 i3001918 MT 16463 A\n", 132 | "638505 i703465 MT 16463 A\n", 133 | "638506 i3001919 MT 16465 C\n", 134 | "638507 i702453 MT 16465 C\n", 135 | "638508 i4000619 MT 16468 T\n", 136 | "638509 i704237 MT 16468 T\n", 137 | "638510 i3001920 MT 16470 G\n", 138 | "638511 i703422 MT 16470 G\n", 139 | "638512 i3001921 MT 16471 --\n", 140 | "638513 i702337 MT 16471 --\n", 141 | "638514 i3001925 MT 16482 A\n", 142 | "638515 i708027 MT 16482 A\n", 143 | "638516 i3001926 MT 16483 G\n", 144 | "638517 i704927 MT 16483 G\n", 145 | "638518 i4000691 MT 16488 C\n", 146 | "638519 i706404 MT 16488 C\n", 147 | "638520 i706772 MT 16494 C\n", 148 | "638521 i3001927 MT 16497 A\n", 149 | "638522 i705833 MT 16497 A\n", 150 | "638523 i4000690 MT 16518 G\n", 151 | "638524 i701050 MT 16518 G\n", 152 | "638525 i701374 MT 16523 A\n", 153 | "638526 i4000693 MT 16524 A\n", 154 | "638527 i704756 MT 16524 A\n", 155 | "638528 i705255 MT 16525 A\n", 156 | "638529 i4000757 MT 16526 G\n", 157 | "638530 i701671 MT 16526 G\n", 158 | "\n", 159 | "[638531 rows x 4 columns]\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "print(data)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "A quick note on the column names:\n", 172 | "\n", 173 | "* rsid stands for Reference SNP cluster ID. It identifies unique SNPs.\n", 174 | "\n", 175 | "* SNPs are Single Nucleotide Polymorphisms ('snips'), locations in the genome that are known to vary between individuals. They can influence disease risk, drug efficacy and side-effects, tell you about your ancestry, and predict aspects of how you look and act.\n", 176 | "\n", 177 | "* All humans have almost the same sequence of 3 billion DNA bases (A,C,G, or T) distributed between their 23 pairs of chromosomes. But at certain locations there are differences that have been reported to be meaningful, either medically or for other reasons (such as for genealogy). The SNPedia catalogues SNPs that have significant medical consequences, are common, are reproducible (or found in meta-analyses or studies of at least 500 patients), or have other historic or medical significance.\n", 178 | "\n", 179 | "I started off by navigating my new data frame with some basic exploratory data analysis and data cleaning." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 6, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# Read the data into a pandas DataFrame and do some EDA\n", 189 | "df = pd.DataFrame(data)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 7, 195 | "metadata": { 196 | "scrolled": true 197 | }, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/html": [ 202 | "
\n", 203 | "\n", 216 | "\n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | "
rsidchromosomepositiongenotype
0rs548049170169869TT
1rs13328684174792--
2rs92831501565508AA
3i7134261726912--
4rs1165879301727841GG
5rs31319721752721AG
6rs121843251754105CC
7rs125676391756268AA
8rs1145251171759036GG
9rs121248191776546AA
10rs121274251794332GG
11rs793739281801536TT
12rs728888531815421--
13rs75383051824398AA
14rs284446991830181AA
15i7134491830731--
16rs1164527381834830GG
17rs726318871835092TT
18rs286786931838665TT
19rs49703821840753CT
20rs44756911846808CC
21rs726318891851390GG
22rs75377561854250AA
23rs133029821861808GG
24rs3767477911863130AA
\n", 404 | "
" 405 | ], 406 | "text/plain": [ 407 | " rsid chromosome position genotype\n", 408 | "0 rs548049170 1 69869 TT\n", 409 | "1 rs13328684 1 74792 --\n", 410 | "2 rs9283150 1 565508 AA\n", 411 | "3 i713426 1 726912 --\n", 412 | "4 rs116587930 1 727841 GG\n", 413 | "5 rs3131972 1 752721 AG\n", 414 | "6 rs12184325 1 754105 CC\n", 415 | "7 rs12567639 1 756268 AA\n", 416 | "8 rs114525117 1 759036 GG\n", 417 | "9 rs12124819 1 776546 AA\n", 418 | "10 rs12127425 1 794332 GG\n", 419 | "11 rs79373928 1 801536 TT\n", 420 | "12 rs72888853 1 815421 --\n", 421 | "13 rs7538305 1 824398 AA\n", 422 | "14 rs28444699 1 830181 AA\n", 423 | "15 i713449 1 830731 --\n", 424 | "16 rs116452738 1 834830 GG\n", 425 | "17 rs72631887 1 835092 TT\n", 426 | "18 rs28678693 1 838665 TT\n", 427 | "19 rs4970382 1 840753 CT\n", 428 | "20 rs4475691 1 846808 CC\n", 429 | "21 rs72631889 1 851390 GG\n", 430 | "22 rs7537756 1 854250 AA\n", 431 | "23 rs13302982 1 861808 GG\n", 432 | "24 rs376747791 1 863130 AA" 433 | ] 434 | }, 435 | "execution_count": 7, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "df.head(25)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 8, 447 | "metadata": {}, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "text/plain": [ 452 | " rsid False\n", 453 | "chromosome False\n", 454 | "position False\n", 455 | "genotype False\n", 456 | "dtype: bool" 457 | ] 458 | }, 459 | "execution_count": 8, 460 | "metadata": {}, 461 | "output_type": "execute_result" 462 | } 463 | ], 464 | "source": [ 465 | "df.isna().any()" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 9, 471 | "metadata": { 472 | "scrolled": true 473 | }, 474 | "outputs": [ 475 | { 476 | "data": { 477 | "text/plain": [ 478 | " rsid 638531\n", 479 | "chromosome 25\n", 480 | "position 634934\n", 481 | "genotype 20\n", 482 | "dtype: int64" 483 | ] 484 | }, 485 | "execution_count": 9, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "df.nunique()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 10, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "name": "stdout", 501 | "output_type": "stream", 502 | "text": [ 503 | "\n", 504 | "RangeIndex: 638531 entries, 0 to 638530\n", 505 | "Data columns (total 4 columns):\n", 506 | " rsid 638531 non-null object\n", 507 | "chromosome 638531 non-null object\n", 508 | "position 638531 non-null int64\n", 509 | "genotype 638531 non-null object\n", 510 | "dtypes: int64(1), object(3)\n", 511 | "memory usage: 19.5+ MB\n" 512 | ] 513 | } 514 | ], 515 | "source": [ 516 | "df.info()" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 11, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/html": [ 527 | "
\n", 528 | "\n", 541 | "\n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | "
rsidchromosomepositiongenotype
449i605996712526746AA
2816i6052145111009679GG
5325i6012699119992513CC
5339i6059797120020994TT
5791i6058167121795388GG
\n", 589 | "
" 590 | ], 591 | "text/plain": [ 592 | " rsid chromosome position genotype\n", 593 | "449 i6059967 1 2526746 AA\n", 594 | "2816 i6052145 1 11009679 GG\n", 595 | "5325 i6012699 1 19992513 CC\n", 596 | "5339 i6059797 1 20020994 TT\n", 597 | "5791 i6058167 1 21795388 GG" 598 | ] 599 | }, 600 | "metadata": {}, 601 | "output_type": "display_data" 602 | }, 603 | { 604 | "name": "stdout", 605 | "output_type": "stream", 606 | "text": [ 607 | "\n", 608 | "Int64Index: 3597 entries, 449 to 638530\n", 609 | "Data columns (total 4 columns):\n", 610 | " rsid 3597 non-null object\n", 611 | "chromosome 3597 non-null object\n", 612 | "position 3597 non-null int64\n", 613 | "genotype 3597 non-null object\n", 614 | "dtypes: int64(1), object(3)\n", 615 | "memory usage: 140.5+ KB\n" 616 | ] 617 | }, 618 | { 619 | "data": { 620 | "text/plain": [ 621 | "None" 622 | ] 623 | }, 624 | "metadata": {}, 625 | "output_type": "display_data" 626 | } 627 | ], 628 | "source": [ 629 | "duplicates = df[df.duplicated(subset='position')]\n", 630 | "display(duplicates.head())\n", 631 | "display(duplicates.info())" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 12, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "# How many chromosomes am I missing by not having a Y chromosome?\n", 641 | "Y_chromosome = df[df.chromosome == 'Y']" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 13, 647 | "metadata": { 648 | "scrolled": true 649 | }, 650 | "outputs": [ 651 | { 652 | "data": { 653 | "text/plain": [ 654 | "3733" 655 | ] 656 | }, 657 | "execution_count": 13, 658 | "metadata": {}, 659 | "output_type": "execute_result" 660 | } 661 | ], 662 | "source": [ 663 | "len(Y_chromosome)" 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "Since I don't have this chromosome, I'll just drop it from the DataFrame." 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 14, 676 | "metadata": {}, 677 | "outputs": [ 678 | { 679 | "name": "stdout", 680 | "output_type": "stream", 681 | "text": [ 682 | "\n", 683 | "Int64Index: 634798 entries, 0 to 638530\n", 684 | "Data columns (total 4 columns):\n", 685 | " rsid 634798 non-null object\n", 686 | "chromosome 634798 non-null object\n", 687 | "position 634798 non-null int64\n", 688 | "genotype 634798 non-null object\n", 689 | "dtypes: int64(1), object(3)\n", 690 | "memory usage: 24.2+ MB\n" 691 | ] 692 | } 693 | ], 694 | "source": [ 695 | "df = df[df.chromosome != 'Y']\n", 696 | "df.info()" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "Most of the chromosomes are numeric; only X, Y, and mitochondrial are characters. I'll convert them to numbers, cast them to ints, and create a dictionary to translate back later so that the data will be more manipulable." 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 15, 709 | "metadata": { 710 | "scrolled": true 711 | }, 712 | "outputs": [ 713 | { 714 | "data": { 715 | "text/plain": [ 716 | "array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',\n", 717 | " '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',\n", 718 | " 'MT'], dtype=object)" 719 | ] 720 | }, 721 | "execution_count": 15, 722 | "metadata": {}, 723 | "output_type": "execute_result" 724 | } 725 | ], 726 | "source": [ 727 | "df['chromosome'].unique()" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 16, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'X', r'23', x))\n", 737 | "df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'MT', r'24', x))" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 17, 743 | "metadata": {}, 744 | "outputs": [], 745 | "source": [ 746 | "df['chromosome'] = df['chromosome'].apply(lambda x: int(x))" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 18, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "chromosome_dict = {1:'1', 2:'2', 3:'3', 4:'4', 5:'5', 6:'6', 7:'7', 8:'8', 9:'9', 10:'10', 11:'11', 12:'12', 13:'13', \n", 756 | " 14:'14', 15:'15', 16:'16', 17:'17', 18:'18', 19:'19', 20:'20', 21:'21', 22:'22', 23:'X', 24:'MT'}" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": 19, 762 | "metadata": {}, 763 | "outputs": [ 764 | { 765 | "name": "stdout", 766 | "output_type": "stream", 767 | "text": [ 768 | "{1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10', 11: '11', 12: '12', 13: '13', 14: '14', 15: '15', 16: '16', 17: '17', 18: '18', 19: '19', 20: '20', 21: '21', 22: '22', 23: 'X', 24: 'MT'}\n", 769 | "\n", 770 | "Int64Index: 634798 entries, 0 to 638530\n", 771 | "Data columns (total 4 columns):\n", 772 | " rsid 634798 non-null object\n", 773 | "chromosome 634798 non-null int64\n", 774 | "position 634798 non-null int64\n", 775 | "genotype 634798 non-null object\n", 776 | "dtypes: int64(2), object(2)\n", 777 | "memory usage: 24.2+ MB\n" 778 | ] 779 | } 780 | ], 781 | "source": [ 782 | "print(chromosome_dict)\n", 783 | "df.info()" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "There are 16,005 genotypes that I simply do not have:" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 20, 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "data": { 800 | "text/plain": [ 801 | "16005" 802 | ] 803 | }, 804 | "execution_count": 20, 805 | "metadata": {}, 806 | "output_type": "execute_result" 807 | } 808 | ], 809 | "source": [ 810 | "genotype_na = df[df.genotype == '--']\n", 811 | "len(genotype_na)" 812 | ] 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": {}, 817 | "source": [ 818 | "### Some visualizations" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 21, 824 | "metadata": { 825 | "scrolled": true 826 | }, 827 | "outputs": [ 828 | { 829 | "name": "stdout", 830 | "output_type": "stream", 831 | "text": [ 832 | "\n", 833 | "Int64Index: 49514 entries, 0 to 49513\n", 834 | "Data columns (total 4 columns):\n", 835 | " rsid 49514 non-null object\n", 836 | "chromosome 49514 non-null int64\n", 837 | "position 49514 non-null int64\n", 838 | "genotype 49514 non-null object\n", 839 | "dtypes: int64(2), object(2)\n", 840 | "memory usage: 1.9+ MB\n" 841 | ] 842 | } 843 | ], 844 | "source": [ 845 | "df[df.chromosome == 1].info()" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": 22, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [ 854 | "# Remove that pesky whitespace from the column name\n", 855 | "df.rename({' rsid': 'rsid'}, axis='columns', inplace=True)" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "How many SNPs are there per chromosome?" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 23, 868 | "metadata": {}, 869 | "outputs": [], 870 | "source": [ 871 | "# We can do this manually with a for loop . . .\n", 872 | "x = []\n", 873 | "y = []\n", 874 | "for k in chromosome_dict:\n", 875 | " x.append(k)\n", 876 | " y.append(len(df[df.chromosome == k]))\n", 877 | "rsid_per_chromosome = dict(zip(x,y)) " 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 24, 883 | "metadata": { 884 | "scrolled": false 885 | }, 886 | "outputs": [ 887 | { 888 | "data": { 889 | "text/plain": [ 890 | "{1: 49514,\n", 891 | " 2: 51775,\n", 892 | " 3: 43024,\n", 893 | " 4: 39474,\n", 894 | " 5: 37032,\n", 895 | " 6: 44023,\n", 896 | " 7: 34357,\n", 897 | " 8: 31683,\n", 898 | " 9: 26446,\n", 899 | " 10: 30525,\n", 900 | " 11: 30942,\n", 901 | " 12: 29432,\n", 902 | " 13: 22080,\n", 903 | " 14: 19961,\n", 904 | " 15: 19006,\n", 905 | " 16: 20397,\n", 906 | " 17: 19401,\n", 907 | " 18: 17675,\n", 908 | " 19: 14917,\n", 909 | " 20: 14781,\n", 910 | " 21: 8607,\n", 911 | " 22: 8915,\n", 912 | " 23: 16530,\n", 913 | " 24: 4301}" 914 | ] 915 | }, 916 | "execution_count": 24, 917 | "metadata": {}, 918 | "output_type": "execute_result" 919 | } 920 | ], 921 | "source": [ 922 | "rsid_per_chromosome" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 25, 928 | "metadata": {}, 929 | "outputs": [], 930 | "source": [ 931 | "# . . . but pandas makes it a lot easier!\n", 932 | "rsid_per_chromosome_series = df.groupby('chromosome')['rsid'].count()\n", 933 | "rsid_per_chromosome_series.columns = ['chromosome', 'count']" 934 | ] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": 26, 939 | "metadata": {}, 940 | "outputs": [ 941 | { 942 | "data": { 943 | "image/png": "\n", 944 | "text/plain": [ 945 | "
" 946 | ] 947 | }, 948 | "metadata": { 949 | "needs_background": "light" 950 | }, 951 | "output_type": "display_data" 952 | } 953 | ], 954 | "source": [ 955 | "rsid_per_chromosome_series.plot.barh(figsize=(16,9), fontsize=15)\n", 956 | "plt.show()" 957 | ] 958 | }, 959 | { 960 | "cell_type": "markdown", 961 | "metadata": {}, 962 | "source": [ 963 | "## Getting data on SNPs from SNPedia\n", 964 | "\n", 965 | "To get some more info about my DNA, I pulled some information on clinically significant SNPs from SNPedia.\n", 966 | "\n", 967 | "The columns are:\n", 968 | "\n", 969 | "* Unnamed: 0 (actually the SNP name)\n", 970 | "* Magnitude (a subjective measure of interest)\n", 971 | "* Repute (a subjective measure of whether the genotype is \"good\" or \"bad\" to have based on research, and blank for things like ancestry and eye color)\n", 972 | "* Summary (a narrative description)" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": 27, 978 | "metadata": {}, 979 | "outputs": [ 980 | { 981 | "data": { 982 | "text/html": [ 983 | "
\n", 984 | "\n", 997 | "\n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | "
Unnamed: 0MagnitudeReputeSummary
0Rs661(A;A)9.0Badearly onset Alzheimer's disease
1Rs6647(T;T)0.0GoodNormal; two copies of Pi-M1V allele
2Rs6647(C;C)0.0GoodNormal; two copies of Pi-M1A allele
3Rs1303(T;T)0.0Goodcommon in clinvar
4Rs28929471(G;G)0.0Goodcommon in complete genomics
\n", 1045 | "
" 1046 | ], 1047 | "text/plain": [ 1048 | " Unnamed: 0 Magnitude Repute Summary\n", 1049 | "0 Rs661(A;A) 9.0 Bad early onset Alzheimer's disease\n", 1050 | "1 Rs6647(T;T) 0.0 Good Normal; two copies of Pi-M1V allele\n", 1051 | "2 Rs6647(C;C) 0.0 Good Normal; two copies of Pi-M1A allele\n", 1052 | "3 Rs1303(T;T) 0.0 Good common in clinvar\n", 1053 | "4 Rs28929471(G;G) 0.0 Good common in complete genomics" 1054 | ] 1055 | }, 1056 | "execution_count": 27, 1057 | "metadata": {}, 1058 | "output_type": "execute_result" 1059 | } 1060 | ], 1061 | "source": [ 1062 | "snp_df = pd.read_csv('result.csv')\n", 1063 | "snp_df.head()" 1064 | ] 1065 | }, 1066 | { 1067 | "cell_type": "markdown", 1068 | "metadata": {}, 1069 | "source": [ 1070 | "To match up with my original DataFrame, I'll create a genotype column and use regex to separate out the genotype, which is tacked onto the end of the SNP." 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "code", 1075 | "execution_count": 28, 1076 | "metadata": {}, 1077 | "outputs": [], 1078 | "source": [ 1079 | "snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x: re.sub(r'.*([AGCT]);([AGCT])\\)', r'\\1\\2', x))" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "code", 1084 | "execution_count": 29, 1085 | "metadata": {}, 1086 | "outputs": [ 1087 | { 1088 | "data": { 1089 | "text/html": [ 1090 | "
\n", 1091 | "\n", 1104 | "\n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | "
Unnamed: 0MagnitudeReputeSummarygenotype
0Rs661(A;A)9.0Badearly onset Alzheimer's diseaseAA
1Rs6647(T;T)0.0GoodNormal; two copies of Pi-M1V alleleTT
2Rs6647(C;C)0.0GoodNormal; two copies of Pi-M1A alleleCC
3Rs1303(T;T)0.0Goodcommon in clinvarTT
4Rs28929471(G;G)0.0Goodcommon in complete genomicsGG
\n", 1158 | "
" 1159 | ], 1160 | "text/plain": [ 1161 | " Unnamed: 0 Magnitude Repute Summary \\\n", 1162 | "0 Rs661(A;A) 9.0 Bad early onset Alzheimer's disease \n", 1163 | "1 Rs6647(T;T) 0.0 Good Normal; two copies of Pi-M1V allele \n", 1164 | "2 Rs6647(C;C) 0.0 Good Normal; two copies of Pi-M1A allele \n", 1165 | "3 Rs1303(T;T) 0.0 Good common in clinvar \n", 1166 | "4 Rs28929471(G;G) 0.0 Good common in complete genomics \n", 1167 | "\n", 1168 | " genotype \n", 1169 | "0 AA \n", 1170 | "1 TT \n", 1171 | "2 CC \n", 1172 | "3 TT \n", 1173 | "4 GG " 1174 | ] 1175 | }, 1176 | "execution_count": 29, 1177 | "metadata": {}, 1178 | "output_type": "execute_result" 1179 | } 1180 | ], 1181 | "source": [ 1182 | "snp_df.head()" 1183 | ] 1184 | }, 1185 | { 1186 | "cell_type": "markdown", 1187 | "metadata": {}, 1188 | "source": [ 1189 | "For consistency's sake, I renamed the columns to match my original DataFrame and made sure the rsids were all lower-case." 1190 | ] 1191 | }, 1192 | { 1193 | "cell_type": "code", 1194 | "execution_count": 30, 1195 | "metadata": {}, 1196 | "outputs": [], 1197 | "source": [ 1198 | "new_cols = ['rsid', 'magnitude', 'repute', 'summary', 'genotype']\n", 1199 | "snp_df.columns = new_cols" 1200 | ] 1201 | }, 1202 | { 1203 | "cell_type": "markdown", 1204 | "metadata": {}, 1205 | "source": [ 1206 | "I'll use regex to clean up the rsid a little more, too." 1207 | ] 1208 | }, 1209 | { 1210 | "cell_type": "code", 1211 | "execution_count": 31, 1212 | "metadata": {}, 1213 | "outputs": [], 1214 | "source": [ 1215 | "snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())\n", 1216 | "snp_df['rsid'] = snp_df['rsid'].map(lambda x : re.sub(r'([a-z]{1,}[\\d]+)\\([agct];[agct]\\)', r'\\1', x))" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "code", 1221 | "execution_count": 32, 1222 | "metadata": { 1223 | "scrolled": false 1224 | }, 1225 | "outputs": [ 1226 | { 1227 | "data": { 1228 | "text/html": [ 1229 | "
\n", 1230 | "\n", 1243 | "\n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | "
rsidmagnitudereputesummarygenotype
0rs6619.0Badearly onset Alzheimer's diseaseAA
1rs66470.0GoodNormal; two copies of Pi-M1V alleleTT
2rs66470.0GoodNormal; two copies of Pi-M1A alleleCC
3rs13030.0Goodcommon in clinvarTT
4rs289294710.0Goodcommon in complete genomicsGG
\n", 1297 | "
" 1298 | ], 1299 | "text/plain": [ 1300 | " rsid magnitude repute summary genotype\n", 1301 | "0 rs661 9.0 Bad early onset Alzheimer's disease AA\n", 1302 | "1 rs6647 0.0 Good Normal; two copies of Pi-M1V allele TT\n", 1303 | "2 rs6647 0.0 Good Normal; two copies of Pi-M1A allele CC\n", 1304 | "3 rs1303 0.0 Good common in clinvar TT\n", 1305 | "4 rs28929471 0.0 Good common in complete genomics GG" 1306 | ] 1307 | }, 1308 | "execution_count": 32, 1309 | "metadata": {}, 1310 | "output_type": "execute_result" 1311 | } 1312 | ], 1313 | "source": [ 1314 | "snp_df.head()" 1315 | ] 1316 | }, 1317 | { 1318 | "cell_type": "code", 1319 | "execution_count": 33, 1320 | "metadata": { 1321 | "scrolled": true 1322 | }, 1323 | "outputs": [ 1324 | { 1325 | "name": "stdout", 1326 | "output_type": "stream", 1327 | "text": [ 1328 | "\n", 1329 | "RangeIndex: 100 entries, 0 to 99\n", 1330 | "Data columns (total 5 columns):\n", 1331 | "rsid 100 non-null object\n", 1332 | "magnitude 100 non-null float64\n", 1333 | "repute 91 non-null object\n", 1334 | "summary 96 non-null object\n", 1335 | "genotype 100 non-null object\n", 1336 | "dtypes: float64(1), object(4)\n", 1337 | "memory usage: 4.0+ KB\n" 1338 | ] 1339 | } 1340 | ], 1341 | "source": [ 1342 | "snp_df.info()" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "markdown", 1347 | "metadata": {}, 1348 | "source": [ 1349 | "Let's see what's going on with the null reputes and summaries and overwrite them if it's appropriate. (In this case, the answer seems to be yes.)" 1350 | ] 1351 | }, 1352 | { 1353 | "cell_type": "code", 1354 | "execution_count": 34, 1355 | "metadata": {}, 1356 | "outputs": [ 1357 | { 1358 | "data": { 1359 | "text/html": [ 1360 | "
\n", 1361 | "\n", 1374 | "\n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | "
rsidmagnitudereputesummarygenotype
0rs289315694.0NaNhigh risk of emphysemaCC
1rs289294730.0NaNNaNTT
2rs289315720.0NaNNaNAA
3rs18012523.0NaNNaNGG
4rs81924664.0NaNuncertainTT
5rs49868522.0NaNpredisposition to breast cancer?AA
6rs18007092.0NaNpredisposition to breast cancer?TT
7rs289315920.0NaNNaNTT
8rs49868932.1NaNpoor metabolizer of several commonly prescribe...AA
\n", 1460 | "
" 1461 | ], 1462 | "text/plain": [ 1463 | " rsid magnitude repute \\\n", 1464 | "0 rs28931569 4.0 NaN \n", 1465 | "1 rs28929473 0.0 NaN \n", 1466 | "2 rs28931572 0.0 NaN \n", 1467 | "3 rs1801252 3.0 NaN \n", 1468 | "4 rs8192466 4.0 NaN \n", 1469 | "5 rs4986852 2.0 NaN \n", 1470 | "6 rs1800709 2.0 NaN \n", 1471 | "7 rs28931592 0.0 NaN \n", 1472 | "8 rs4986893 2.1 NaN \n", 1473 | "\n", 1474 | " summary genotype \n", 1475 | "0 high risk of emphysema CC \n", 1476 | "1 NaN TT \n", 1477 | "2 NaN AA \n", 1478 | "3 NaN GG \n", 1479 | "4 uncertain TT \n", 1480 | "5 predisposition to breast cancer? AA \n", 1481 | "6 predisposition to breast cancer? TT \n", 1482 | "7 NaN TT \n", 1483 | "8 poor metabolizer of several commonly prescribe... AA " 1484 | ] 1485 | }, 1486 | "metadata": {}, 1487 | "output_type": "display_data" 1488 | } 1489 | ], 1490 | "source": [ 1491 | "null_repute = snp_df[snp_df['repute'].isnull()]\n", 1492 | "null_summaries = snp_df[snp_df['summary'].isnull()]\n", 1493 | "null_repute_and_summaries = pd.concat([null_repute,null_summaries]).drop_duplicates().reset_index(drop=True)\n", 1494 | "display(null_repute_and_summaries)" 1495 | ] 1496 | }, 1497 | { 1498 | "cell_type": "code", 1499 | "execution_count": 35, 1500 | "metadata": {}, 1501 | "outputs": [], 1502 | "source": [ 1503 | "snp_df['repute'].fillna(value='Neutral', inplace=True)\n", 1504 | "snp_df['summary'].fillna(value='None', inplace=True)" 1505 | ] 1506 | }, 1507 | { 1508 | "cell_type": "code", 1509 | "execution_count": 36, 1510 | "metadata": {}, 1511 | "outputs": [ 1512 | { 1513 | "data": { 1514 | "text/plain": [ 1515 | "rsid False\n", 1516 | "magnitude False\n", 1517 | "repute False\n", 1518 | "summary False\n", 1519 | "genotype False\n", 1520 | "dtype: bool" 1521 | ] 1522 | }, 1523 | "execution_count": 36, 1524 | "metadata": {}, 1525 | "output_type": "execute_result" 1526 | } 1527 | ], 1528 | "source": [ 1529 | "# No no NaNette\n", 1530 | "snp_df.isna().any()" 1531 | ] 1532 | }, 1533 | { 1534 | "cell_type": "markdown", 1535 | "metadata": {}, 1536 | "source": [ 1537 | "# Merging my data with SNPedia\n", 1538 | "\n", 1539 | "Here, I've done an inner join of the SNPedia DataFrame on my DNA to see what data, if any, it has on my particular genotypes." 1540 | ] 1541 | }, 1542 | { 1543 | "cell_type": "code", 1544 | "execution_count": 37, 1545 | "metadata": {}, 1546 | "outputs": [], 1547 | "source": [ 1548 | "new_df = snp_df.merge(df, how='inner', on=['rsid', 'genotype'], suffixes=('_SNPedia', '_myDNA'))" 1549 | ] 1550 | }, 1551 | { 1552 | "cell_type": "code", 1553 | "execution_count": 38, 1554 | "metadata": {}, 1555 | "outputs": [ 1556 | { 1557 | "data": { 1558 | "text/html": [ 1559 | "
\n", 1560 | "\n", 1573 | "\n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | "
rsidmagnitudereputesummarygenotypechromosomeposition
0rs13030.0Goodcommon in clinvarTT1494844843
1rs175802.5Bada slightly reduced functionality form of Alpha...TT1494847262
2rs289315800.0Goodcommon in clinvarAA1250344783
3rs10427140.0GoodnormalCC5148206473
4rs18008880.0GoodnormalCC5148206885
5rs23037900.0Goodcommon in clinvarAA1657017292
6rs58822.0BadFaster aging. Increased risk for Dementia. Les...AA1657016092
7rs22301992.0Bad2.5x+ risk of ARMDGG196718387
8rs289316080.0Goodcommon in clinvarGG775614497
9rs49868930.0GoodnormalGG1096540410
10rs283995040.0GoodnormalAA1096522463
11rs22349220.0Goodcommon in clinvarAA1226026406
12rs289316140.0Goodcommon in clinvarGG41806119
13rs289330680.0Goodcommon in clinvarCC41807371
\n", 1729 | "
" 1730 | ], 1731 | "text/plain": [ 1732 | " rsid magnitude repute \\\n", 1733 | "0 rs1303 0.0 Good \n", 1734 | "1 rs17580 2.5 Bad \n", 1735 | "2 rs28931580 0.0 Good \n", 1736 | "3 rs1042714 0.0 Good \n", 1737 | "4 rs1800888 0.0 Good \n", 1738 | "5 rs2303790 0.0 Good \n", 1739 | "6 rs5882 2.0 Bad \n", 1740 | "7 rs2230199 2.0 Bad \n", 1741 | "8 rs28931608 0.0 Good \n", 1742 | "9 rs4986893 0.0 Good \n", 1743 | "10 rs28399504 0.0 Good \n", 1744 | "11 rs2234922 0.0 Good \n", 1745 | "12 rs28931614 0.0 Good \n", 1746 | "13 rs28933068 0.0 Good \n", 1747 | "\n", 1748 | " summary genotype chromosome \\\n", 1749 | "0 common in clinvar TT 14 \n", 1750 | "1 a slightly reduced functionality form of Alpha... TT 14 \n", 1751 | "2 common in clinvar AA 12 \n", 1752 | "3 normal CC 5 \n", 1753 | "4 normal CC 5 \n", 1754 | "5 common in clinvar AA 16 \n", 1755 | "6 Faster aging. Increased risk for Dementia. Les... AA 16 \n", 1756 | "7 2.5x+ risk of ARMD GG 19 \n", 1757 | "8 common in clinvar GG 7 \n", 1758 | "9 normal GG 10 \n", 1759 | "10 normal AA 10 \n", 1760 | "11 common in clinvar AA 1 \n", 1761 | "12 common in clinvar GG 4 \n", 1762 | "13 common in clinvar CC 4 \n", 1763 | "\n", 1764 | " position \n", 1765 | "0 94844843 \n", 1766 | "1 94847262 \n", 1767 | "2 50344783 \n", 1768 | "3 148206473 \n", 1769 | "4 148206885 \n", 1770 | "5 57017292 \n", 1771 | "6 57016092 \n", 1772 | "7 6718387 \n", 1773 | "8 75614497 \n", 1774 | "9 96540410 \n", 1775 | "10 96522463 \n", 1776 | "11 226026406 \n", 1777 | "12 1806119 \n", 1778 | "13 1807371 " 1779 | ] 1780 | }, 1781 | "execution_count": 38, 1782 | "metadata": {}, 1783 | "output_type": "execute_result" 1784 | } 1785 | ], 1786 | "source": [ 1787 | "new_df.head(20)" 1788 | ] 1789 | }, 1790 | { 1791 | "cell_type": "markdown", 1792 | "metadata": {}, 1793 | "source": [ 1794 | "### What's hiding in there?" 1795 | ] 1796 | }, 1797 | { 1798 | "cell_type": "code", 1799 | "execution_count": 39, 1800 | "metadata": {}, 1801 | "outputs": [], 1802 | "source": [ 1803 | "# Create a DataFrame for some subsets of genes\n", 1804 | "good_genes = new_df[new_df.repute == 'Good']\n", 1805 | "bad_genes = new_df[new_df.repute == 'Bad']\n", 1806 | "interesting_genes = new_df[new_df.magnitude > 4] # 4 is the threshold for \"worth your time\" given by SNPedia" 1807 | ] 1808 | }, 1809 | { 1810 | "cell_type": "markdown", 1811 | "metadata": {}, 1812 | "source": [ 1813 | "I have plenty of \"good\" genotypes, but none with a nonzero magnitude." 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": 40, 1819 | "metadata": { 1820 | "scrolled": true 1821 | }, 1822 | "outputs": [ 1823 | { 1824 | "data": { 1825 | "text/html": [ 1826 | "
\n", 1827 | "\n", 1840 | "\n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | " \n", 1889 | " \n", 1890 | " \n", 1891 | " \n", 1892 | " \n", 1893 | " \n", 1894 | " \n", 1895 | " \n", 1896 | " \n", 1897 | " \n", 1898 | " \n", 1899 | " \n", 1900 | " \n", 1901 | " \n", 1902 | " \n", 1903 | " \n", 1904 | " \n", 1905 | " \n", 1906 | " \n", 1907 | " \n", 1908 | " \n", 1909 | " \n", 1910 | " \n", 1911 | " \n", 1912 | " \n", 1913 | " \n", 1914 | " \n", 1915 | " \n", 1916 | " \n", 1917 | " \n", 1918 | " \n", 1919 | " \n", 1920 | " \n", 1921 | " \n", 1922 | " \n", 1923 | " \n", 1924 | " \n", 1925 | " \n", 1926 | " \n", 1927 | " \n", 1928 | " \n", 1929 | " \n", 1930 | " \n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | " \n", 1943 | " \n", 1944 | " \n", 1945 | " \n", 1946 | " \n", 1947 | " \n", 1948 | " \n", 1949 | " \n", 1950 | " \n", 1951 | " \n", 1952 | " \n", 1953 | " \n", 1954 | " \n", 1955 | " \n", 1956 | " \n", 1957 | " \n", 1958 | " \n", 1959 | " \n", 1960 | " \n", 1961 | " \n", 1962 | " \n", 1963 | " \n", 1964 | " \n", 1965 | "
rsidmagnitudereputesummarygenotypechromosomeposition
0rs13030.0Goodcommon in clinvarTT1494844843
2rs289315800.0Goodcommon in clinvarAA1250344783
3rs10427140.0GoodnormalCC5148206473
4rs18008880.0GoodnormalCC5148206885
5rs23037900.0Goodcommon in clinvarAA1657017292
8rs289316080.0Goodcommon in clinvarGG775614497
9rs49868930.0GoodnormalGG1096540410
10rs283995040.0GoodnormalAA1096522463
11rs22349220.0Goodcommon in clinvarAA1226026406
12rs289316140.0Goodcommon in clinvarGG41806119
13rs289330680.0Goodcommon in clinvarCC41807371
\n", 1966 | "
" 1967 | ], 1968 | "text/plain": [ 1969 | " rsid magnitude repute summary genotype chromosome \\\n", 1970 | "0 rs1303 0.0 Good common in clinvar TT 14 \n", 1971 | "2 rs28931580 0.0 Good common in clinvar AA 12 \n", 1972 | "3 rs1042714 0.0 Good normal CC 5 \n", 1973 | "4 rs1800888 0.0 Good normal CC 5 \n", 1974 | "5 rs2303790 0.0 Good common in clinvar AA 16 \n", 1975 | "8 rs28931608 0.0 Good common in clinvar GG 7 \n", 1976 | "9 rs4986893 0.0 Good normal GG 10 \n", 1977 | "10 rs28399504 0.0 Good normal AA 10 \n", 1978 | "11 rs2234922 0.0 Good common in clinvar AA 1 \n", 1979 | "12 rs28931614 0.0 Good common in clinvar GG 4 \n", 1980 | "13 rs28933068 0.0 Good common in clinvar CC 4 \n", 1981 | "\n", 1982 | " position \n", 1983 | "0 94844843 \n", 1984 | "2 50344783 \n", 1985 | "3 148206473 \n", 1986 | "4 148206885 \n", 1987 | "5 57017292 \n", 1988 | "8 75614497 \n", 1989 | "9 96540410 \n", 1990 | "10 96522463 \n", 1991 | "11 226026406 \n", 1992 | "12 1806119 \n", 1993 | "13 1807371 " 1994 | ] 1995 | }, 1996 | "execution_count": 40, 1997 | "metadata": {}, 1998 | "output_type": "execute_result" 1999 | } 2000 | ], 2001 | "source": [ 2002 | "good_genes" 2003 | ] 2004 | }, 2005 | { 2006 | "cell_type": "markdown", 2007 | "metadata": {}, 2008 | "source": [ 2009 | "I have three \"bad\" genotypes with a nonzero magnitude." 2010 | ] 2011 | }, 2012 | { 2013 | "cell_type": "code", 2014 | "execution_count": 41, 2015 | "metadata": {}, 2016 | "outputs": [ 2017 | { 2018 | "data": { 2019 | "text/html": [ 2020 | "
\n", 2021 | "\n", 2034 | "\n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | " \n", 2063 | " \n", 2064 | " \n", 2065 | " \n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | "
rsidmagnitudereputesummarygenotypechromosomeposition
1rs175802.5Bada slightly reduced functionality form of Alpha...TT1494847262
6rs58822.0BadFaster aging. Increased risk for Dementia. Les...AA1657016092
7rs22301992.0Bad2.5x+ risk of ARMDGG196718387
\n", 2080 | "
" 2081 | ], 2082 | "text/plain": [ 2083 | " rsid magnitude repute \\\n", 2084 | "1 rs17580 2.5 Bad \n", 2085 | "6 rs5882 2.0 Bad \n", 2086 | "7 rs2230199 2.0 Bad \n", 2087 | "\n", 2088 | " summary genotype chromosome \\\n", 2089 | "1 a slightly reduced functionality form of Alpha... TT 14 \n", 2090 | "6 Faster aging. Increased risk for Dementia. Les... AA 16 \n", 2091 | "7 2.5x+ risk of ARMD GG 19 \n", 2092 | "\n", 2093 | " position \n", 2094 | "1 94847262 \n", 2095 | "6 57016092 \n", 2096 | "7 6718387 " 2097 | ] 2098 | }, 2099 | "execution_count": 41, 2100 | "metadata": {}, 2101 | "output_type": "execute_result" 2102 | } 2103 | ], 2104 | "source": [ 2105 | "bad_genes" 2106 | ] 2107 | }, 2108 | { 2109 | "cell_type": "markdown", 2110 | "metadata": {}, 2111 | "source": [ 2112 | "Sadly I have no \"interesting\" genotypes above the threshold of 4, although I have some slightly interesting bad ones." 2113 | ] 2114 | }, 2115 | { 2116 | "cell_type": "code", 2117 | "execution_count": 42, 2118 | "metadata": {}, 2119 | "outputs": [ 2120 | { 2121 | "data": { 2122 | "text/html": [ 2123 | "
\n", 2124 | "\n", 2137 | "\n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | "
rsidmagnitudereputesummarygenotypechromosomeposition
\n", 2153 | "
" 2154 | ], 2155 | "text/plain": [ 2156 | "Empty DataFrame\n", 2157 | "Columns: [rsid, magnitude, repute, summary, genotype, chromosome, position]\n", 2158 | "Index: []" 2159 | ] 2160 | }, 2161 | "execution_count": 42, 2162 | "metadata": {}, 2163 | "output_type": "execute_result" 2164 | } 2165 | ], 2166 | "source": [ 2167 | "interesting_genes" 2168 | ] 2169 | }, 2170 | { 2171 | "cell_type": "markdown", 2172 | "metadata": {}, 2173 | "source": [ 2174 | "# Scrape relevant articles with Selenium\n", 2175 | "\n", 2176 | "Now I'd like to read up on my bad genetics, so I'll use Selenium to grab the abstracts of some scientific papers for me from PubMed." 2177 | ] 2178 | }, 2179 | { 2180 | "cell_type": "code", 2181 | "execution_count": 43, 2182 | "metadata": {}, 2183 | "outputs": [], 2184 | "source": [ 2185 | "# Get the base URL from SNPedia\n", 2186 | "base_url = 'https://www.snpedia.com/index.php/'" 2187 | ] 2188 | }, 2189 | { 2190 | "cell_type": "code", 2191 | "execution_count": 44, 2192 | "metadata": {}, 2193 | "outputs": [ 2194 | { 2195 | "name": "stdout", 2196 | "output_type": "stream", 2197 | "text": [ 2198 | "https://www.snpedia.com/index.php/rs17580 \n", 2199 | "\n", 2200 | "https://www.snpedia.com/index.php/rs5882 \n", 2201 | "\n", 2202 | "https://www.snpedia.com/index.php/rs2230199 \n", 2203 | "\n" 2204 | ] 2205 | } 2206 | ], 2207 | "source": [ 2208 | "# Create URLs for each gene that I want to study\n", 2209 | "gene_urls = [base_url + rsid for rsid in bad_genes['rsid']]\n", 2210 | "for url in gene_urls:\n", 2211 | " print(url, '\\n')" 2212 | ] 2213 | }, 2214 | { 2215 | "cell_type": "code", 2216 | "execution_count": 45, 2217 | "metadata": {}, 2218 | "outputs": [], 2219 | "source": [ 2220 | "# Initialize Selenium\n", 2221 | "browser = webdriver.Chrome()" 2222 | ] 2223 | }, 2224 | { 2225 | "cell_type": "code", 2226 | "execution_count": 46, 2227 | "metadata": {}, 2228 | "outputs": [], 2229 | "source": [ 2230 | "import time" 2231 | ] 2232 | }, 2233 | { 2234 | "cell_type": "code", 2235 | "execution_count": 47, 2236 | "metadata": { 2237 | "scrolled": false 2238 | }, 2239 | "outputs": [], 2240 | "source": [ 2241 | "# Write a function to visit the SNPedia URLs, click through to PubMed, \n", 2242 | "# and retrieve the info on the articles for each gene\n", 2243 | "\n", 2244 | "def scrape_abstracts(urls):\n", 2245 | " \n", 2246 | " #all_df = pd.DataFrame()\n", 2247 | " rsid_list = []\n", 2248 | " all_article_links = []\n", 2249 | " all_article_title = []\n", 2250 | " all_article_citation = []\n", 2251 | " all_article_authors = []\n", 2252 | " all_article_abstract = []\n", 2253 | " \n", 2254 | " for url in urls:\n", 2255 | " browser.get(url) #load url\n", 2256 | " rsid = browser.find_element_by_css_selector('.firstHeading').text\n", 2257 | " links_elements = browser.find_elements_by_partial_link_text('PMID')\n", 2258 | " link_urls = []\n", 2259 | " \n", 2260 | " for link in links_elements:\n", 2261 | " link_urls.append(link.get_attribute('href')) # get the URLs to the PubMed pages\n", 2262 | " \n", 2263 | " for element in link_urls:\n", 2264 | " browser.get(element) # follow each link element to PubMed\n", 2265 | " time.sleep(.8)\n", 2266 | " article_title = browser.find_element_by_xpath(\"//div[@class='cit']/../h1\").text\n", 2267 | " article_citation = browser.find_element_by_class_name('cit').text\n", 2268 | " article_authors = browser.find_element_by_class_name('auths').text\n", 2269 | " article_abstract = browser.find_element_by_class_name('abstr').text\n", 2270 | "\n", 2271 | " rsid_list.append(rsid)\n", 2272 | " all_article_title.append(article_title)\n", 2273 | " all_article_citation.append(article_citation)\n", 2274 | " all_article_authors.append(article_authors)\n", 2275 | " all_article_abstract.append(article_abstract)\n", 2276 | " all_article_links.append(element)\n", 2277 | " \n", 2278 | " print(len(rsid_list) == len(link_urls) == len(all_article_title) == len(all_article_citation) == len(all_article_authors) == len(all_article_abstract))\n", 2279 | " \n", 2280 | " df = pd.DataFrame() # store the information\n", 2281 | " df['rsid'] = rsid_list\n", 2282 | " df['article_title'] = all_article_title\n", 2283 | " df['article_citation'] = all_article_citation\n", 2284 | " df['article_authors'] = all_article_authors\n", 2285 | " df['article_abstract'] = all_article_abstract\n", 2286 | " df['link'] = all_article_links\n", 2287 | " df = df.drop_duplicates()\n", 2288 | "\n", 2289 | " df.index = range(len(df.index))\n", 2290 | " \n", 2291 | " return df" 2292 | ] 2293 | }, 2294 | { 2295 | "cell_type": "code", 2296 | "execution_count": 52, 2297 | "metadata": {}, 2298 | "outputs": [ 2299 | { 2300 | "name": "stdout", 2301 | "output_type": "stream", 2302 | "text": [ 2303 | "True\n", 2304 | "False\n", 2305 | "False\n" 2306 | ] 2307 | } 2308 | ], 2309 | "source": [ 2310 | "abstracts_df = scrape_abstracts(gene_urls)" 2311 | ] 2312 | }, 2313 | { 2314 | "cell_type": "code", 2315 | "execution_count": 55, 2316 | "metadata": {}, 2317 | "outputs": [ 2318 | { 2319 | "data": { 2320 | "text/html": [ 2321 | "
\n", 2322 | "\n", 2335 | "\n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | " \n", 2435 | " \n", 2436 | " \n", 2437 | " \n", 2438 | " \n", 2439 | " \n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | " \n", 2446 | " \n", 2447 | " \n", 2448 | " \n", 2449 | " \n", 2450 | " \n", 2451 | " \n", 2452 | " \n", 2453 | " \n", 2454 | " \n", 2455 | " \n", 2456 | " \n", 2457 | " \n", 2458 | " \n", 2459 | " \n", 2460 | " \n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | " \n", 2467 | " \n", 2468 | " \n", 2469 | " \n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | " \n", 2514 | " \n", 2515 | " \n", 2516 | " \n", 2517 | " \n", 2518 | " \n", 2519 | " \n", 2520 | " \n", 2521 | " \n", 2522 | " \n", 2523 | " \n", 2524 | " \n", 2525 | " \n", 2526 | " \n", 2527 | " \n", 2528 | " \n", 2529 | " \n", 2530 | " \n", 2531 | " \n", 2532 | " \n", 2533 | " \n", 2534 | " \n", 2535 | " \n", 2536 | " \n", 2537 | " \n", 2538 | " \n", 2539 | " \n", 2540 | " \n", 2541 | " \n", 2542 | " \n", 2543 | " \n", 2544 | " \n", 2545 | " \n", 2546 | " \n", 2547 | " \n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | " \n", 2581 | " \n", 2582 | " \n", 2583 | " \n", 2584 | " \n", 2585 | " \n", 2586 | " \n", 2587 | " \n", 2588 | " \n", 2589 | " \n", 2590 | " \n", 2591 | " \n", 2592 | " \n", 2593 | " \n", 2594 | " \n", 2595 | " \n", 2596 | " \n", 2597 | " \n", 2598 | " \n", 2599 | " \n", 2600 | " \n", 2601 | " \n", 2602 | " \n", 2603 | " \n", 2604 | " \n", 2605 | " \n", 2606 | " \n", 2607 | " \n", 2608 | " \n", 2609 | " \n", 2610 | " \n", 2611 | " \n", 2612 | " \n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | " \n", 2654 | " \n", 2655 | " \n", 2656 | " \n", 2657 | " \n", 2658 | " \n", 2659 | " \n", 2660 | " \n", 2661 | " \n", 2662 | " \n", 2663 | " \n", 2664 | " \n", 2665 | " \n", 2666 | " \n", 2667 | " \n", 2668 | " \n", 2669 | " \n", 2670 | " \n", 2671 | " \n", 2672 | " \n", 2673 | " \n", 2674 | " \n", 2675 | " \n", 2676 | " \n", 2677 | " \n", 2678 | " \n", 2679 | " \n", 2680 | " \n", 2681 | " \n", 2682 | " \n", 2683 | " \n", 2684 | " \n", 2685 | " \n", 2686 | " \n", 2687 | " \n", 2688 | " \n", 2689 | " \n", 2690 | " \n", 2691 | " \n", 2692 | " \n", 2693 | " \n", 2694 | " \n", 2695 | " \n", 2696 | " \n", 2697 | " \n", 2698 | " \n", 2699 | " \n", 2700 | " \n", 2701 | " \n", 2702 | " \n", 2703 | " \n", 2704 | " \n", 2705 | " \n", 2706 | " \n", 2707 | " \n", 2708 | " \n", 2709 | " \n", 2710 | " \n", 2711 | " \n", 2712 | " \n", 2713 | " \n", 2714 | " \n", 2715 | " \n", 2716 | " \n", 2717 | " \n", 2718 | " \n", 2719 | " \n", 2720 | " \n", 2721 | " \n", 2722 | " \n", 2723 | " \n", 2724 | " \n", 2725 | " \n", 2726 | " \n", 2727 | " \n", 2728 | " \n", 2729 | " \n", 2730 | " \n", 2731 | " \n", 2732 | " \n", 2733 | " \n", 2734 | " \n", 2735 | " \n", 2736 | " \n", 2737 | " \n", 2738 | " \n", 2739 | " \n", 2740 | " \n", 2741 | " \n", 2742 | " \n", 2743 | " \n", 2744 | " \n", 2745 | " \n", 2746 | " \n", 2747 | " \n", 2748 | " \n", 2749 | " \n", 2750 | " \n", 2751 | " \n", 2752 | " \n", 2753 | " \n", 2754 | " \n", 2755 | " \n", 2756 | " \n", 2757 | " \n", 2758 | " \n", 2759 | " \n", 2760 | " \n", 2761 | " \n", 2762 | " \n", 2763 | " \n", 2764 | " \n", 2765 | " \n", 2766 | " \n", 2767 | " \n", 2768 | " \n", 2769 | " \n", 2770 | " \n", 2771 | " \n", 2772 | " \n", 2773 | " \n", 2774 | " \n", 2775 | " \n", 2776 | " \n", 2777 | " \n", 2778 | " \n", 2779 | " \n", 2780 | " \n", 2781 | " \n", 2782 | " \n", 2783 | " \n", 2784 | " \n", 2785 | " \n", 2786 | " \n", 2787 | " \n", 2788 | " \n", 2789 | " \n", 2790 | " \n", 2791 | " \n", 2792 | " \n", 2793 | " \n", 2794 | " \n", 2795 | " \n", 2796 | " \n", 2797 | " \n", 2798 | " \n", 2799 | " \n", 2800 | " \n", 2801 | " \n", 2802 | " \n", 2803 | " \n", 2804 | " \n", 2805 | " \n", 2806 | " \n", 2807 | " \n", 2808 | " \n", 2809 | " \n", 2810 | " \n", 2811 | " \n", 2812 | " \n", 2813 | " \n", 2814 | " \n", 2815 | " \n", 2816 | " \n", 2817 | " \n", 2818 | " \n", 2819 | " \n", 2820 | " \n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | " \n", 2825 | " \n", 2826 | " \n", 2827 | " \n", 2828 | " \n", 2829 | " \n", 2830 | " \n", 2831 | " \n", 2832 | " \n", 2833 | " \n", 2834 | " \n", 2835 | " \n", 2836 | " \n", 2837 | " \n", 2838 | " \n", 2839 | " \n", 2840 | " \n", 2841 | " \n", 2842 | " \n", 2843 | " \n", 2844 | " \n", 2845 | " \n", 2846 | " \n", 2847 | " \n", 2848 | " \n", 2849 | " \n", 2850 | " \n", 2851 | " \n", 2852 | " \n", 2853 | " \n", 2854 | " \n", 2855 | " \n", 2856 | " \n", 2857 | " \n", 2858 | " \n", 2859 | " \n", 2860 | " \n", 2861 | " \n", 2862 | " \n", 2863 | " \n", 2864 | " \n", 2865 | " \n", 2866 | " \n", 2867 | " \n", 2868 | " \n", 2869 | " \n", 2870 | " \n", 2871 | " \n", 2872 | " \n", 2873 | " \n", 2874 | " \n", 2875 | " \n", 2876 | " \n", 2877 | " \n", 2878 | " \n", 2879 | " \n", 2880 | " \n", 2881 | " \n", 2882 | " \n", 2883 | " \n", 2884 | " \n", 2885 | " \n", 2886 | " \n", 2887 | " \n", 2888 | " \n", 2889 | " \n", 2890 | " \n", 2891 | " \n", 2892 | " \n", 2893 | " \n", 2894 | " \n", 2895 | " \n", 2896 | " \n", 2897 | " \n", 2898 | "
rsidarticle_titlearticle_citationarticle_authorsarticle_abstractlink
0rs17580Heterozygosity for the alpha1-antitrypsin Z al...Aliment Pharmacol Ther. 2011 Feb;33(3):389-94....Mihalache F1, Höblinger A, Grünhage F, Krawczy...Abstract\\nBACKGROUND:\\nAlpha1-antitrypsin (α1A...https://www.ncbi.nlm.nih.gov/pubmed/21138453?d...
1rs17580Genetic polymorphisms and susceptibility to lu...J Negat Results Biomed. 2006 Apr 11;5:5.Lee PL1, West C, Crain K, Wang L.Abstract\\nSusceptibility to infection by bacte...https://www.ncbi.nlm.nih.gov/pubmed/16608528?d...
2rs17580Prevalence of genetic polymorphisms in the pro...BMC Gastroenterol. 2010 Feb 20;10:22. doi: 10....Kok KF1, te Morsche RH, van Oijen MG, Drenth JP.Abstract\\nBACKGROUND:\\nAlpha-1 antitrypsin (A1...https://www.ncbi.nlm.nih.gov/pubmed/20170533?d...
3rs17580Serum levels and genotype distribution of α1-a...Thorax. 2012 Aug;67(8):669-74. doi: 10.1136/th...Ferrarotti I1, Thun GA, Zorzetto M, Ottaviani ...Abstract\\nRATIONALE:\\nα1-Antitrypsin (AAT) def...https://www.ncbi.nlm.nih.gov/pubmed/22426792?d...
4rs17580Molecular abnormality of PI S variant of human...Am J Hum Genet. 1977 May;29(3):233-9.Yoshida A, Ewing C, Wessels M, Lieberman J, Ga...Abstract\\nAlpha1-antitrypsin variant protein w...https://www.ncbi.nlm.nih.gov/pubmed/301355?dop...
5rs17580Alpha 1-antitrypsin deficiency caused by the a...J Clin Invest. 1989 Apr;83(4):1144-52.Curiel D1, Brantly M, Curiel E, Stier L, Cryst...Abstract\\nalpha 1-Antitrypsin (alpha 1AT) defi...https://www.ncbi.nlm.nih.gov/pubmed/2539391?do...
6rs17580SERPINA1 PiZ and PiS heterozygotes and lung fu...PLoS One. 2012;7(8):e42728. doi: 10.1371/journ...Thun GA1, Ferrarotti I, Imboden M, Rochat T, G...Abstract\\nBACKGROUND:\\nSevere alpha1-antitryps...https://www.ncbi.nlm.nih.gov/pubmed/22912729?d...
7rs17580Prevalence of alpha-1 antitrypsin high-risk va...Arch Bronconeumol. 2015 Feb;51(2):80-85. doi: ...Pérez-Rubio G1, Jiménez-Valverde LO2, Ramírez-...Abstract\\nINTRODUCTION:\\nChronic obstructive p...https://www.ncbi.nlm.nih.gov/pubmed/25454901?d...
8rs5882Association of a functional polymorphism in th...JAMA. 2010 Jan 13;303(2):150-8. doi: 10.1001/j...Sanders AE1, Wang C, Katz M, Derby CA, Barzila...Abstract\\nCONTEXT:\\nPolymorphisms in the chole...https://www.ncbi.nlm.nih.gov/pubmed/20068209?d...
9rs5882Common genetic variation in multiple metabolic...J Lipid Res. 2010 Dec;51(12):3524-32. doi: 10....Peloso GM1, Demissie S, Collins D, Mirel DB, G...Abstract\\nA low level of HDL-C is the most com...https://www.ncbi.nlm.nih.gov/pubmed/20855565?d...
10rs5882The CETP I405V polymorphism is associated with...Aging Cell. 2012 Apr;11(2):228-33. doi: 10.111...Yu L1, Shulman JM, Chibnik L, Leurgans S, Schn...Abstract\\nThe cholesteryl ester transfer prote...https://www.ncbi.nlm.nih.gov/pubmed/22122979?d...
11rs5882Cholesteryl Ester Transfer Protein (CETP) poly...PLoS One. 2012;7(3):e31930. doi: 10.1371/journ...Papp AC1, Pinsonneault JK, Wang D, Newman LC, ...Abstract\\nPolymorphisms in and around the Chol...https://www.ncbi.nlm.nih.gov/pubmed/22403620?d...
12rs5882Association testing by DNA pooling: an effecti...Proc Natl Acad Sci U S A. 2002 Dec 24;99(26):1...Bansal A1, van den Boom D, Kammerer S, Honisch...Abstract\\nWith an ever-increasing resource of ...https://www.ncbi.nlm.nih.gov/pubmed/12475937?d...
13rs5882Direct molecular haplotyping of long-range gen...Proc Natl Acad Sci U S A. 2003 Jun 24;100(13):...Ding C1, Cantor CR.Abstract\\nHaplotypes, combinations of several ...https://www.ncbi.nlm.nih.gov/pubmed/12802015?d...
14rs5882Effects of cholesterol ester transfer protein ...Atherosclerosis. 2008 Jan;196(1):455-60. Epub ...Terán-García M1, Després JP, Tremblay A, Bouch...Abstract\\nCholesterol ester transfer protein (...https://www.ncbi.nlm.nih.gov/pubmed/17196207?d...
15rs5882No association of CETP genotype with cognitive...Neurosci Lett. 2007 Jun 13;420(2):189-92. Epub...Johnson W1, Harris SE, Collins P, Starr JM, Wh...Abstract\\nA cholesteryl ester transfer protein...https://www.ncbi.nlm.nih.gov/pubmed/17531380?d...
16rs5882Lack of replication of genetic associations wi...Biogerontology. 2008 Apr;9(2):85-92. Epub 2007...Novelli V1, Viviani Anselmi C, Roncarati R, Gu...Abstract\\nThe exceptional longevity of centena...https://www.ncbi.nlm.nih.gov/pubmed/18034366?d...
17rs5882A hierarchical and modular approach to the dis...BMC Genet. 2008 Jan 14;9:6. doi: 10.1186/1471-...Sebastiani P1, Zhao Z, Abad-Grau MM, Riva A, H...Abstract\\nBACKGROUND:\\nOne of the challenges o...https://www.ncbi.nlm.nih.gov/pubmed/18194558?d...
18rs5882New application of intelligent agents in spora...BMC Bioinformatics. 2008 May 30;9:254. doi: 10...Penco S1, Buscema M, Patrosso MC, Marocchi A, ...Abstract\\nBACKGROUND:\\nFew genetic factors pre...https://www.ncbi.nlm.nih.gov/pubmed/18513389?d...
19rs5882Cholesterol ester transfer protein, interleuki...Am J Cardiol. 2008 Jun 15;101(12):1683-8. doi:...Enquobahrie DA1, Smith NL, Bis JC, Carty CL, R...Abstract\\nVariations in candidate genes partic...https://www.ncbi.nlm.nih.gov/pubmed/18549840?d...
20rs5882Cholesteryl ester transfer protein (CETP) gene...Ann Hum Genet. 2008 Nov;72(Pt 6):732-41. doi: ...Meiner V1, Friedlander Y, Milo H, Sharon N, Be...Abstract\\nAlthough Cholesteryl Ester Transfer ...https://www.ncbi.nlm.nih.gov/pubmed/18637884?d...
21rs5882Multiple genetic variants along candidate path...J Lipid Res. 2008 Dec;49(12):2582-9. doi: 10.1...Lu Y1, Dollé ME, Imholz S, van 't Slot R, Vers...Abstract\\nThe known genetic variants determini...https://www.ncbi.nlm.nih.gov/pubmed/18660489?d...
22rs5882Genetic-epidemiological evidence on genes asso...Exp Gerontol. 2009 Mar;44(3):136-60. doi: 10.1...Boes E1, Coassin S, Kollerits B, Heid IM, Kron...Abstract\\nHigh-density lipoprotein (HDL) parti...https://www.ncbi.nlm.nih.gov/pubmed/19041386?d...
23rs5882A meta-analysis of candidate gene polymorphism...Stroke. 2009 Mar;40(3):683-95. doi: 10.1161/ST...Wang X1, Cheng S, Brophy VH, Erlich HA, Mannha...Abstract\\nBACKGROUND AND PURPOSE:\\nIschemic st...https://www.ncbi.nlm.nih.gov/pubmed/19131662?d...
24rs5882Genetic risk factors in recurrent venous throm...Clin Chim Acta. 2009 Apr;402(1-2):189-92.Zee RY1, Bubes V, Shrivastava S, Ridker PM, Gl...Abstract\\nBACKGROUND:\\nRecurrent venous thromb...https://www.ncbi.nlm.nih.gov/pubmed/19263529?d...
25rs5882TagSNP transferability and relative loss of va...J Biomed Sci. 2009 Aug 14;16:73. doi: 10.1186/...Lins TC1, Abreu BS, Pereira RW.Abstract\\nBACKGROUND:\\nThe application of a su...https://www.ncbi.nlm.nih.gov/pubmed/19682379?d...
26rs5882Genetic loci associated with plasma concentrat...Circ Cardiovasc Genet. 2008 Oct;1(1):21-30. do...Chasman DI1, Paré G, Zee RY, Parker AN, Cook N...Abstract\\nBACKGROUND:\\nGenome-wide genetic ass...https://www.ncbi.nlm.nih.gov/pubmed/19802338?d...
27rs5882Polymorphism in the CETP gene region, HDL chol...Circ Cardiovasc Genet. 2009 Feb;2(1):26-33. do...Ridker PM1, Paré G, Parker AN, Zee RY, Miletic...Abstract\\nBACKGROUND:\\nRecent trial data have ...https://www.ncbi.nlm.nih.gov/pubmed/20031564?d...
28rs5882Associations between common genetic polymorphi...Atherosclerosis. 2011 May;216(1):166-9. doi: 1...Legry V1, Bokor S, Beghin L, Galfo M, Gonzalez...Abstract\\nOBJECTIVE:\\nGenetic variability in t...https://www.ncbi.nlm.nih.gov/pubmed/21316679?d...
29rs5882Gender and single nucleotide polymorphisms in ...J Nutr. 2012 Sep;142(9):1764-71. doi: 10.3945/...Clifford AJ1, Chen K, McWade L, Rincon G, Kim ...Abstract\\nUsing linear regression models, we s...https://www.ncbi.nlm.nih.gov/pubmed/22833659?d...
.....................
58rs2230199A meta-analysis of candidate gene polymorphism...Stroke. 2009 Mar;40(3):683-95. doi: 10.1161/ST...Wang X1, Cheng S, Brophy VH, Erlich HA, Mannha...Abstract\\nBACKGROUND AND PURPOSE:\\nIschemic st...https://www.ncbi.nlm.nih.gov/pubmed/19131662?d...
59rs2230199Common variation in the SERPING1 gene is not a...Mol Vis. 2009;15:200-7. Epub 2009 Jan 23.Park KH1, Ryu E, Tosakulwong N, Wu Y, Edwards AO.Abstract\\nPURPOSE:\\nCommon genetic variation i...https://www.ncbi.nlm.nih.gov/pubmed/19169411?d...
60rs2230199Assessing susceptibility to age-related macula...Mol Cell Proteomics. 2009 Jun;8(6):1338-49. do...Gu J1, Pauer GJ, Yue X, Narendra U, Sturgill G...Abstract\\nAge-related macular degeneration (AM...https://www.ncbi.nlm.nih.gov/pubmed/19202148?d...
61rs2230199Multilocus analysis of age-related macular deg...Eur J Hum Genet. 2009 Sep;17(9):1190-9. doi: 1...Bergeron-Sawitzke J1, Gold B, Olsh A, Schlotte...Abstract\\nAge-related macular degeneration (AM...https://www.ncbi.nlm.nih.gov/pubmed/19259132?d...
62rs2230199Genetic risk factors in recurrent venous throm...Clin Chim Acta. 2009 Apr;402(1-2):189-92.Zee RY1, Bubes V, Shrivastava S, Ridker PM, Gl...Abstract\\nBACKGROUND:\\nRecurrent venous thromb...https://www.ncbi.nlm.nih.gov/pubmed/19263529?d...
63rs2230199Association of 77 polymorphisms in 52 candidat...J Hypertens. 2009 Mar;27(3):476-83.Conen D1, Cheng S, Steiner LL, Buring JE, Ridk...Abstract\\nOBJECTIVE:\\nGenetic risk factors for...https://www.ncbi.nlm.nih.gov/pubmed/19330901?d...
64rs2230199Single nucleotide polymorphisms of the tenomod...Mol Vis. 2009;15:762-70. Epub 2009 Apr 15.Tolppanen AM1, Nevalainen T, Kolehmainen M, Se...Abstract\\nPURPOSE:\\nTenomodulin (TNMD) is loca...https://www.ncbi.nlm.nih.gov/pubmed/19381347?d...
65rs2230199A candidate gene association study of 77 polym...J Pain. 2009 Jul;10(7):759-66. doi: 10.1016/j....Schürks M1, Kurth T, Buring JE, Zee RY.Abstract\\nPopulation-based studies have establ...https://www.ncbi.nlm.nih.gov/pubmed/19559392?d...
66rs2230199Plasma complement components and activation fr...Invest Ophthalmol Vis Sci. 2009 Dec;50(12):581...Reynolds R1, Hartnett ME, Atkinson JP, Giclas ...Abstract\\nPURPOSE:\\nSeveral genes encoding com...https://www.ncbi.nlm.nih.gov/pubmed/19661236?d...
67rs2230199CFH, C3 and ARMS2 are significant risk loci fo...PLoS One. 2009 Oct 12;4(10):e7418. doi: 10.137...Scholl HP1, Fleckenstein M, Fritsche LG, Schmi...Abstract\\nBACKGROUND:\\nAge-related macular deg...https://www.ncbi.nlm.nih.gov/pubmed/19823576?d...
68rs2230199Complement component 3 polymorphisms interact ...Am J Clin Nutr. 2009 Dec;90(6):1665-73. doi: 1...Phillips CM1, Goumidi L, Bertrais S, Ferguson ...Abstract\\nBACKGROUND:\\nComplement component 3 ...https://www.ncbi.nlm.nih.gov/pubmed/19828715?d...
69rs2230199Genetic analysis of typical wet-type age-relat...J Ocul Biol Dis Infor. 2009 Dec 22;2(4):164-175.Goto A, Akahori M, Okamoto H, Minami M, Terauc...Abstract\\nAge-related macular degeneration (AM...https://www.ncbi.nlm.nih.gov/pubmed/20157352?d...
70rs2230199R102G polymorphism of the C3 gene associated w...Mol Vis. 2010 Jul 15;16:1324-30.Zerbib J1, Richard F, Puche N, Leveziel N, Coh...Abstract\\nPURPOSE:\\nMajor genetic factors for ...https://www.ncbi.nlm.nih.gov/pubmed/20664795?d...
71rs2230199Assessing susceptibility to age-related macula...Arch Ophthalmol. 2011 Mar;129(3):344-51. doi: ...Chen Y1, Zeng J, Zhao C, Wang K, Trood E, Bueh...Abstract\\nOBJECTIVES:\\nTo evaluate the indepen...https://www.ncbi.nlm.nih.gov/pubmed/21402993?d...
72rs2230199Using genetic variation and environmental risk...PLoS One. 2011 Mar 24;6(3):e17784. doi: 10.137...Spencer KL1, Olson LM, Schnetz-Boutaud N, Gall...Abstract\\nA major goal of personalized medicin...https://www.ncbi.nlm.nih.gov/pubmed/21455292?d...
73rs2230199Association of polymorphisms in C2, CFB and C3...Exp Eye Res. 2012 Mar;96(1):42-7. doi: 10.1016...Kim SJ1, Lee SJ, Kim NR, Chin HS.Abstract\\nThis study was to investigate the as...https://www.ncbi.nlm.nih.gov/pubmed/22273503?d...
74rs2230199Complement C3 gene polymorphism in renal trans...Gene. 2012 May 1;498(2):254-8. doi: 10.1016/j....Bazyar N1, Azarpira N, Khatami SR, Galehdari H...Abstract\\nThe C3 component of complement has d...https://www.ncbi.nlm.nih.gov/pubmed/22361228?d...
75rs2230199Heritability and genome-wide association study...Ophthalmology. 2012 Sep;119(9):1874-85. doi: 1...Sobrin L1, Ripke S, Yu Y, Fagerness J, Bhangal...Abstract\\nPURPOSE:\\nTo investigate whether the...https://www.ncbi.nlm.nih.gov/pubmed/22705344?d...
76rs2230199Pharmacogenetics for genes associated with age...Ophthalmology. 2013 Mar;120(3):593-599. doi: 1...Hagstrom SA1, Ying GS2, Pauer GJT3, Sturgill-S...Abstract\\nPURPOSE:\\nTo evaluate the pharmacoge...https://www.ncbi.nlm.nih.gov/pubmed/23337555?d...
77rs2230199Genetic influences on the outcome of anti-vasc...Ophthalmology. 2013 Aug;120(8):1641-8. doi: 10...Abedi F1, Wickremasinghe S, Richardson AJ, Isl...Abstract\\nPURPOSE:\\nTo determine the associati...https://www.ncbi.nlm.nih.gov/pubmed/23582991?d...
78rs2230199Seven new loci associated with age-related mac...Nat Genet. 2013 Apr;45(4):433-9, 439e1-2. doi:...Fritsche LG1, Chen W, Schu M, Yaspan BL, Yu Y,...Abstract\\nAge-related macular degeneration (AM...https://www.ncbi.nlm.nih.gov/pubmed/23455636?d...
79rs2230199Complement alternative pathway genetic variati...Clin Exp Immunol. 2013 Nov;174(2):326-34. doi:...Kraivong R1, Vasanawathana S, Limpitikul W, Ma...Abstract\\nDengue disease is a mosquito-borne i...https://www.ncbi.nlm.nih.gov/pubmed/23919682?d...
80rs2230199Common polymorphisms in the complement system ...J Infect. 2013 Mar;66(3):255-62. doi: 10.1016/...Adriani KS1, Brouwer MC, Geldhoff M, Baas F, Z...Abstract\\nOBJECTIVE:\\nRisk factors for suscept...https://www.ncbi.nlm.nih.gov/pubmed/23068452?d...
81rs2230199Association between polymorphisms of complemen...Invest Ophthalmol Vis Sci. 2013 Jan 7;54(1):17...Wu L1, Tao Q, Chen W, Wang Z, Song Y, Sheng S,...Abstract\\nPURPOSE:\\nWe assessed the associatio...https://www.ncbi.nlm.nih.gov/pubmed/23233260?d...
82rs2230199Nonsynonymous single nucleotide polymorphisms ...Gene. 2015 May 1;561(2):249-55. doi: 10.1016/j...Qian-Qian Y1, Yong Y2, Jing Z1, Xin B1, Tian-H...Abstract\\nNonsynonymous single nucleotide poly...https://www.ncbi.nlm.nih.gov/pubmed/25688879?d...
83rs2230199Impact of the common genetic associations of a...PLoS One. 2014 Mar 27;9(3):e93459. doi: 10.137...Ristau T1, Paun C2, Ersoy L1, Hahn M3, Lechant...Abstract\\nAge-related macular degeneration (AM...https://www.ncbi.nlm.nih.gov/pubmed/24675670?d...
84rs2230199The genetic variant rs4073 A→T of the Interleu...Acta Ophthalmol. 2015 Dec;93(8):726-33. doi: 1...Hautamäki A1, Seitsonen S1, Holopainen JM1, Mo...Abstract\\nPURPOSE:\\nTo study the association o...https://www.ncbi.nlm.nih.gov/pubmed/26154559?d...
85rs2230199Association between a functional genetic polym...Genet Mol Res. 2015 Oct 16;14(4):12567-76. doi...Zhang MX1, Zhao XF2, Ren YC1, Geng TT3, Yang H...Abstract\\nThe association between the rs223019...https://www.ncbi.nlm.nih.gov/pubmed/26505407?d...
86rs2230199Single-Nucleotide Polymorphisms Associated Wit...JAMA Ophthalmol. 2016 Jun 1;134(6):674-81. doi...Maguire MG1, Ying GS1, Jaffe GJ2, Toth CA2, Da...Abstract\\nIMPORTANCE:\\nSingle-nucleotide polym...https://www.ncbi.nlm.nih.gov/pubmed/27099955?d...
87rs2230199Effect of Risk Alleles in CFH, C3, and VEGFA o...Klin Monbl Augenheilkd. 2016 Apr;233(4):465-70...Habibi I1, Kort F2, Sfar I1, Chebil A2, Bourao...Abstract\\nPURPOSE:\\nThe aim of this pharmacoge...https://www.ncbi.nlm.nih.gov/pubmed/27116510?d...
\n", 2899 | "

88 rows × 6 columns

\n", 2900 | "
" 2901 | ], 2902 | "text/plain": [ 2903 | " rsid article_title \\\n", 2904 | "0 rs17580 Heterozygosity for the alpha1-antitrypsin Z al... \n", 2905 | "1 rs17580 Genetic polymorphisms and susceptibility to lu... \n", 2906 | "2 rs17580 Prevalence of genetic polymorphisms in the pro... \n", 2907 | "3 rs17580 Serum levels and genotype distribution of α1-a... \n", 2908 | "4 rs17580 Molecular abnormality of PI S variant of human... \n", 2909 | "5 rs17580 Alpha 1-antitrypsin deficiency caused by the a... \n", 2910 | "6 rs17580 SERPINA1 PiZ and PiS heterozygotes and lung fu... \n", 2911 | "7 rs17580 Prevalence of alpha-1 antitrypsin high-risk va... \n", 2912 | "8 rs5882 Association of a functional polymorphism in th... \n", 2913 | "9 rs5882 Common genetic variation in multiple metabolic... \n", 2914 | "10 rs5882 The CETP I405V polymorphism is associated with... \n", 2915 | "11 rs5882 Cholesteryl Ester Transfer Protein (CETP) poly... \n", 2916 | "12 rs5882 Association testing by DNA pooling: an effecti... \n", 2917 | "13 rs5882 Direct molecular haplotyping of long-range gen... \n", 2918 | "14 rs5882 Effects of cholesterol ester transfer protein ... \n", 2919 | "15 rs5882 No association of CETP genotype with cognitive... \n", 2920 | "16 rs5882 Lack of replication of genetic associations wi... \n", 2921 | "17 rs5882 A hierarchical and modular approach to the dis... \n", 2922 | "18 rs5882 New application of intelligent agents in spora... \n", 2923 | "19 rs5882 Cholesterol ester transfer protein, interleuki... \n", 2924 | "20 rs5882 Cholesteryl ester transfer protein (CETP) gene... \n", 2925 | "21 rs5882 Multiple genetic variants along candidate path... \n", 2926 | "22 rs5882 Genetic-epidemiological evidence on genes asso... \n", 2927 | "23 rs5882 A meta-analysis of candidate gene polymorphism... \n", 2928 | "24 rs5882 Genetic risk factors in recurrent venous throm... \n", 2929 | "25 rs5882 TagSNP transferability and relative loss of va... \n", 2930 | "26 rs5882 Genetic loci associated with plasma concentrat... \n", 2931 | "27 rs5882 Polymorphism in the CETP gene region, HDL chol... \n", 2932 | "28 rs5882 Associations between common genetic polymorphi... \n", 2933 | "29 rs5882 Gender and single nucleotide polymorphisms in ... \n", 2934 | ".. ... ... \n", 2935 | "58 rs2230199 A meta-analysis of candidate gene polymorphism... \n", 2936 | "59 rs2230199 Common variation in the SERPING1 gene is not a... \n", 2937 | "60 rs2230199 Assessing susceptibility to age-related macula... \n", 2938 | "61 rs2230199 Multilocus analysis of age-related macular deg... \n", 2939 | "62 rs2230199 Genetic risk factors in recurrent venous throm... \n", 2940 | "63 rs2230199 Association of 77 polymorphisms in 52 candidat... \n", 2941 | "64 rs2230199 Single nucleotide polymorphisms of the tenomod... \n", 2942 | "65 rs2230199 A candidate gene association study of 77 polym... \n", 2943 | "66 rs2230199 Plasma complement components and activation fr... \n", 2944 | "67 rs2230199 CFH, C3 and ARMS2 are significant risk loci fo... \n", 2945 | "68 rs2230199 Complement component 3 polymorphisms interact ... \n", 2946 | "69 rs2230199 Genetic analysis of typical wet-type age-relat... \n", 2947 | "70 rs2230199 R102G polymorphism of the C3 gene associated w... \n", 2948 | "71 rs2230199 Assessing susceptibility to age-related macula... \n", 2949 | "72 rs2230199 Using genetic variation and environmental risk... \n", 2950 | "73 rs2230199 Association of polymorphisms in C2, CFB and C3... \n", 2951 | "74 rs2230199 Complement C3 gene polymorphism in renal trans... \n", 2952 | "75 rs2230199 Heritability and genome-wide association study... \n", 2953 | "76 rs2230199 Pharmacogenetics for genes associated with age... \n", 2954 | "77 rs2230199 Genetic influences on the outcome of anti-vasc... \n", 2955 | "78 rs2230199 Seven new loci associated with age-related mac... \n", 2956 | "79 rs2230199 Complement alternative pathway genetic variati... \n", 2957 | "80 rs2230199 Common polymorphisms in the complement system ... \n", 2958 | "81 rs2230199 Association between polymorphisms of complemen... \n", 2959 | "82 rs2230199 Nonsynonymous single nucleotide polymorphisms ... \n", 2960 | "83 rs2230199 Impact of the common genetic associations of a... \n", 2961 | "84 rs2230199 The genetic variant rs4073 A→T of the Interleu... \n", 2962 | "85 rs2230199 Association between a functional genetic polym... \n", 2963 | "86 rs2230199 Single-Nucleotide Polymorphisms Associated Wit... \n", 2964 | "87 rs2230199 Effect of Risk Alleles in CFH, C3, and VEGFA o... \n", 2965 | "\n", 2966 | " article_citation \\\n", 2967 | "0 Aliment Pharmacol Ther. 2011 Feb;33(3):389-94.... \n", 2968 | "1 J Negat Results Biomed. 2006 Apr 11;5:5. \n", 2969 | "2 BMC Gastroenterol. 2010 Feb 20;10:22. doi: 10.... \n", 2970 | "3 Thorax. 2012 Aug;67(8):669-74. doi: 10.1136/th... \n", 2971 | "4 Am J Hum Genet. 1977 May;29(3):233-9. \n", 2972 | "5 J Clin Invest. 1989 Apr;83(4):1144-52. \n", 2973 | "6 PLoS One. 2012;7(8):e42728. doi: 10.1371/journ... \n", 2974 | "7 Arch Bronconeumol. 2015 Feb;51(2):80-85. doi: ... \n", 2975 | "8 JAMA. 2010 Jan 13;303(2):150-8. doi: 10.1001/j... \n", 2976 | "9 J Lipid Res. 2010 Dec;51(12):3524-32. doi: 10.... \n", 2977 | "10 Aging Cell. 2012 Apr;11(2):228-33. doi: 10.111... \n", 2978 | "11 PLoS One. 2012;7(3):e31930. doi: 10.1371/journ... \n", 2979 | "12 Proc Natl Acad Sci U S A. 2002 Dec 24;99(26):1... \n", 2980 | "13 Proc Natl Acad Sci U S A. 2003 Jun 24;100(13):... \n", 2981 | "14 Atherosclerosis. 2008 Jan;196(1):455-60. Epub ... \n", 2982 | "15 Neurosci Lett. 2007 Jun 13;420(2):189-92. Epub... \n", 2983 | "16 Biogerontology. 2008 Apr;9(2):85-92. Epub 2007... \n", 2984 | "17 BMC Genet. 2008 Jan 14;9:6. doi: 10.1186/1471-... \n", 2985 | "18 BMC Bioinformatics. 2008 May 30;9:254. doi: 10... \n", 2986 | "19 Am J Cardiol. 2008 Jun 15;101(12):1683-8. doi:... \n", 2987 | "20 Ann Hum Genet. 2008 Nov;72(Pt 6):732-41. doi: ... \n", 2988 | "21 J Lipid Res. 2008 Dec;49(12):2582-9. doi: 10.1... \n", 2989 | "22 Exp Gerontol. 2009 Mar;44(3):136-60. doi: 10.1... \n", 2990 | "23 Stroke. 2009 Mar;40(3):683-95. doi: 10.1161/ST... \n", 2991 | "24 Clin Chim Acta. 2009 Apr;402(1-2):189-92. \n", 2992 | "25 J Biomed Sci. 2009 Aug 14;16:73. doi: 10.1186/... \n", 2993 | "26 Circ Cardiovasc Genet. 2008 Oct;1(1):21-30. do... \n", 2994 | "27 Circ Cardiovasc Genet. 2009 Feb;2(1):26-33. do... \n", 2995 | "28 Atherosclerosis. 2011 May;216(1):166-9. doi: 1... \n", 2996 | "29 J Nutr. 2012 Sep;142(9):1764-71. doi: 10.3945/... \n", 2997 | ".. ... \n", 2998 | "58 Stroke. 2009 Mar;40(3):683-95. doi: 10.1161/ST... \n", 2999 | "59 Mol Vis. 2009;15:200-7. Epub 2009 Jan 23. \n", 3000 | "60 Mol Cell Proteomics. 2009 Jun;8(6):1338-49. do... \n", 3001 | "61 Eur J Hum Genet. 2009 Sep;17(9):1190-9. doi: 1... \n", 3002 | "62 Clin Chim Acta. 2009 Apr;402(1-2):189-92. \n", 3003 | "63 J Hypertens. 2009 Mar;27(3):476-83. \n", 3004 | "64 Mol Vis. 2009;15:762-70. Epub 2009 Apr 15. \n", 3005 | "65 J Pain. 2009 Jul;10(7):759-66. doi: 10.1016/j.... \n", 3006 | "66 Invest Ophthalmol Vis Sci. 2009 Dec;50(12):581... \n", 3007 | "67 PLoS One. 2009 Oct 12;4(10):e7418. doi: 10.137... \n", 3008 | "68 Am J Clin Nutr. 2009 Dec;90(6):1665-73. doi: 1... \n", 3009 | "69 J Ocul Biol Dis Infor. 2009 Dec 22;2(4):164-175. \n", 3010 | "70 Mol Vis. 2010 Jul 15;16:1324-30. \n", 3011 | "71 Arch Ophthalmol. 2011 Mar;129(3):344-51. doi: ... \n", 3012 | "72 PLoS One. 2011 Mar 24;6(3):e17784. doi: 10.137... \n", 3013 | "73 Exp Eye Res. 2012 Mar;96(1):42-7. doi: 10.1016... \n", 3014 | "74 Gene. 2012 May 1;498(2):254-8. doi: 10.1016/j.... \n", 3015 | "75 Ophthalmology. 2012 Sep;119(9):1874-85. doi: 1... \n", 3016 | "76 Ophthalmology. 2013 Mar;120(3):593-599. doi: 1... \n", 3017 | "77 Ophthalmology. 2013 Aug;120(8):1641-8. doi: 10... \n", 3018 | "78 Nat Genet. 2013 Apr;45(4):433-9, 439e1-2. doi:... \n", 3019 | "79 Clin Exp Immunol. 2013 Nov;174(2):326-34. doi:... \n", 3020 | "80 J Infect. 2013 Mar;66(3):255-62. doi: 10.1016/... \n", 3021 | "81 Invest Ophthalmol Vis Sci. 2013 Jan 7;54(1):17... \n", 3022 | "82 Gene. 2015 May 1;561(2):249-55. doi: 10.1016/j... \n", 3023 | "83 PLoS One. 2014 Mar 27;9(3):e93459. doi: 10.137... \n", 3024 | "84 Acta Ophthalmol. 2015 Dec;93(8):726-33. doi: 1... \n", 3025 | "85 Genet Mol Res. 2015 Oct 16;14(4):12567-76. doi... \n", 3026 | "86 JAMA Ophthalmol. 2016 Jun 1;134(6):674-81. doi... \n", 3027 | "87 Klin Monbl Augenheilkd. 2016 Apr;233(4):465-70... \n", 3028 | "\n", 3029 | " article_authors \\\n", 3030 | "0 Mihalache F1, Höblinger A, Grünhage F, Krawczy... \n", 3031 | "1 Lee PL1, West C, Crain K, Wang L. \n", 3032 | "2 Kok KF1, te Morsche RH, van Oijen MG, Drenth JP. \n", 3033 | "3 Ferrarotti I1, Thun GA, Zorzetto M, Ottaviani ... \n", 3034 | "4 Yoshida A, Ewing C, Wessels M, Lieberman J, Ga... \n", 3035 | "5 Curiel D1, Brantly M, Curiel E, Stier L, Cryst... \n", 3036 | "6 Thun GA1, Ferrarotti I, Imboden M, Rochat T, G... \n", 3037 | "7 Pérez-Rubio G1, Jiménez-Valverde LO2, Ramírez-... \n", 3038 | "8 Sanders AE1, Wang C, Katz M, Derby CA, Barzila... \n", 3039 | "9 Peloso GM1, Demissie S, Collins D, Mirel DB, G... \n", 3040 | "10 Yu L1, Shulman JM, Chibnik L, Leurgans S, Schn... \n", 3041 | "11 Papp AC1, Pinsonneault JK, Wang D, Newman LC, ... \n", 3042 | "12 Bansal A1, van den Boom D, Kammerer S, Honisch... \n", 3043 | "13 Ding C1, Cantor CR. \n", 3044 | "14 Terán-García M1, Després JP, Tremblay A, Bouch... \n", 3045 | "15 Johnson W1, Harris SE, Collins P, Starr JM, Wh... \n", 3046 | "16 Novelli V1, Viviani Anselmi C, Roncarati R, Gu... \n", 3047 | "17 Sebastiani P1, Zhao Z, Abad-Grau MM, Riva A, H... \n", 3048 | "18 Penco S1, Buscema M, Patrosso MC, Marocchi A, ... \n", 3049 | "19 Enquobahrie DA1, Smith NL, Bis JC, Carty CL, R... \n", 3050 | "20 Meiner V1, Friedlander Y, Milo H, Sharon N, Be... \n", 3051 | "21 Lu Y1, Dollé ME, Imholz S, van 't Slot R, Vers... \n", 3052 | "22 Boes E1, Coassin S, Kollerits B, Heid IM, Kron... \n", 3053 | "23 Wang X1, Cheng S, Brophy VH, Erlich HA, Mannha... \n", 3054 | "24 Zee RY1, Bubes V, Shrivastava S, Ridker PM, Gl... \n", 3055 | "25 Lins TC1, Abreu BS, Pereira RW. \n", 3056 | "26 Chasman DI1, Paré G, Zee RY, Parker AN, Cook N... \n", 3057 | "27 Ridker PM1, Paré G, Parker AN, Zee RY, Miletic... \n", 3058 | "28 Legry V1, Bokor S, Beghin L, Galfo M, Gonzalez... \n", 3059 | "29 Clifford AJ1, Chen K, McWade L, Rincon G, Kim ... \n", 3060 | ".. ... \n", 3061 | "58 Wang X1, Cheng S, Brophy VH, Erlich HA, Mannha... \n", 3062 | "59 Park KH1, Ryu E, Tosakulwong N, Wu Y, Edwards AO. \n", 3063 | "60 Gu J1, Pauer GJ, Yue X, Narendra U, Sturgill G... \n", 3064 | "61 Bergeron-Sawitzke J1, Gold B, Olsh A, Schlotte... \n", 3065 | "62 Zee RY1, Bubes V, Shrivastava S, Ridker PM, Gl... \n", 3066 | "63 Conen D1, Cheng S, Steiner LL, Buring JE, Ridk... \n", 3067 | "64 Tolppanen AM1, Nevalainen T, Kolehmainen M, Se... \n", 3068 | "65 Schürks M1, Kurth T, Buring JE, Zee RY. \n", 3069 | "66 Reynolds R1, Hartnett ME, Atkinson JP, Giclas ... \n", 3070 | "67 Scholl HP1, Fleckenstein M, Fritsche LG, Schmi... \n", 3071 | "68 Phillips CM1, Goumidi L, Bertrais S, Ferguson ... \n", 3072 | "69 Goto A, Akahori M, Okamoto H, Minami M, Terauc... \n", 3073 | "70 Zerbib J1, Richard F, Puche N, Leveziel N, Coh... \n", 3074 | "71 Chen Y1, Zeng J, Zhao C, Wang K, Trood E, Bueh... \n", 3075 | "72 Spencer KL1, Olson LM, Schnetz-Boutaud N, Gall... \n", 3076 | "73 Kim SJ1, Lee SJ, Kim NR, Chin HS. \n", 3077 | "74 Bazyar N1, Azarpira N, Khatami SR, Galehdari H... \n", 3078 | "75 Sobrin L1, Ripke S, Yu Y, Fagerness J, Bhangal... \n", 3079 | "76 Hagstrom SA1, Ying GS2, Pauer GJT3, Sturgill-S... \n", 3080 | "77 Abedi F1, Wickremasinghe S, Richardson AJ, Isl... \n", 3081 | "78 Fritsche LG1, Chen W, Schu M, Yaspan BL, Yu Y,... \n", 3082 | "79 Kraivong R1, Vasanawathana S, Limpitikul W, Ma... \n", 3083 | "80 Adriani KS1, Brouwer MC, Geldhoff M, Baas F, Z... \n", 3084 | "81 Wu L1, Tao Q, Chen W, Wang Z, Song Y, Sheng S,... \n", 3085 | "82 Qian-Qian Y1, Yong Y2, Jing Z1, Xin B1, Tian-H... \n", 3086 | "83 Ristau T1, Paun C2, Ersoy L1, Hahn M3, Lechant... \n", 3087 | "84 Hautamäki A1, Seitsonen S1, Holopainen JM1, Mo... \n", 3088 | "85 Zhang MX1, Zhao XF2, Ren YC1, Geng TT3, Yang H... \n", 3089 | "86 Maguire MG1, Ying GS1, Jaffe GJ2, Toth CA2, Da... \n", 3090 | "87 Habibi I1, Kort F2, Sfar I1, Chebil A2, Bourao... \n", 3091 | "\n", 3092 | " article_abstract \\\n", 3093 | "0 Abstract\\nBACKGROUND:\\nAlpha1-antitrypsin (α1A... \n", 3094 | "1 Abstract\\nSusceptibility to infection by bacte... \n", 3095 | "2 Abstract\\nBACKGROUND:\\nAlpha-1 antitrypsin (A1... \n", 3096 | "3 Abstract\\nRATIONALE:\\nα1-Antitrypsin (AAT) def... \n", 3097 | "4 Abstract\\nAlpha1-antitrypsin variant protein w... \n", 3098 | "5 Abstract\\nalpha 1-Antitrypsin (alpha 1AT) defi... \n", 3099 | "6 Abstract\\nBACKGROUND:\\nSevere alpha1-antitryps... \n", 3100 | "7 Abstract\\nINTRODUCTION:\\nChronic obstructive p... \n", 3101 | "8 Abstract\\nCONTEXT:\\nPolymorphisms in the chole... \n", 3102 | "9 Abstract\\nA low level of HDL-C is the most com... \n", 3103 | "10 Abstract\\nThe cholesteryl ester transfer prote... \n", 3104 | "11 Abstract\\nPolymorphisms in and around the Chol... \n", 3105 | "12 Abstract\\nWith an ever-increasing resource of ... \n", 3106 | "13 Abstract\\nHaplotypes, combinations of several ... \n", 3107 | "14 Abstract\\nCholesterol ester transfer protein (... \n", 3108 | "15 Abstract\\nA cholesteryl ester transfer protein... \n", 3109 | "16 Abstract\\nThe exceptional longevity of centena... \n", 3110 | "17 Abstract\\nBACKGROUND:\\nOne of the challenges o... \n", 3111 | "18 Abstract\\nBACKGROUND:\\nFew genetic factors pre... \n", 3112 | "19 Abstract\\nVariations in candidate genes partic... \n", 3113 | "20 Abstract\\nAlthough Cholesteryl Ester Transfer ... \n", 3114 | "21 Abstract\\nThe known genetic variants determini... \n", 3115 | "22 Abstract\\nHigh-density lipoprotein (HDL) parti... \n", 3116 | "23 Abstract\\nBACKGROUND AND PURPOSE:\\nIschemic st... \n", 3117 | "24 Abstract\\nBACKGROUND:\\nRecurrent venous thromb... \n", 3118 | "25 Abstract\\nBACKGROUND:\\nThe application of a su... \n", 3119 | "26 Abstract\\nBACKGROUND:\\nGenome-wide genetic ass... \n", 3120 | "27 Abstract\\nBACKGROUND:\\nRecent trial data have ... \n", 3121 | "28 Abstract\\nOBJECTIVE:\\nGenetic variability in t... \n", 3122 | "29 Abstract\\nUsing linear regression models, we s... \n", 3123 | ".. ... \n", 3124 | "58 Abstract\\nBACKGROUND AND PURPOSE:\\nIschemic st... \n", 3125 | "59 Abstract\\nPURPOSE:\\nCommon genetic variation i... \n", 3126 | "60 Abstract\\nAge-related macular degeneration (AM... \n", 3127 | "61 Abstract\\nAge-related macular degeneration (AM... \n", 3128 | "62 Abstract\\nBACKGROUND:\\nRecurrent venous thromb... \n", 3129 | "63 Abstract\\nOBJECTIVE:\\nGenetic risk factors for... \n", 3130 | "64 Abstract\\nPURPOSE:\\nTenomodulin (TNMD) is loca... \n", 3131 | "65 Abstract\\nPopulation-based studies have establ... \n", 3132 | "66 Abstract\\nPURPOSE:\\nSeveral genes encoding com... \n", 3133 | "67 Abstract\\nBACKGROUND:\\nAge-related macular deg... \n", 3134 | "68 Abstract\\nBACKGROUND:\\nComplement component 3 ... \n", 3135 | "69 Abstract\\nAge-related macular degeneration (AM... \n", 3136 | "70 Abstract\\nPURPOSE:\\nMajor genetic factors for ... \n", 3137 | "71 Abstract\\nOBJECTIVES:\\nTo evaluate the indepen... \n", 3138 | "72 Abstract\\nA major goal of personalized medicin... \n", 3139 | "73 Abstract\\nThis study was to investigate the as... \n", 3140 | "74 Abstract\\nThe C3 component of complement has d... \n", 3141 | "75 Abstract\\nPURPOSE:\\nTo investigate whether the... \n", 3142 | "76 Abstract\\nPURPOSE:\\nTo evaluate the pharmacoge... \n", 3143 | "77 Abstract\\nPURPOSE:\\nTo determine the associati... \n", 3144 | "78 Abstract\\nAge-related macular degeneration (AM... \n", 3145 | "79 Abstract\\nDengue disease is a mosquito-borne i... \n", 3146 | "80 Abstract\\nOBJECTIVE:\\nRisk factors for suscept... \n", 3147 | "81 Abstract\\nPURPOSE:\\nWe assessed the associatio... \n", 3148 | "82 Abstract\\nNonsynonymous single nucleotide poly... \n", 3149 | "83 Abstract\\nAge-related macular degeneration (AM... \n", 3150 | "84 Abstract\\nPURPOSE:\\nTo study the association o... \n", 3151 | "85 Abstract\\nThe association between the rs223019... \n", 3152 | "86 Abstract\\nIMPORTANCE:\\nSingle-nucleotide polym... \n", 3153 | "87 Abstract\\nPURPOSE:\\nThe aim of this pharmacoge... \n", 3154 | "\n", 3155 | " link \n", 3156 | "0 https://www.ncbi.nlm.nih.gov/pubmed/21138453?d... \n", 3157 | "1 https://www.ncbi.nlm.nih.gov/pubmed/16608528?d... \n", 3158 | "2 https://www.ncbi.nlm.nih.gov/pubmed/20170533?d... \n", 3159 | "3 https://www.ncbi.nlm.nih.gov/pubmed/22426792?d... \n", 3160 | "4 https://www.ncbi.nlm.nih.gov/pubmed/301355?dop... \n", 3161 | "5 https://www.ncbi.nlm.nih.gov/pubmed/2539391?do... \n", 3162 | "6 https://www.ncbi.nlm.nih.gov/pubmed/22912729?d... \n", 3163 | "7 https://www.ncbi.nlm.nih.gov/pubmed/25454901?d... \n", 3164 | "8 https://www.ncbi.nlm.nih.gov/pubmed/20068209?d... \n", 3165 | "9 https://www.ncbi.nlm.nih.gov/pubmed/20855565?d... \n", 3166 | "10 https://www.ncbi.nlm.nih.gov/pubmed/22122979?d... \n", 3167 | "11 https://www.ncbi.nlm.nih.gov/pubmed/22403620?d... \n", 3168 | "12 https://www.ncbi.nlm.nih.gov/pubmed/12475937?d... \n", 3169 | "13 https://www.ncbi.nlm.nih.gov/pubmed/12802015?d... \n", 3170 | "14 https://www.ncbi.nlm.nih.gov/pubmed/17196207?d... \n", 3171 | "15 https://www.ncbi.nlm.nih.gov/pubmed/17531380?d... \n", 3172 | "16 https://www.ncbi.nlm.nih.gov/pubmed/18034366?d... \n", 3173 | "17 https://www.ncbi.nlm.nih.gov/pubmed/18194558?d... \n", 3174 | "18 https://www.ncbi.nlm.nih.gov/pubmed/18513389?d... \n", 3175 | "19 https://www.ncbi.nlm.nih.gov/pubmed/18549840?d... \n", 3176 | "20 https://www.ncbi.nlm.nih.gov/pubmed/18637884?d... \n", 3177 | "21 https://www.ncbi.nlm.nih.gov/pubmed/18660489?d... \n", 3178 | "22 https://www.ncbi.nlm.nih.gov/pubmed/19041386?d... \n", 3179 | "23 https://www.ncbi.nlm.nih.gov/pubmed/19131662?d... \n", 3180 | "24 https://www.ncbi.nlm.nih.gov/pubmed/19263529?d... \n", 3181 | "25 https://www.ncbi.nlm.nih.gov/pubmed/19682379?d... \n", 3182 | "26 https://www.ncbi.nlm.nih.gov/pubmed/19802338?d... \n", 3183 | "27 https://www.ncbi.nlm.nih.gov/pubmed/20031564?d... \n", 3184 | "28 https://www.ncbi.nlm.nih.gov/pubmed/21316679?d... \n", 3185 | "29 https://www.ncbi.nlm.nih.gov/pubmed/22833659?d... \n", 3186 | ".. ... \n", 3187 | "58 https://www.ncbi.nlm.nih.gov/pubmed/19131662?d... \n", 3188 | "59 https://www.ncbi.nlm.nih.gov/pubmed/19169411?d... \n", 3189 | "60 https://www.ncbi.nlm.nih.gov/pubmed/19202148?d... \n", 3190 | "61 https://www.ncbi.nlm.nih.gov/pubmed/19259132?d... \n", 3191 | "62 https://www.ncbi.nlm.nih.gov/pubmed/19263529?d... \n", 3192 | "63 https://www.ncbi.nlm.nih.gov/pubmed/19330901?d... \n", 3193 | "64 https://www.ncbi.nlm.nih.gov/pubmed/19381347?d... \n", 3194 | "65 https://www.ncbi.nlm.nih.gov/pubmed/19559392?d... \n", 3195 | "66 https://www.ncbi.nlm.nih.gov/pubmed/19661236?d... \n", 3196 | "67 https://www.ncbi.nlm.nih.gov/pubmed/19823576?d... \n", 3197 | "68 https://www.ncbi.nlm.nih.gov/pubmed/19828715?d... \n", 3198 | "69 https://www.ncbi.nlm.nih.gov/pubmed/20157352?d... \n", 3199 | "70 https://www.ncbi.nlm.nih.gov/pubmed/20664795?d... \n", 3200 | "71 https://www.ncbi.nlm.nih.gov/pubmed/21402993?d... \n", 3201 | "72 https://www.ncbi.nlm.nih.gov/pubmed/21455292?d... \n", 3202 | "73 https://www.ncbi.nlm.nih.gov/pubmed/22273503?d... \n", 3203 | "74 https://www.ncbi.nlm.nih.gov/pubmed/22361228?d... \n", 3204 | "75 https://www.ncbi.nlm.nih.gov/pubmed/22705344?d... \n", 3205 | "76 https://www.ncbi.nlm.nih.gov/pubmed/23337555?d... \n", 3206 | "77 https://www.ncbi.nlm.nih.gov/pubmed/23582991?d... \n", 3207 | "78 https://www.ncbi.nlm.nih.gov/pubmed/23455636?d... \n", 3208 | "79 https://www.ncbi.nlm.nih.gov/pubmed/23919682?d... \n", 3209 | "80 https://www.ncbi.nlm.nih.gov/pubmed/23068452?d... \n", 3210 | "81 https://www.ncbi.nlm.nih.gov/pubmed/23233260?d... \n", 3211 | "82 https://www.ncbi.nlm.nih.gov/pubmed/25688879?d... \n", 3212 | "83 https://www.ncbi.nlm.nih.gov/pubmed/24675670?d... \n", 3213 | "84 https://www.ncbi.nlm.nih.gov/pubmed/26154559?d... \n", 3214 | "85 https://www.ncbi.nlm.nih.gov/pubmed/26505407?d... \n", 3215 | "86 https://www.ncbi.nlm.nih.gov/pubmed/27099955?d... \n", 3216 | "87 https://www.ncbi.nlm.nih.gov/pubmed/27116510?d... \n", 3217 | "\n", 3218 | "[88 rows x 6 columns]" 3219 | ] 3220 | }, 3221 | "execution_count": 55, 3222 | "metadata": {}, 3223 | "output_type": "execute_result" 3224 | } 3225 | ], 3226 | "source": [ 3227 | "abstracts_df" 3228 | ] 3229 | }, 3230 | { 3231 | "cell_type": "markdown", 3232 | "metadata": {}, 3233 | "source": [ 3234 | "To save for later perusal, I'll export my web scrapings, complete with abstracts and hyperlinks, to a CSV file using the pandas DataFrame.to_csv method. " 3235 | ] 3236 | }, 3237 | { 3238 | "cell_type": "code", 3239 | "execution_count": 56, 3240 | "metadata": {}, 3241 | "outputs": [], 3242 | "source": [ 3243 | "#DataFrame to CSV\n", 3244 | "export_csv = abstracts_df.to_csv(r'/Users/lorajohns/Documents/Python/DNA/DNA_articles.csv')" 3245 | ] 3246 | }, 3247 | { 3248 | "cell_type": "markdown", 3249 | "metadata": {}, 3250 | "source": [ 3251 | "## Reading up on the medical literature\n", 3252 | "\n", 3253 | "Now I have a handy CSV file, nicely formatted to read in Numbers, Excel, or PDF format, with citations to scientific articles analyzing and describing my genotypes with \"significant\" magnitudes and \"bad\" reputations. With the powerful tools Python provides, it's a great time to be alive for literal introspection. " 3254 | ] 3255 | } 3256 | ], 3257 | "metadata": { 3258 | "kernelspec": { 3259 | "display_name": "Python 3", 3260 | "language": "python", 3261 | "name": "python3" 3262 | }, 3263 | "language_info": { 3264 | "codemirror_mode": { 3265 | "name": "ipython", 3266 | "version": 3 3267 | }, 3268 | "file_extension": ".py", 3269 | "mimetype": "text/x-python", 3270 | "name": "python", 3271 | "nbconvert_exporter": "python", 3272 | "pygments_lexer": "ipython3", 3273 | "version": "3.7.2" 3274 | } 3275 | }, 3276 | "nbformat": 4, 3277 | "nbformat_minor": 2 3278 | } 3279 | --------------------------------------------------------------------------------