├── LICENSE └── 1001genomes_PAM_GPN_correlation.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Silvan Büdenbender 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /1001genomes_PAM_GPN_correlation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNPlEba2v8XoRYdbXK+KCGE", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "!pip install biopython" 33 | ], 34 | "metadata": { 35 | "colab": { 36 | "base_uri": "https://localhost:8080/" 37 | }, 38 | "id": "AcjQYlyybMKq", 39 | "outputId": "bbd6f38e-43af-44d2-a315-da76ad3f8241" 40 | }, 41 | "execution_count": null, 42 | "outputs": [ 43 | { 44 | "output_type": "stream", 45 | "name": "stdout", 46 | "text": [ 47 | "Collecting biopython\n", 48 | " Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)\n", 49 | "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from biopython) (2.0.2)\n", 50 | "Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)\n", 51 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.2/3.2 MB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 52 | "\u001b[?25hInstalling collected packages: biopython\n", 53 | "Successfully installed biopython-1.86\n" 54 | ] 55 | } 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "source": [ 61 | "import time\n", 62 | "import requests\n", 63 | "from collections import defaultdict\n", 64 | "from io import StringIO\n", 65 | "\n", 66 | "from tqdm import tqdm\n", 67 | "import pandas as pd\n", 68 | "import numpy as np\n", 69 | "from Bio import SeqIO\n", 70 | "from Bio.Seq import Seq, MutableSeq" 71 | ], 72 | "metadata": { 73 | "id": "k5L396sZ0qxA" 74 | }, 75 | "execution_count": null, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "id": "hu4QRbwAzV7y" 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "HMA4_gene_id = \"AT2G19110.1\"\n", 87 | "amino_acid_substitution = \"missense_variant\"" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "source": [ 93 | "class DefaultDict(defaultdict):\n", 94 | " def __missing__(self, key):\n", 95 | " return self.default_factory(key)\n", 96 | "\n", 97 | "def get_accession_gene_fasta(gene):\n", 98 | " def get_gene(accession):\n", 99 | " fasta = requests.get(f\"https://tools.1001genomes.org/api/v1/pseudogenomes/strains/{accession}/gids/{gene}\")\n", 100 | " return fasta.text\n", 101 | " return get_gene\n" 102 | ], 103 | "metadata": { 104 | "id": "XozcmWeH5LBT" 105 | }, 106 | "execution_count": null, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "source": [ 112 | "HMA4_fasta_cache = DefaultDict(get_accession_gene_fasta(HMA4_gene_id))" 113 | ], 114 | "metadata": { 115 | "id": "KqQTiutV2Y0e" 116 | }, 117 | "execution_count": null, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "source": [ 123 | "aa_sub_snps = requests.get(f\"https://tools.1001genomes.org/api/v1.1/variants.json?type=snps;accs=all;gid={HMA4_gene_id};effect={amino_acid_substitution}\")\n", 124 | "df = pd.DataFrame(aa_sub_snps.json()['data'], columns=[\"chromosome\", \"position\", \"accession\", \"reference\", \"variant\", \"-\", \"impacts\", \"effects\"])\n", 125 | "df" 126 | ], 127 | "metadata": { 128 | "id": "S25ZOM6u0zBX", 129 | "colab": { 130 | "base_uri": "https://localhost:8080/", 131 | "height": 424 132 | }, 133 | "outputId": "d54e94fd-560b-4097-e523-63bb4556b389" 134 | }, 135 | "execution_count": null, 136 | "outputs": [ 137 | { 138 | "output_type": "execute_result", 139 | "data": { 140 | "text/plain": [ 141 | " chromosome position accession reference variant - impacts \\\n", 142 | "0 2 8279523 9653 T G 40 MODERATE \n", 143 | "1 2 8279523 9655 T G 40 MODERATE \n", 144 | "2 2 8279523 9661 T G 40 MODERATE \n", 145 | "3 2 8279523 9968 T G 40 MODERATE \n", 146 | "4 2 8279539 9525 T A 40 MODERATE \n", 147 | "... ... ... ... ... ... .. ... \n", 148 | "2328 2 8286155 9845 G T 40 MODERATE \n", 149 | "2329 2 8286155 9886 G T 40 MODERATE \n", 150 | "2330 2 8286155 9888 G T 40 MODERATE \n", 151 | "2331 2 8286155 9894 G T 40 MODERATE \n", 152 | "2332 2 8286250 9121 G A 40 MODERATE \n", 153 | "\n", 154 | " effects \n", 155 | "0 missense_variant \n", 156 | "1 missense_variant \n", 157 | "2 missense_variant \n", 158 | "3 missense_variant \n", 159 | "4 missense_variant \n", 160 | "... ... \n", 161 | "2328 missense_variant \n", 162 | "2329 missense_variant \n", 163 | "2330 missense_variant \n", 164 | "2331 missense_variant \n", 165 | "2332 missense_variant \n", 166 | "\n", 167 | "[2333 rows x 8 columns]" 168 | ], 169 | "text/html": [ 170 | "\n", 171 | "
\n", 172 | "
\n", 173 | "\n", 186 | "\n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | "
chromosomepositionaccessionreferencevariant-impactseffects
0282795239653TG40MODERATEmissense_variant
1282795239655TG40MODERATEmissense_variant
2282795239661TG40MODERATEmissense_variant
3282795239968TG40MODERATEmissense_variant
4282795399525TA40MODERATEmissense_variant
...........................
2328282861559845GT40MODERATEmissense_variant
2329282861559886GT40MODERATEmissense_variant
2330282861559888GT40MODERATEmissense_variant
2331282861559894GT40MODERATEmissense_variant
2332282862509121GA40MODERATEmissense_variant
\n", 324 | "

2333 rows × 8 columns

\n", 325 | "
\n", 326 | "
\n", 327 | "\n", 328 | "
\n", 329 | " \n", 337 | "\n", 338 | " \n", 378 | "\n", 379 | " \n", 403 | "
\n", 404 | "\n", 405 | "\n", 406 | "
\n", 407 | " \n", 418 | "\n", 419 | "\n", 508 | "\n", 509 | " \n", 531 | "
\n", 532 | "\n", 533 | "
\n", 534 | " \n", 565 | " \n", 574 | " \n", 586 | "
\n", 587 | "\n", 588 | "
\n", 589 | "
\n" 590 | ], 591 | "application/vnd.google.colaboratory.intrinsic+json": { 592 | "type": "dataframe", 593 | "variable_name": "df", 594 | "summary": "{\n \"name\": \"df\",\n \"rows\": 2333,\n \"fields\": [\n {\n \"column\": \"chromosome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 2,\n \"num_unique_values\": 1,\n \"samples\": [\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"position\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1083,\n \"min\": 8279523,\n \"max\": 8286250,\n \"num_unique_values\": 83,\n \"samples\": [\n 8284492\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"accession\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2602,\n \"min\": 88,\n \"max\": 18696,\n \"num_unique_values\": 959,\n \"samples\": [\n 10010\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"G\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"variant\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"A\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"-\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 40,\n \"max\": 40,\n \"num_unique_values\": 1,\n \"samples\": [\n 40\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"impacts\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"MODERATE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"effects\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"missense_variant\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" 595 | } 596 | }, 597 | "metadata": {}, 598 | "execution_count": 122 599 | } 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "source": [ 605 | "def get_start_end_from_seq(seq):\n", 606 | " range = seq.id.split(\"|\")[4]\n", 607 | " _chrom, range = range.split(\":\")\n", 608 | " start, end = range.split(\"..\")\n", 609 | " return int(start), int(end)" 610 | ], 611 | "metadata": { 612 | "id": "MCg0jtLcgNWn" 613 | }, 614 | "execution_count": null, 615 | "outputs": [] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "source": [ 620 | "def get_aa_substitution_pair(variant, record, in_gene_position):\n", 621 | " in_condon_position = in_gene_position % 3\n", 622 | "\n", 623 | " codon_start = (in_gene_position // 3) * 3\n", 624 | " codon = MutableSeq(record.seq[codon_start:codon_start+3])\n", 625 | "\n", 626 | " variant_aa = codon.translate()\n", 627 | "\n", 628 | " codon[in_condon_position] = variant[\"reference\"]\n", 629 | " reference_aa = codon.translate()\n", 630 | "\n", 631 | " return reference_aa, variant_aa" 632 | ], 633 | "metadata": { 634 | "id": "4lrKW5Fkkf7J" 635 | }, 636 | "execution_count": null, 637 | "outputs": [] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "source": [ 642 | "def build_variant_substitution_data(variant, fasta_cache):\n", 643 | " fasta = fasta_cache[variant[\"accession\"]]\n", 644 | " record = SeqIO.read(StringIO(fasta), \"fasta\")\n", 645 | "\n", 646 | " start, _ = get_start_end_from_seq(record)\n", 647 | " in_gene_position = variant[\"position\"] - start\n", 648 | "\n", 649 | " if variant[\"variant\"] == record.seq[in_gene_position]:\n", 650 | "\n", 651 | " snp_centered_dna_window = record.seq[in_gene_position-256:in_gene_position+257]\n", 652 | "\n", 653 | " reference_aa, variant_aa = get_aa_substitution_pair(variant, record, in_gene_position)\n", 654 | "\n", 655 | " return [str(reference_aa), str(variant_aa), str(snp_centered_dna_window)]\n", 656 | " else:\n", 657 | " return [\"-\", \"-\", \"-\"]" 658 | ], 659 | "metadata": { 660 | "id": "xD7FLy-Ioi2o" 661 | }, 662 | "execution_count": null, 663 | "outputs": [] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "source": [ 668 | "df[:2]" 669 | ], 670 | "metadata": { 671 | "colab": { 672 | "base_uri": "https://localhost:8080/", 673 | "height": 112 674 | }, 675 | "id": "yKJXVDJFr-Z4", 676 | "outputId": "fb20756f-3684-491d-cfe8-36bd2fba62b1" 677 | }, 678 | "execution_count": null, 679 | "outputs": [ 680 | { 681 | "output_type": "execute_result", 682 | "data": { 683 | "text/plain": [ 684 | " chromosome position accession reference variant - impacts \\\n", 685 | "0 2 8279523 9653 T G 40 MODERATE \n", 686 | "1 2 8279523 9655 T G 40 MODERATE \n", 687 | "\n", 688 | " effects \n", 689 | "0 missense_variant \n", 690 | "1 missense_variant " 691 | ], 692 | "text/html": [ 693 | "\n", 694 | "
\n", 695 | "
\n", 696 | "\n", 709 | "\n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | "
chromosomepositionaccessionreferencevariant-impactseffects
0282795239653TG40MODERATEmissense_variant
1282795239655TG40MODERATEmissense_variant
\n", 748 | "
\n", 749 | "
\n", 750 | "\n", 751 | "
\n", 752 | " \n", 760 | "\n", 761 | " \n", 801 | "\n", 802 | " \n", 826 | "
\n", 827 | "\n", 828 | "\n", 829 | "
\n", 830 | " \n", 841 | "\n", 842 | "\n", 931 | "\n", 932 | " \n", 954 | "
\n", 955 | "\n", 956 | "
\n", 957 | "
\n" 958 | ], 959 | "application/vnd.google.colaboratory.intrinsic+json": { 960 | "type": "dataframe", 961 | "summary": "{\n \"name\": \"df[:2]\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"chromosome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 2,\n \"num_unique_values\": 1,\n \"samples\": [\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"position\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 8279523,\n \"max\": 8279523,\n \"num_unique_values\": 1,\n \"samples\": [\n 8279523\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"accession\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 9653,\n \"max\": 9655,\n \"num_unique_values\": 2,\n \"samples\": [\n 9655\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"T\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"variant\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"G\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"-\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 40,\n \"max\": 40,\n \"num_unique_values\": 1,\n \"samples\": [\n 40\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"impacts\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"MODERATE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"effects\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"missense_variant\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" 962 | } 963 | }, 964 | "metadata": {}, 965 | "execution_count": 95 966 | } 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "source": [ 972 | "results = []\n", 973 | "for variant in tqdm(df[:100].to_dict(orient=\"records\")):\n", 974 | " results.append(build_variant_substitution_data(variant, HMA4_fasta_cache))\n", 975 | " time.sleep(0.51)" 976 | ], 977 | "metadata": { 978 | "colab": { 979 | "base_uri": "https://localhost:8080/" 980 | }, 981 | "id": "6Eg_-mHurtL0", 982 | "outputId": "4935ecc6-65d9-4852-bc4f-f82c206fc9ee" 983 | }, 984 | "execution_count": null, 985 | "outputs": [ 986 | { 987 | "output_type": "stream", 988 | "name": "stderr", 989 | "text": [ 990 | "100%|██████████| 100/100 [01:34<00:00, 1.05it/s]\n" 991 | ] 992 | } 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "source": [ 998 | "build_variant_substitution_data(df.iloc[29], HMA4_fasta_cache)" 999 | ], 1000 | "metadata": { 1001 | "colab": { 1002 | "base_uri": "https://localhost:8080/", 1003 | "height": 297 1004 | }, 1005 | "id": "ZETYFRvMyO74", 1006 | "outputId": "db46b392-25e1-4e52-d46d-74e5b77acecf" 1007 | }, 1008 | "execution_count": null, 1009 | "outputs": [ 1010 | { 1011 | "output_type": "error", 1012 | "ename": "AssertionError", 1013 | "evalue": "", 1014 | "traceback": [ 1015 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1016 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 1017 | "\u001b[0;32m/tmp/ipython-input-3953702373.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbuild_variant_substitution_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m29\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mHMA4_fasta_cache\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m29\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1018 | "\u001b[0;32m/tmp/ipython-input-3100140708.py\u001b[0m in \u001b[0;36mbuild_variant_substitution_data\u001b[0;34m(variant, fasta_cache)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_start_end_from_seq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0min_gene_position\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvariant\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"position\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mvariant\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"variant\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mrecord\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0min_gene_position\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0msnp_centered_dna_window\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrecord\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0min_gene_position\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m256\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0min_gene_position\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m257\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1019 | "\u001b[0;31mAssertionError\u001b[0m: " 1020 | ] 1021 | } 1022 | ] 1023 | }, 1024 | { 1025 | "cell_type": "code", 1026 | "source": [ 1027 | "weird_variant = df.iloc[29]\n", 1028 | "\n", 1029 | "fasta = HMA4_fasta_cache[weird_variant[\"accession\"]]\n", 1030 | "record = SeqIO.read(StringIO(fasta), \"fasta\")\n", 1031 | "\n", 1032 | "start, _ = get_start_end_from_seq(record)\n", 1033 | "in_gene_position = weird_variant[\"position\"] - start\n", 1034 | "\n", 1035 | "print(record.seq[in_gene_position])\n", 1036 | "print(weird_variant)\n" 1037 | ], 1038 | "metadata": { 1039 | "colab": { 1040 | "base_uri": "https://localhost:8080/" 1041 | }, 1042 | "id": "o-a68W2eygrL", 1043 | "outputId": "8f4c6a75-6028-4574-d94f-aa06ae8234aa" 1044 | }, 1045 | "execution_count": null, 1046 | "outputs": [ 1047 | { 1048 | "output_type": "stream", 1049 | "name": "stdout", 1050 | "text": [ 1051 | "G\n", 1052 | "chromosome 2\n", 1053 | "position 8280931\n", 1054 | "accession 9511\n", 1055 | "reference G\n", 1056 | "variant T\n", 1057 | "- 40\n", 1058 | "impacts MODERATE\n", 1059 | "effects missense_variant\n", 1060 | "Name: 29, dtype: object\n" 1061 | ] 1062 | } 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "source": [ 1068 | "results = pd.DataFrame(results, columns=[\"reference_aa\", \"variant_aa\", \"snp_centered_dna_window\"])\n", 1069 | "results" 1070 | ], 1071 | "metadata": { 1072 | "colab": { 1073 | "base_uri": "https://localhost:8080/", 1074 | "height": 424 1075 | }, 1076 | "id": "heS0L3zyuyin", 1077 | "outputId": "d8dde231-0d38-42fa-8be3-ce1d9d66dedb" 1078 | }, 1079 | "execution_count": null, 1080 | "outputs": [ 1081 | { 1082 | "output_type": "execute_result", 1083 | "data": { 1084 | "text/plain": [ 1085 | " reference_aa variant_aa snp_centered_dna_window\n", 1086 | "0 L V AAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG...\n", 1087 | "1 L V AAAAGTAAACATTTTCAATAAGAAANNNNNNNNNNNNNNNNNNNNG...\n", 1088 | "2 L V AAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG...\n", 1089 | "3 L V NNAAGTAAANNNNTNNAATAANAAAATACAAGACCCATNCCGAAAG...\n", 1090 | "4 F Y AATAAGAAAANNNNNNNNNNNNNNCGNAAGTTTNTTNNNNANAAAA...\n", 1091 | ".. ... ... ...\n", 1092 | "95 K M TATAACAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAA...\n", 1093 | "96 Y F CAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAAGTTGA...\n", 1094 | "97 - - -\n", 1095 | "98 - - -\n", 1096 | "99 - - -\n", 1097 | "\n", 1098 | "[100 rows x 3 columns]" 1099 | ], 1100 | "text/html": [ 1101 | "\n", 1102 | "
\n", 1103 | "
\n", 1104 | "\n", 1117 | "\n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | "
reference_aavariant_aasnp_centered_dna_window
0LVAAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG...
1LVAAAAGTAAACATTTTCAATAAGAAANNNNNNNNNNNNNNNNNNNNG...
2LVAAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG...
3LVNNAAGTAAANNNNTNNAATAANAAAATACAAGACCCATNCCGAAAG...
4FYAATAAGAAAANNNNNNNNNNNNNNCGNAAGTTTNTTNNNNANAAAA...
............
95KMTATAACAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAA...
96YFCAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAAGTTGA...
97---
98---
99---
\n", 1195 | "

100 rows × 3 columns

\n", 1196 | "
\n", 1197 | "
\n", 1198 | "\n", 1199 | "
\n", 1200 | " \n", 1208 | "\n", 1209 | " \n", 1249 | "\n", 1250 | " \n", 1274 | "
\n", 1275 | "\n", 1276 | "\n", 1277 | "
\n", 1278 | " \n", 1289 | "\n", 1290 | "\n", 1379 | "\n", 1380 | " \n", 1402 | "
\n", 1403 | "\n", 1404 | "
\n", 1405 | " \n", 1436 | " \n", 1445 | " \n", 1457 | "
\n", 1458 | "\n", 1459 | "
\n", 1460 | "
\n" 1461 | ], 1462 | "application/vnd.google.colaboratory.intrinsic+json": { 1463 | "type": "dataframe", 1464 | "variable_name": "results", 1465 | "summary": "{\n \"name\": \"results\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"reference_aa\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"Y\",\n \"F\",\n \"S\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"variant_aa\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"M\",\n \"R\",\n \"V\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"snp_centered_dna_window\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"AAAAAAAAATGTTTNGAACTGTTTCATGATAATGATAACNAAAAAAGTTTTTGCTTTCTTNTTTTTTTTCCTCCGCAAAACAGTCTNAAAGTATAACCAAAAAGCCTATAAATCAATATAATTTGTTGTTTTGATTTACGTTTTACAGAAAATGGCGTTACAAAACAAAGAAGAAGAGAAAAAGAAAGTGNNNNNNNNGCAAAAGAGTTACTTCGNNNNNNNNGGAATCTGTTGTACATCGGAAGTTCCTATAATCAAGAATATTCTCAAGTCACTTGACGGCGTTAAAGAATATTCCGTCATCGTTCCCTCGAGAACCGTGATTGTTGTTCACGACAGTCTCCTCANCTCTCCCTTCCAAATTGGTAAATNTTTTTTTTCTTTGNGATAATAAANNTTTTTTNNNNNNNAAANATTGGTAAATCATTATAANTAAATAGTTATTTAANATTTCTCTAATTTTTAATTTTACTCAGTNAAAAAATAANAATTAANNNNNNTAAANAATTATTT\",\n \"TGGAGGCAGCAGCAGTTGTGTTTCTGTTCACCATATCCGACTGGCTCGAAACAAGAGCTAGCTACAAGGNTTGTNTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTGAAATTNAAACTATAATGGTTTATTAAGGTAATGATTATTGCAAATGTAAAATCTCAGGCGACCTCGGTAATGCAGTCTTTGATGAGCTTAGCTCCACAAAAGGCTACAATAGCAGAGACTGGTGAAGAAGTTGAGGTAGATGAGGTTAAGGTTGATACTGTTGTAGCAGTTAAAGCTGGTGAAACCATACCAATTGATGGGATTGTGGTGGATGGAAACTGTGAAGTAGACGAGAAAACCTTAACGGGCGAAGCATTTCCTGTGCCTAAACAGAGAGATTCTACGGTTTGGGCTGGCACCATCAATCTAAATGGTANGTAATCCTATTTTAAGAGCTTCAAGCTTTATACATTTTTGTTGTAT\",\n \"CATTTTCCTCACTTGCAAATATATTTTAANAAATCTTCTTCCACCTTTGTTAATTAATGATTTAATTCTGAAAAACANTGTGTTGCAGCTAAGGCACTAAACGAAGCTAGGTTAGAAGCAAACGTGAGGGTAAACGGAGAAACTAGCTTCAAGAACAAATGGCCGAGCCCTTTCGCCGTAGTTTCCGGCTTACTTCTCCTCCTATCCTTCCTAAAGTTTGTCTACTCGCNTTTACGTTGGNTCGCCGTCGCAGCAGCTGCCGCCGGTNTNTNTCCGATTCTTGCCAAAGCCTTTGCTTCCATTAAAAGGCCTAGGATCGACATCAACATATTGGTCATAATAACCGGTAATACCACTTTCTCCTCTTTTCTTTATGCTGTCGTTATACCANTTTTTTTTNTAGTATTCATTATTAGCATCTAACATTATTTCTCTATATTACTCCGTGACTTAAGAATGATGTGTTCTATAATAATTTGTTAGTTTATGGNTTATCCGTGATCGATGTAACAA\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" 1466 | } 1467 | }, 1468 | "metadata": {}, 1469 | "execution_count": 144 1470 | } 1471 | ] 1472 | }, 1473 | { 1474 | "cell_type": "code", 1475 | "source": [ 1476 | "(results[\"reference_aa\"] == \"-\").sum()" 1477 | ], 1478 | "metadata": { 1479 | "colab": { 1480 | "base_uri": "https://localhost:8080/" 1481 | }, 1482 | "id": "xNoABL4t0bZk", 1483 | "outputId": "29585018-1d3f-46a2-952d-9df5a8cd5373" 1484 | }, 1485 | "execution_count": null, 1486 | "outputs": [ 1487 | { 1488 | "output_type": "execute_result", 1489 | "data": { 1490 | "text/plain": [ 1491 | "np.int64(51)" 1492 | ] 1493 | }, 1494 | "metadata": {}, 1495 | "execution_count": 148 1496 | } 1497 | ] 1498 | }, 1499 | { 1500 | "cell_type": "code", 1501 | "source": [ 1502 | "from google.colab import files\n", 1503 | "results.to_csv(\"results.csv\", index=False)\n", 1504 | "files.download(\"results.csv\")" 1505 | ], 1506 | "metadata": { 1507 | "colab": { 1508 | "base_uri": "https://localhost:8080/", 1509 | "height": 17 1510 | }, 1511 | "id": "xpN0OV2XwoKT", 1512 | "outputId": "ced13f2c-b173-401c-ec77-db778519cbc2" 1513 | }, 1514 | "execution_count": null, 1515 | "outputs": [ 1516 | { 1517 | "output_type": "display_data", 1518 | "data": { 1519 | "text/plain": [ 1520 | "" 1521 | ], 1522 | "application/javascript": [ 1523 | "\n", 1524 | " async function download(id, filename, size) {\n", 1525 | " if (!google.colab.kernel.accessAllowed) {\n", 1526 | " return;\n", 1527 | " }\n", 1528 | " const div = document.createElement('div');\n", 1529 | " const label = document.createElement('label');\n", 1530 | " label.textContent = `Downloading \"${filename}\": `;\n", 1531 | " div.appendChild(label);\n", 1532 | " const progress = document.createElement('progress');\n", 1533 | " progress.max = size;\n", 1534 | " div.appendChild(progress);\n", 1535 | " document.body.appendChild(div);\n", 1536 | "\n", 1537 | " const buffers = [];\n", 1538 | " let downloaded = 0;\n", 1539 | "\n", 1540 | " const channel = await google.colab.kernel.comms.open(id);\n", 1541 | " // Send a message to notify the kernel that we're ready.\n", 1542 | " channel.send({})\n", 1543 | "\n", 1544 | " for await (const message of channel.messages) {\n", 1545 | " // Send a message to notify the kernel that we're ready.\n", 1546 | " channel.send({})\n", 1547 | " if (message.buffers) {\n", 1548 | " for (const buffer of message.buffers) {\n", 1549 | " buffers.push(buffer);\n", 1550 | " downloaded += buffer.byteLength;\n", 1551 | " progress.value = downloaded;\n", 1552 | " }\n", 1553 | " }\n", 1554 | " }\n", 1555 | " const blob = new Blob(buffers, {type: 'application/binary'});\n", 1556 | " const a = document.createElement('a');\n", 1557 | " a.href = window.URL.createObjectURL(blob);\n", 1558 | " a.download = filename;\n", 1559 | " div.appendChild(a);\n", 1560 | " a.click();\n", 1561 | " div.remove();\n", 1562 | " }\n", 1563 | " " 1564 | ] 1565 | }, 1566 | "metadata": {} 1567 | }, 1568 | { 1569 | "output_type": "display_data", 1570 | "data": { 1571 | "text/plain": [ 1572 | "" 1573 | ], 1574 | "application/javascript": [ 1575 | "download(\"download_d7f10122-a0c2-4540-863a-4cf24d73e405\", \"results.csv\", 25736)" 1576 | ] 1577 | }, 1578 | "metadata": {} 1579 | } 1580 | ] 1581 | }, 1582 | { 1583 | "cell_type": "code", 1584 | "source": [ 1585 | "results[\"snp_centered_dna_window\"].apply(lambda x: len(x))" 1586 | ], 1587 | "metadata": { 1588 | "colab": { 1589 | "base_uri": "https://localhost:8080/", 1590 | "height": 147 1591 | }, 1592 | "id": "1alzRqCgu_RZ", 1593 | "outputId": "2b546112-5323-4799-a5bd-9019be3fc192" 1594 | }, 1595 | "execution_count": null, 1596 | "outputs": [ 1597 | { 1598 | "output_type": "execute_result", 1599 | "data": { 1600 | "text/plain": [ 1601 | "0 513\n", 1602 | "1 513\n", 1603 | "Name: snp_centered_dna_window, dtype: int64" 1604 | ], 1605 | "text/html": [ 1606 | "
\n", 1607 | "\n", 1620 | "\n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | "
snp_centered_dna_window
0513
1513
\n", 1638 | "

" 1639 | ] 1640 | }, 1641 | "metadata": {}, 1642 | "execution_count": 127 1643 | } 1644 | ] 1645 | }, 1646 | { 1647 | "cell_type": "code", 1648 | "source": [], 1649 | "metadata": { 1650 | "id": "CuUvWfMpvwOz" 1651 | }, 1652 | "execution_count": null, 1653 | "outputs": [] 1654 | }, 1655 | { 1656 | "cell_type": "code", 1657 | "source": [ 1658 | "first_result = df.loc[4].to_dict()\n", 1659 | "first_result" 1660 | ], 1661 | "metadata": { 1662 | "colab": { 1663 | "base_uri": "https://localhost:8080/" 1664 | }, 1665 | "id": "0AOlo1ha2e1d", 1666 | "outputId": "ace84581-4489-425a-88aa-d4acec34bab3" 1667 | }, 1668 | "execution_count": null, 1669 | "outputs": [ 1670 | { 1671 | "output_type": "execute_result", 1672 | "data": { 1673 | "text/plain": [ 1674 | "{'chromosome': 2,\n", 1675 | " 'position': 8279539,\n", 1676 | " 'accession': 9525,\n", 1677 | " 'reference': 'T',\n", 1678 | " 'variant': 'A',\n", 1679 | " '-': 40,\n", 1680 | " 'impacts': 'MODERATE',\n", 1681 | " 'effects': 'missense_variant'}" 1682 | ] 1683 | }, 1684 | "metadata": {}, 1685 | "execution_count": 59 1686 | } 1687 | ] 1688 | }, 1689 | { 1690 | "cell_type": "code", 1691 | "source": [ 1692 | "get_accession_gene_fasta(HMA4_gene_id)(first_result[\"accession\"])" 1693 | ], 1694 | "metadata": { 1695 | "colab": { 1696 | "base_uri": "https://localhost:8080/", 1697 | "height": 140 1698 | }, 1699 | "id": "b1UmpkUTq3yG", 1700 | "outputId": "36e61bb1-c3cc-4731-9064-6865e209bd31" 1701 | }, 1702 | "execution_count": null, 1703 | "outputs": [ 1704 | { 1705 | "output_type": "execute_result", 1706 | "data": { 1707 | "text/plain": [ 1708 | "'>MPI-GMI|Ath-1001-Genomes|pseudo-genome|9525|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2\\nCTACGTTCCTAACACTTCTCTCAACCTTTATCTGATCGCACCAAACCAGTTTTTTCGCATCGGCTNCTTCCTTTTGCTACTAGCTCTCCTCTCTTCTCCGGTNTTTTTGTCTCNCTTCTTAATTCACACAGATTTCATGATAAGTGATGATCTATAACAAGACGCTAACTCTTCTCTTGCATTTCTCGTGTTTTCATTTTCTTGTTACGCCAAATTTATCCCTTCAAAATCNNTTTTTTATGNGTATAGAATCCAAATAATAAGTAAAAGCTGATTCGTCTTCTTCCACTTAACACAAGTAAGCAGTGAGAGGGTGAAGATTTTTCTTTAGGAAAACAAAGAGAGTGAAGATATTTTTTGGCTTGATCTCAACATTATTTTTTCNTAAAAGTAAACATTTTCAATAAGAAAANNNNNNNNNNNNNNCGNAAGTTTNTTNNNNANAAAAAAAAAAAGGNTTTGANCTGTTTCATGATAATGATAACNAAAAAAGTTTTTGCTTTCTTNTTTTTTTTCCTCCGCAAAACAGTCTNAAAGTATAACCAAAAAGCCTATAAATCAATATAATTTGTTGTTTTGATTTACGTTTTACNGAAAATGGCGTTACAAAACAAAGAAGAAGAGAAAAAGAAAGTGAAGAAGTTGCAAAAGAGTTACTACGATGTTCTCGGAATCTGTTGTACATCGGAAGTTCCTATAATCGAGAATATTCTCAAGTCACTTGACGGCGTTAAAGAATATTCCGTCATCGTTCCCTCGAGAACCGTGNTTGTTGTTCACGACAGNNTCCTCATCTCTCCCTTCCNAATTGGTAAATNTTTTTTTTCTTTNTGATAATAAAGNTTTTTTNNNNNAAAAAAATTGGTAAATCATTATAANTAAATAGTTATTTAANATTTCTCTAATTTTTAATTTTACTCAGTNAAAAAATAANAATTAAAAAGCATAAATAANNNNNNNNTATTACACGAAAAGCTCACTTCATCTTATTTCTATTTATAAATTAGTGGTTTTGCGTCATGGTTTGATATTTTAATTAGTCAAATATATGTGATCCAANAAGTCACTTCAACTGAAAATATTTAATATTCTACCACTNAAATTTAATTTGCTTTCCCAAATCATTTCTTCTTAAGAAAACTAGTTTAGANCCAATTCTGTTTTANCAAAGTAATTTTTTCTTAATTGCTTTGCTTTGATATTTATGTACTGATNNNNNNNNNNNNNTGGTTGTNTTTTTAACCTAGGAGCTAAAGGNNAATATTAAAATTAACANTTTTTTCGCCTACAACAAAATAGGATAAACCGTTACTNTTTTTTNTGTTTTATTAATCCCATGAAACACGTTCAGTTAGNATAACCAAAGATTGTGAATAAGGTTCGTCTANTTTTTTTTCAAACTGTATCTAGTAAACCAACAAATTAAAAATGGTAACTTTTATAGAAACGCATAATGATACAATAACGGCAATAATACAACGAATCACATGTTTNTATAAATTCTAAATTTTGCATATCATAAAACCTTTACCACATTNNNTNTTTTNTGCATATTATAGCTAACCTAATACGTGCATATAACGCANNNTACGTGTGTAAACAAATAAATAATATATTTAATTGACATCATATATAGAATTTAATTCCGAAATCATATATAGATAATACATAATAGAANAAAGGTTGTTAGAAAAAGCCTGTCGTGATACTTTACCAAATCTTTTGATATGAATATTTAATTGGTAGTGCATCATACTCGTTACGTAACAATATTTTATAATTTTTATTTGACAGAAAACATATTCTATATTAGACATTNAAAGTNAAAAATAATCAAANAAAAAATCANAAAAAATTAAATAATAACACTATTTTCATCATTAAGAAAATACACTATTTTCAGTTTATTATNTANTAGTTTTTAAGGTTTGAATTTATAAAAATTGGCTAATATTTGTCATTTTCCTCACTTGCAAATATATTTTAANAAATCTTCTTCCACCTTTGTTAATTNATGATTTAATTCNGAAAAACANNGTGTTGCAGCNAAGGCACTAAACGAAGCTAGGTTAGNAGCAAANGTNAGGNTAAANGGAGAAACTAGCTTCAAGAACAAATGGCCGAGCCCTTTCGCCGTAGTTTCCGGCTTACTNCTCCTCCTATCCTTCCTAAAGTTTGTCTACNCNCNTTTACGTTGGCTCGCCGTCGCAGCAGTTGCCGCCGGTATCTATCCGATTCTTGCCAAAGCCTTTGCTTCCATTAAAAGGCCTAGGATCGACATCAACATATTGGTCATAATAACCGGTAATACCACTTTCTCCTCTTTTCTTTATGCTGTCGTTATACCANTTTTTTTTNTAGTATTCATTATTAGCATCTAACATTATTTCTCTATATTACTCCGTGACTTAAGAATGATGTGTTCTATAATAATTTGTTAGTTTATGGNTTATCCGTGATCGATGTAACAACTAGAAATAATATTGTTACTTATCTAAGATTTGNAAACTTAGCAAATTGTGGTTAAGGTAAAACTATTAATATATATAATTTCTTAAAANTTGTGATCGAGTCTCTATCCCTTTTTCGGTATTTTAAAAAAGCTTACATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAAAAGTTTCCTTTATTTTGTTCGTAACNTTTTTAAATAAATTCATATGGGAAAACTACTAAAGAATAGGTATTAAGAAAAATATTGTATTATGCTCAACTTATTTTACTGAACTCCAATATAGGAAAAATTAAATACGTGTAGCAGTTTGATATGNTTGTTATATTGTAGANTTTATTTGAGTTATAGGTTATAGAGGAAAACTAAAACAACTTTATATTTTTGCCGTAAGCAATGCAAATTNGAGTTATAGTTTATAGGAAAAAATNAAAATGTTCTTAGGATATAGGAAAAGTGAACATGTNATATAAACTTGAAAGTTGGTGTGGAAGATAATGCAAAAACAAAAGGTGCATTTCAAAATACGGGAAAAAGAAAGACGCCTTTCTCTTTTCTGTTTATAAAAATATACATTTTCGTTCATTTCTACTAGTACGATTATTTATAGGAACATACATAATTTCAAATTAAAACCATCTAAAGAGTAGNGAAAATAATTAAATATTTTGTTCAAGGAAAATAGTTATATTGTCTATAACGTAGGCAAGAAAATAAAAATCAGTTTTATTTTCCTNGGGTGCAATTAAGANGNCAATCTTTTTATATAAAATCAGAGTAATTTCATACCAGAAATCCAGACTAAATTTTGAGNTTTGTAGGAAGTCTCAAAAATCATAGTGTGATATCTTTTGATGNTTTTTTCTAAAACAATTACAATTACGTGCCATAGATAGAAAAACGCTAAGCATAGGTTTTTGATTGAAAATGGAAAATGGAAAAAGAACCTTCAACTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATGGAGCTATGGAATTAGTTTTAGTTGNANTTTCTGTTTAGTAGACATAATTAACAANAAACCATACAAAGTCATNTNNANCCAAGTTTTTTTTTTTTTNTNNTTTTTTTTTNTTTCGCTTGGTAGATGTTATAGTGTTACTTTAAGCTTTCTTTGATTTTCTAAAAACAAAATATTTTACACTGAGAGGTGATACAATGTTTTAGGTTTAAACTGATAATGATCCAATTTCTTCAATACAGTGATTGCAACACTTGCAATGCAAGATTTCATGGAGGCAGCAGCAGTTGTGTTCCTATTCACCATATCCGACTGGCTCGAAACAAGAGCTAGCTACAAGGNTTGTTTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGTGNNNNNNNNNNNNNNNTGGTNTANNNAGGTAATGATTATTGCAAATGTAAAATCTCAGGCGACCTCGGTAATGCAGTCTTTGATGAGCTTAGCTCCACAAAAGGCTACAATAGCAGAGACTGGTGAAGAAGTTGAGGTAGATGAGGTTAAGGTTGATACTGTTGTAGCAGTTAAAGCTGGTGAAACCATACCAATTGATGGGATTGTGGTGGATGGAAACTGTGAAGTAGACGAGAAAACCTTAACGGGCGAAGCATTTCCTGTGCCTAAACAGAGAGATTCTACGGTTTGGGCTGGNACCATCAATCTAAATGGTANGTAATCCTATTTTAAGAGCTTCAAGCTTTATACATTTTTGTTGTATCTTTCATTCTTTTGATCTTTCTTGGTNTTTTTTTTGCAGGTTACATAAGTGNGAAAACAACTTCTTTAGCGGGTGATTGCGTGGTTGCGAAAATGGCTAAGCTAGTAGAAGAAGCTCAGAGCAGTAAAACCAAATCTCAGAGACTAATAGACAAATGTTCTCAGTACTATACTCCAGGTAAGCAAAGACATAACCAAAACGTNNTNTTTATGTTCTTGATTCGAGTAATTTTGAGACCTCTGTGNTTCTTGTTTTGTTTTCAGCAATCATCTTAGTATCAGCTTGCGTCGCCATTGTCCCGGTTATAATGAAGGTCCACAACCTTAAACATTGGTTCCACCTAGCATTAGTTGTGTTAGTCAGTGGTTGTCCCTGTGGTCTTATCCTCTCTACACCAGTTGCTACTTTCTGTGCACTTACTAAAGCGGCAACTTCCGGACTTCTGATCAAAAGTGCTGATTATCTTGACACTCTCTCAAAGATCAAGATCGTTGCTTTCGATAAAACTGGGACTATTACAAGAGGAGAGTTNATTGTCATAGNTTTCAAGTCNCTCTCTAGAGATATAAACCNACGCAGCTTGCTTTNCTGGNNAAAAAAAAACTTGTGTNCTNNCCATAACNAGTTTGCCGAGATAACTTATGACNGAGACTTTCTTGTTATATAATGTTTCAGGGTATCTAGTGTTGAAAGCAAATCAAGTCATCCAATGGCTGCAACAATCGTGGACTATGCAAAATCTGTTTCTGTTGAGCCTAGGCCTGAAGAGGTTGAGGATTACCAGAACTTTCCAGGTGAAGGAATCTACGGGAAGATTGATGGTAACGATATCTTCATTGGGAACNAAAAGATTGCTTCTCGAGCTGGTTGTTCAACAGGTACTTGTAATATAAAGCTCAACAGAATGTTTTGAGGTCTTGGTATTCTTTAGTCTGAGCACTTGATTNTTCTTTCTTTACAGTTCCAGAGATTGAAGTTGATACCAAAGGCGGGAAGACTGTTGGATACGTCTATGTAGGTGAAAGACTAGCTGGATTTTTCAATCTTTCTGATGCTTGTAGATCTGGTGTTTCTCAAGCAATGGCAGAACTGAAATCTCTAGGAATCAAAACCGCAATGCTAACGGGAGATAATCAAGCCGCGGCAATGCATGCTCAAGAACAGGTGAGACTAATATAAACCATAATTTTAATCACTCTCTTAAGCGAAGACGTTGTTTAATAGTNCCTTGAAATGTCTTATGAAACAGCTAGGGAATGTTTTAGATGTTGTACATGGAGATCTTCTTCCAGAAGATAAGTCCAGAATCATACAAGAGTTTAAGAAAGAGGGACCAACCGCAATGGTAGGGGACGGTGTGAATGATGCACCAGCTTTAGCTACAGCTGATATTGGTATCTCCATGGGAATTTCTGGCTCTGCTCTTGCAACACAAACTGGTAATATTATTCTGATGTCTAATGATATAAGAAGGATACCACAAGCGGTGAAGCTAGCGAGAAGAGCACGACGCAAAGTTGTTGAAAACGTGTGTCTATCAATCATTTTAAAAGCAGGAATNCTCGCTTNGGCATTTGCTGGTCATCCTTTGATTTGGGCTGCGGTTCTTGTTGATGTAGGGACTTGTCTGCTTGTNATTTTCAATAGTATGTTGCTGCTGCGAGAGAAGAAAAAGATTGGGAACAAAAAGTGTTACAGGGCTTCTACATCTAAGTTGAATGGTAGGAAACTTGAAGGCGATGATGATTATGTTGTGGACTTAGANGCAGGCTTGTTAAAAAAGAGCGGGAATGGTCAATGCAAATCAAGCTGTTGTGGAGATAAGAAAAATCAAGAGAATGTTGTGATGATGAAACCAAGTAGTAAAACCAGTTCTGATCATTCTCACCCTGGTTGTTGTGGCGATAAGAAGGAAGAAAAAGTGAAGCCGCTTGTGAAAGATGGCTGTTGCAGTGAGAAAACTAGGAAACCAGAGGGAGATATGGTTTCATTGAGCTCATGTANGAAGTCTAGTCATGTNAAACATGACCTGAAAATGNAAGGTGGTTCAGGTTGTTGTGNTAGCAAAAATNAGNAAGNGAAGGAANTAGTAGCAAAAAGCTGTTGTGAGAAACCCAAACAGCAGGNGGNGAGNGTTNGAGACTGCAAGTCTGGTCATTGCGAGAAGAAGAAGCAAGCTGAAGACATTGTTGTCCCGGTGCAGATTATTGGTCATGCATTAACGCATGTGGAGATCGAGTTGCAGACAAAGGAAACCTGCAAAACAAGCTGTTGTGACAGTAAAGATAAGGTTAAGGAGACAGGTTTGCTGCTTTCTAGTGAGAACACNCCTTACCTGGAGAAAGGNGTGCTGATTAAAGATGAAGGAAACTGCAAGTCTGGCAGCGAGAACATGGGGACAGTGAAACAAAGCTGCCATGAGAAGGGCTGCAGCGATGANAAACAAACCGGGGAAATAACTCTTGCTTCGGAGGAAGAGACAGATGATCAAGATTGCTCCTCGGGATGTTGTGTGAACGAGGGAACAGTGAAACAAAGCTTCCATGAGAAGAAGCATTCTGTGTTGGTGGAGAAGGAAGGTTTGGACATGGAAACTGGTTTCTGTTGTGATGCCAAGCTGGTTTGTTGTGGAAACACAGAAGGTGAAGTGAAGGAGCAATGTCGTCTGGAGATAAAGAAAGAAGAACATTGCAAGTCTGGTTGCTGCGGCGAGGAAAAACAAACCGGAGAAATCGCTCTGGNTTCAGAGGAAGAGNNNNNNAGCACGAATTGTTCCACGGGTTGTTGTGTGGACAAAGAAGAAGTGACACAAACCTGTCATGAGAAGCCTGTTAGCTTGGTGGTATCAGGCTTGGAAGTGAAGAAGGATGAGCATTGTGAGAGCTCACACAGAGCCGTCAAGGTAGAGACCTGTTGCAAAGTGAAGATTCCAGAGGCTTGCGCATCAAAATGTAGGGACAGAGCGAAGCGTCACAGTGGTAAAAGCTGTTGCAGGAGTTATGCAAAAGAGNTATGCAGCCACCGCCATCATCATCACCACCACCACCACCATCACCATGTGAGTGCTTGATGGAGATTGATTGAATAACTTAAACTCTTGATGCATCCATCTATTCACATTACGTTTANTCTCATTCCGTGAATGCCGAAANAAAAAACAAAATGTTCCAGCAAAGGCAGTTTATTAGATTAAGCAACTGTGTTATTCATAAAGACAATGCTAGTGATTTTTTTTAAGTACTTTATGTATTGCAATTCCT'" 1709 | ], 1710 | "application/vnd.google.colaboratory.intrinsic+json": { 1711 | "type": "string" 1712 | } 1713 | }, 1714 | "metadata": {}, 1715 | "execution_count": 87 1716 | } 1717 | ] 1718 | }, 1719 | { 1720 | "cell_type": "code", 1721 | "source": [ 1722 | "HMA4_fasta_cache = DefaultDict(get_accession_gene_fasta(HMA4_gene_id))" 1723 | ], 1724 | "metadata": { 1725 | "id": "vMJWm-_kqPJO" 1726 | }, 1727 | "execution_count": null, 1728 | "outputs": [] 1729 | }, 1730 | { 1731 | "cell_type": "code", 1732 | "source": [ 1733 | "get_aa_substitution_pair(first_result)" 1734 | ], 1735 | "metadata": { 1736 | "colab": { 1737 | "base_uri": "https://localhost:8080/" 1738 | }, 1739 | "id": "uoAvXA5_l1W_", 1740 | "outputId": "0f777d88-f870-4aae-c161-f6e57a9b8e4d" 1741 | }, 1742 | "execution_count": null, 1743 | "outputs": [ 1744 | { 1745 | "output_type": "execute_result", 1746 | "data": { 1747 | "text/plain": [ 1748 | "(MutableSeq('Y'), MutableSeq('F'))" 1749 | ] 1750 | }, 1751 | "metadata": {}, 1752 | "execution_count": 61 1753 | } 1754 | ] 1755 | }, 1756 | { 1757 | "cell_type": "code", 1758 | "source": [ 1759 | "fasta = get_accession_gene_fasta(first_result[\"accession\"], HMA4_gene_id)" 1760 | ], 1761 | "metadata": { 1762 | "id": "JrsYwelu6CiX" 1763 | }, 1764 | "execution_count": null, 1765 | "outputs": [] 1766 | }, 1767 | { 1768 | "cell_type": "code", 1769 | "source": [ 1770 | "record = SeqIO.read(StringIO(fasta), \"fasta\")\n", 1771 | "record" 1772 | ], 1773 | "metadata": { 1774 | "colab": { 1775 | "base_uri": "https://localhost:8080/" 1776 | }, 1777 | "id": "DPj6gOYcbT3h", 1778 | "outputId": "f74e2595-2953-4c15-8184-13fbc5a5a5aa" 1779 | }, 1780 | "execution_count": null, 1781 | "outputs": [ 1782 | { 1783 | "output_type": "execute_result", 1784 | "data": { 1785 | "text/plain": [ 1786 | "SeqRecord(seq=Seq('CTACGTTCCTAACANTTCTCTCAACCTTTATCTGATCGCACCAAACCAGTTTTT...CCT'), id='MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2', name='MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2', description='MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2', dbxrefs=[])" 1787 | ] 1788 | }, 1789 | "metadata": {}, 1790 | "execution_count": 18 1791 | } 1792 | ] 1793 | }, 1794 | { 1795 | "cell_type": "code", 1796 | "source": [ 1797 | "aa = record.seq.translate()" 1798 | ], 1799 | "metadata": { 1800 | "colab": { 1801 | "base_uri": "https://localhost:8080/" 1802 | }, 1803 | "id": "Dlox9xZqdM-e", 1804 | "outputId": "f8c7685c-967a-4055-d46d-6516cb5e66ff" 1805 | }, 1806 | "execution_count": null, 1807 | "outputs": [ 1808 | { 1809 | "output_type": "stream", 1810 | "name": "stderr", 1811 | "text": [ 1812 | "/usr/local/lib/python3.12/dist-packages/Bio/Seq.py:2877: BiopythonWarning: Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.\n", 1813 | " warnings.warn(\n" 1814 | ] 1815 | } 1816 | ] 1817 | }, 1818 | { 1819 | "cell_type": "code", 1820 | "source": [ 1821 | "start, _ = get_start_end_from_seq(record)\n", 1822 | "start" 1823 | ], 1824 | "metadata": { 1825 | "colab": { 1826 | "base_uri": "https://localhost:8080/" 1827 | }, 1828 | "id": "0WZ4QyJSgohr", 1829 | "outputId": "ba68e1fa-54e0-4243-f9c7-d7c5e1ccb845" 1830 | }, 1831 | "execution_count": null, 1832 | "outputs": [ 1833 | { 1834 | "output_type": "execute_result", 1835 | "data": { 1836 | "text/plain": [ 1837 | "8278881" 1838 | ] 1839 | }, 1840 | "metadata": {}, 1841 | "execution_count": 37 1842 | } 1843 | ] 1844 | }, 1845 | { 1846 | "cell_type": "code", 1847 | "source": [ 1848 | "in_gene_position = first_result[\"position\"] - start" 1849 | ], 1850 | "metadata": { 1851 | "id": "XNRtQwOzeD9-" 1852 | }, 1853 | "execution_count": null, 1854 | "outputs": [] 1855 | }, 1856 | { 1857 | "cell_type": "code", 1858 | "source": [ 1859 | "first_result[\"variant\"] == record.seq[in_gene_position]" 1860 | ], 1861 | "metadata": { 1862 | "colab": { 1863 | "base_uri": "https://localhost:8080/" 1864 | }, 1865 | "id": "xoJgVLKig9F5", 1866 | "outputId": "5a7ca1b8-217f-4acf-ddaa-de3bee4009c9" 1867 | }, 1868 | "execution_count": null, 1869 | "outputs": [ 1870 | { 1871 | "output_type": "execute_result", 1872 | "data": { 1873 | "text/plain": [ 1874 | "True" 1875 | ] 1876 | }, 1877 | "metadata": {}, 1878 | "execution_count": 47 1879 | } 1880 | ] 1881 | }, 1882 | { 1883 | "cell_type": "code", 1884 | "source": [ 1885 | "codon_start = (in_gene_position // 3) * 3\n", 1886 | "codon_start" 1887 | ], 1888 | "metadata": { 1889 | "colab": { 1890 | "base_uri": "https://localhost:8080/" 1891 | }, 1892 | "id": "paV0Hfbah0Hy", 1893 | "outputId": "7e18ef33-0e33-41d4-a476-6f9712c3b4e5" 1894 | }, 1895 | "execution_count": null, 1896 | "outputs": [ 1897 | { 1898 | "output_type": "execute_result", 1899 | "data": { 1900 | "text/plain": [ 1901 | "642" 1902 | ] 1903 | }, 1904 | "metadata": {}, 1905 | "execution_count": 48 1906 | } 1907 | ] 1908 | }, 1909 | { 1910 | "cell_type": "code", 1911 | "source": [ 1912 | "in_condon_position = in_gene_position % 3\n", 1913 | "in_condon_position" 1914 | ], 1915 | "metadata": { 1916 | "colab": { 1917 | "base_uri": "https://localhost:8080/" 1918 | }, 1919 | "id": "hpEdQDqAimqV", 1920 | "outputId": "f3d3e3ca-39dc-48c7-d5b3-0665edd3dfcc" 1921 | }, 1922 | "execution_count": null, 1923 | "outputs": [ 1924 | { 1925 | "output_type": "execute_result", 1926 | "data": { 1927 | "text/plain": [ 1928 | "0" 1929 | ] 1930 | }, 1931 | "metadata": {}, 1932 | "execution_count": 49 1933 | } 1934 | ] 1935 | }, 1936 | { 1937 | "cell_type": "code", 1938 | "source": [ 1939 | "codon = MutableSeq(record.seq[codon_start:codon_start+3])" 1940 | ], 1941 | "metadata": { 1942 | "id": "zY5RIgdMiI40" 1943 | }, 1944 | "execution_count": null, 1945 | "outputs": [] 1946 | }, 1947 | { 1948 | "cell_type": "code", 1949 | "source": [ 1950 | "codon.translate()" 1951 | ], 1952 | "metadata": { 1953 | "colab": { 1954 | "base_uri": "https://localhost:8080/" 1955 | }, 1956 | "id": "cDkbbg01iQSp", 1957 | "outputId": "9dd02500-7d28-43ba-f248-cda40a358fda" 1958 | }, 1959 | "execution_count": null, 1960 | "outputs": [ 1961 | { 1962 | "output_type": "execute_result", 1963 | "data": { 1964 | "text/plain": [ 1965 | "MutableSeq('V')" 1966 | ] 1967 | }, 1968 | "metadata": {}, 1969 | "execution_count": 55 1970 | } 1971 | ] 1972 | }, 1973 | { 1974 | "cell_type": "code", 1975 | "source": [ 1976 | "codon[in_condon_position] = first_result[\"reference\"]\n", 1977 | "codon.translate()" 1978 | ], 1979 | "metadata": { 1980 | "colab": { 1981 | "base_uri": "https://localhost:8080/" 1982 | }, 1983 | "id": "A7pDErOziTPs", 1984 | "outputId": "98ba4e30-6be5-4e7d-b2a1-936bdacdbae0" 1985 | }, 1986 | "execution_count": null, 1987 | "outputs": [ 1988 | { 1989 | "output_type": "execute_result", 1990 | "data": { 1991 | "text/plain": [ 1992 | "MutableSeq('L')" 1993 | ] 1994 | }, 1995 | "metadata": {}, 1996 | "execution_count": 56 1997 | } 1998 | ] 1999 | }, 2000 | { 2001 | "cell_type": "code", 2002 | "source": [ 2003 | "aa" 2004 | ], 2005 | "metadata": { 2006 | "colab": { 2007 | "base_uri": "https://localhost:8080/" 2008 | }, 2009 | "id": "41MMMDoweAtl", 2010 | "outputId": "45777d91-ac38-4a1e-9d9e-b85bf4dce3af" 2011 | }, 2012 | "execution_count": null, 2013 | "outputs": [ 2014 | { 2015 | "output_type": "execute_result", 2016 | "data": { 2017 | "text/plain": [ 2018 | "Seq('LRS*XFSQPLSDRTKPVFSHRLLPFAXSSPLFSGXFVSLLNSHRFHDK**SIXR...CNS')" 2019 | ] 2020 | }, 2021 | "metadata": {}, 2022 | "execution_count": 22 2023 | } 2024 | ] 2025 | }, 2026 | { 2027 | "cell_type": "code", 2028 | "source": [ 2029 | "record.id" 2030 | ], 2031 | "metadata": { 2032 | "colab": { 2033 | "base_uri": "https://localhost:8080/", 2034 | "height": 35 2035 | }, 2036 | "id": "c4EnAyO0fxGy", 2037 | "outputId": "cbc2bd94-3564-45cb-e5d8-883c261db2fe" 2038 | }, 2039 | "execution_count": null, 2040 | "outputs": [ 2041 | { 2042 | "output_type": "execute_result", 2043 | "data": { 2044 | "text/plain": [ 2045 | "'MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2'" 2046 | ], 2047 | "application/vnd.google.colaboratory.intrinsic+json": { 2048 | "type": "string" 2049 | } 2050 | }, 2051 | "metadata": {}, 2052 | "execution_count": 31 2053 | } 2054 | ] 2055 | }, 2056 | { 2057 | "cell_type": "code", 2058 | "source": [ 2059 | "record.seq[in_gene_position]" 2060 | ], 2061 | "metadata": { 2062 | "colab": { 2063 | "base_uri": "https://localhost:8080/", 2064 | "height": 35 2065 | }, 2066 | "id": "-y8MHPRrndYi", 2067 | "outputId": "78cdd7ef-10e0-4e3e-80db-eb2351c60ad3" 2068 | }, 2069 | "execution_count": null, 2070 | "outputs": [ 2071 | { 2072 | "output_type": "execute_result", 2073 | "data": { 2074 | "text/plain": [ 2075 | "'G'" 2076 | ], 2077 | "application/vnd.google.colaboratory.intrinsic+json": { 2078 | "type": "string" 2079 | } 2080 | }, 2081 | "metadata": {}, 2082 | "execution_count": 63 2083 | } 2084 | ] 2085 | }, 2086 | { 2087 | "cell_type": "code", 2088 | "source": [ 2089 | "snp_centered_dna_window = record.seq[in_gene_position-256:in_gene_position+257]" 2090 | ], 2091 | "metadata": { 2092 | "id": "jHZW-AwDnmde" 2093 | }, 2094 | "execution_count": null, 2095 | "outputs": [] 2096 | }, 2097 | { 2098 | "cell_type": "code", 2099 | "source": [ 2100 | "len(dna_snippet)" 2101 | ], 2102 | "metadata": { 2103 | "colab": { 2104 | "base_uri": "https://localhost:8080/" 2105 | }, 2106 | "id": "xd-2KX6Lnxex", 2107 | "outputId": "7de526b8-8d46-4224-d6d4-4f5126bb8605" 2108 | }, 2109 | "execution_count": null, 2110 | "outputs": [ 2111 | { 2112 | "output_type": "execute_result", 2113 | "data": { 2114 | "text/plain": [ 2115 | "513" 2116 | ] 2117 | }, 2118 | "metadata": {}, 2119 | "execution_count": 70 2120 | } 2121 | ] 2122 | }, 2123 | { 2124 | "cell_type": "code", 2125 | "source": [ 2126 | "dna_snippet[len(dna_snippet) // 2]" 2127 | ], 2128 | "metadata": { 2129 | "colab": { 2130 | "base_uri": "https://localhost:8080/", 2131 | "height": 35 2132 | }, 2133 | "id": "gjw2pKF5n3NB", 2134 | "outputId": "608e475f-7b65-4538-883b-be29a3ea707f" 2135 | }, 2136 | "execution_count": null, 2137 | "outputs": [ 2138 | { 2139 | "output_type": "execute_result", 2140 | "data": { 2141 | "text/plain": [ 2142 | "'G'" 2143 | ], 2144 | "application/vnd.google.colaboratory.intrinsic+json": { 2145 | "type": "string" 2146 | } 2147 | }, 2148 | "metadata": {}, 2149 | "execution_count": 74 2150 | } 2151 | ] 2152 | }, 2153 | { 2154 | "cell_type": "code", 2155 | "source": [ 2156 | "DefaultDict(lambda key: key + 5)" 2157 | ], 2158 | "metadata": { 2159 | "id": "gAo7w-Y33V8O" 2160 | }, 2161 | "execution_count": null, 2162 | "outputs": [] 2163 | }, 2164 | { 2165 | "cell_type": "code", 2166 | "source": [ 2167 | "foo = \"AAATAAA\"\n", 2168 | "\n", 2169 | "foo[len(foo) // 2]" 2170 | ], 2171 | "metadata": { 2172 | "colab": { 2173 | "base_uri": "https://localhost:8080/", 2174 | "height": 35 2175 | }, 2176 | "id": "vHV2Isu9oGDd", 2177 | "outputId": "238229fc-37ab-42a0-e54a-085b756d7db6" 2178 | }, 2179 | "execution_count": null, 2180 | "outputs": [ 2181 | { 2182 | "output_type": "execute_result", 2183 | "data": { 2184 | "text/plain": [ 2185 | "'T'" 2186 | ], 2187 | "application/vnd.google.colaboratory.intrinsic+json": { 2188 | "type": "string" 2189 | } 2190 | }, 2191 | "metadata": {}, 2192 | "execution_count": 76 2193 | } 2194 | ] 2195 | }, 2196 | { 2197 | "cell_type": "code", 2198 | "source": [ 2199 | "foo[3-10:3+2]" 2200 | ], 2201 | "metadata": { 2202 | "colab": { 2203 | "base_uri": "https://localhost:8080/", 2204 | "height": 35 2205 | }, 2206 | "id": "JDgLsIEpoOoZ", 2207 | "outputId": "f8fc1f3c-a984-48dc-c192-73d88787430b" 2208 | }, 2209 | "execution_count": null, 2210 | "outputs": [ 2211 | { 2212 | "output_type": "execute_result", 2213 | "data": { 2214 | "text/plain": [ 2215 | "'AAATA'" 2216 | ], 2217 | "application/vnd.google.colaboratory.intrinsic+json": { 2218 | "type": "string" 2219 | } 2220 | }, 2221 | "metadata": {}, 2222 | "execution_count": 115 2223 | } 2224 | ] 2225 | } 2226 | ] 2227 | } --------------------------------------------------------------------------------