├── LICENSE
└── 1001genomes_PAM_GPN_correlation.ipynb
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Silvan Büdenbender
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/1001genomes_PAM_GPN_correlation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNPlEba2v8XoRYdbXK+KCGE",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "source": [
32 | "!pip install biopython"
33 | ],
34 | "metadata": {
35 | "colab": {
36 | "base_uri": "https://localhost:8080/"
37 | },
38 | "id": "AcjQYlyybMKq",
39 | "outputId": "bbd6f38e-43af-44d2-a315-da76ad3f8241"
40 | },
41 | "execution_count": null,
42 | "outputs": [
43 | {
44 | "output_type": "stream",
45 | "name": "stdout",
46 | "text": [
47 | "Collecting biopython\n",
48 | " Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)\n",
49 | "Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages (from biopython) (2.0.2)\n",
50 | "Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)\n",
51 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.2/3.2 MB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
52 | "\u001b[?25hInstalling collected packages: biopython\n",
53 | "Successfully installed biopython-1.86\n"
54 | ]
55 | }
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "source": [
61 | "import time\n",
62 | "import requests\n",
63 | "from collections import defaultdict\n",
64 | "from io import StringIO\n",
65 | "\n",
66 | "from tqdm import tqdm\n",
67 | "import pandas as pd\n",
68 | "import numpy as np\n",
69 | "from Bio import SeqIO\n",
70 | "from Bio.Seq import Seq, MutableSeq"
71 | ],
72 | "metadata": {
73 | "id": "k5L396sZ0qxA"
74 | },
75 | "execution_count": null,
76 | "outputs": []
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": null,
81 | "metadata": {
82 | "id": "hu4QRbwAzV7y"
83 | },
84 | "outputs": [],
85 | "source": [
86 | "HMA4_gene_id = \"AT2G19110.1\"\n",
87 | "amino_acid_substitution = \"missense_variant\""
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "source": [
93 | "class DefaultDict(defaultdict):\n",
94 | " def __missing__(self, key):\n",
95 | " return self.default_factory(key)\n",
96 | "\n",
97 | "def get_accession_gene_fasta(gene):\n",
98 | " def get_gene(accession):\n",
99 | " fasta = requests.get(f\"https://tools.1001genomes.org/api/v1/pseudogenomes/strains/{accession}/gids/{gene}\")\n",
100 | " return fasta.text\n",
101 | " return get_gene\n"
102 | ],
103 | "metadata": {
104 | "id": "XozcmWeH5LBT"
105 | },
106 | "execution_count": null,
107 | "outputs": []
108 | },
109 | {
110 | "cell_type": "code",
111 | "source": [
112 | "HMA4_fasta_cache = DefaultDict(get_accession_gene_fasta(HMA4_gene_id))"
113 | ],
114 | "metadata": {
115 | "id": "KqQTiutV2Y0e"
116 | },
117 | "execution_count": null,
118 | "outputs": []
119 | },
120 | {
121 | "cell_type": "code",
122 | "source": [
123 | "aa_sub_snps = requests.get(f\"https://tools.1001genomes.org/api/v1.1/variants.json?type=snps;accs=all;gid={HMA4_gene_id};effect={amino_acid_substitution}\")\n",
124 | "df = pd.DataFrame(aa_sub_snps.json()['data'], columns=[\"chromosome\", \"position\", \"accession\", \"reference\", \"variant\", \"-\", \"impacts\", \"effects\"])\n",
125 | "df"
126 | ],
127 | "metadata": {
128 | "id": "S25ZOM6u0zBX",
129 | "colab": {
130 | "base_uri": "https://localhost:8080/",
131 | "height": 424
132 | },
133 | "outputId": "d54e94fd-560b-4097-e523-63bb4556b389"
134 | },
135 | "execution_count": null,
136 | "outputs": [
137 | {
138 | "output_type": "execute_result",
139 | "data": {
140 | "text/plain": [
141 | " chromosome position accession reference variant - impacts \\\n",
142 | "0 2 8279523 9653 T G 40 MODERATE \n",
143 | "1 2 8279523 9655 T G 40 MODERATE \n",
144 | "2 2 8279523 9661 T G 40 MODERATE \n",
145 | "3 2 8279523 9968 T G 40 MODERATE \n",
146 | "4 2 8279539 9525 T A 40 MODERATE \n",
147 | "... ... ... ... ... ... .. ... \n",
148 | "2328 2 8286155 9845 G T 40 MODERATE \n",
149 | "2329 2 8286155 9886 G T 40 MODERATE \n",
150 | "2330 2 8286155 9888 G T 40 MODERATE \n",
151 | "2331 2 8286155 9894 G T 40 MODERATE \n",
152 | "2332 2 8286250 9121 G A 40 MODERATE \n",
153 | "\n",
154 | " effects \n",
155 | "0 missense_variant \n",
156 | "1 missense_variant \n",
157 | "2 missense_variant \n",
158 | "3 missense_variant \n",
159 | "4 missense_variant \n",
160 | "... ... \n",
161 | "2328 missense_variant \n",
162 | "2329 missense_variant \n",
163 | "2330 missense_variant \n",
164 | "2331 missense_variant \n",
165 | "2332 missense_variant \n",
166 | "\n",
167 | "[2333 rows x 8 columns]"
168 | ],
169 | "text/html": [
170 | "\n",
171 | "
\n",
172 | "
\n",
173 | "\n",
186 | "
\n",
187 | " \n",
188 | " \n",
189 | " | \n",
190 | " chromosome | \n",
191 | " position | \n",
192 | " accession | \n",
193 | " reference | \n",
194 | " variant | \n",
195 | " - | \n",
196 | " impacts | \n",
197 | " effects | \n",
198 | "
\n",
199 | " \n",
200 | " \n",
201 | " \n",
202 | " | 0 | \n",
203 | " 2 | \n",
204 | " 8279523 | \n",
205 | " 9653 | \n",
206 | " T | \n",
207 | " G | \n",
208 | " 40 | \n",
209 | " MODERATE | \n",
210 | " missense_variant | \n",
211 | "
\n",
212 | " \n",
213 | " | 1 | \n",
214 | " 2 | \n",
215 | " 8279523 | \n",
216 | " 9655 | \n",
217 | " T | \n",
218 | " G | \n",
219 | " 40 | \n",
220 | " MODERATE | \n",
221 | " missense_variant | \n",
222 | "
\n",
223 | " \n",
224 | " | 2 | \n",
225 | " 2 | \n",
226 | " 8279523 | \n",
227 | " 9661 | \n",
228 | " T | \n",
229 | " G | \n",
230 | " 40 | \n",
231 | " MODERATE | \n",
232 | " missense_variant | \n",
233 | "
\n",
234 | " \n",
235 | " | 3 | \n",
236 | " 2 | \n",
237 | " 8279523 | \n",
238 | " 9968 | \n",
239 | " T | \n",
240 | " G | \n",
241 | " 40 | \n",
242 | " MODERATE | \n",
243 | " missense_variant | \n",
244 | "
\n",
245 | " \n",
246 | " | 4 | \n",
247 | " 2 | \n",
248 | " 8279539 | \n",
249 | " 9525 | \n",
250 | " T | \n",
251 | " A | \n",
252 | " 40 | \n",
253 | " MODERATE | \n",
254 | " missense_variant | \n",
255 | "
\n",
256 | " \n",
257 | " | ... | \n",
258 | " ... | \n",
259 | " ... | \n",
260 | " ... | \n",
261 | " ... | \n",
262 | " ... | \n",
263 | " ... | \n",
264 | " ... | \n",
265 | " ... | \n",
266 | "
\n",
267 | " \n",
268 | " | 2328 | \n",
269 | " 2 | \n",
270 | " 8286155 | \n",
271 | " 9845 | \n",
272 | " G | \n",
273 | " T | \n",
274 | " 40 | \n",
275 | " MODERATE | \n",
276 | " missense_variant | \n",
277 | "
\n",
278 | " \n",
279 | " | 2329 | \n",
280 | " 2 | \n",
281 | " 8286155 | \n",
282 | " 9886 | \n",
283 | " G | \n",
284 | " T | \n",
285 | " 40 | \n",
286 | " MODERATE | \n",
287 | " missense_variant | \n",
288 | "
\n",
289 | " \n",
290 | " | 2330 | \n",
291 | " 2 | \n",
292 | " 8286155 | \n",
293 | " 9888 | \n",
294 | " G | \n",
295 | " T | \n",
296 | " 40 | \n",
297 | " MODERATE | \n",
298 | " missense_variant | \n",
299 | "
\n",
300 | " \n",
301 | " | 2331 | \n",
302 | " 2 | \n",
303 | " 8286155 | \n",
304 | " 9894 | \n",
305 | " G | \n",
306 | " T | \n",
307 | " 40 | \n",
308 | " MODERATE | \n",
309 | " missense_variant | \n",
310 | "
\n",
311 | " \n",
312 | " | 2332 | \n",
313 | " 2 | \n",
314 | " 8286250 | \n",
315 | " 9121 | \n",
316 | " G | \n",
317 | " A | \n",
318 | " 40 | \n",
319 | " MODERATE | \n",
320 | " missense_variant | \n",
321 | "
\n",
322 | " \n",
323 | "
\n",
324 | "
2333 rows × 8 columns
\n",
325 | "
\n",
326 | "
\n",
589 | "
\n"
590 | ],
591 | "application/vnd.google.colaboratory.intrinsic+json": {
592 | "type": "dataframe",
593 | "variable_name": "df",
594 | "summary": "{\n \"name\": \"df\",\n \"rows\": 2333,\n \"fields\": [\n {\n \"column\": \"chromosome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 2,\n \"num_unique_values\": 1,\n \"samples\": [\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"position\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1083,\n \"min\": 8279523,\n \"max\": 8286250,\n \"num_unique_values\": 83,\n \"samples\": [\n 8284492\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"accession\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2602,\n \"min\": 88,\n \"max\": 18696,\n \"num_unique_values\": 959,\n \"samples\": [\n 10010\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"G\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"variant\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 4,\n \"samples\": [\n \"A\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"-\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 40,\n \"max\": 40,\n \"num_unique_values\": 1,\n \"samples\": [\n 40\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"impacts\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"MODERATE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"effects\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"missense_variant\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
595 | }
596 | },
597 | "metadata": {},
598 | "execution_count": 122
599 | }
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "source": [
605 | "def get_start_end_from_seq(seq):\n",
606 | " range = seq.id.split(\"|\")[4]\n",
607 | " _chrom, range = range.split(\":\")\n",
608 | " start, end = range.split(\"..\")\n",
609 | " return int(start), int(end)"
610 | ],
611 | "metadata": {
612 | "id": "MCg0jtLcgNWn"
613 | },
614 | "execution_count": null,
615 | "outputs": []
616 | },
617 | {
618 | "cell_type": "code",
619 | "source": [
620 | "def get_aa_substitution_pair(variant, record, in_gene_position):\n",
621 | " in_condon_position = in_gene_position % 3\n",
622 | "\n",
623 | " codon_start = (in_gene_position // 3) * 3\n",
624 | " codon = MutableSeq(record.seq[codon_start:codon_start+3])\n",
625 | "\n",
626 | " variant_aa = codon.translate()\n",
627 | "\n",
628 | " codon[in_condon_position] = variant[\"reference\"]\n",
629 | " reference_aa = codon.translate()\n",
630 | "\n",
631 | " return reference_aa, variant_aa"
632 | ],
633 | "metadata": {
634 | "id": "4lrKW5Fkkf7J"
635 | },
636 | "execution_count": null,
637 | "outputs": []
638 | },
639 | {
640 | "cell_type": "code",
641 | "source": [
642 | "def build_variant_substitution_data(variant, fasta_cache):\n",
643 | " fasta = fasta_cache[variant[\"accession\"]]\n",
644 | " record = SeqIO.read(StringIO(fasta), \"fasta\")\n",
645 | "\n",
646 | " start, _ = get_start_end_from_seq(record)\n",
647 | " in_gene_position = variant[\"position\"] - start\n",
648 | "\n",
649 | " if variant[\"variant\"] == record.seq[in_gene_position]:\n",
650 | "\n",
651 | " snp_centered_dna_window = record.seq[in_gene_position-256:in_gene_position+257]\n",
652 | "\n",
653 | " reference_aa, variant_aa = get_aa_substitution_pair(variant, record, in_gene_position)\n",
654 | "\n",
655 | " return [str(reference_aa), str(variant_aa), str(snp_centered_dna_window)]\n",
656 | " else:\n",
657 | " return [\"-\", \"-\", \"-\"]"
658 | ],
659 | "metadata": {
660 | "id": "xD7FLy-Ioi2o"
661 | },
662 | "execution_count": null,
663 | "outputs": []
664 | },
665 | {
666 | "cell_type": "code",
667 | "source": [
668 | "df[:2]"
669 | ],
670 | "metadata": {
671 | "colab": {
672 | "base_uri": "https://localhost:8080/",
673 | "height": 112
674 | },
675 | "id": "yKJXVDJFr-Z4",
676 | "outputId": "fb20756f-3684-491d-cfe8-36bd2fba62b1"
677 | },
678 | "execution_count": null,
679 | "outputs": [
680 | {
681 | "output_type": "execute_result",
682 | "data": {
683 | "text/plain": [
684 | " chromosome position accession reference variant - impacts \\\n",
685 | "0 2 8279523 9653 T G 40 MODERATE \n",
686 | "1 2 8279523 9655 T G 40 MODERATE \n",
687 | "\n",
688 | " effects \n",
689 | "0 missense_variant \n",
690 | "1 missense_variant "
691 | ],
692 | "text/html": [
693 | "\n",
694 | " \n",
695 | "
\n",
696 | "\n",
709 | "
\n",
710 | " \n",
711 | " \n",
712 | " | \n",
713 | " chromosome | \n",
714 | " position | \n",
715 | " accession | \n",
716 | " reference | \n",
717 | " variant | \n",
718 | " - | \n",
719 | " impacts | \n",
720 | " effects | \n",
721 | "
\n",
722 | " \n",
723 | " \n",
724 | " \n",
725 | " | 0 | \n",
726 | " 2 | \n",
727 | " 8279523 | \n",
728 | " 9653 | \n",
729 | " T | \n",
730 | " G | \n",
731 | " 40 | \n",
732 | " MODERATE | \n",
733 | " missense_variant | \n",
734 | "
\n",
735 | " \n",
736 | " | 1 | \n",
737 | " 2 | \n",
738 | " 8279523 | \n",
739 | " 9655 | \n",
740 | " T | \n",
741 | " G | \n",
742 | " 40 | \n",
743 | " MODERATE | \n",
744 | " missense_variant | \n",
745 | "
\n",
746 | " \n",
747 | "
\n",
748 | "
\n",
749 | "
\n",
957 | "
\n"
958 | ],
959 | "application/vnd.google.colaboratory.intrinsic+json": {
960 | "type": "dataframe",
961 | "summary": "{\n \"name\": \"df[:2]\",\n \"rows\": 2,\n \"fields\": [\n {\n \"column\": \"chromosome\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 2,\n \"max\": 2,\n \"num_unique_values\": 1,\n \"samples\": [\n 2\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"position\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 8279523,\n \"max\": 8279523,\n \"num_unique_values\": 1,\n \"samples\": [\n 8279523\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"accession\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1,\n \"min\": 9653,\n \"max\": 9655,\n \"num_unique_values\": 2,\n \"samples\": [\n 9655\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"reference\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"T\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"variant\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"G\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"-\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 40,\n \"max\": 40,\n \"num_unique_values\": 1,\n \"samples\": [\n 40\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"impacts\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"MODERATE\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"effects\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 1,\n \"samples\": [\n \"missense_variant\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
962 | }
963 | },
964 | "metadata": {},
965 | "execution_count": 95
966 | }
967 | ]
968 | },
969 | {
970 | "cell_type": "code",
971 | "source": [
972 | "results = []\n",
973 | "for variant in tqdm(df[:100].to_dict(orient=\"records\")):\n",
974 | " results.append(build_variant_substitution_data(variant, HMA4_fasta_cache))\n",
975 | " time.sleep(0.51)"
976 | ],
977 | "metadata": {
978 | "colab": {
979 | "base_uri": "https://localhost:8080/"
980 | },
981 | "id": "6Eg_-mHurtL0",
982 | "outputId": "4935ecc6-65d9-4852-bc4f-f82c206fc9ee"
983 | },
984 | "execution_count": null,
985 | "outputs": [
986 | {
987 | "output_type": "stream",
988 | "name": "stderr",
989 | "text": [
990 | "100%|██████████| 100/100 [01:34<00:00, 1.05it/s]\n"
991 | ]
992 | }
993 | ]
994 | },
995 | {
996 | "cell_type": "code",
997 | "source": [
998 | "build_variant_substitution_data(df.iloc[29], HMA4_fasta_cache)"
999 | ],
1000 | "metadata": {
1001 | "colab": {
1002 | "base_uri": "https://localhost:8080/",
1003 | "height": 297
1004 | },
1005 | "id": "ZETYFRvMyO74",
1006 | "outputId": "db46b392-25e1-4e52-d46d-74e5b77acecf"
1007 | },
1008 | "execution_count": null,
1009 | "outputs": [
1010 | {
1011 | "output_type": "error",
1012 | "ename": "AssertionError",
1013 | "evalue": "",
1014 | "traceback": [
1015 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1016 | "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
1017 | "\u001b[0;32m/tmp/ipython-input-3953702373.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mbuild_variant_substitution_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m29\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mHMA4_fasta_cache\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m29\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1018 | "\u001b[0;32m/tmp/ipython-input-3100140708.py\u001b[0m in \u001b[0;36mbuild_variant_substitution_data\u001b[0;34m(variant, fasta_cache)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_start_end_from_seq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0min_gene_position\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvariant\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"position\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mvariant\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"variant\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mrecord\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0min_gene_position\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0msnp_centered_dna_window\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrecord\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0min_gene_position\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m256\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0min_gene_position\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m257\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1019 | "\u001b[0;31mAssertionError\u001b[0m: "
1020 | ]
1021 | }
1022 | ]
1023 | },
1024 | {
1025 | "cell_type": "code",
1026 | "source": [
1027 | "weird_variant = df.iloc[29]\n",
1028 | "\n",
1029 | "fasta = HMA4_fasta_cache[weird_variant[\"accession\"]]\n",
1030 | "record = SeqIO.read(StringIO(fasta), \"fasta\")\n",
1031 | "\n",
1032 | "start, _ = get_start_end_from_seq(record)\n",
1033 | "in_gene_position = weird_variant[\"position\"] - start\n",
1034 | "\n",
1035 | "print(record.seq[in_gene_position])\n",
1036 | "print(weird_variant)\n"
1037 | ],
1038 | "metadata": {
1039 | "colab": {
1040 | "base_uri": "https://localhost:8080/"
1041 | },
1042 | "id": "o-a68W2eygrL",
1043 | "outputId": "8f4c6a75-6028-4574-d94f-aa06ae8234aa"
1044 | },
1045 | "execution_count": null,
1046 | "outputs": [
1047 | {
1048 | "output_type": "stream",
1049 | "name": "stdout",
1050 | "text": [
1051 | "G\n",
1052 | "chromosome 2\n",
1053 | "position 8280931\n",
1054 | "accession 9511\n",
1055 | "reference G\n",
1056 | "variant T\n",
1057 | "- 40\n",
1058 | "impacts MODERATE\n",
1059 | "effects missense_variant\n",
1060 | "Name: 29, dtype: object\n"
1061 | ]
1062 | }
1063 | ]
1064 | },
1065 | {
1066 | "cell_type": "code",
1067 | "source": [
1068 | "results = pd.DataFrame(results, columns=[\"reference_aa\", \"variant_aa\", \"snp_centered_dna_window\"])\n",
1069 | "results"
1070 | ],
1071 | "metadata": {
1072 | "colab": {
1073 | "base_uri": "https://localhost:8080/",
1074 | "height": 424
1075 | },
1076 | "id": "heS0L3zyuyin",
1077 | "outputId": "d8dde231-0d38-42fa-8be3-ce1d9d66dedb"
1078 | },
1079 | "execution_count": null,
1080 | "outputs": [
1081 | {
1082 | "output_type": "execute_result",
1083 | "data": {
1084 | "text/plain": [
1085 | " reference_aa variant_aa snp_centered_dna_window\n",
1086 | "0 L V AAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG...\n",
1087 | "1 L V AAAAGTAAACATTTTCAATAAGAAANNNNNNNNNNNNNNNNNNNNG...\n",
1088 | "2 L V AAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG...\n",
1089 | "3 L V NNAAGTAAANNNNTNNAATAANAAAATACAAGACCCATNCCGAAAG...\n",
1090 | "4 F Y AATAAGAAAANNNNNNNNNNNNNNCGNAAGTTTNTTNNNNANAAAA...\n",
1091 | ".. ... ... ...\n",
1092 | "95 K M TATAACAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAA...\n",
1093 | "96 Y F CAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAAGTTGA...\n",
1094 | "97 - - -\n",
1095 | "98 - - -\n",
1096 | "99 - - -\n",
1097 | "\n",
1098 | "[100 rows x 3 columns]"
1099 | ],
1100 | "text/html": [
1101 | "\n",
1102 | " \n",
1103 | " \n",
1104 | "\n",
1117 | " \n",
1118 | " \n",
1119 | " \n",
1120 | " | \n",
1121 | " reference_aa | \n",
1122 | " variant_aa | \n",
1123 | " snp_centered_dna_window | \n",
1124 | " \n",
1125 | " \n",
1126 | " \n",
1127 | " \n",
1128 | " | 0 | \n",
1129 | " L | \n",
1130 | " V | \n",
1131 | " AAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG... | \n",
1132 | " \n",
1133 | " \n",
1134 | " | 1 | \n",
1135 | " L | \n",
1136 | " V | \n",
1137 | " AAAAGTAAACATTTTCAATAAGAAANNNNNNNNNNNNNNNNNNNNG... | \n",
1138 | " \n",
1139 | " \n",
1140 | " | 2 | \n",
1141 | " L | \n",
1142 | " V | \n",
1143 | " AAAAGTAAACATTTTCAATAAGAAAATACAAGACCCATACCGAAAG... | \n",
1144 | " \n",
1145 | " \n",
1146 | " | 3 | \n",
1147 | " L | \n",
1148 | " V | \n",
1149 | " NNAAGTAAANNNNTNNAATAANAAAATACAAGACCCATNCCGAAAG... | \n",
1150 | " \n",
1151 | " \n",
1152 | " | 4 | \n",
1153 | " F | \n",
1154 | " Y | \n",
1155 | " AATAAGAAAANNNNNNNNNNNNNNCGNAAGTTTNTTNNNNANAAAA... | \n",
1156 | " \n",
1157 | " \n",
1158 | " | ... | \n",
1159 | " ... | \n",
1160 | " ... | \n",
1161 | " ... | \n",
1162 | " \n",
1163 | " \n",
1164 | " | 95 | \n",
1165 | " K | \n",
1166 | " M | \n",
1167 | " TATAACAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAA... | \n",
1168 | " \n",
1169 | " \n",
1170 | " | 96 | \n",
1171 | " Y | \n",
1172 | " F | \n",
1173 | " CAATTGTGAAATCTCTTGCTATTTTTATAAATGATTTTGAAGTTGA... | \n",
1174 | " \n",
1175 | " \n",
1176 | " | 97 | \n",
1177 | " - | \n",
1178 | " - | \n",
1179 | " - | \n",
1180 | " \n",
1181 | " \n",
1182 | " | 98 | \n",
1183 | " - | \n",
1184 | " - | \n",
1185 | " - | \n",
1186 | " \n",
1187 | " \n",
1188 | " | 99 | \n",
1189 | " - | \n",
1190 | " - | \n",
1191 | " - | \n",
1192 | " \n",
1193 | " \n",
1194 | " \n",
1195 | " 100 rows × 3 columns \n",
1196 | " \n",
1197 | " \n",
1460 | " \n"
1461 | ],
1462 | "application/vnd.google.colaboratory.intrinsic+json": {
1463 | "type": "dataframe",
1464 | "variable_name": "results",
1465 | "summary": "{\n \"name\": \"results\",\n \"rows\": 100,\n \"fields\": [\n {\n \"column\": \"reference_aa\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"Y\",\n \"F\",\n \"S\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"variant_aa\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 12,\n \"samples\": [\n \"M\",\n \"R\",\n \"V\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"snp_centered_dna_window\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 50,\n \"samples\": [\n \"AAAAAAAAATGTTTNGAACTGTTTCATGATAATGATAACNAAAAAAGTTTTTGCTTTCTTNTTTTTTTTCCTCCGCAAAACAGTCTNAAAGTATAACCAAAAAGCCTATAAATCAATATAATTTGTTGTTTTGATTTACGTTTTACAGAAAATGGCGTTACAAAACAAAGAAGAAGAGAAAAAGAAAGTGNNNNNNNNGCAAAAGAGTTACTTCGNNNNNNNNGGAATCTGTTGTACATCGGAAGTTCCTATAATCAAGAATATTCTCAAGTCACTTGACGGCGTTAAAGAATATTCCGTCATCGTTCCCTCGAGAACCGTGATTGTTGTTCACGACAGTCTCCTCANCTCTCCCTTCCAAATTGGTAAATNTTTTTTTTCTTTGNGATAATAAANNTTTTTTNNNNNNNAAANATTGGTAAATCATTATAANTAAATAGTTATTTAANATTTCTCTAATTTTTAATTTTACTCAGTNAAAAAATAANAATTAANNNNNNTAAANAATTATTT\",\n \"TGGAGGCAGCAGCAGTTGTGTTTCTGTTCACCATATCCGACTGGCTCGAAACAAGAGCTAGCTACAAGGNTTGTNTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTGAAATTNAAACTATAATGGTTTATTAAGGTAATGATTATTGCAAATGTAAAATCTCAGGCGACCTCGGTAATGCAGTCTTTGATGAGCTTAGCTCCACAAAAGGCTACAATAGCAGAGACTGGTGAAGAAGTTGAGGTAGATGAGGTTAAGGTTGATACTGTTGTAGCAGTTAAAGCTGGTGAAACCATACCAATTGATGGGATTGTGGTGGATGGAAACTGTGAAGTAGACGAGAAAACCTTAACGGGCGAAGCATTTCCTGTGCCTAAACAGAGAGATTCTACGGTTTGGGCTGGCACCATCAATCTAAATGGTANGTAATCCTATTTTAAGAGCTTCAAGCTTTATACATTTTTGTTGTAT\",\n \"CATTTTCCTCACTTGCAAATATATTTTAANAAATCTTCTTCCACCTTTGTTAATTAATGATTTAATTCTGAAAAACANTGTGTTGCAGCTAAGGCACTAAACGAAGCTAGGTTAGAAGCAAACGTGAGGGTAAACGGAGAAACTAGCTTCAAGAACAAATGGCCGAGCCCTTTCGCCGTAGTTTCCGGCTTACTTCTCCTCCTATCCTTCCTAAAGTTTGTCTACTCGCNTTTACGTTGGNTCGCCGTCGCAGCAGCTGCCGCCGGTNTNTNTCCGATTCTTGCCAAAGCCTTTGCTTCCATTAAAAGGCCTAGGATCGACATCAACATATTGGTCATAATAACCGGTAATACCACTTTCTCCTCTTTTCTTTATGCTGTCGTTATACCANTTTTTTTTNTAGTATTCATTATTAGCATCTAACATTATTTCTCTATATTACTCCGTGACTTAAGAATGATGTGTTCTATAATAATTTGTTAGTTTATGGNTTATCCGTGATCGATGTAACAA\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"
1466 | }
1467 | },
1468 | "metadata": {},
1469 | "execution_count": 144
1470 | }
1471 | ]
1472 | },
1473 | {
1474 | "cell_type": "code",
1475 | "source": [
1476 | "(results[\"reference_aa\"] == \"-\").sum()"
1477 | ],
1478 | "metadata": {
1479 | "colab": {
1480 | "base_uri": "https://localhost:8080/"
1481 | },
1482 | "id": "xNoABL4t0bZk",
1483 | "outputId": "29585018-1d3f-46a2-952d-9df5a8cd5373"
1484 | },
1485 | "execution_count": null,
1486 | "outputs": [
1487 | {
1488 | "output_type": "execute_result",
1489 | "data": {
1490 | "text/plain": [
1491 | "np.int64(51)"
1492 | ]
1493 | },
1494 | "metadata": {},
1495 | "execution_count": 148
1496 | }
1497 | ]
1498 | },
1499 | {
1500 | "cell_type": "code",
1501 | "source": [
1502 | "from google.colab import files\n",
1503 | "results.to_csv(\"results.csv\", index=False)\n",
1504 | "files.download(\"results.csv\")"
1505 | ],
1506 | "metadata": {
1507 | "colab": {
1508 | "base_uri": "https://localhost:8080/",
1509 | "height": 17
1510 | },
1511 | "id": "xpN0OV2XwoKT",
1512 | "outputId": "ced13f2c-b173-401c-ec77-db778519cbc2"
1513 | },
1514 | "execution_count": null,
1515 | "outputs": [
1516 | {
1517 | "output_type": "display_data",
1518 | "data": {
1519 | "text/plain": [
1520 | ""
1521 | ],
1522 | "application/javascript": [
1523 | "\n",
1524 | " async function download(id, filename, size) {\n",
1525 | " if (!google.colab.kernel.accessAllowed) {\n",
1526 | " return;\n",
1527 | " }\n",
1528 | " const div = document.createElement('div');\n",
1529 | " const label = document.createElement('label');\n",
1530 | " label.textContent = `Downloading \"${filename}\": `;\n",
1531 | " div.appendChild(label);\n",
1532 | " const progress = document.createElement('progress');\n",
1533 | " progress.max = size;\n",
1534 | " div.appendChild(progress);\n",
1535 | " document.body.appendChild(div);\n",
1536 | "\n",
1537 | " const buffers = [];\n",
1538 | " let downloaded = 0;\n",
1539 | "\n",
1540 | " const channel = await google.colab.kernel.comms.open(id);\n",
1541 | " // Send a message to notify the kernel that we're ready.\n",
1542 | " channel.send({})\n",
1543 | "\n",
1544 | " for await (const message of channel.messages) {\n",
1545 | " // Send a message to notify the kernel that we're ready.\n",
1546 | " channel.send({})\n",
1547 | " if (message.buffers) {\n",
1548 | " for (const buffer of message.buffers) {\n",
1549 | " buffers.push(buffer);\n",
1550 | " downloaded += buffer.byteLength;\n",
1551 | " progress.value = downloaded;\n",
1552 | " }\n",
1553 | " }\n",
1554 | " }\n",
1555 | " const blob = new Blob(buffers, {type: 'application/binary'});\n",
1556 | " const a = document.createElement('a');\n",
1557 | " a.href = window.URL.createObjectURL(blob);\n",
1558 | " a.download = filename;\n",
1559 | " div.appendChild(a);\n",
1560 | " a.click();\n",
1561 | " div.remove();\n",
1562 | " }\n",
1563 | " "
1564 | ]
1565 | },
1566 | "metadata": {}
1567 | },
1568 | {
1569 | "output_type": "display_data",
1570 | "data": {
1571 | "text/plain": [
1572 | ""
1573 | ],
1574 | "application/javascript": [
1575 | "download(\"download_d7f10122-a0c2-4540-863a-4cf24d73e405\", \"results.csv\", 25736)"
1576 | ]
1577 | },
1578 | "metadata": {}
1579 | }
1580 | ]
1581 | },
1582 | {
1583 | "cell_type": "code",
1584 | "source": [
1585 | "results[\"snp_centered_dna_window\"].apply(lambda x: len(x))"
1586 | ],
1587 | "metadata": {
1588 | "colab": {
1589 | "base_uri": "https://localhost:8080/",
1590 | "height": 147
1591 | },
1592 | "id": "1alzRqCgu_RZ",
1593 | "outputId": "2b546112-5323-4799-a5bd-9019be3fc192"
1594 | },
1595 | "execution_count": null,
1596 | "outputs": [
1597 | {
1598 | "output_type": "execute_result",
1599 | "data": {
1600 | "text/plain": [
1601 | "0 513\n",
1602 | "1 513\n",
1603 | "Name: snp_centered_dna_window, dtype: int64"
1604 | ],
1605 | "text/html": [
1606 | "\n",
1607 | "\n",
1620 | " \n",
1621 | " \n",
1622 | " \n",
1623 | " | \n",
1624 | " snp_centered_dna_window | \n",
1625 | " \n",
1626 | " \n",
1627 | " \n",
1628 | " \n",
1629 | " | 0 | \n",
1630 | " 513 | \n",
1631 | " \n",
1632 | " \n",
1633 | " | 1 | \n",
1634 | " 513 | \n",
1635 | " \n",
1636 | " \n",
1637 | " \n",
1638 | " "
1639 | ]
1640 | },
1641 | "metadata": {},
1642 | "execution_count": 127
1643 | }
1644 | ]
1645 | },
1646 | {
1647 | "cell_type": "code",
1648 | "source": [],
1649 | "metadata": {
1650 | "id": "CuUvWfMpvwOz"
1651 | },
1652 | "execution_count": null,
1653 | "outputs": []
1654 | },
1655 | {
1656 | "cell_type": "code",
1657 | "source": [
1658 | "first_result = df.loc[4].to_dict()\n",
1659 | "first_result"
1660 | ],
1661 | "metadata": {
1662 | "colab": {
1663 | "base_uri": "https://localhost:8080/"
1664 | },
1665 | "id": "0AOlo1ha2e1d",
1666 | "outputId": "ace84581-4489-425a-88aa-d4acec34bab3"
1667 | },
1668 | "execution_count": null,
1669 | "outputs": [
1670 | {
1671 | "output_type": "execute_result",
1672 | "data": {
1673 | "text/plain": [
1674 | "{'chromosome': 2,\n",
1675 | " 'position': 8279539,\n",
1676 | " 'accession': 9525,\n",
1677 | " 'reference': 'T',\n",
1678 | " 'variant': 'A',\n",
1679 | " '-': 40,\n",
1680 | " 'impacts': 'MODERATE',\n",
1681 | " 'effects': 'missense_variant'}"
1682 | ]
1683 | },
1684 | "metadata": {},
1685 | "execution_count": 59
1686 | }
1687 | ]
1688 | },
1689 | {
1690 | "cell_type": "code",
1691 | "source": [
1692 | "get_accession_gene_fasta(HMA4_gene_id)(first_result[\"accession\"])"
1693 | ],
1694 | "metadata": {
1695 | "colab": {
1696 | "base_uri": "https://localhost:8080/",
1697 | "height": 140
1698 | },
1699 | "id": "b1UmpkUTq3yG",
1700 | "outputId": "36e61bb1-c3cc-4731-9064-6865e209bd31"
1701 | },
1702 | "execution_count": null,
1703 | "outputs": [
1704 | {
1705 | "output_type": "execute_result",
1706 | "data": {
1707 | "text/plain": [
1708 | "'>MPI-GMI|Ath-1001-Genomes|pseudo-genome|9525|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2\\nCTACGTTCCTAACACTTCTCTCAACCTTTATCTGATCGCACCAAACCAGTTTTTTCGCATCGGCTNCTTCCTTTTGCTACTAGCTCTCCTCTCTTCTCCGGTNTTTTTGTCTCNCTTCTTAATTCACACAGATTTCATGATAAGTGATGATCTATAACAAGACGCTAACTCTTCTCTTGCATTTCTCGTGTTTTCATTTTCTTGTTACGCCAAATTTATCCCTTCAAAATCNNTTTTTTATGNGTATAGAATCCAAATAATAAGTAAAAGCTGATTCGTCTTCTTCCACTTAACACAAGTAAGCAGTGAGAGGGTGAAGATTTTTCTTTAGGAAAACAAAGAGAGTGAAGATATTTTTTGGCTTGATCTCAACATTATTTTTTCNTAAAAGTAAACATTTTCAATAAGAAAANNNNNNNNNNNNNNCGNAAGTTTNTTNNNNANAAAAAAAAAAAGGNTTTGANCTGTTTCATGATAATGATAACNAAAAAAGTTTTTGCTTTCTTNTTTTTTTTCCTCCGCAAAACAGTCTNAAAGTATAACCAAAAAGCCTATAAATCAATATAATTTGTTGTTTTGATTTACGTTTTACNGAAAATGGCGTTACAAAACAAAGAAGAAGAGAAAAAGAAAGTGAAGAAGTTGCAAAAGAGTTACTACGATGTTCTCGGAATCTGTTGTACATCGGAAGTTCCTATAATCGAGAATATTCTCAAGTCACTTGACGGCGTTAAAGAATATTCCGTCATCGTTCCCTCGAGAACCGTGNTTGTTGTTCACGACAGNNTCCTCATCTCTCCCTTCCNAATTGGTAAATNTTTTTTTTCTTTNTGATAATAAAGNTTTTTTNNNNNAAAAAAATTGGTAAATCATTATAANTAAATAGTTATTTAANATTTCTCTAATTTTTAATTTTACTCAGTNAAAAAATAANAATTAAAAAGCATAAATAANNNNNNNNTATTACACGAAAAGCTCACTTCATCTTATTTCTATTTATAAATTAGTGGTTTTGCGTCATGGTTTGATATTTTAATTAGTCAAATATATGTGATCCAANAAGTCACTTCAACTGAAAATATTTAATATTCTACCACTNAAATTTAATTTGCTTTCCCAAATCATTTCTTCTTAAGAAAACTAGTTTAGANCCAATTCTGTTTTANCAAAGTAATTTTTTCTTAATTGCTTTGCTTTGATATTTATGTACTGATNNNNNNNNNNNNNTGGTTGTNTTTTTAACCTAGGAGCTAAAGGNNAATATTAAAATTAACANTTTTTTCGCCTACAACAAAATAGGATAAACCGTTACTNTTTTTTNTGTTTTATTAATCCCATGAAACACGTTCAGTTAGNATAACCAAAGATTGTGAATAAGGTTCGTCTANTTTTTTTTCAAACTGTATCTAGTAAACCAACAAATTAAAAATGGTAACTTTTATAGAAACGCATAATGATACAATAACGGCAATAATACAACGAATCACATGTTTNTATAAATTCTAAATTTTGCATATCATAAAACCTTTACCACATTNNNTNTTTTNTGCATATTATAGCTAACCTAATACGTGCATATAACGCANNNTACGTGTGTAAACAAATAAATAATATATTTAATTGACATCATATATAGAATTTAATTCCGAAATCATATATAGATAATACATAATAGAANAAAGGTTGTTAGAAAAAGCCTGTCGTGATACTTTACCAAATCTTTTGATATGAATATTTAATTGGTAGTGCATCATACTCGTTACGTAACAATATTTTATAATTTTTATTTGACAGAAAACATATTCTATATTAGACATTNAAAGTNAAAAATAATCAAANAAAAAATCANAAAAAATTAAATAATAACACTATTTTCATCATTAAGAAAATACACTATTTTCAGTTTATTATNTANTAGTTTTTAAGGTTTGAATTTATAAAAATTGGCTAATATTTGTCATTTTCCTCACTTGCAAATATATTTTAANAAATCTTCTTCCACCTTTGTTAATTNATGATTTAATTCNGAAAAACANNGTGTTGCAGCNAAGGCACTAAACGAAGCTAGGTTAGNAGCAAANGTNAGGNTAAANGGAGAAACTAGCTTCAAGAACAAATGGCCGAGCCCTTTCGCCGTAGTTTCCGGCTTACTNCTCCTCCTATCCTTCCTAAAGTTTGTCTACNCNCNTTTACGTTGGCTCGCCGTCGCAGCAGTTGCCGCCGGTATCTATCCGATTCTTGCCAAAGCCTTTGCTTCCATTAAAAGGCCTAGGATCGACATCAACATATTGGTCATAATAACCGGTAATACCACTTTCTCCTCTTTTCTTTATGCTGTCGTTATACCANTTTTTTTTNTAGTATTCATTATTAGCATCTAACATTATTTCTCTATATTACTCCGTGACTTAAGAATGATGTGTTCTATAATAATTTGTTAGTTTATGGNTTATCCGTGATCGATGTAACAACTAGAAATAATATTGTTACTTATCTAAGATTTGNAAACTTAGCAAATTGTGGTTAAGGTAAAACTATTAATATATATAATTTCTTAAAANTTGTGATCGAGTCTCTATCCCTTTTTCGGTATTTTAAAAAAGCTTACATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAAAAGTTTCCTTTATTTTGTTCGTAACNTTTTTAAATAAATTCATATGGGAAAACTACTAAAGAATAGGTATTAAGAAAAATATTGTATTATGCTCAACTTATTTTACTGAACTCCAATATAGGAAAAATTAAATACGTGTAGCAGTTTGATATGNTTGTTATATTGTAGANTTTATTTGAGTTATAGGTTATAGAGGAAAACTAAAACAACTTTATATTTTTGCCGTAAGCAATGCAAATTNGAGTTATAGTTTATAGGAAAAAATNAAAATGTTCTTAGGATATAGGAAAAGTGAACATGTNATATAAACTTGAAAGTTGGTGTGGAAGATAATGCAAAAACAAAAGGTGCATTTCAAAATACGGGAAAAAGAAAGACGCCTTTCTCTTTTCTGTTTATAAAAATATACATTTTCGTTCATTTCTACTAGTACGATTATTTATAGGAACATACATAATTTCAAATTAAAACCATCTAAAGAGTAGNGAAAATAATTAAATATTTTGTTCAAGGAAAATAGTTATATTGTCTATAACGTAGGCAAGAAAATAAAAATCAGTTTTATTTTCCTNGGGTGCAATTAAGANGNCAATCTTTTTATATAAAATCAGAGTAATTTCATACCAGAAATCCAGACTAAATTTTGAGNTTTGTAGGAAGTCTCAAAAATCATAGTGTGATATCTTTTGATGNTTTTTTCTAAAACAATTACAATTACGTGCCATAGATAGAAAAACGCTAAGCATAGGTTTTTGATTGAAAATGGAAAATGGAAAAAGAACCTTCAACTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATGGAGCTATGGAATTAGTTTTAGTTGNANTTTCTGTTTAGTAGACATAATTAACAANAAACCATACAAAGTCATNTNNANCCAAGTTTTTTTTTTTTTNTNNTTTTTTTTTNTTTCGCTTGGTAGATGTTATAGTGTTACTTTAAGCTTTCTTTGATTTTCTAAAAACAAAATATTTTACACTGAGAGGTGATACAATGTTTTAGGTTTAAACTGATAATGATCCAATTTCTTCAATACAGTGATTGCAACACTTGCAATGCAAGATTTCATGGAGGCAGCAGCAGTTGTGTTCCTATTCACCATATCCGACTGGCTCGAAACAAGAGCTAGCTACAAGGNTTGTTTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGTGNNNNNNNNNNNNNNNTGGTNTANNNAGGTAATGATTATTGCAAATGTAAAATCTCAGGCGACCTCGGTAATGCAGTCTTTGATGAGCTTAGCTCCACAAAAGGCTACAATAGCAGAGACTGGTGAAGAAGTTGAGGTAGATGAGGTTAAGGTTGATACTGTTGTAGCAGTTAAAGCTGGTGAAACCATACCAATTGATGGGATTGTGGTGGATGGAAACTGTGAAGTAGACGAGAAAACCTTAACGGGCGAAGCATTTCCTGTGCCTAAACAGAGAGATTCTACGGTTTGGGCTGGNACCATCAATCTAAATGGTANGTAATCCTATTTTAAGAGCTTCAAGCTTTATACATTTTTGTTGTATCTTTCATTCTTTTGATCTTTCTTGGTNTTTTTTTTGCAGGTTACATAAGTGNGAAAACAACTTCTTTAGCGGGTGATTGCGTGGTTGCGAAAATGGCTAAGCTAGTAGAAGAAGCTCAGAGCAGTAAAACCAAATCTCAGAGACTAATAGACAAATGTTCTCAGTACTATACTCCAGGTAAGCAAAGACATAACCAAAACGTNNTNTTTATGTTCTTGATTCGAGTAATTTTGAGACCTCTGTGNTTCTTGTTTTGTTTTCAGCAATCATCTTAGTATCAGCTTGCGTCGCCATTGTCCCGGTTATAATGAAGGTCCACAACCTTAAACATTGGTTCCACCTAGCATTAGTTGTGTTAGTCAGTGGTTGTCCCTGTGGTCTTATCCTCTCTACACCAGTTGCTACTTTCTGTGCACTTACTAAAGCGGCAACTTCCGGACTTCTGATCAAAAGTGCTGATTATCTTGACACTCTCTCAAAGATCAAGATCGTTGCTTTCGATAAAACTGGGACTATTACAAGAGGAGAGTTNATTGTCATAGNTTTCAAGTCNCTCTCTAGAGATATAAACCNACGCAGCTTGCTTTNCTGGNNAAAAAAAAACTTGTGTNCTNNCCATAACNAGTTTGCCGAGATAACTTATGACNGAGACTTTCTTGTTATATAATGTTTCAGGGTATCTAGTGTTGAAAGCAAATCAAGTCATCCAATGGCTGCAACAATCGTGGACTATGCAAAATCTGTTTCTGTTGAGCCTAGGCCTGAAGAGGTTGAGGATTACCAGAACTTTCCAGGTGAAGGAATCTACGGGAAGATTGATGGTAACGATATCTTCATTGGGAACNAAAAGATTGCTTCTCGAGCTGGTTGTTCAACAGGTACTTGTAATATAAAGCTCAACAGAATGTTTTGAGGTCTTGGTATTCTTTAGTCTGAGCACTTGATTNTTCTTTCTTTACAGTTCCAGAGATTGAAGTTGATACCAAAGGCGGGAAGACTGTTGGATACGTCTATGTAGGTGAAAGACTAGCTGGATTTTTCAATCTTTCTGATGCTTGTAGATCTGGTGTTTCTCAAGCAATGGCAGAACTGAAATCTCTAGGAATCAAAACCGCAATGCTAACGGGAGATAATCAAGCCGCGGCAATGCATGCTCAAGAACAGGTGAGACTAATATAAACCATAATTTTAATCACTCTCTTAAGCGAAGACGTTGTTTAATAGTNCCTTGAAATGTCTTATGAAACAGCTAGGGAATGTTTTAGATGTTGTACATGGAGATCTTCTTCCAGAAGATAAGTCCAGAATCATACAAGAGTTTAAGAAAGAGGGACCAACCGCAATGGTAGGGGACGGTGTGAATGATGCACCAGCTTTAGCTACAGCTGATATTGGTATCTCCATGGGAATTTCTGGCTCTGCTCTTGCAACACAAACTGGTAATATTATTCTGATGTCTAATGATATAAGAAGGATACCACAAGCGGTGAAGCTAGCGAGAAGAGCACGACGCAAAGTTGTTGAAAACGTGTGTCTATCAATCATTTTAAAAGCAGGAATNCTCGCTTNGGCATTTGCTGGTCATCCTTTGATTTGGGCTGCGGTTCTTGTTGATGTAGGGACTTGTCTGCTTGTNATTTTCAATAGTATGTTGCTGCTGCGAGAGAAGAAAAAGATTGGGAACAAAAAGTGTTACAGGGCTTCTACATCTAAGTTGAATGGTAGGAAACTTGAAGGCGATGATGATTATGTTGTGGACTTAGANGCAGGCTTGTTAAAAAAGAGCGGGAATGGTCAATGCAAATCAAGCTGTTGTGGAGATAAGAAAAATCAAGAGAATGTTGTGATGATGAAACCAAGTAGTAAAACCAGTTCTGATCATTCTCACCCTGGTTGTTGTGGCGATAAGAAGGAAGAAAAAGTGAAGCCGCTTGTGAAAGATGGCTGTTGCAGTGAGAAAACTAGGAAACCAGAGGGAGATATGGTTTCATTGAGCTCATGTANGAAGTCTAGTCATGTNAAACATGACCTGAAAATGNAAGGTGGTTCAGGTTGTTGTGNTAGCAAAAATNAGNAAGNGAAGGAANTAGTAGCAAAAAGCTGTTGTGAGAAACCCAAACAGCAGGNGGNGAGNGTTNGAGACTGCAAGTCTGGTCATTGCGAGAAGAAGAAGCAAGCTGAAGACATTGTTGTCCCGGTGCAGATTATTGGTCATGCATTAACGCATGTGGAGATCGAGTTGCAGACAAAGGAAACCTGCAAAACAAGCTGTTGTGACAGTAAAGATAAGGTTAAGGAGACAGGTTTGCTGCTTTCTAGTGAGAACACNCCTTACCTGGAGAAAGGNGTGCTGATTAAAGATGAAGGAAACTGCAAGTCTGGCAGCGAGAACATGGGGACAGTGAAACAAAGCTGCCATGAGAAGGGCTGCAGCGATGANAAACAAACCGGGGAAATAACTCTTGCTTCGGAGGAAGAGACAGATGATCAAGATTGCTCCTCGGGATGTTGTGTGAACGAGGGAACAGTGAAACAAAGCTTCCATGAGAAGAAGCATTCTGTGTTGGTGGAGAAGGAAGGTTTGGACATGGAAACTGGTTTCTGTTGTGATGCCAAGCTGGTTTGTTGTGGAAACACAGAAGGTGAAGTGAAGGAGCAATGTCGTCTGGAGATAAAGAAAGAAGAACATTGCAAGTCTGGTTGCTGCGGCGAGGAAAAACAAACCGGAGAAATCGCTCTGGNTTCAGAGGAAGAGNNNNNNAGCACGAATTGTTCCACGGGTTGTTGTGTGGACAAAGAAGAAGTGACACAAACCTGTCATGAGAAGCCTGTTAGCTTGGTGGTATCAGGCTTGGAAGTGAAGAAGGATGAGCATTGTGAGAGCTCACACAGAGCCGTCAAGGTAGAGACCTGTTGCAAAGTGAAGATTCCAGAGGCTTGCGCATCAAAATGTAGGGACAGAGCGAAGCGTCACAGTGGTAAAAGCTGTTGCAGGAGTTATGCAAAAGAGNTATGCAGCCACCGCCATCATCATCACCACCACCACCACCATCACCATGTGAGTGCTTGATGGAGATTGATTGAATAACTTAAACTCTTGATGCATCCATCTATTCACATTACGTTTANTCTCATTCCGTGAATGCCGAAANAAAAAACAAAATGTTCCAGCAAAGGCAGTTTATTAGATTAAGCAACTGTGTTATTCATAAAGACAATGCTAGTGATTTTTTTTAAGTACTTTATGTATTGCAATTCCT'"
1709 | ],
1710 | "application/vnd.google.colaboratory.intrinsic+json": {
1711 | "type": "string"
1712 | }
1713 | },
1714 | "metadata": {},
1715 | "execution_count": 87
1716 | }
1717 | ]
1718 | },
1719 | {
1720 | "cell_type": "code",
1721 | "source": [
1722 | "HMA4_fasta_cache = DefaultDict(get_accession_gene_fasta(HMA4_gene_id))"
1723 | ],
1724 | "metadata": {
1725 | "id": "vMJWm-_kqPJO"
1726 | },
1727 | "execution_count": null,
1728 | "outputs": []
1729 | },
1730 | {
1731 | "cell_type": "code",
1732 | "source": [
1733 | "get_aa_substitution_pair(first_result)"
1734 | ],
1735 | "metadata": {
1736 | "colab": {
1737 | "base_uri": "https://localhost:8080/"
1738 | },
1739 | "id": "uoAvXA5_l1W_",
1740 | "outputId": "0f777d88-f870-4aae-c161-f6e57a9b8e4d"
1741 | },
1742 | "execution_count": null,
1743 | "outputs": [
1744 | {
1745 | "output_type": "execute_result",
1746 | "data": {
1747 | "text/plain": [
1748 | "(MutableSeq('Y'), MutableSeq('F'))"
1749 | ]
1750 | },
1751 | "metadata": {},
1752 | "execution_count": 61
1753 | }
1754 | ]
1755 | },
1756 | {
1757 | "cell_type": "code",
1758 | "source": [
1759 | "fasta = get_accession_gene_fasta(first_result[\"accession\"], HMA4_gene_id)"
1760 | ],
1761 | "metadata": {
1762 | "id": "JrsYwelu6CiX"
1763 | },
1764 | "execution_count": null,
1765 | "outputs": []
1766 | },
1767 | {
1768 | "cell_type": "code",
1769 | "source": [
1770 | "record = SeqIO.read(StringIO(fasta), \"fasta\")\n",
1771 | "record"
1772 | ],
1773 | "metadata": {
1774 | "colab": {
1775 | "base_uri": "https://localhost:8080/"
1776 | },
1777 | "id": "DPj6gOYcbT3h",
1778 | "outputId": "f74e2595-2953-4c15-8184-13fbc5a5a5aa"
1779 | },
1780 | "execution_count": null,
1781 | "outputs": [
1782 | {
1783 | "output_type": "execute_result",
1784 | "data": {
1785 | "text/plain": [
1786 | "SeqRecord(seq=Seq('CTACGTTCCTAACANTTCTCTCAACCTTTATCTGATCGCACCAAACCAGTTTTT...CCT'), id='MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2', name='MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2', description='MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2', dbxrefs=[])"
1787 | ]
1788 | },
1789 | "metadata": {},
1790 | "execution_count": 18
1791 | }
1792 | ]
1793 | },
1794 | {
1795 | "cell_type": "code",
1796 | "source": [
1797 | "aa = record.seq.translate()"
1798 | ],
1799 | "metadata": {
1800 | "colab": {
1801 | "base_uri": "https://localhost:8080/"
1802 | },
1803 | "id": "Dlox9xZqdM-e",
1804 | "outputId": "f8c7685c-967a-4055-d46d-6516cb5e66ff"
1805 | },
1806 | "execution_count": null,
1807 | "outputs": [
1808 | {
1809 | "output_type": "stream",
1810 | "name": "stderr",
1811 | "text": [
1812 | "/usr/local/lib/python3.12/dist-packages/Bio/Seq.py:2877: BiopythonWarning: Partial codon, len(sequence) not a multiple of three. Explicitly trim the sequence or add trailing N before translation. This may become an error in future.\n",
1813 | " warnings.warn(\n"
1814 | ]
1815 | }
1816 | ]
1817 | },
1818 | {
1819 | "cell_type": "code",
1820 | "source": [
1821 | "start, _ = get_start_end_from_seq(record)\n",
1822 | "start"
1823 | ],
1824 | "metadata": {
1825 | "colab": {
1826 | "base_uri": "https://localhost:8080/"
1827 | },
1828 | "id": "0WZ4QyJSgohr",
1829 | "outputId": "ba68e1fa-54e0-4243-f9c7-d7c5e1ccb845"
1830 | },
1831 | "execution_count": null,
1832 | "outputs": [
1833 | {
1834 | "output_type": "execute_result",
1835 | "data": {
1836 | "text/plain": [
1837 | "8278881"
1838 | ]
1839 | },
1840 | "metadata": {},
1841 | "execution_count": 37
1842 | }
1843 | ]
1844 | },
1845 | {
1846 | "cell_type": "code",
1847 | "source": [
1848 | "in_gene_position = first_result[\"position\"] - start"
1849 | ],
1850 | "metadata": {
1851 | "id": "XNRtQwOzeD9-"
1852 | },
1853 | "execution_count": null,
1854 | "outputs": []
1855 | },
1856 | {
1857 | "cell_type": "code",
1858 | "source": [
1859 | "first_result[\"variant\"] == record.seq[in_gene_position]"
1860 | ],
1861 | "metadata": {
1862 | "colab": {
1863 | "base_uri": "https://localhost:8080/"
1864 | },
1865 | "id": "xoJgVLKig9F5",
1866 | "outputId": "5a7ca1b8-217f-4acf-ddaa-de3bee4009c9"
1867 | },
1868 | "execution_count": null,
1869 | "outputs": [
1870 | {
1871 | "output_type": "execute_result",
1872 | "data": {
1873 | "text/plain": [
1874 | "True"
1875 | ]
1876 | },
1877 | "metadata": {},
1878 | "execution_count": 47
1879 | }
1880 | ]
1881 | },
1882 | {
1883 | "cell_type": "code",
1884 | "source": [
1885 | "codon_start = (in_gene_position // 3) * 3\n",
1886 | "codon_start"
1887 | ],
1888 | "metadata": {
1889 | "colab": {
1890 | "base_uri": "https://localhost:8080/"
1891 | },
1892 | "id": "paV0Hfbah0Hy",
1893 | "outputId": "7e18ef33-0e33-41d4-a476-6f9712c3b4e5"
1894 | },
1895 | "execution_count": null,
1896 | "outputs": [
1897 | {
1898 | "output_type": "execute_result",
1899 | "data": {
1900 | "text/plain": [
1901 | "642"
1902 | ]
1903 | },
1904 | "metadata": {},
1905 | "execution_count": 48
1906 | }
1907 | ]
1908 | },
1909 | {
1910 | "cell_type": "code",
1911 | "source": [
1912 | "in_condon_position = in_gene_position % 3\n",
1913 | "in_condon_position"
1914 | ],
1915 | "metadata": {
1916 | "colab": {
1917 | "base_uri": "https://localhost:8080/"
1918 | },
1919 | "id": "hpEdQDqAimqV",
1920 | "outputId": "f3d3e3ca-39dc-48c7-d5b3-0665edd3dfcc"
1921 | },
1922 | "execution_count": null,
1923 | "outputs": [
1924 | {
1925 | "output_type": "execute_result",
1926 | "data": {
1927 | "text/plain": [
1928 | "0"
1929 | ]
1930 | },
1931 | "metadata": {},
1932 | "execution_count": 49
1933 | }
1934 | ]
1935 | },
1936 | {
1937 | "cell_type": "code",
1938 | "source": [
1939 | "codon = MutableSeq(record.seq[codon_start:codon_start+3])"
1940 | ],
1941 | "metadata": {
1942 | "id": "zY5RIgdMiI40"
1943 | },
1944 | "execution_count": null,
1945 | "outputs": []
1946 | },
1947 | {
1948 | "cell_type": "code",
1949 | "source": [
1950 | "codon.translate()"
1951 | ],
1952 | "metadata": {
1953 | "colab": {
1954 | "base_uri": "https://localhost:8080/"
1955 | },
1956 | "id": "cDkbbg01iQSp",
1957 | "outputId": "9dd02500-7d28-43ba-f248-cda40a358fda"
1958 | },
1959 | "execution_count": null,
1960 | "outputs": [
1961 | {
1962 | "output_type": "execute_result",
1963 | "data": {
1964 | "text/plain": [
1965 | "MutableSeq('V')"
1966 | ]
1967 | },
1968 | "metadata": {},
1969 | "execution_count": 55
1970 | }
1971 | ]
1972 | },
1973 | {
1974 | "cell_type": "code",
1975 | "source": [
1976 | "codon[in_condon_position] = first_result[\"reference\"]\n",
1977 | "codon.translate()"
1978 | ],
1979 | "metadata": {
1980 | "colab": {
1981 | "base_uri": "https://localhost:8080/"
1982 | },
1983 | "id": "A7pDErOziTPs",
1984 | "outputId": "98ba4e30-6be5-4e7d-b2a1-936bdacdbae0"
1985 | },
1986 | "execution_count": null,
1987 | "outputs": [
1988 | {
1989 | "output_type": "execute_result",
1990 | "data": {
1991 | "text/plain": [
1992 | "MutableSeq('L')"
1993 | ]
1994 | },
1995 | "metadata": {},
1996 | "execution_count": 56
1997 | }
1998 | ]
1999 | },
2000 | {
2001 | "cell_type": "code",
2002 | "source": [
2003 | "aa"
2004 | ],
2005 | "metadata": {
2006 | "colab": {
2007 | "base_uri": "https://localhost:8080/"
2008 | },
2009 | "id": "41MMMDoweAtl",
2010 | "outputId": "45777d91-ac38-4a1e-9d9e-b85bf4dce3af"
2011 | },
2012 | "execution_count": null,
2013 | "outputs": [
2014 | {
2015 | "output_type": "execute_result",
2016 | "data": {
2017 | "text/plain": [
2018 | "Seq('LRS*XFSQPLSDRTKPVFSHRLLPFAXSSPLFSGXFVSLLNSHRFHDK**SIXR...CNS')"
2019 | ]
2020 | },
2021 | "metadata": {},
2022 | "execution_count": 22
2023 | }
2024 | ]
2025 | },
2026 | {
2027 | "cell_type": "code",
2028 | "source": [
2029 | "record.id"
2030 | ],
2031 | "metadata": {
2032 | "colab": {
2033 | "base_uri": "https://localhost:8080/",
2034 | "height": 35
2035 | },
2036 | "id": "c4EnAyO0fxGy",
2037 | "outputId": "cbc2bd94-3564-45cb-e5d8-883c261db2fe"
2038 | },
2039 | "execution_count": null,
2040 | "outputs": [
2041 | {
2042 | "output_type": "execute_result",
2043 | "data": {
2044 | "text/plain": [
2045 | "'MPI-GMI|Ath-1001-Genomes|pseudo-genome|9653|Chr2:8278881..8286445|Col-0_gi:AT2G19110.1|V0.2'"
2046 | ],
2047 | "application/vnd.google.colaboratory.intrinsic+json": {
2048 | "type": "string"
2049 | }
2050 | },
2051 | "metadata": {},
2052 | "execution_count": 31
2053 | }
2054 | ]
2055 | },
2056 | {
2057 | "cell_type": "code",
2058 | "source": [
2059 | "record.seq[in_gene_position]"
2060 | ],
2061 | "metadata": {
2062 | "colab": {
2063 | "base_uri": "https://localhost:8080/",
2064 | "height": 35
2065 | },
2066 | "id": "-y8MHPRrndYi",
2067 | "outputId": "78cdd7ef-10e0-4e3e-80db-eb2351c60ad3"
2068 | },
2069 | "execution_count": null,
2070 | "outputs": [
2071 | {
2072 | "output_type": "execute_result",
2073 | "data": {
2074 | "text/plain": [
2075 | "'G'"
2076 | ],
2077 | "application/vnd.google.colaboratory.intrinsic+json": {
2078 | "type": "string"
2079 | }
2080 | },
2081 | "metadata": {},
2082 | "execution_count": 63
2083 | }
2084 | ]
2085 | },
2086 | {
2087 | "cell_type": "code",
2088 | "source": [
2089 | "snp_centered_dna_window = record.seq[in_gene_position-256:in_gene_position+257]"
2090 | ],
2091 | "metadata": {
2092 | "id": "jHZW-AwDnmde"
2093 | },
2094 | "execution_count": null,
2095 | "outputs": []
2096 | },
2097 | {
2098 | "cell_type": "code",
2099 | "source": [
2100 | "len(dna_snippet)"
2101 | ],
2102 | "metadata": {
2103 | "colab": {
2104 | "base_uri": "https://localhost:8080/"
2105 | },
2106 | "id": "xd-2KX6Lnxex",
2107 | "outputId": "7de526b8-8d46-4224-d6d4-4f5126bb8605"
2108 | },
2109 | "execution_count": null,
2110 | "outputs": [
2111 | {
2112 | "output_type": "execute_result",
2113 | "data": {
2114 | "text/plain": [
2115 | "513"
2116 | ]
2117 | },
2118 | "metadata": {},
2119 | "execution_count": 70
2120 | }
2121 | ]
2122 | },
2123 | {
2124 | "cell_type": "code",
2125 | "source": [
2126 | "dna_snippet[len(dna_snippet) // 2]"
2127 | ],
2128 | "metadata": {
2129 | "colab": {
2130 | "base_uri": "https://localhost:8080/",
2131 | "height": 35
2132 | },
2133 | "id": "gjw2pKF5n3NB",
2134 | "outputId": "608e475f-7b65-4538-883b-be29a3ea707f"
2135 | },
2136 | "execution_count": null,
2137 | "outputs": [
2138 | {
2139 | "output_type": "execute_result",
2140 | "data": {
2141 | "text/plain": [
2142 | "'G'"
2143 | ],
2144 | "application/vnd.google.colaboratory.intrinsic+json": {
2145 | "type": "string"
2146 | }
2147 | },
2148 | "metadata": {},
2149 | "execution_count": 74
2150 | }
2151 | ]
2152 | },
2153 | {
2154 | "cell_type": "code",
2155 | "source": [
2156 | "DefaultDict(lambda key: key + 5)"
2157 | ],
2158 | "metadata": {
2159 | "id": "gAo7w-Y33V8O"
2160 | },
2161 | "execution_count": null,
2162 | "outputs": []
2163 | },
2164 | {
2165 | "cell_type": "code",
2166 | "source": [
2167 | "foo = \"AAATAAA\"\n",
2168 | "\n",
2169 | "foo[len(foo) // 2]"
2170 | ],
2171 | "metadata": {
2172 | "colab": {
2173 | "base_uri": "https://localhost:8080/",
2174 | "height": 35
2175 | },
2176 | "id": "vHV2Isu9oGDd",
2177 | "outputId": "238229fc-37ab-42a0-e54a-085b756d7db6"
2178 | },
2179 | "execution_count": null,
2180 | "outputs": [
2181 | {
2182 | "output_type": "execute_result",
2183 | "data": {
2184 | "text/plain": [
2185 | "'T'"
2186 | ],
2187 | "application/vnd.google.colaboratory.intrinsic+json": {
2188 | "type": "string"
2189 | }
2190 | },
2191 | "metadata": {},
2192 | "execution_count": 76
2193 | }
2194 | ]
2195 | },
2196 | {
2197 | "cell_type": "code",
2198 | "source": [
2199 | "foo[3-10:3+2]"
2200 | ],
2201 | "metadata": {
2202 | "colab": {
2203 | "base_uri": "https://localhost:8080/",
2204 | "height": 35
2205 | },
2206 | "id": "JDgLsIEpoOoZ",
2207 | "outputId": "f8fc1f3c-a984-48dc-c192-73d88787430b"
2208 | },
2209 | "execution_count": null,
2210 | "outputs": [
2211 | {
2212 | "output_type": "execute_result",
2213 | "data": {
2214 | "text/plain": [
2215 | "'AAATA'"
2216 | ],
2217 | "application/vnd.google.colaboratory.intrinsic+json": {
2218 | "type": "string"
2219 | }
2220 | },
2221 | "metadata": {},
2222 | "execution_count": 115
2223 | }
2224 | ]
2225 | }
2226 | ]
2227 | }
--------------------------------------------------------------------------------
|