├── .gitignore
├── 2020_2021
├── Lesson1
│ ├── Exercises1.ipynb
│ └── Lesson1.ipynb
├── Lesson2
│ ├── Exercises2.ipynb
│ └── Lesson2.ipynb
├── Lesson3
│ ├── Exercises3.ipynb
│ └── Lesson3.ipynb
├── Lesson4
│ ├── Exercises4.ipynb
│ └── Lesson4.ipynb
├── Lesson5
│ ├── Exercises5.ipynb
│ ├── Lesson5.ipynb
│ └── protein_sequences
│ │ ├── O00444.fasta
│ │ ├── P49760.fasta
│ │ ├── Q02156.fasta
│ │ ├── Q13188.fasta
│ │ └── Q13627.fasta
├── Lesson6
│ ├── Exercises6.ipynb
│ └── Lesson6.ipynb
├── Lesson7
│ ├── Exercises7.ipynb
│ └── Lesson7.ipynb
├── Lesson8
│ └── Lesson8.ipynb
├── data
│ ├── P04439.fasta
│ ├── RepeatMasker.subset.bed
│ ├── brca_transcripts.txt
│ ├── cervical.csv
│ ├── genetic_code.tsv
│ ├── trio.2010_06.ychr.sites.vcf
│ ├── uniprot_ids.txt
│ ├── utils.py
│ └── validation.py
└── images
│ ├── Integer.jpeg
│ └── List.jpeg
├── 2021_2022
├── Lesson1
│ ├── Exercises1.ipynb
│ └── Lesson1.ipynb
├── Lesson2
│ ├── Exercises2.ipynb
│ └── Lesson2.ipynb
├── Lesson3
│ ├── Exercises3.ipynb
│ └── Lesson3.ipynb
├── Lesson4
│ ├── Exercises4.ipynb
│ └── Lesson4.ipynb
├── Lesson5
│ ├── Exercises5.ipynb
│ └── Lesson5.ipynb
├── Lesson6
│ ├── Exercises6.ipynb
│ └── Lesson6.ipynb
├── Lesson7
│ ├── Exercises7.ipynb
│ └── Lesson7.ipynb
├── Lesson8
│ └── Lesson8.ipynb
├── data
│ ├── P04439.fasta
│ ├── RepeatMasker.subset.bed
│ ├── brca_transcripts.txt
│ ├── cervical.csv
│ ├── genetic_code.tsv
│ ├── my_utils.py
│ ├── trio.2010_06.ychr.sites.vcf
│ ├── uniprot_ids.txt
│ └── validation.py
└── images
│ ├── Integer.jpeg
│ └── List.jpeg
├── 2022_2023
├── Lesson1
│ ├── Exercises1.ipynb
│ └── Lesson1.ipynb
├── Lesson2
│ ├── Exercises2.ipynb
│ └── Lesson2.ipynb
├── Lesson3
│ ├── Exercises3.ipynb
│ └── Lesson3.ipynb
├── Lesson4
│ ├── Exercises4.ipynb
│ └── Lesson4.ipynb
├── Lesson5
│ ├── Exercises5.ipynb
│ └── Lesson5.ipynb
├── Lesson6
│ ├── Exercises6.ipynb
│ └── Lesson6.ipynb
├── Lesson7
│ ├── Exercises7.ipynb
│ └── Lesson7.ipynb
├── Lesson8
│ └── Lesson8.ipynb
├── data
│ ├── P04439.fasta
│ ├── RepeatMasker.subset.bed
│ ├── brca_transcripts.txt
│ ├── cervical.csv
│ ├── genetic_code.tsv
│ ├── my_utils.py
│ ├── trio.2010_06.ychr.sites.vcf
│ ├── uniprot_ids.txt
│ └── validation.py
└── images
│ ├── Integer.jpeg
│ └── List.jpeg
├── 2023_2024
├── Lesson1
│ ├── Exercises1.ipynb
│ └── Lesson1.ipynb
├── Lesson2
│ ├── Exercises2.ipynb
│ └── Lesson2.ipynb
├── Lesson3
│ ├── Exercises3.ipynb
│ └── Lesson3.ipynb
├── Lesson4
│ ├── Exercises4.ipynb
│ └── Lesson4.ipynb
├── Lesson5
│ ├── Exercises5.ipynb
│ └── Lesson5.ipynb
├── Lesson6
│ ├── Exercises6.ipynb
│ └── Lesson6.ipynb
├── Lesson7
│ ├── Exercises7.ipynb
│ └── Lesson7.ipynb
├── Lesson8
│ └── Lesson8.ipynb
├── data
│ ├── P04439.fasta
│ ├── RepeatMasker.subset.bed
│ ├── brca_transcripts.txt
│ ├── genetic_code.tsv
│ ├── my_utils.py
│ ├── uniprot_ids.txt
│ └── validation.py
└── images
│ ├── Integer.jpeg
│ └── List.jpeg
├── 2024_2025
├── Lesson1
│ ├── Exercises1.ipynb
│ └── Lesson1.ipynb
├── Lesson2
│ ├── Exercises2.ipynb
│ └── Lesson2.ipynb
├── Lesson3
│ ├── Exercises3.ipynb
│ └── Lesson3.ipynb
├── Lesson4
│ ├── Exercises4.ipynb
│ └── Lesson4.ipynb
├── Lesson5
│ ├── Exercises5.ipynb
│ └── Lesson5.ipynb
├── Lesson6
│ ├── Exercises6.ipynb
│ └── Lesson6.ipynb
├── Lesson7
│ ├── Exercises7.ipynb
│ └── Lesson7.ipynb
├── Lesson8
│ └── Lesson8.ipynb
├── data
│ ├── P04439.fasta
│ ├── RepeatMasker.subset.bed
│ ├── brca_transcripts.txt
│ ├── genetic_code.tsv
│ ├── my_utils.py
│ ├── uniprot_ids.txt
│ └── validation.py
└── images
│ ├── Integer.jpeg
│ └── List.jpeg
├── ExamResults
├── 2020.12.23.md
├── 2021.01.21.md
├── 2021.04.08.md
├── 2021.06.10.md
├── 2021.09.23.md
├── 2021.12.20.md
├── 2022.01.13.md
├── 2022.04.22.md
├── 2022.07.14.md
├── 2022.09.08.md
├── 2022.12.22.md
├── 2022.12.23.md
├── 2023.02.09.md
├── 2023.06.12.md
├── 2023.09.07.md
├── 2024.01.15.md
├── 2024.06.11.md
├── 2024.12.19.md
├── 2025.02.13.md
└── plot_date_vs_grade.py
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/2020_2021/Lesson1/Exercises1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "The following list is corrupted:"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "mutations = [\n",
18 | " 'p.Ser31Ala',\n",
19 | " 'p.Pro38Leu',\n",
20 | " 'p.Asn100Lys',\n",
21 | " 'p.LEU110VAL',\n",
22 | " 13,\n",
23 | " 4.0,\n",
24 | " True,\n",
25 | " 'p.Tyr341Leu',\n",
26 | " 'AUG',\n",
27 | " 'p.Tyr0Le',\n",
28 | " 'p.Asn1.3Lys',\n",
29 | " 'p.Arg0Leu'\n",
30 | "]"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n",
38 | "\n",
39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "#### Tips"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "valid_aminos = [\n",
56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n",
57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n",
58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n",
59 | "]\n",
60 | "\n",
61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n",
62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n",
63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "valid_mutations = []\n",
73 | "\n",
74 | "# TODO"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Exercise\n",
82 | "\n",
83 | "Write a script to check if a protein sequence is valid."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# TODO"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "### Exercise\n",
100 | "\n",
101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# TODO"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.8.3"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
--------------------------------------------------------------------------------
/2020_2021/Lesson2/Exercises2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n",
10 | "\n",
11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n",
12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n",
13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "mutations = [\n",
23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n",
24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n",
25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n",
26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n",
27 | "]\n",
28 | "\n",
29 | "aa_3L_to_1L = {\n",
30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
34 | "}"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# TO DO"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "### Exercise\n",
51 | "\n",
52 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# TO DO"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Exercise\n",
69 | "\n",
70 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "# TO DO"
80 | ]
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "Python 3",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.8.3"
100 | }
101 | },
102 | "nbformat": 4,
103 | "nbformat_minor": 4
104 | }
105 |
--------------------------------------------------------------------------------
/2020_2021/Lesson3/Exercises3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n",
10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# TO DO"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Exercise\n",
27 | "\n",
28 | "Write a function to remove duplicates in a list."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# TO DO"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Exercise\n",
45 | "\n",
46 | "Write a function to calculate the identity between 2 sequences."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# TO DO"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.8.3"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
--------------------------------------------------------------------------------
/2020_2021/Lesson4/Exercises4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Compute all pair-wise identities (number of identical character pairs)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "sequences = [\n",
19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n",
20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n",
21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n",
22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n",
23 | "]\n",
24 | "\n",
25 | "# TO DO"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Exercise\n",
33 | "\n",
34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n",
35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n",
36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# TO DO"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Exercise\n",
53 | "\n",
54 | "Print the index of the first occurrence of the ATG codon.\n",
55 | "\n",
56 | "Try with and without using the `find()` method on strings."
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 3,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# TO DO"
66 | ]
67 | }
68 | ],
69 | "metadata": {
70 | "kernelspec": {
71 | "display_name": "Python 3",
72 | "language": "python",
73 | "name": "python3"
74 | },
75 | "language_info": {
76 | "codemirror_mode": {
77 | "name": "ipython",
78 | "version": 3
79 | },
80 | "file_extension": ".py",
81 | "mimetype": "text/x-python",
82 | "name": "python",
83 | "nbconvert_exporter": "python",
84 | "pygments_lexer": "ipython3",
85 | "version": "3.8.3"
86 | }
87 | },
88 | "nbformat": 4,
89 | "nbformat_minor": 4
90 | }
91 |
--------------------------------------------------------------------------------
/2020_2021/Lesson5/Exercises5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Write a function to search motifs in a sequence.\n",
10 | "\n",
11 | "Try with and without using the `re` module."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n",
21 | "\n",
22 | "consensus_motifs = {\n",
23 | " 'motif1': 'AGGAG[GT]',\n",
24 | " 'motif2': 'T[AT]AAT',\n",
25 | " 'motif3': 'GG.A.T[AG]'\n",
26 | "}"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Possible printed output:\n",
34 | "```\n",
35 | "AGGAG[GT]\n",
36 | "\t(969, 975) AGGAGG\n",
37 | "\t(1153, 1159) AGGAGG\n",
38 | "\t(1339, 1345) AGGAGT\n",
39 | "\t(1587, 1593) AGGAGG\n",
40 | "\t(1881, 1887) AGGAGG\n",
41 | "\t(1941, 1947) AGGAGG\n",
42 | "T[AT]AAT\n",
43 | "\t(50, 55) TAAAT\n",
44 | "\t(1098, 1103) TAAAT\n",
45 | "\t(1276, 1281) TAAAT\n",
46 | "GG.A.T[AG]\n",
47 | "\t(248, 255) GGTACTG\n",
48 | "\t(983, 990) GGAAATA\n",
49 | "\t(1910, 1917) GGGACTG\n",
50 | "\t(1980, 1987) GGCAGTG\n",
51 | "```\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# TO DO"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Exercise\n",
68 | "\n",
69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "aa_3L_to_1L = {\n",
79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
83 | "}\n",
84 | "\n",
85 | "#aa_1L_to_3L['A'] --> 'ALA'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# TO DO"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### Exercise\n",
102 | "\n",
103 | "Write a function to remove not valid aminoacids from a protein."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# TO DO"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.8.3"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 4
137 | }
--------------------------------------------------------------------------------
/2020_2021/Lesson5/protein_sequences/O00444.fasta:
--------------------------------------------------------------------------------
1 | >sp|O00444|PLK4_HUMAN Serine/threonine-protein kinase PLK4 OS=Homo sapiens OX=9606 GN=PLK4 PE=1 SV=3
2 | MATCIGEKIEDFKVGNLLGKGSFAGVYRAESIHTGLEVAIKMIDKKAMYKAGMVQRVQNE
3 | VKIHCQLKHPSILELYNYFEDSNYVYLVLEMCHNGEMNRYLKNRVKPFSENEARHFMHQI
4 | ITGMLYLHSHGILHRDLTLSNLLLTRNMNIKIADFGLATQLKMPHEKHYTLCGTPNYISP
5 | EIATRSAHGLESDVWSLGCMFYTLLIGRPPFDTDTVKNTLNKVVLADYEMPSFLSIEAKD
6 | LIHQLLRRNPADRLSLSSVLDHPFMSRNSSTKSKDLGTVEDSIDSGHATISTAITASSST
7 | SISGSLFDKRRLLIGQPLPNKMTVFPKNKSSTDFSSSGDGNSFYTQWGNQETSNSGRGRV
8 | IQDAEERPHSRYLRRAYSSDRSGTSNSQSQAKTYTMERCHSAEMLSVSKRSGGGENEERY
9 | SPTDNNANIFNFFKEKTSSSSGSFERPDNNQALSNHLCPGKTPFPFADPTPQTETVQQWF
10 | GNLQINAHLRKTTEYDSISPNRDFQGHPDLQKDTSKNAWTDTKVKKNSDASDNAHSVKQQ
11 | NTMKYMTALHSKPEIIQQECVFGSDPLSEQSKTRGMEPPWGYQNRTLRSITSPLVAHRLK
12 | PIRQKTKKAVVSILDSEEVCVELVKEYASQEYVKEVLQISSDGNTITIYYPNGGRGFPLA
13 | DRPPSPTDNISRYSFDNLPEKYWRKYQYASRFVQLVRSKSPKITYFTRYAKCILMENSPG
14 | ADFEVWFYDGVKIHKTEDFIQVIEKTGKSYTLKSESEVNSLKEEIKMYMDHANEGHRICL
15 | ALESIISEEERKTRSAPFFPIIIGRKPGSTSSPKALSPPPSVDSNYPTRERASFNRMVMH
16 | SAASPTQAPILNPSMVTNEGLGLTTTASGTDISSNSLKDCLPKSAQLLKSVFVKNVGWAT
17 | QLTSGAVWVQFNDGSQLVVQAGVSSISYTSPNGQTTRYGENEKLPDYIKQKLQCLSSILL
18 | MFSNPTPNFH
19 |
--------------------------------------------------------------------------------
/2020_2021/Lesson5/protein_sequences/P49760.fasta:
--------------------------------------------------------------------------------
1 | >sp|P49760|CLK2_HUMAN Dual specificity protein kinase CLK2 OS=Homo sapiens OX=9606 GN=CLK2 PE=1 SV=1
2 | MPHPRRYHSSERGSRGSYREHYRSRKHKRRRSRSWSSSSDRTRRRRREDSYHVRSRSSYD
3 | DRSSDRRVYDRRYCGSYRRNDYSRDRGDAYYDTDYRHSYEYQRENSSYRSQRSSRRKHRR
4 | RRRRSRTFSRSSSQHSSRRAKSVEDDAEGHLIYHVGDWLQERYEIVSTLGEGTFGRVVQC
5 | VDHRRGGARVALKIIKNVEKYKEAARLEINVLEKINEKDPDNKNLCVQMFDWFDYHGHMC
6 | ISFELLGLSTFDFLKDNNYLPYPIHQVRHMAFQLCQAVKFLHDNKLTHTDLKPENILFVN
7 | SDYELTYNLEKKRDERSVKSTAVRVVDFGSATFDHEHHSTIVSTRHYRAPEVILELGWSQ
8 | PCDVWSIGCIIFEYYVGFTLFQTHDNREHLAMMERILGPIPSRMIRKTRKQKYFYRGRLD
9 | WDENTSAGRYVRENCKPLRRYLTSEAEEHHQLFDLIESMLEYEPAKRLTLGEALQHPFFA
10 | RLRAEPPNKLWDSSRDISR
11 |
--------------------------------------------------------------------------------
/2020_2021/Lesson5/protein_sequences/Q02156.fasta:
--------------------------------------------------------------------------------
1 | >sp|Q02156|KPCE_HUMAN Protein kinase C epsilon type OS=Homo sapiens OX=9606 GN=PRKCE PE=1 SV=1
2 | MVVFNGLLKIKICEAVSLKPTAWSLRHAVGPRPQTFLLDPYIALNVDDSRIGQTATKQKT
3 | NSPAWHDEFVTDVCNGRKIELAVFHDAPIGYDDFVANCTIQFEELLQNGSRHFEDWIDLE
4 | PEGRVYVIIDLSGSSGEAPKDNEERVFRERMRPRKRQGAVRRRVHQVNGHKFMATYLRQP
5 | TYCSHCRDFIWGVIGKQGYQCQVCTCVVHKRCHELIITKCAGLKKQETPDQVGSQRFSVN
6 | MPHKFGIHNYKVPTFCDHCGSLLWGLLRQGLQCKVCKMNVHRRCETNVAPNCGVDARGIA
7 | KVLADLGVTPDKITNSGQRRKKLIAGAESPQPASGSSPSEEDRSKSAPTSPCDQEIKELE
8 | NNIRKALSFDNRGEEHRAASSPDGQLMSPGENGEVRQGQAKRLGLDEFNFIKVLGKGSFG
9 | KVMLAELKGKDEVYAVKVLKKDVILQDDDVDCTMTEKRILALARKHPYLTQLYCCFQTKD
10 | RLFFVMEYVNGGDLMFQIQRSRKFDEPRSRFYAAEVTSALMFLHQHGVIYRDLKLDNILL
11 | DAEGHCKLADFGMCKEGILNGVTTTTFCGTPDYIAPEILQELEYGPSVDWWALGVLMYEM
12 | MAGQPPFEADNEDDLFESILHDDVLYPVWLSKEAVSILKAFMTKNPHKRLGCVASQNGED
13 | AIKQHPFFKEIDWVLLEQKKIKPPFKPRIKTKRDVNNFDQDFTREEPVLTLVDEAIVKQI
14 | NQEEFKGFSYFGEDLMP
15 |
--------------------------------------------------------------------------------
/2020_2021/Lesson5/protein_sequences/Q13188.fasta:
--------------------------------------------------------------------------------
1 | >sp|Q13188|STK3_HUMAN Serine/threonine-protein kinase 3 OS=Homo sapiens OX=9606 GN=STK3 PE=1 SV=2
2 | MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIHKESGQVVAIKQVPV
3 | ESDLQEIIKEISIMQQCDSPYVVKYYGSYFKNTDLWIVMEYCGAGSVSDIIRLRNKTLIE
4 | DEIATILKSTLKGLEYLHFMRKIHRDIKAGNILLNTEGHAKLADFGVAGQLTDTMAKRNT
5 | VIGTPFWMAPEVIQEIGYNCVADIWSLGITSIEMAEGKPPYADIHPMRAIFMIPTNPPPT
6 | FRKPELWSDDFTDFVKKCLVKNPEQRATATQLLQHPFIKNAKPVSILRDLITEAMEIKAK
7 | RHEEQQRELEEEEENSDEDELDSHTMVKTSVESVGTMRATSTMSEGAQTMIEHNSTMLES
8 | DLGTMVINSEDEEEEDGTMKRNATSPQVQRPSFMDYFDKQDFKNKSHENCNQNMHEPFPM
9 | SKNVFPDNWKVPQDGDFDFLKNLSLEELQMRLKALDPMMEREIEELRQRYTAKRQPILDA
10 | MDAKKRRQQNF
11 |
--------------------------------------------------------------------------------
/2020_2021/Lesson5/protein_sequences/Q13627.fasta:
--------------------------------------------------------------------------------
1 | >sp|Q13627|DYR1A_HUMAN Dual specificity tyrosine-phosphorylation-regulated kinase 1A OS=Homo sapiens OX=9606 GN=DYRK1A PE=1 SV=2
2 | MHTGGETSACKPSSVRLAPSFSFHAAGLQMAGQMPHSHQYSDRRQPNISDQQVSALSYSD
3 | QIQQPLTNQVMPDIVMLQRRMPQTFRDPATAPLRKLSVDLIKTYKHINEVYYAKKKRRHQ
4 | QGQGDDSSHKKERKVYNDGYDDDNYDYIVKNGEKWMDRYEIDSLIGKGSFGQVVKAYDRV
5 | EQEWVAIKIIKNKKAFLNQAQIEVRLLELMNKHDTEMKYYIVHLKRHFMFRNHLCLVFEM
6 | LSYNLYDLLRNTNFRGVSLNLTRKFAQQMCTALLFLATPELSIIHCDLKPENILLCNPKR
7 | SAIKIVDFGSSCQLGQRIYQYIQSRFYRSPEVLLGMPYDLAIDMWSLGCILVEMHTGEPL
8 | FSGANEVDQMNKIVEVLGIPPAHILDQAPKARKFFEKLPDGTWNLKKTKDGKREYKPPGT
9 | RKLHNILGVETGGPGGRRAGESGHTVADYLKFKDLILRMLDYDPKTRIQPYYALQHSFFK
10 | KTADEGTNTSNSVSTSPAMEQSQSSGTTSSTSSSSGGSSGTSNSGRARSDPTHQHRHSGG
11 | HFTAAVQAMDCETHSPQVRQQFPAPLGWSGTEAPTQVTVETHPVQETTFHVAPQQNALHH
12 | HHGNSSHHHHHHHHHHHHHGQQALGNRTRPRVYNSPTNSSSTQDSMEVGHSHHSMTSLSS
13 | STTSSSTSSSSTGNQGNQAYQNRPVAANTLDFGQNGAMDVNLTVYSNPRQETGIAGHPTY
14 | QFSANTGPAHYMTEGHLTMRQGADREESPMTGVCVQQSPVASS
15 |
--------------------------------------------------------------------------------
/2020_2021/Lesson6/Exercises6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n",
10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n",
11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 3,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "
\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " UUU | \n",
41 | " F | \n",
42 | " Phe | \n",
43 | " Phenylalanine | \n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 | \n",
49 | " UUC | \n",
50 | " F | \n",
51 | " Phe | \n",
52 | " Phenylalanine | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " UUA | \n",
57 | " L | \n",
58 | " Leu | \n",
59 | " Leucine | \n",
60 | "
\n",
61 | " \n",
62 | " 2 | \n",
63 | " UUG | \n",
64 | " L | \n",
65 | " Leu | \n",
66 | " Leucine | \n",
67 | "
\n",
68 | " \n",
69 | " 3 | \n",
70 | " CUU | \n",
71 | " L | \n",
72 | " Leu | \n",
73 | " Leucine | \n",
74 | "
\n",
75 | " \n",
76 | " 4 | \n",
77 | " CUC | \n",
78 | " L | \n",
79 | " Leu | \n",
80 | " Leucine | \n",
81 | "
\n",
82 | " \n",
83 | " ... | \n",
84 | " ... | \n",
85 | " ... | \n",
86 | " ... | \n",
87 | " ... | \n",
88 | "
\n",
89 | " \n",
90 | " 58 | \n",
91 | " AGG | \n",
92 | " R | \n",
93 | " Arg | \n",
94 | " Arginine | \n",
95 | "
\n",
96 | " \n",
97 | " 59 | \n",
98 | " GGU | \n",
99 | " G | \n",
100 | " Gly | \n",
101 | " Glycine | \n",
102 | "
\n",
103 | " \n",
104 | " 60 | \n",
105 | " GGC | \n",
106 | " G | \n",
107 | " Gly | \n",
108 | " Glycine | \n",
109 | "
\n",
110 | " \n",
111 | " 61 | \n",
112 | " GGA | \n",
113 | " G | \n",
114 | " Gly | \n",
115 | " Glycine | \n",
116 | "
\n",
117 | " \n",
118 | " 62 | \n",
119 | " GGG | \n",
120 | " G | \n",
121 | " Gly | \n",
122 | " Glycine | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
63 rows × 4 columns
\n",
127 | "
"
128 | ],
129 | "text/plain": [
130 | " UUU F Phe Phenylalanine\n",
131 | "0 UUC F Phe Phenylalanine\n",
132 | "1 UUA L Leu Leucine\n",
133 | "2 UUG L Leu Leucine\n",
134 | "3 CUU L Leu Leucine\n",
135 | "4 CUC L Leu Leucine\n",
136 | ".. ... .. ... ...\n",
137 | "58 AGG R Arg Arginine\n",
138 | "59 GGU G Gly Glycine\n",
139 | "60 GGC G Gly Glycine\n",
140 | "61 GGA G Gly Glycine\n",
141 | "62 GGG G Gly Glycine\n",
142 | "\n",
143 | "[63 rows x 4 columns]"
144 | ]
145 | },
146 | "execution_count": 3,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "import pandas as pd\n",
153 | "\n",
154 | "# TO DO"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### Exercise\n",
162 | "\n",
163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 4,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "# TO DO"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Exercise\n",
180 | "\n",
181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "# TO DO"
191 | ]
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "Python 3",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.8.3"
211 | }
212 | },
213 | "nbformat": 4,
214 | "nbformat_minor": 4
215 | }
--------------------------------------------------------------------------------
/2020_2021/Lesson6/Lesson6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 6 - 2020/12/03"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Numpy\n",
15 | "[NumPy](https://numpy.org/) (short for *Numerical Python*) is a numerical library for Python which provides an efficient interface to store and operate on data."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "### A Python Integer Is More Than Just an Integer\n",
23 | "A Python integer is a pointer to a position in memory containing all the Python object information, including the bytes that contain the integer value.\n",
24 | "\n",
25 | ""
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### A Python List Is More Than Just a List\n",
33 | "\n",
34 | "Because of Python's dynamic typing, we can create heterogeneous lists:"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 1,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "[bool, str, float, int]"
46 | ]
47 | },
48 | "execution_count": 1,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "my_list = [True, \"2\", 3.0, 4]\n",
55 | "\n",
56 | "[type(item) for item in my_list]"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "But this flexibility comes at a cost.\n",
64 | "\n",
65 | ""
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "In the special case that all variables are of the same type, much of this information is redundant: it can be much more efficient to store data in a fixed-type array."
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "### Fixed-Type Arrays in Python\n",
80 | "The built-in ``array`` module can be used to create arrays of a uniform type:"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 2,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/plain": [
91 | "array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"
92 | ]
93 | },
94 | "execution_count": 2,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "import array\n",
101 | "\n",
102 | "L = list(range(10))\n",
103 | "A = array.array('i', L) # i indicates integer values\n",
104 | "\n",
105 | "A"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Much more useful, however, is the ``numpy.ndarray`` object of the NumPy package which adds to this efficient *operations* on that data."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 3,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "(numpy.ndarray, array([3, 9, 8, 8, 4, 3, 8, 2, 2]))"
124 | ]
125 | },
126 | "execution_count": 3,
127 | "metadata": {},
128 | "output_type": "execute_result"
129 | }
130 | ],
131 | "source": [
132 | "import numpy as np\n",
133 | "\n",
134 | "x_np = np.random.randint(10, size=9) # One-dimensional array\n",
135 | "\n",
136 | "type(x_np), x_np"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 4,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "x1[3]: 8\n",
149 | "x1[2:5]: [8 8 4]\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "print('x1[3]:', x_np[3]) # Array Indexing\n",
155 | "print('x1[2:5]:', x_np[2:5]) # Array Slicing"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 5,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "3\n",
168 | "9\n",
169 | "8\n",
170 | "8\n",
171 | "4\n",
172 | "3\n",
173 | "8\n",
174 | "2\n",
175 | "2\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "# Iteration\n",
181 | "for element in x_np:\n",
182 | " print(element)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "``numpy.ndarray`` stands for N-dimensional array which means that this object is built to be multi-dimensional, with attributes and methods specifically designed for this feature."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 6,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "[3 9 8 8 4 3 8 2 2]\n",
202 | "[[3 9 8]\n",
203 | " [8 4 3]\n",
204 | " [8 2 2]]\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "grid = x_np.reshape((3, 3)) # Two-dimensional array\n",
210 | "\n",
211 | "print(x_np)\n",
212 | "print(grid)"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 7,
218 | "metadata": {},
219 | "outputs": [
220 | {
221 | "name": "stdout",
222 | "output_type": "stream",
223 | "text": [
224 | "grid.ndim: 2\n",
225 | "grid.shape: (3, 3)\n",
226 | "grid.size: 9\n",
227 | "grid.dtype: int64\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "print(\"grid.ndim: \", grid.ndim)\n",
233 | "print(\"grid.shape:\", grid.shape)\n",
234 | "print(\"grid.size: \", grid.size)\n",
235 | "print(\"grid.dtype:\", grid.dtype)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### Boolean indexing\n",
243 | "Numpy arrays can be sliced with vectors of booleans (``list``s or other ``ndarray``s) with the same dimensions."
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 8,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "x_np: [3 9 8 8 4 3 8 2 2]\n"
256 | ]
257 | }
258 | ],
259 | "source": [
260 | "print('x_np:', x_np)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 9,
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "boolean_np: [False True True True True False True False False]\n"
273 | ]
274 | }
275 | ],
276 | "source": [
277 | "boolean_np = x_np > 3\n",
278 | "\n",
279 | "print('boolean_np:', boolean_np) # It states if the element in the elements in the same position are > 3."
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 10,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "[9, 8, 8, 4, 8]"
291 | ]
292 | },
293 | "execution_count": 10,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "[x for x in x_np if x > 3]"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 11,
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/plain": [
310 | "array([9, 8, 8, 4, 8])"
311 | ]
312 | },
313 | "execution_count": 11,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "x_np[boolean_np]"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 12,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "2.76 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
332 | "5.36 ms ± 63.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "big_array = np.random.rand(10000000)\n",
338 | "\n",
339 | "%timeit [x for x in big_array if x > 3]\n",
340 | "%timeit big_array[big_array > 3]"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "### Vectorized Operations\n",
348 | "Operation between arrays are carried out with a different logic than that of standard lists."
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 13,
354 | "metadata": {},
355 | "outputs": [
356 | {
357 | "name": "stdout",
358 | "output_type": "stream",
359 | "text": [
360 | "x_list + x_list: [3, 9, 8, 8, 4, 3, 8, 2, 2, 3, 9, 8, 8, 4, 3, 8, 2, 2]\n",
361 | "x_np + x_np: [ 6 18 16 16 8 6 16 4 4]\n"
362 | ]
363 | }
364 | ],
365 | "source": [
366 | "x_list = list(x_np)\n",
367 | "\n",
368 | "print('x_list + x_list:', x_list + x_list)\n",
369 | "print('x_np + x_np:', x_np + x_np)"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "| Operator | Equivalent func | Description |\n",
377 | "|---------------|---------------------|---------------------------------------|\n",
378 | "|``+`` |``np.add`` |Addition (e.g., ``1 + 1 = 2``) |\n",
379 | "|``-`` |``np.subtract`` |Subtraction (e.g., ``3 - 2 = 1``) |\n",
380 | "|``-`` |``np.negative`` |Unary negation (e.g., ``-2``) |\n",
381 | "|``*`` |``np.multiply`` |Multiplication (e.g., ``2 * 3 = 6``) |\n",
382 | "|``/`` |``np.divide`` |Division (e.g., ``3 / 2 = 1.5``) |\n",
383 | "|``//`` |``np.floor_divide`` |Floor division (e.g., ``3 // 2 = 1``) |\n",
384 | "|``**`` |``np.power`` |Exponentiation (e.g., ``2 ** 3 = 8``) |\n",
385 | "|``%`` |``np.mod`` |Modulus/remainder (e.g., ``9 % 4 = 1``)|"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 14,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "name": "stdout",
395 | "output_type": "stream",
396 | "text": [
397 | "1.41 s ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
398 | "4.11 ms ± 39 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "%timeit sum(big_array)\n",
404 | "%timeit np.sum(big_array) # or big_array.sum()"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "metadata": {},
410 | "source": [
411 | "Important: whenever possible, make sure that you are using the NumPy version of these operations when operating on NumPy arrays."
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {},
417 | "source": [
418 | "## Pandas\n",
419 | "\n",
420 | "[Pandas](https://pandas.pydata.org/) is a library built on top of NumPy, which provides an efficient implementation of a ``DataFrame``.\n",
421 | "\n",
422 | "``DataFrame``s can be seens as multidimensional arrays with attached row and column labels, that can presennt heterogeneous types and/or missing data."
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | "### The Pandas Series Object\n",
430 | "A Pandas ``Series`` is a one-dimensional array of indexed data."
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 15,
436 | "metadata": {},
437 | "outputs": [
438 | {
439 | "data": {
440 | "text/plain": [
441 | "0 RNA\n",
442 | "1 gene\n",
443 | "2 protein\n",
444 | "dtype: object"
445 | ]
446 | },
447 | "execution_count": 15,
448 | "metadata": {},
449 | "output_type": "execute_result"
450 | }
451 | ],
452 | "source": [
453 | "import pandas as pd\n",
454 | "\n",
455 | "data = pd.Series(['RNA', 'gene', 'protein'])\n",
456 | "data"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 16,
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "data": {
466 | "text/plain": [
467 | "array(['RNA', 'gene', 'protein'], dtype=object)"
468 | ]
469 | },
470 | "execution_count": 16,
471 | "metadata": {},
472 | "output_type": "execute_result"
473 | }
474 | ],
475 | "source": [
476 | "data.values"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 17,
482 | "metadata": {},
483 | "outputs": [
484 | {
485 | "data": {
486 | "text/plain": [
487 | "RangeIndex(start=0, stop=3, step=1)"
488 | ]
489 | },
490 | "execution_count": 17,
491 | "metadata": {},
492 | "output_type": "execute_result"
493 | }
494 | ],
495 | "source": [
496 | "data.index"
497 | ]
498 | },
499 | {
500 | "cell_type": "markdown",
501 | "metadata": {},
502 | "source": [
503 | "The index need not be an integer, but can consist of values of any type:"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 18,
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/plain": [
514 | "ENST RNA\n",
515 | "ENSG gene\n",
516 | "ENSP protein\n",
517 | "dtype: object"
518 | ]
519 | },
520 | "execution_count": 18,
521 | "metadata": {},
522 | "output_type": "execute_result"
523 | }
524 | ],
525 | "source": [
526 | "data = pd.Series(\n",
527 | " ['RNA', 'gene', 'protein'],\n",
528 | " index=['ENST', 'ENSG', 'ENSP']\n",
529 | ")\n",
530 | "data"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 19,
536 | "metadata": {},
537 | "outputs": [
538 | {
539 | "data": {
540 | "text/plain": [
541 | "'gene'"
542 | ]
543 | },
544 | "execution_count": 19,
545 | "metadata": {},
546 | "output_type": "execute_result"
547 | }
548 | ],
549 | "source": [
550 | "data['ENSG']"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "We can construct a ``Series`` from a dictionary and the way we access the values are similar to dictionaries:"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 20,
563 | "metadata": {},
564 | "outputs": [
565 | {
566 | "data": {
567 | "text/plain": [
568 | "ENST RNA\n",
569 | "ENSG gene\n",
570 | "ENSP protein\n",
571 | "dtype: object"
572 | ]
573 | },
574 | "execution_count": 20,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n",
581 | "data = pd.Series(map_dict)\n",
582 | "data"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 21,
588 | "metadata": {},
589 | "outputs": [
590 | {
591 | "data": {
592 | "text/plain": [
593 | "ENSG gene\n",
594 | "ENSP protein\n",
595 | "dtype: object"
596 | ]
597 | },
598 | "execution_count": 21,
599 | "metadata": {},
600 | "output_type": "execute_result"
601 | }
602 | ],
603 | "source": [
604 | "data['ENSG':]"
605 | ]
606 | },
607 | {
608 | "cell_type": "markdown",
609 | "metadata": {},
610 | "source": [
611 | "### The Pandas DataFrame Object\n",
612 | "\n",
613 | "It can be constructed from 2 or more dictionary with the same keys (or from 2 `Series` with the same indexes)."
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": 22,
619 | "metadata": {},
620 | "outputs": [
621 | {
622 | "data": {
623 | "text/html": [
624 | "\n",
625 | "\n",
638 | "
\n",
639 | " \n",
640 | " \n",
641 | " | \n",
642 | " mapping type | \n",
643 | " counts | \n",
644 | "
\n",
645 | " \n",
646 | " \n",
647 | " \n",
648 | " ENST | \n",
649 | " RNA | \n",
650 | " 3300 | \n",
651 | "
\n",
652 | " \n",
653 | " ENSG | \n",
654 | " gene | \n",
655 | " 18435 | \n",
656 | "
\n",
657 | " \n",
658 | " ENSP | \n",
659 | " protein | \n",
660 | " 12034 | \n",
661 | "
\n",
662 | " \n",
663 | "
\n",
664 | "
"
665 | ],
666 | "text/plain": [
667 | " mapping type counts\n",
668 | "ENST RNA 3300\n",
669 | "ENSG gene 18435\n",
670 | "ENSP protein 12034"
671 | ]
672 | },
673 | "execution_count": 22,
674 | "metadata": {},
675 | "output_type": "execute_result"
676 | }
677 | ],
678 | "source": [
679 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n",
680 | "count_dict = {'ENST': 3300, 'ENSG': 18435, 'ENSP': 12034}\n",
681 | " \n",
682 | "df = pd.DataFrame({'mapping type': map_dict, 'counts': count_dict})\n",
683 | "df"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": 23,
689 | "metadata": {},
690 | "outputs": [
691 | {
692 | "data": {
693 | "text/plain": [
694 | "Index(['ENST', 'ENSG', 'ENSP'], dtype='object')"
695 | ]
696 | },
697 | "execution_count": 23,
698 | "metadata": {},
699 | "output_type": "execute_result"
700 | }
701 | ],
702 | "source": [
703 | "df.index"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 24,
709 | "metadata": {},
710 | "outputs": [
711 | {
712 | "data": {
713 | "text/plain": [
714 | "Index(['mapping type', 'counts'], dtype='object')"
715 | ]
716 | },
717 | "execution_count": 24,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "df.columns"
724 | ]
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "metadata": {},
729 | "source": [
730 | "We can access a colum like a dictionary or in a Pandas way:"
731 | ]
732 | },
733 | {
734 | "cell_type": "code",
735 | "execution_count": 25,
736 | "metadata": {},
737 | "outputs": [
738 | {
739 | "data": {
740 | "text/plain": [
741 | "ENST 3300\n",
742 | "ENSG 18435\n",
743 | "ENSP 12034\n",
744 | "Name: counts, dtype: int64"
745 | ]
746 | },
747 | "execution_count": 25,
748 | "metadata": {},
749 | "output_type": "execute_result"
750 | }
751 | ],
752 | "source": [
753 | "df['counts'] # like a dictionary"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": 26,
759 | "metadata": {},
760 | "outputs": [
761 | {
762 | "data": {
763 | "text/plain": [
764 | "ENST 3300\n",
765 | "ENSG 18435\n",
766 | "ENSP 12034\n",
767 | "Name: counts, dtype: int64"
768 | ]
769 | },
770 | "execution_count": 26,
771 | "metadata": {},
772 | "output_type": "execute_result"
773 | }
774 | ],
775 | "source": [
776 | "df.counts # The Pandas way"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 27,
782 | "metadata": {},
783 | "outputs": [
784 | {
785 | "data": {
786 | "text/plain": [
787 | "ENST RNA\n",
788 | "ENSG gene\n",
789 | "ENSP protein\n",
790 | "Name: mapping type, dtype: object"
791 | ]
792 | },
793 | "execution_count": 27,
794 | "metadata": {},
795 | "output_type": "execute_result"
796 | }
797 | ],
798 | "source": [
799 | "df['mapping type']\n",
800 | "#df.mapping type # I can't do it"
801 | ]
802 | }
803 | ],
804 | "metadata": {
805 | "kernelspec": {
806 | "display_name": "Python 3",
807 | "language": "python",
808 | "name": "python3"
809 | },
810 | "language_info": {
811 | "codemirror_mode": {
812 | "name": "ipython",
813 | "version": 3
814 | },
815 | "file_extension": ".py",
816 | "mimetype": "text/x-python",
817 | "name": "python",
818 | "nbconvert_exporter": "python",
819 | "pygments_lexer": "ipython3",
820 | "version": "3.8.3"
821 | }
822 | },
823 | "nbformat": 4,
824 | "nbformat_minor": 4
825 | }
826 |
--------------------------------------------------------------------------------
/2020_2021/Lesson7/Exercises7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# TO DO"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Exercise\n",
26 | "\n",
27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# TO DO"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Exercise\n",
44 | "\n",
45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)."
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# TO DO"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.8.3"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/2020_2021/data/P04439.fasta:
--------------------------------------------------------------------------------
1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2
2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF
3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ
4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL
5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT
6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL
7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL
8 | TACKV
9 |
--------------------------------------------------------------------------------
/2020_2021/data/brca_transcripts.txt:
--------------------------------------------------------------------------------
1 | transcript_id biotype bp aa
2 | ENST00000352993.7 Protein coding 3668 721
3 | ENST00000354071.7 Protein coding 4497 1399
4 | ENST00000461221.5 Nonsense mediated decay 5693 63
5 | ENST00000461574.1 Protein coding 726 242
6 | ENST00000461798.5 Nonsense mediated decay 582 63
7 |
--------------------------------------------------------------------------------
/2020_2021/data/genetic_code.tsv:
--------------------------------------------------------------------------------
1 | UUU F Phe Phenylalanine
2 | UUC F Phe Phenylalanine
3 | UUA L Leu Leucine
4 | UUG L Leu Leucine
5 | CUU L Leu Leucine
6 | CUC L Leu Leucine
7 | CUA L Leu Leucine
8 | CUG L Leu Leucine
9 | AUU I Ile Isoleucine
10 | AUC I Ile Isoleucine
11 | AUA I Ile Isoleucine
12 | AUG M Met Methionine (Start)
13 | GUU V Val Valine
14 | GUC V Val Valine
15 | GUA V Val Valine
16 | GUG V Val Valine
17 | UCU S Ser Serine
18 | UCC S Ser Serine
19 | UCA S Ser Serine
20 | UCG S Ser Serine
21 | CCU P Pro Proline
22 | CCC P Pro Proline
23 | CCA P Pro Proline
24 | CCG P Pro Proline
25 | ACU T Thr Threonine
26 | ACC T Thr Threonine
27 | ACA T Thr Threonine
28 | ACG T Thr Threonine
29 | GCU A Ala Alanine
30 | GCC A Ala Alanine
31 | GCA A Ala Alanine
32 | GCG A Ala Alanine
33 | UAU Y Tyr Tyrosine
34 | UAC Y Tyr Tyrosine
35 | UAA X Stop (Stop)
36 | UAG X Stop (Stop)
37 | CAU H His Histidine
38 | CAC H His Histidine
39 | CAA Q Gln Glutamine
40 | CAG Q Gln Glutamine
41 | AAU N Asn Asparagine
42 | AAC N Asn Asparagine
43 | AAA K Lys Lysine
44 | AAG K Lys Lysine
45 | GAU D Asp Aspartic acid
46 | GAC D Asp Aspartic acid
47 | GAA E Glu Glutamic acid
48 | GAG E Glu Glutamic acid
49 | UGU C Cys Cysteine
50 | UGC C Cys Cysteine
51 | UGA X Stop (Stop)
52 | UGG W Trp Tryptophan
53 | CGU R Arg Arginine
54 | CGC R Arg Arginine
55 | CGA R Arg Arginine
56 | CGG R Arg Arginine
57 | AGU S Ser Serine
58 | AGC S Ser Serine
59 | AGA R Arg Arginine
60 | AGG R Arg Arginine
61 | GGU G Gly Glycine
62 | GGC G Gly Glycine
63 | GGA G Gly Glycine
64 | GGG G Gly Glycine
--------------------------------------------------------------------------------
/2020_2021/data/uniprot_ids.txt:
--------------------------------------------------------------------------------
1 | Q13188
2 | O00444
3 | P49760
4 | PYYY4Z
5 | Q13627
6 | Q02156
7 |
--------------------------------------------------------------------------------
/2020_2021/data/utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | def generate_string(n, alphabet):
4 | s = ""
5 | for i in range(n):
6 | s += random.choice(alphabet)
7 |
8 | return s
9 |
--------------------------------------------------------------------------------
/2020_2021/data/validation.py:
--------------------------------------------------------------------------------
1 | def valid_sequence(sequence, valid_characters):
2 | for c in sequence:
3 | if c.upper() not in valid_characters:
4 | return False
5 |
6 | return True
7 |
8 | def validate_dna(sequence):
9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C'])
10 |
11 | def validate_rna(sequence):
12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C'])
13 |
14 | def validate_protein(sequence):
15 | return valid_sequence(
16 | sequence,
17 | [
18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/2020_2021/images/Integer.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2020_2021/images/Integer.jpeg
--------------------------------------------------------------------------------
/2020_2021/images/List.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2020_2021/images/List.jpeg
--------------------------------------------------------------------------------
/2021_2022/Lesson1/Exercises1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "The following list is corrupted:"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "mutations = [\n",
18 | " 'p.Ser31Ala',\n",
19 | " 'p.Pro38Leu',\n",
20 | " 'p.Asn100Lys',\n",
21 | " 'p.LEU110VAL',\n",
22 | " 13,\n",
23 | " 4.0,\n",
24 | " True,\n",
25 | " 'p.Tyr341Leu',\n",
26 | " 'AUG',\n",
27 | " 'p.Tyr0Le',\n",
28 | " 'p.Asn1.3Lys',\n",
29 | " 'p.Arg0Leu'\n",
30 | "]"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n",
38 | "\n",
39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "#### Tips"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "valid_aminos = [\n",
56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n",
57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n",
58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n",
59 | "]\n",
60 | "\n",
61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n",
62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n",
63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "valid_mutations = []\n",
73 | "\n",
74 | "# TODO"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Exercise\n",
82 | "\n",
83 | "Write a script to check if a protein sequence is valid."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# TODO"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "### Exercise\n",
100 | "\n",
101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# TODO"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.8.3"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
--------------------------------------------------------------------------------
/2021_2022/Lesson2/Exercises2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n",
10 | "\n",
11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n",
12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n",
13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "mutations = [\n",
23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n",
24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n",
25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n",
26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n",
27 | "]\n",
28 | "\n",
29 | "aa_3L_to_1L = {\n",
30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
34 | "}"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# TO DO"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "### Exercise\n",
51 | "\n",
52 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "# TO DO"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | "### Exercise\n",
69 | "\n",
70 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method."
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "# TO DO"
80 | ]
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "Python 3",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.8.10"
100 | }
101 | },
102 | "nbformat": 4,
103 | "nbformat_minor": 4
104 | }
105 |
--------------------------------------------------------------------------------
/2021_2022/Lesson3/Exercises3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n",
10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# TO DO"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Exercise\n",
27 | "\n",
28 | "Write a function to remove duplicates in a list."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# TO DO"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Exercise\n",
45 | "\n",
46 | "Write a function to calculate the identity between 2 sequences."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# TO DO"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.8.10"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
81 |
--------------------------------------------------------------------------------
/2021_2022/Lesson4/Exercises4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Compute all pair-wise identities (number of identical character pairs)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "sequences = [\n",
19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n",
20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n",
21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n",
22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n",
23 | "]\n",
24 | "\n",
25 | "# TO DO"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Exercise\n",
33 | "\n",
34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n",
35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n",
36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# TO DO"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Exercise\n",
53 | "\n",
54 | "Print the index of the first occurrence of the ATG codon.\n",
55 | "\n",
56 | "Try with and without using the `find()` method on strings."
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 3,
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "# TO DO"
66 | ]
67 | }
68 | ],
69 | "metadata": {
70 | "kernelspec": {
71 | "display_name": "Python 3",
72 | "language": "python",
73 | "name": "python3"
74 | },
75 | "language_info": {
76 | "codemirror_mode": {
77 | "name": "ipython",
78 | "version": 3
79 | },
80 | "file_extension": ".py",
81 | "mimetype": "text/x-python",
82 | "name": "python",
83 | "nbconvert_exporter": "python",
84 | "pygments_lexer": "ipython3",
85 | "version": "3.8.10"
86 | }
87 | },
88 | "nbformat": 4,
89 | "nbformat_minor": 4
90 | }
91 |
--------------------------------------------------------------------------------
/2021_2022/Lesson5/Exercises5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Write a function to search motifs in a sequence.\n",
10 | "\n",
11 | "Try with and without using the `re` module."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n",
21 | "\n",
22 | "consensus_motifs = {\n",
23 | " 'motif1': 'AGGAG[GT]',\n",
24 | " 'motif2': 'T[AT]AAT',\n",
25 | " 'motif3': 'GG.A.T[AG]'\n",
26 | "}"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Possible printed output:\n",
34 | "```\n",
35 | "AGGAG[GT]\n",
36 | "\t(969, 975) AGGAGG\n",
37 | "\t(1153, 1159) AGGAGG\n",
38 | "\t(1339, 1345) AGGAGT\n",
39 | "\t(1587, 1593) AGGAGG\n",
40 | "\t(1881, 1887) AGGAGG\n",
41 | "\t(1941, 1947) AGGAGG\n",
42 | "T[AT]AAT\n",
43 | "\t(50, 55) TAAAT\n",
44 | "\t(1098, 1103) TAAAT\n",
45 | "\t(1276, 1281) TAAAT\n",
46 | "GG.A.T[AG]\n",
47 | "\t(248, 255) GGTACTG\n",
48 | "\t(983, 990) GGAAATA\n",
49 | "\t(1910, 1917) GGGACTG\n",
50 | "\t(1980, 1987) GGCAGTG\n",
51 | "```\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# TO DO"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Exercise\n",
68 | "\n",
69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "aa_3L_to_1L = {\n",
79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
83 | "}\n",
84 | "\n",
85 | "#aa_1L_to_3L['A'] --> 'ALA'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# TO DO"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### Exercise\n",
102 | "\n",
103 | "Write a function to remove not valid aminoacids from a protein."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# TO DO"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.8.10"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 4
137 | }
138 |
--------------------------------------------------------------------------------
/2021_2022/Lesson6/Exercises6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n",
10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n",
11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 3,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " UUU | \n",
41 | " F | \n",
42 | " Phe | \n",
43 | " Phenylalanine | \n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 | \n",
49 | " UUC | \n",
50 | " F | \n",
51 | " Phe | \n",
52 | " Phenylalanine | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " UUA | \n",
57 | " L | \n",
58 | " Leu | \n",
59 | " Leucine | \n",
60 | "
\n",
61 | " \n",
62 | " 2 | \n",
63 | " UUG | \n",
64 | " L | \n",
65 | " Leu | \n",
66 | " Leucine | \n",
67 | "
\n",
68 | " \n",
69 | " 3 | \n",
70 | " CUU | \n",
71 | " L | \n",
72 | " Leu | \n",
73 | " Leucine | \n",
74 | "
\n",
75 | " \n",
76 | " 4 | \n",
77 | " CUC | \n",
78 | " L | \n",
79 | " Leu | \n",
80 | " Leucine | \n",
81 | "
\n",
82 | " \n",
83 | " ... | \n",
84 | " ... | \n",
85 | " ... | \n",
86 | " ... | \n",
87 | " ... | \n",
88 | "
\n",
89 | " \n",
90 | " 58 | \n",
91 | " AGG | \n",
92 | " R | \n",
93 | " Arg | \n",
94 | " Arginine | \n",
95 | "
\n",
96 | " \n",
97 | " 59 | \n",
98 | " GGU | \n",
99 | " G | \n",
100 | " Gly | \n",
101 | " Glycine | \n",
102 | "
\n",
103 | " \n",
104 | " 60 | \n",
105 | " GGC | \n",
106 | " G | \n",
107 | " Gly | \n",
108 | " Glycine | \n",
109 | "
\n",
110 | " \n",
111 | " 61 | \n",
112 | " GGA | \n",
113 | " G | \n",
114 | " Gly | \n",
115 | " Glycine | \n",
116 | "
\n",
117 | " \n",
118 | " 62 | \n",
119 | " GGG | \n",
120 | " G | \n",
121 | " Gly | \n",
122 | " Glycine | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
63 rows × 4 columns
\n",
127 | "
"
128 | ],
129 | "text/plain": [
130 | " UUU F Phe Phenylalanine\n",
131 | "0 UUC F Phe Phenylalanine\n",
132 | "1 UUA L Leu Leucine\n",
133 | "2 UUG L Leu Leucine\n",
134 | "3 CUU L Leu Leucine\n",
135 | "4 CUC L Leu Leucine\n",
136 | ".. ... .. ... ...\n",
137 | "58 AGG R Arg Arginine\n",
138 | "59 GGU G Gly Glycine\n",
139 | "60 GGC G Gly Glycine\n",
140 | "61 GGA G Gly Glycine\n",
141 | "62 GGG G Gly Glycine\n",
142 | "\n",
143 | "[63 rows x 4 columns]"
144 | ]
145 | },
146 | "execution_count": 3,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "import pandas as pd\n",
153 | "\n",
154 | "# TO DO"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### Exercise\n",
162 | "\n",
163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 4,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "# TO DO"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Exercise\n",
180 | "\n",
181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "# TO DO"
191 | ]
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "Python 3",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.8.10"
211 | }
212 | },
213 | "nbformat": 4,
214 | "nbformat_minor": 4
215 | }
216 |
--------------------------------------------------------------------------------
/2021_2022/Lesson6/Lesson6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Lesson 6 - 2021/11/18"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Numpy\n",
15 | "[NumPy](https://numpy.org/) (short for *Numerical Python*) is a numerical library for Python which provides an efficient interface to store and operate on data."
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "### A Python Integer Is More Than Just an Integer\n",
23 | "A Python integer is a pointer to a position in memory containing all the Python object information, including the bytes that contain the integer value.\n",
24 | "\n",
25 | ""
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### A Python List Is More Than Just a List\n",
33 | "\n",
34 | "Because of Python's dynamic typing, we can create heterogeneous lists:"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 1,
40 | "metadata": {},
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "[bool, str, float, int]"
46 | ]
47 | },
48 | "execution_count": 1,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "my_list = [True, \"2\", 3.0, 4]\n",
55 | "\n",
56 | "[type(item) for item in my_list]"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "But this flexibility comes at a cost.\n",
64 | "\n",
65 | ""
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "metadata": {},
71 | "source": [
72 | "In the special case that all variables are of the same type, much of this information is redundant: it can be much more efficient to store data in a fixed-type array."
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "metadata": {},
78 | "source": [
79 | "### Fixed-Type Arrays in Python\n",
80 | "The built-in ``array`` module can be used to create arrays of a uniform type:"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 2,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/plain": [
91 | "array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])"
92 | ]
93 | },
94 | "execution_count": 2,
95 | "metadata": {},
96 | "output_type": "execute_result"
97 | }
98 | ],
99 | "source": [
100 | "import array\n",
101 | "\n",
102 | "L = list(range(10))\n",
103 | "A = array.array('i', L) # i indicates integer values\n",
104 | "\n",
105 | "A"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "Much more useful, however, is the ``numpy.ndarray`` object of the NumPy package which adds to this efficient *operations* on that data."
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 3,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "(numpy.ndarray, array([9, 8, 3, 5, 1, 1, 6, 0, 5]))"
124 | ]
125 | },
126 | "execution_count": 3,
127 | "metadata": {},
128 | "output_type": "execute_result"
129 | }
130 | ],
131 | "source": [
132 | "import numpy as np\n",
133 | "\n",
134 | "x_np = np.random.randint(10, size=9) # One-dimensional array\n",
135 | "\n",
136 | "type(x_np), x_np"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 4,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "x1[3]: 5\n",
149 | "x1[2:5]: [3 5 1]\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "print('x1[3]:', x_np[3]) # Array Indexing\n",
155 | "print('x1[2:5]:', x_np[2:5]) # Array Slicing"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 5,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "9\n",
168 | "8\n",
169 | "3\n",
170 | "5\n",
171 | "1\n",
172 | "1\n",
173 | "6\n",
174 | "0\n",
175 | "5\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "# Iteration\n",
181 | "for element in x_np:\n",
182 | " print(element)"
183 | ]
184 | },
185 | {
186 | "cell_type": "markdown",
187 | "metadata": {},
188 | "source": [
189 | "``numpy.ndarray`` stands for N-dimensional array which means that this object is built to be multi-dimensional, with attributes and methods specifically designed for this feature."
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 6,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "[9 8 3 5 1 1 6 0 5]\n",
202 | "[[9 8 3]\n",
203 | " [5 1 1]\n",
204 | " [6 0 5]]\n"
205 | ]
206 | }
207 | ],
208 | "source": [
209 | "grid = x_np.reshape((3, 3)) # Two-dimensional array\n",
210 | "\n",
211 | "print(x_np)\n",
212 | "print(grid)"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 7,
218 | "metadata": {},
219 | "outputs": [
220 | {
221 | "name": "stdout",
222 | "output_type": "stream",
223 | "text": [
224 | "grid.ndim: 2\n",
225 | "grid.shape: (3, 3)\n",
226 | "grid.size: 9\n",
227 | "grid.dtype: int64\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "print(\"grid.ndim: \", grid.ndim)\n",
233 | "print(\"grid.shape:\", grid.shape)\n",
234 | "print(\"grid.size: \", grid.size)\n",
235 | "print(\"grid.dtype:\", grid.dtype)"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "### Boolean indexing\n",
243 | "Numpy arrays can be sliced with vectors of booleans (``list``s or other ``ndarray``s) with the same dimensions."
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 9,
249 | "metadata": {},
250 | "outputs": [
251 | {
252 | "name": "stdout",
253 | "output_type": "stream",
254 | "text": [
255 | "x_np: [9 8 3 5 1 1 6 0 5]\n"
256 | ]
257 | }
258 | ],
259 | "source": [
260 | "print('x_np:', x_np)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 10,
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "boolean_np: [ True True False True False False True False True]\n"
273 | ]
274 | }
275 | ],
276 | "source": [
277 | "boolean_np = x_np > 3\n",
278 | "\n",
279 | "print('boolean_np:', boolean_np) # It states if the element in the elements in the same position are > 3."
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 11,
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "[9, 8, 5, 6, 5]"
291 | ]
292 | },
293 | "execution_count": 11,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "[x for x in x_np if x > 3]"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": 12,
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/plain": [
310 | "array([9, 8, 5, 6, 5])"
311 | ]
312 | },
313 | "execution_count": 12,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "x_np[boolean_np]"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 13,
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "2.41 s ± 938 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
332 | "5.54 ms ± 18 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "big_array = np.random.rand(10000000)\n",
338 | "\n",
339 | "%timeit [x for x in big_array if x > 3]\n",
340 | "%timeit big_array[big_array > 3]"
341 | ]
342 | },
343 | {
344 | "cell_type": "markdown",
345 | "metadata": {},
346 | "source": [
347 | "### Vectorized Operations\n",
348 | "Operation between arrays are carried out with a different logic than that of standard lists."
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 14,
354 | "metadata": {},
355 | "outputs": [
356 | {
357 | "name": "stdout",
358 | "output_type": "stream",
359 | "text": [
360 | "x_list + x_list: [9, 8, 3, 5, 1, 1, 6, 0, 5, 9, 8, 3, 5, 1, 1, 6, 0, 5]\n",
361 | "x_np + x_np: [18 16 6 10 2 2 12 0 10]\n"
362 | ]
363 | }
364 | ],
365 | "source": [
366 | "x_list = list(x_np)\n",
367 | "\n",
368 | "print('x_list + x_list:', x_list + x_list)\n",
369 | "print('x_np + x_np:', x_np + x_np)"
370 | ]
371 | },
372 | {
373 | "cell_type": "markdown",
374 | "metadata": {},
375 | "source": [
376 | "| Operator | Equivalent func | Description |\n",
377 | "|---------------|---------------------|---------------------------------------|\n",
378 | "|``+`` |``np.add`` |Addition (e.g., ``1 + 1 = 2``) |\n",
379 | "|``-`` |``np.subtract`` |Subtraction (e.g., ``3 - 2 = 1``) |\n",
380 | "|``-`` |``np.negative`` |Unary negation (e.g., ``-2``) |\n",
381 | "|``*`` |``np.multiply`` |Multiplication (e.g., ``2 * 3 = 6``) |\n",
382 | "|``/`` |``np.divide`` |Division (e.g., ``3 / 2 = 1.5``) |\n",
383 | "|``//`` |``np.floor_divide`` |Floor division (e.g., ``3 // 2 = 1``) |\n",
384 | "|``**`` |``np.power`` |Exponentiation (e.g., ``2 ** 3 = 8``) |\n",
385 | "|``%`` |``np.mod`` |Modulus/remainder (e.g., ``9 % 4 = 1``)|"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 15,
391 | "metadata": {},
392 | "outputs": [
393 | {
394 | "name": "stdout",
395 | "output_type": "stream",
396 | "text": [
397 | "1.37 s ± 39.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
398 | "4.26 ms ± 7.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
399 | ]
400 | }
401 | ],
402 | "source": [
403 | "%timeit sum(big_array)\n",
404 | "%timeit np.sum(big_array) # or big_array.sum()"
405 | ]
406 | },
407 | {
408 | "cell_type": "markdown",
409 | "metadata": {},
410 | "source": [
411 | "Important: whenever possible, make sure that you are using the NumPy version of these operations when operating on NumPy arrays."
412 | ]
413 | },
414 | {
415 | "cell_type": "markdown",
416 | "metadata": {},
417 | "source": [
418 | "## Pandas\n",
419 | "\n",
420 | "[Pandas](https://pandas.pydata.org/) is a library built on top of NumPy, which provides an efficient implementation of a ``DataFrame``.\n",
421 | "\n",
422 | "``DataFrame``s can be seens as multidimensional arrays with attached row and column labels, that can presennt heterogeneous types and/or missing data."
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | "### The Pandas Series Object\n",
430 | "A Pandas ``Series`` is a one-dimensional array of indexed data."
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 16,
436 | "metadata": {},
437 | "outputs": [
438 | {
439 | "data": {
440 | "text/plain": [
441 | "0 RNA\n",
442 | "1 gene\n",
443 | "2 protein\n",
444 | "dtype: object"
445 | ]
446 | },
447 | "execution_count": 16,
448 | "metadata": {},
449 | "output_type": "execute_result"
450 | }
451 | ],
452 | "source": [
453 | "import pandas as pd\n",
454 | "\n",
455 | "data = pd.Series(['RNA', 'gene', 'protein'])\n",
456 | "data"
457 | ]
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": 17,
462 | "metadata": {},
463 | "outputs": [
464 | {
465 | "data": {
466 | "text/plain": [
467 | "array(['RNA', 'gene', 'protein'], dtype=object)"
468 | ]
469 | },
470 | "execution_count": 17,
471 | "metadata": {},
472 | "output_type": "execute_result"
473 | }
474 | ],
475 | "source": [
476 | "data.values"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 18,
482 | "metadata": {},
483 | "outputs": [
484 | {
485 | "data": {
486 | "text/plain": [
487 | "RangeIndex(start=0, stop=3, step=1)"
488 | ]
489 | },
490 | "execution_count": 18,
491 | "metadata": {},
492 | "output_type": "execute_result"
493 | }
494 | ],
495 | "source": [
496 | "data.index"
497 | ]
498 | },
499 | {
500 | "cell_type": "markdown",
501 | "metadata": {},
502 | "source": [
503 | "The index need not be an integer, but can consist of values of any type:"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 19,
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/plain": [
514 | "ENST RNA\n",
515 | "ENSG gene\n",
516 | "ENSP protein\n",
517 | "dtype: object"
518 | ]
519 | },
520 | "execution_count": 19,
521 | "metadata": {},
522 | "output_type": "execute_result"
523 | }
524 | ],
525 | "source": [
526 | "data = pd.Series(\n",
527 | " ['RNA', 'gene', 'protein'],\n",
528 | " index=['ENST', 'ENSG', 'ENSP']\n",
529 | ")\n",
530 | "data"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 20,
536 | "metadata": {},
537 | "outputs": [
538 | {
539 | "data": {
540 | "text/plain": [
541 | "'gene'"
542 | ]
543 | },
544 | "execution_count": 20,
545 | "metadata": {},
546 | "output_type": "execute_result"
547 | }
548 | ],
549 | "source": [
550 | "data['ENSG']"
551 | ]
552 | },
553 | {
554 | "cell_type": "markdown",
555 | "metadata": {},
556 | "source": [
557 | "We can construct a ``Series`` from a dictionary and the way we access the values are similar to dictionaries:"
558 | ]
559 | },
560 | {
561 | "cell_type": "code",
562 | "execution_count": 21,
563 | "metadata": {},
564 | "outputs": [
565 | {
566 | "data": {
567 | "text/plain": [
568 | "ENST RNA\n",
569 | "ENSG gene\n",
570 | "ENSP protein\n",
571 | "dtype: object"
572 | ]
573 | },
574 | "execution_count": 21,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n",
581 | "data = pd.Series(map_dict)\n",
582 | "data"
583 | ]
584 | },
585 | {
586 | "cell_type": "code",
587 | "execution_count": 22,
588 | "metadata": {},
589 | "outputs": [
590 | {
591 | "data": {
592 | "text/plain": [
593 | "ENSG gene\n",
594 | "ENSP protein\n",
595 | "dtype: object"
596 | ]
597 | },
598 | "execution_count": 22,
599 | "metadata": {},
600 | "output_type": "execute_result"
601 | }
602 | ],
603 | "source": [
604 | "data['ENSG':]"
605 | ]
606 | },
607 | {
608 | "cell_type": "markdown",
609 | "metadata": {},
610 | "source": [
611 | "### The Pandas DataFrame Object\n",
612 | "\n",
613 | "It can be constructed from 2 or more dictionary with the same keys (or from 2 `Series` with the same indexes)."
614 | ]
615 | },
616 | {
617 | "cell_type": "code",
618 | "execution_count": 24,
619 | "metadata": {},
620 | "outputs": [
621 | {
622 | "data": {
623 | "text/html": [
624 | "\n",
625 | "\n",
638 | "
\n",
639 | " \n",
640 | " \n",
641 | " | \n",
642 | " mapping type | \n",
643 | " counts | \n",
644 | "
\n",
645 | " \n",
646 | " \n",
647 | " \n",
648 | " ENST | \n",
649 | " RNA | \n",
650 | " 3300 | \n",
651 | "
\n",
652 | " \n",
653 | " ENSG | \n",
654 | " gene | \n",
655 | " 18435 | \n",
656 | "
\n",
657 | " \n",
658 | " ENSP | \n",
659 | " protein | \n",
660 | " 12034 | \n",
661 | "
\n",
662 | " \n",
663 | "
\n",
664 | "
"
665 | ],
666 | "text/plain": [
667 | " mapping type counts\n",
668 | "ENST RNA 3300\n",
669 | "ENSG gene 18435\n",
670 | "ENSP protein 12034"
671 | ]
672 | },
673 | "execution_count": 24,
674 | "metadata": {},
675 | "output_type": "execute_result"
676 | }
677 | ],
678 | "source": [
679 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n",
680 | "count_dict = {'ENST': 3300, 'ENSG': 18435, 'ENSP': 12034}\n",
681 | " \n",
682 | "df = pd.DataFrame({'mapping type': map_dict, 'counts': count_dict})\n",
683 | "df"
684 | ]
685 | },
686 | {
687 | "cell_type": "code",
688 | "execution_count": 25,
689 | "metadata": {},
690 | "outputs": [
691 | {
692 | "data": {
693 | "text/plain": [
694 | "Index(['ENST', 'ENSG', 'ENSP'], dtype='object')"
695 | ]
696 | },
697 | "execution_count": 25,
698 | "metadata": {},
699 | "output_type": "execute_result"
700 | }
701 | ],
702 | "source": [
703 | "df.index"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 26,
709 | "metadata": {},
710 | "outputs": [
711 | {
712 | "data": {
713 | "text/plain": [
714 | "Index(['mapping type', 'counts'], dtype='object')"
715 | ]
716 | },
717 | "execution_count": 26,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "df.columns"
724 | ]
725 | },
726 | {
727 | "cell_type": "markdown",
728 | "metadata": {},
729 | "source": [
730 | "We can access a colum like a dictionary or in a Pandas way:"
731 | ]
732 | },
733 | {
734 | "cell_type": "code",
735 | "execution_count": 25,
736 | "metadata": {},
737 | "outputs": [
738 | {
739 | "data": {
740 | "text/plain": [
741 | "ENST 3300\n",
742 | "ENSG 18435\n",
743 | "ENSP 12034\n",
744 | "Name: counts, dtype: int64"
745 | ]
746 | },
747 | "execution_count": 25,
748 | "metadata": {},
749 | "output_type": "execute_result"
750 | }
751 | ],
752 | "source": [
753 | "df['counts'] # like a dictionary"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": 26,
759 | "metadata": {},
760 | "outputs": [
761 | {
762 | "data": {
763 | "text/plain": [
764 | "ENST 3300\n",
765 | "ENSG 18435\n",
766 | "ENSP 12034\n",
767 | "Name: counts, dtype: int64"
768 | ]
769 | },
770 | "execution_count": 26,
771 | "metadata": {},
772 | "output_type": "execute_result"
773 | }
774 | ],
775 | "source": [
776 | "df.counts # The Pandas way"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 27,
782 | "metadata": {},
783 | "outputs": [
784 | {
785 | "data": {
786 | "text/plain": [
787 | "ENST RNA\n",
788 | "ENSG gene\n",
789 | "ENSP protein\n",
790 | "Name: mapping type, dtype: object"
791 | ]
792 | },
793 | "execution_count": 27,
794 | "metadata": {},
795 | "output_type": "execute_result"
796 | }
797 | ],
798 | "source": [
799 | "df['mapping type']\n",
800 | "#df.mapping type # I can't do it"
801 | ]
802 | }
803 | ],
804 | "metadata": {
805 | "kernelspec": {
806 | "display_name": "Python 3",
807 | "language": "python",
808 | "name": "python3"
809 | },
810 | "language_info": {
811 | "codemirror_mode": {
812 | "name": "ipython",
813 | "version": 3
814 | },
815 | "file_extension": ".py",
816 | "mimetype": "text/x-python",
817 | "name": "python",
818 | "nbconvert_exporter": "python",
819 | "pygments_lexer": "ipython3",
820 | "version": "3.8.10"
821 | }
822 | },
823 | "nbformat": 4,
824 | "nbformat_minor": 4
825 | }
826 |
--------------------------------------------------------------------------------
/2021_2022/Lesson7/Exercises7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# TO DO"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Exercise\n",
26 | "\n",
27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# TO DO"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Exercise\n",
44 | "\n",
45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)."
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# TO DO"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.8.10"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/2021_2022/data/P04439.fasta:
--------------------------------------------------------------------------------
1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2
2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF
3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ
4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL
5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT
6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL
7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL
8 | TACKV
9 |
--------------------------------------------------------------------------------
/2021_2022/data/brca_transcripts.txt:
--------------------------------------------------------------------------------
1 | transcript_id biotype bp aa
2 | ENST00000352993.7 Protein coding 3668 721
3 | ENST00000354071.7 Protein coding 4497 1399
4 | ENST00000461221.5 Nonsense mediated decay 5693 63
5 | ENST00000461574.1 Protein coding 726 242
6 | ENST00000461798.5 Nonsense mediated decay 582 63
7 |
--------------------------------------------------------------------------------
/2021_2022/data/genetic_code.tsv:
--------------------------------------------------------------------------------
1 | UUU F Phe Phenylalanine
2 | UUC F Phe Phenylalanine
3 | UUA L Leu Leucine
4 | UUG L Leu Leucine
5 | CUU L Leu Leucine
6 | CUC L Leu Leucine
7 | CUA L Leu Leucine
8 | CUG L Leu Leucine
9 | AUU I Ile Isoleucine
10 | AUC I Ile Isoleucine
11 | AUA I Ile Isoleucine
12 | AUG M Met Methionine (Start)
13 | GUU V Val Valine
14 | GUC V Val Valine
15 | GUA V Val Valine
16 | GUG V Val Valine
17 | UCU S Ser Serine
18 | UCC S Ser Serine
19 | UCA S Ser Serine
20 | UCG S Ser Serine
21 | CCU P Pro Proline
22 | CCC P Pro Proline
23 | CCA P Pro Proline
24 | CCG P Pro Proline
25 | ACU T Thr Threonine
26 | ACC T Thr Threonine
27 | ACA T Thr Threonine
28 | ACG T Thr Threonine
29 | GCU A Ala Alanine
30 | GCC A Ala Alanine
31 | GCA A Ala Alanine
32 | GCG A Ala Alanine
33 | UAU Y Tyr Tyrosine
34 | UAC Y Tyr Tyrosine
35 | UAA X Stop (Stop)
36 | UAG X Stop (Stop)
37 | CAU H His Histidine
38 | CAC H His Histidine
39 | CAA Q Gln Glutamine
40 | CAG Q Gln Glutamine
41 | AAU N Asn Asparagine
42 | AAC N Asn Asparagine
43 | AAA K Lys Lysine
44 | AAG K Lys Lysine
45 | GAU D Asp Aspartic acid
46 | GAC D Asp Aspartic acid
47 | GAA E Glu Glutamic acid
48 | GAG E Glu Glutamic acid
49 | UGU C Cys Cysteine
50 | UGC C Cys Cysteine
51 | UGA X Stop (Stop)
52 | UGG W Trp Tryptophan
53 | CGU R Arg Arginine
54 | CGC R Arg Arginine
55 | CGA R Arg Arginine
56 | CGG R Arg Arginine
57 | AGU S Ser Serine
58 | AGC S Ser Serine
59 | AGA R Arg Arginine
60 | AGG R Arg Arginine
61 | GGU G Gly Glycine
62 | GGC G Gly Glycine
63 | GGA G Gly Glycine
64 | GGG G Gly Glycine
--------------------------------------------------------------------------------
/2021_2022/data/my_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | def generate_string(n, alphabet):
4 | s = ""
5 | for i in range(n):
6 | s += random.choice(alphabet)
7 |
8 | return s
9 |
--------------------------------------------------------------------------------
/2021_2022/data/uniprot_ids.txt:
--------------------------------------------------------------------------------
1 | Q13188
2 | O00444
3 | P49760
4 | PYYY4Z
5 | Q13627
6 | Q02156
7 |
--------------------------------------------------------------------------------
/2021_2022/data/validation.py:
--------------------------------------------------------------------------------
1 | def valid_sequence(sequence, valid_characters):
2 | for c in sequence:
3 | if c.upper() not in valid_characters:
4 | return False
5 |
6 | return True
7 |
8 | def validate_dna(sequence):
9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C'])
10 |
11 | def validate_rna(sequence):
12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C'])
13 |
14 | def validate_protein(sequence):
15 | return valid_sequence(
16 | sequence,
17 | [
18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/2021_2022/images/Integer.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2021_2022/images/Integer.jpeg
--------------------------------------------------------------------------------
/2021_2022/images/List.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2021_2022/images/List.jpeg
--------------------------------------------------------------------------------
/2022_2023/Lesson1/Exercises1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "The following list is corrupted:"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "mutations = [\n",
18 | " 'p.Ser31Ala',\n",
19 | " 'p.Pro38Leu',\n",
20 | " 'p.Asn100Lys',\n",
21 | " 'p.LEU110VAL',\n",
22 | " 13,\n",
23 | " 4.0,\n",
24 | " True,\n",
25 | " 'p.Tyr341Leu',\n",
26 | " 'AUG',\n",
27 | " 'p.Tyr0Le',\n",
28 | " 'p.Asn1.3Lys',\n",
29 | " 'p.Arg0Leu'\n",
30 | "]"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n",
38 | "\n",
39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "#### Tips"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "valid_aminos = [\n",
56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n",
57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n",
58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n",
59 | "]\n",
60 | "\n",
61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n",
62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n",
63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "valid_mutations = []\n",
73 | "\n",
74 | "# TODO"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Exercise\n",
82 | "\n",
83 | "Write a script to check if a protein sequence is valid."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# TODO"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "### Exercise\n",
100 | "\n",
101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# TODO"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3 (ipykernel)",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.10.6"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
136 |
--------------------------------------------------------------------------------
/2022_2023/Lesson2/Exercises2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n",
10 | "\n",
11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n",
12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n",
13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "mutations = [\n",
23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n",
24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n",
25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n",
26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n",
27 | "]\n",
28 | "\n",
29 | "aa_3L_to_1L = {\n",
30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
34 | "}"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# TO DO"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "Set of valid mutations that could belong to the HLA class I histocompatibility antigen protein:\n",
51 | "\n",
52 | "`['p.Pro39Arg', 'p.Gly40Ile', 'p.Leu19Gly', 'p.Val49Ile', 'p.Asn90Asp', 'p.Phe133His', 'p.Leu134Cys', 'p.Glu190Ser', 'p.Gln248Val', 'p.Thr249Ile']`"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "### Exercise\n",
60 | "\n",
61 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins."
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# TO DO"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### Exercise\n",
78 | "\n",
79 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "# TO DO"
89 | ]
90 | }
91 | ],
92 | "metadata": {
93 | "kernelspec": {
94 | "display_name": "Python 3 (ipykernel)",
95 | "language": "python",
96 | "name": "python3"
97 | },
98 | "language_info": {
99 | "codemirror_mode": {
100 | "name": "ipython",
101 | "version": 3
102 | },
103 | "file_extension": ".py",
104 | "mimetype": "text/x-python",
105 | "name": "python",
106 | "nbconvert_exporter": "python",
107 | "pygments_lexer": "ipython3",
108 | "version": "3.10.6"
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 4
113 | }
114 |
--------------------------------------------------------------------------------
/2022_2023/Lesson3/Exercises3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n",
10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# TO DO"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Exercise\n",
27 | "\n",
28 | "Write a function to remove duplicates in a list."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# TO DO"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Exercise\n",
45 | "\n",
46 | "Write a function to calculate the identity between 2 sequences."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# TO DO"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3 (ipykernel)",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.10.6"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
81 |
--------------------------------------------------------------------------------
/2022_2023/Lesson4/Exercises4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Compute all pair-wise identities (number of identical character pairs)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "sequences = [\n",
19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n",
20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n",
21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n",
22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n",
23 | "]\n",
24 | "\n",
25 | "# TO DO"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Exercise\n",
33 | "\n",
34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n",
35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n",
36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# TO DO"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Exercise\n",
53 | "\n",
54 | "Print the index of the first occurrence of the ATG codon in `dna_seq`.\n",
55 | "\n",
56 | "Try with and without using the `find()` method on strings.\n",
57 | "\n",
58 | "Do the same with the ribonucleotide sequence in the `P04439.rna.fasta` file (manage the `U` <-> `T` conversion)."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "dna_seq = 'AAAAATCCCGAGGCGGCAUGTATATAGGGCTCCGGAGGCGTAATATAAAA'\n",
68 | "\n",
69 | "# TODO"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3 (ipykernel)",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.10.6"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 4
94 | }
95 |
--------------------------------------------------------------------------------
/2022_2023/Lesson5/Exercises5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Write a function to search motifs in a sequence.\n",
10 | "\n",
11 | "Try with and without using the `re` module."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n",
21 | "\n",
22 | "consensus_motifs = {\n",
23 | " 'motif1': 'AGGAG[GT]',\n",
24 | " 'motif2': 'T[AT]AAT',\n",
25 | " 'motif3': 'GG.A.T[AG]'\n",
26 | "}"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Possible printed output:\n",
34 | "```\n",
35 | "AGGAG[GT]\n",
36 | "\t(969, 975) AGGAGG\n",
37 | "\t(1153, 1159) AGGAGG\n",
38 | "\t(1339, 1345) AGGAGT\n",
39 | "\t(1587, 1593) AGGAGG\n",
40 | "\t(1881, 1887) AGGAGG\n",
41 | "\t(1941, 1947) AGGAGG\n",
42 | "T[AT]AAT\n",
43 | "\t(50, 55) TAAAT\n",
44 | "\t(1098, 1103) TAAAT\n",
45 | "\t(1276, 1281) TAAAT\n",
46 | "GG.A.T[AG]\n",
47 | "\t(248, 255) GGTACTG\n",
48 | "\t(983, 990) GGAAATA\n",
49 | "\t(1910, 1917) GGGACTG\n",
50 | "\t(1980, 1987) GGCAGTG\n",
51 | "```\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# TO DO"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Exercise\n",
68 | "\n",
69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "aa_3L_to_1L = {\n",
79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
83 | "}\n",
84 | "\n",
85 | "#aa_1L_to_3L['A'] --> 'ALA'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# TO DO"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### Exercise\n",
102 | "\n",
103 | "Write a function to remove not valid aminoacids from a protein."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# TO DO"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3 (ipykernel)",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.10.6"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 4
137 | }
138 |
--------------------------------------------------------------------------------
/2022_2023/Lesson6/Exercises6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n",
10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n",
11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 3,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " UUU | \n",
41 | " F | \n",
42 | " Phe | \n",
43 | " Phenylalanine | \n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 | \n",
49 | " UUC | \n",
50 | " F | \n",
51 | " Phe | \n",
52 | " Phenylalanine | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " UUA | \n",
57 | " L | \n",
58 | " Leu | \n",
59 | " Leucine | \n",
60 | "
\n",
61 | " \n",
62 | " 2 | \n",
63 | " UUG | \n",
64 | " L | \n",
65 | " Leu | \n",
66 | " Leucine | \n",
67 | "
\n",
68 | " \n",
69 | " 3 | \n",
70 | " CUU | \n",
71 | " L | \n",
72 | " Leu | \n",
73 | " Leucine | \n",
74 | "
\n",
75 | " \n",
76 | " 4 | \n",
77 | " CUC | \n",
78 | " L | \n",
79 | " Leu | \n",
80 | " Leucine | \n",
81 | "
\n",
82 | " \n",
83 | " ... | \n",
84 | " ... | \n",
85 | " ... | \n",
86 | " ... | \n",
87 | " ... | \n",
88 | "
\n",
89 | " \n",
90 | " 58 | \n",
91 | " AGG | \n",
92 | " R | \n",
93 | " Arg | \n",
94 | " Arginine | \n",
95 | "
\n",
96 | " \n",
97 | " 59 | \n",
98 | " GGU | \n",
99 | " G | \n",
100 | " Gly | \n",
101 | " Glycine | \n",
102 | "
\n",
103 | " \n",
104 | " 60 | \n",
105 | " GGC | \n",
106 | " G | \n",
107 | " Gly | \n",
108 | " Glycine | \n",
109 | "
\n",
110 | " \n",
111 | " 61 | \n",
112 | " GGA | \n",
113 | " G | \n",
114 | " Gly | \n",
115 | " Glycine | \n",
116 | "
\n",
117 | " \n",
118 | " 62 | \n",
119 | " GGG | \n",
120 | " G | \n",
121 | " Gly | \n",
122 | " Glycine | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
63 rows × 4 columns
\n",
127 | "
"
128 | ],
129 | "text/plain": [
130 | " UUU F Phe Phenylalanine\n",
131 | "0 UUC F Phe Phenylalanine\n",
132 | "1 UUA L Leu Leucine\n",
133 | "2 UUG L Leu Leucine\n",
134 | "3 CUU L Leu Leucine\n",
135 | "4 CUC L Leu Leucine\n",
136 | ".. ... .. ... ...\n",
137 | "58 AGG R Arg Arginine\n",
138 | "59 GGU G Gly Glycine\n",
139 | "60 GGC G Gly Glycine\n",
140 | "61 GGA G Gly Glycine\n",
141 | "62 GGG G Gly Glycine\n",
142 | "\n",
143 | "[63 rows x 4 columns]"
144 | ]
145 | },
146 | "execution_count": 3,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "import pandas as pd\n",
153 | "\n",
154 | "# TO DO"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### Exercise\n",
162 | "\n",
163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 4,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "# TO DO"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Exercise\n",
180 | "\n",
181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "# TO DO"
191 | ]
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "Python 3 (ipykernel)",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.10.6"
211 | }
212 | },
213 | "nbformat": 4,
214 | "nbformat_minor": 4
215 | }
216 |
--------------------------------------------------------------------------------
/2022_2023/Lesson7/Exercises7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# TO DO"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Exercise\n",
26 | "\n",
27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# TO DO"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Exercise\n",
44 | "\n",
45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)."
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# TO DO"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3 (ipykernel)",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.10.6"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/2022_2023/data/P04439.fasta:
--------------------------------------------------------------------------------
1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2
2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF
3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ
4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL
5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT
6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL
7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL
8 | TACKV
9 |
--------------------------------------------------------------------------------
/2022_2023/data/brca_transcripts.txt:
--------------------------------------------------------------------------------
1 | transcript_id biotype bp aa
2 | ENST00000352993.7 Protein coding 3668 721
3 | ENST00000354071.7 Protein coding 4497 1399
4 | ENST00000461221.5 Nonsense mediated decay 5693 63
5 | ENST00000461574.1 Protein coding 726 242
6 | ENST00000461798.5 Nonsense mediated decay 582 63
7 |
--------------------------------------------------------------------------------
/2022_2023/data/genetic_code.tsv:
--------------------------------------------------------------------------------
1 | UUU F Phe Phenylalanine
2 | UUC F Phe Phenylalanine
3 | UUA L Leu Leucine
4 | UUG L Leu Leucine
5 | CUU L Leu Leucine
6 | CUC L Leu Leucine
7 | CUA L Leu Leucine
8 | CUG L Leu Leucine
9 | AUU I Ile Isoleucine
10 | AUC I Ile Isoleucine
11 | AUA I Ile Isoleucine
12 | AUG M Met Methionine (Start)
13 | GUU V Val Valine
14 | GUC V Val Valine
15 | GUA V Val Valine
16 | GUG V Val Valine
17 | UCU S Ser Serine
18 | UCC S Ser Serine
19 | UCA S Ser Serine
20 | UCG S Ser Serine
21 | CCU P Pro Proline
22 | CCC P Pro Proline
23 | CCA P Pro Proline
24 | CCG P Pro Proline
25 | ACU T Thr Threonine
26 | ACC T Thr Threonine
27 | ACA T Thr Threonine
28 | ACG T Thr Threonine
29 | GCU A Ala Alanine
30 | GCC A Ala Alanine
31 | GCA A Ala Alanine
32 | GCG A Ala Alanine
33 | UAU Y Tyr Tyrosine
34 | UAC Y Tyr Tyrosine
35 | UAA X Stop (Stop)
36 | UAG X Stop (Stop)
37 | CAU H His Histidine
38 | CAC H His Histidine
39 | CAA Q Gln Glutamine
40 | CAG Q Gln Glutamine
41 | AAU N Asn Asparagine
42 | AAC N Asn Asparagine
43 | AAA K Lys Lysine
44 | AAG K Lys Lysine
45 | GAU D Asp Aspartic acid
46 | GAC D Asp Aspartic acid
47 | GAA E Glu Glutamic acid
48 | GAG E Glu Glutamic acid
49 | UGU C Cys Cysteine
50 | UGC C Cys Cysteine
51 | UGA X Stop (Stop)
52 | UGG W Trp Tryptophan
53 | CGU R Arg Arginine
54 | CGC R Arg Arginine
55 | CGA R Arg Arginine
56 | CGG R Arg Arginine
57 | AGU S Ser Serine
58 | AGC S Ser Serine
59 | AGA R Arg Arginine
60 | AGG R Arg Arginine
61 | GGU G Gly Glycine
62 | GGC G Gly Glycine
63 | GGA G Gly Glycine
64 | GGG G Gly Glycine
--------------------------------------------------------------------------------
/2022_2023/data/my_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | def generate_string(n, alphabet):
4 | s = ""
5 | for i in range(n):
6 | s += random.choice(alphabet)
7 |
8 | return s
9 |
--------------------------------------------------------------------------------
/2022_2023/data/uniprot_ids.txt:
--------------------------------------------------------------------------------
1 | Q13188
2 | O00444
3 | P49760
4 | PYYY4Z
5 | Q13627
6 | Q02156
7 |
--------------------------------------------------------------------------------
/2022_2023/data/validation.py:
--------------------------------------------------------------------------------
1 | def valid_sequence(sequence, valid_characters):
2 | for c in sequence:
3 | if c.upper() not in valid_characters:
4 | return False
5 |
6 | return True
7 |
8 | def validate_dna(sequence):
9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C'])
10 |
11 | def validate_rna(sequence):
12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C'])
13 |
14 | def validate_protein(sequence):
15 | return valid_sequence(
16 | sequence,
17 | [
18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/2022_2023/images/Integer.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2022_2023/images/Integer.jpeg
--------------------------------------------------------------------------------
/2022_2023/images/List.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2022_2023/images/List.jpeg
--------------------------------------------------------------------------------
/2023_2024/Lesson1/Exercises1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "The following list is corrupted:"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "mutations = [\n",
18 | " 'p.Ser31Ala',\n",
19 | " 'p.Pro38Leu',\n",
20 | " 'p.Asn100Lys',\n",
21 | " 'p.LEU110VAL',\n",
22 | " 13,\n",
23 | " 4.0,\n",
24 | " True,\n",
25 | " 'p.Tyr341Leu',\n",
26 | " 'AUG',\n",
27 | " 'p.Tyr0Le',\n",
28 | " 'p.Asn1.3Lys',\n",
29 | " 'p.Arg0Leu'\n",
30 | "]"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n",
38 | "\n",
39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "#### Tips"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "valid_aminos = [\n",
56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n",
57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n",
58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n",
59 | "]\n",
60 | "\n",
61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n",
62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n",
63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "valid_mutations = []\n",
73 | "\n",
74 | "# TODO"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Exercise\n",
82 | "\n",
83 | "Write a script to check if a protein sequence is valid."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# TODO"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "### Exercise\n",
100 | "\n",
101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# TODO"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3 (ipykernel)",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.10.6"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
136 |
--------------------------------------------------------------------------------
/2023_2024/Lesson2/Exercises2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n",
10 | "\n",
11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n",
12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n",
13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "mutations = [\n",
23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n",
24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n",
25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n",
26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n",
27 | "]\n",
28 | "\n",
29 | "aa_3L_to_1L = {\n",
30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
34 | "}"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# TO DO"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "Set of valid mutations that could belong to the HLA class I histocompatibility antigen protein:\n",
51 | "\n",
52 | "`['p.Pro39Arg', 'p.Gly40Ile', 'p.Leu19Gly', 'p.Val49Ile', 'p.Asn90Asp', 'p.Phe133His', 'p.Leu134Cys', 'p.Glu190Ser', 'p.Gln248Val', 'p.Thr249Ile']`"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "### Exercise\n",
60 | "\n",
61 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins."
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# TO DO"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### Exercise\n",
78 | "\n",
79 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "# TO DO"
89 | ]
90 | }
91 | ],
92 | "metadata": {
93 | "kernelspec": {
94 | "display_name": "Python 3 (ipykernel)",
95 | "language": "python",
96 | "name": "python3"
97 | },
98 | "language_info": {
99 | "codemirror_mode": {
100 | "name": "ipython",
101 | "version": 3
102 | },
103 | "file_extension": ".py",
104 | "mimetype": "text/x-python",
105 | "name": "python",
106 | "nbconvert_exporter": "python",
107 | "pygments_lexer": "ipython3",
108 | "version": "3.10.6"
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 4
113 | }
114 |
--------------------------------------------------------------------------------
/2023_2024/Lesson3/Exercises3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n",
10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# TO DO"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Exercise\n",
27 | "\n",
28 | "Write a function to remove duplicates in a list."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# TO DO"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Exercise\n",
45 | "\n",
46 | "Write a function to calculate the identity between 2 sequences."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# TO DO"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3 (ipykernel)",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.10.6"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
81 |
--------------------------------------------------------------------------------
/2023_2024/Lesson4/Exercises4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Compute all pair-wise identities (number of identical character pairs)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "sequences = [\n",
19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n",
20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n",
21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n",
22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n",
23 | "]\n",
24 | "\n",
25 | "# TO DO"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Exercise\n",
33 | "\n",
34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n",
35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n",
36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# TO DO"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Exercise\n",
53 | "\n",
54 | "Print the index of the first occurrence of the ATG codon in `dna_seq`.\n",
55 | "\n",
56 | "Try with and without using the `find()` method on strings.\n",
57 | "\n",
58 | "Do the same with the ribonucleotide sequence in the `P04439.rna.fasta` file (manage the `U` <-> `T` conversion)."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "dna_seq = 'AAAAATCCCGAGGCGGCAUGTATATAGGGCTCCGGAGGCGTAATATAAAA'\n",
68 | "\n",
69 | "# TODO"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3 (ipykernel)",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.10.6"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 4
94 | }
95 |
--------------------------------------------------------------------------------
/2023_2024/Lesson5/Exercises5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Write a function to search motifs in a sequence.\n",
10 | "\n",
11 | "Try with and without using the `re` module."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n",
21 | "\n",
22 | "consensus_motifs = {\n",
23 | " 'motif1': 'AGGAG[GT]',\n",
24 | " 'motif2': 'T[AT]AAT',\n",
25 | " 'motif3': 'GG.A.T[AG]'\n",
26 | "}"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Possible printed output:\n",
34 | "```\n",
35 | "AGGAG[GT]\n",
36 | "\t(969, 975) AGGAGG\n",
37 | "\t(1153, 1159) AGGAGG\n",
38 | "\t(1339, 1345) AGGAGT\n",
39 | "\t(1587, 1593) AGGAGG\n",
40 | "\t(1881, 1887) AGGAGG\n",
41 | "\t(1941, 1947) AGGAGG\n",
42 | "T[AT]AAT\n",
43 | "\t(50, 55) TAAAT\n",
44 | "\t(1098, 1103) TAAAT\n",
45 | "\t(1276, 1281) TAAAT\n",
46 | "GG.A.T[AG]\n",
47 | "\t(248, 255) GGTACTG\n",
48 | "\t(983, 990) GGAAATA\n",
49 | "\t(1910, 1917) GGGACTG\n",
50 | "\t(1980, 1987) GGCAGTG\n",
51 | "```\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# TO DO"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Exercise\n",
68 | "\n",
69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "aa_3L_to_1L = {\n",
79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
83 | "}\n",
84 | "\n",
85 | "#aa_1L_to_3L['A'] --> 'ALA'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# TO DO"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### Exercise\n",
102 | "\n",
103 | "Write a function to remove not valid aminoacids from a protein."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# TO DO"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3 (ipykernel)",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.10.6"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 4
137 | }
138 |
--------------------------------------------------------------------------------
/2023_2024/Lesson6/Exercises6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n",
10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n",
11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 3,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " UUU | \n",
41 | " F | \n",
42 | " Phe | \n",
43 | " Phenylalanine | \n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 | \n",
49 | " UUC | \n",
50 | " F | \n",
51 | " Phe | \n",
52 | " Phenylalanine | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " UUA | \n",
57 | " L | \n",
58 | " Leu | \n",
59 | " Leucine | \n",
60 | "
\n",
61 | " \n",
62 | " 2 | \n",
63 | " UUG | \n",
64 | " L | \n",
65 | " Leu | \n",
66 | " Leucine | \n",
67 | "
\n",
68 | " \n",
69 | " 3 | \n",
70 | " CUU | \n",
71 | " L | \n",
72 | " Leu | \n",
73 | " Leucine | \n",
74 | "
\n",
75 | " \n",
76 | " 4 | \n",
77 | " CUC | \n",
78 | " L | \n",
79 | " Leu | \n",
80 | " Leucine | \n",
81 | "
\n",
82 | " \n",
83 | " ... | \n",
84 | " ... | \n",
85 | " ... | \n",
86 | " ... | \n",
87 | " ... | \n",
88 | "
\n",
89 | " \n",
90 | " 58 | \n",
91 | " AGG | \n",
92 | " R | \n",
93 | " Arg | \n",
94 | " Arginine | \n",
95 | "
\n",
96 | " \n",
97 | " 59 | \n",
98 | " GGU | \n",
99 | " G | \n",
100 | " Gly | \n",
101 | " Glycine | \n",
102 | "
\n",
103 | " \n",
104 | " 60 | \n",
105 | " GGC | \n",
106 | " G | \n",
107 | " Gly | \n",
108 | " Glycine | \n",
109 | "
\n",
110 | " \n",
111 | " 61 | \n",
112 | " GGA | \n",
113 | " G | \n",
114 | " Gly | \n",
115 | " Glycine | \n",
116 | "
\n",
117 | " \n",
118 | " 62 | \n",
119 | " GGG | \n",
120 | " G | \n",
121 | " Gly | \n",
122 | " Glycine | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
63 rows × 4 columns
\n",
127 | "
"
128 | ],
129 | "text/plain": [
130 | " UUU F Phe Phenylalanine\n",
131 | "0 UUC F Phe Phenylalanine\n",
132 | "1 UUA L Leu Leucine\n",
133 | "2 UUG L Leu Leucine\n",
134 | "3 CUU L Leu Leucine\n",
135 | "4 CUC L Leu Leucine\n",
136 | ".. ... .. ... ...\n",
137 | "58 AGG R Arg Arginine\n",
138 | "59 GGU G Gly Glycine\n",
139 | "60 GGC G Gly Glycine\n",
140 | "61 GGA G Gly Glycine\n",
141 | "62 GGG G Gly Glycine\n",
142 | "\n",
143 | "[63 rows x 4 columns]"
144 | ]
145 | },
146 | "execution_count": 3,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "import pandas as pd\n",
153 | "\n",
154 | "# TO DO"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### Exercise\n",
162 | "\n",
163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 4,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "# TO DO"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Exercise\n",
180 | "\n",
181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "# TO DO"
191 | ]
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "Python 3 (ipykernel)",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.10.6"
211 | }
212 | },
213 | "nbformat": 4,
214 | "nbformat_minor": 4
215 | }
216 |
--------------------------------------------------------------------------------
/2023_2024/Lesson7/Exercises7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# TO DO"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Exercise\n",
26 | "\n",
27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# TO DO"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Exercise\n",
44 | "\n",
45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)."
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# TO DO"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3 (ipykernel)",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.10.6"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/2023_2024/data/P04439.fasta:
--------------------------------------------------------------------------------
1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2
2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF
3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ
4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL
5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT
6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL
7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL
8 | TACKV
9 |
--------------------------------------------------------------------------------
/2023_2024/data/brca_transcripts.txt:
--------------------------------------------------------------------------------
1 | transcript_id biotype bp aa
2 | ENST00000352993.7 Protein coding 3668 721
3 | ENST00000354071.7 Protein coding 4497 1399
4 | ENST00000461221.5 Nonsense mediated decay 5693 63
5 | ENST00000461574.1 Protein coding 726 242
6 | ENST00000461798.5 Nonsense mediated decay 582 63
7 |
--------------------------------------------------------------------------------
/2023_2024/data/genetic_code.tsv:
--------------------------------------------------------------------------------
1 | UUU F Phe Phenylalanine
2 | UUC F Phe Phenylalanine
3 | UUA L Leu Leucine
4 | UUG L Leu Leucine
5 | CUU L Leu Leucine
6 | CUC L Leu Leucine
7 | CUA L Leu Leucine
8 | CUG L Leu Leucine
9 | AUU I Ile Isoleucine
10 | AUC I Ile Isoleucine
11 | AUA I Ile Isoleucine
12 | AUG M Met Methionine (Start)
13 | GUU V Val Valine
14 | GUC V Val Valine
15 | GUA V Val Valine
16 | GUG V Val Valine
17 | UCU S Ser Serine
18 | UCC S Ser Serine
19 | UCA S Ser Serine
20 | UCG S Ser Serine
21 | CCU P Pro Proline
22 | CCC P Pro Proline
23 | CCA P Pro Proline
24 | CCG P Pro Proline
25 | ACU T Thr Threonine
26 | ACC T Thr Threonine
27 | ACA T Thr Threonine
28 | ACG T Thr Threonine
29 | GCU A Ala Alanine
30 | GCC A Ala Alanine
31 | GCA A Ala Alanine
32 | GCG A Ala Alanine
33 | UAU Y Tyr Tyrosine
34 | UAC Y Tyr Tyrosine
35 | UAA X Stop (Stop)
36 | UAG X Stop (Stop)
37 | CAU H His Histidine
38 | CAC H His Histidine
39 | CAA Q Gln Glutamine
40 | CAG Q Gln Glutamine
41 | AAU N Asn Asparagine
42 | AAC N Asn Asparagine
43 | AAA K Lys Lysine
44 | AAG K Lys Lysine
45 | GAU D Asp Aspartic acid
46 | GAC D Asp Aspartic acid
47 | GAA E Glu Glutamic acid
48 | GAG E Glu Glutamic acid
49 | UGU C Cys Cysteine
50 | UGC C Cys Cysteine
51 | UGA X Stop (Stop)
52 | UGG W Trp Tryptophan
53 | CGU R Arg Arginine
54 | CGC R Arg Arginine
55 | CGA R Arg Arginine
56 | CGG R Arg Arginine
57 | AGU S Ser Serine
58 | AGC S Ser Serine
59 | AGA R Arg Arginine
60 | AGG R Arg Arginine
61 | GGU G Gly Glycine
62 | GGC G Gly Glycine
63 | GGA G Gly Glycine
64 | GGG G Gly Glycine
--------------------------------------------------------------------------------
/2023_2024/data/my_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | def generate_string(n, alphabet):
4 | s = ""
5 | for i in range(n):
6 | s += random.choice(alphabet)
7 |
8 | return s
9 |
--------------------------------------------------------------------------------
/2023_2024/data/uniprot_ids.txt:
--------------------------------------------------------------------------------
1 | Q13188
2 | O00444
3 | P49760
4 | PYYY4Z
5 | Q13627
6 | Q02156
7 |
--------------------------------------------------------------------------------
/2023_2024/data/validation.py:
--------------------------------------------------------------------------------
1 | def valid_sequence(sequence, valid_characters):
2 | for c in sequence:
3 | if c.upper() not in valid_characters:
4 | return False
5 |
6 | return True
7 |
8 | def validate_dna(sequence):
9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C'])
10 |
11 | def validate_rna(sequence):
12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C'])
13 |
14 | def validate_protein(sequence):
15 | return valid_sequence(
16 | sequence,
17 | [
18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/2023_2024/images/Integer.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2023_2024/images/Integer.jpeg
--------------------------------------------------------------------------------
/2023_2024/images/List.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2023_2024/images/List.jpeg
--------------------------------------------------------------------------------
/2024_2025/Lesson1/Exercises1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "The following list is corrupted:"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "mutations = [\n",
18 | " 'p.Ser31Ala',\n",
19 | " 'p.Pro38Leu',\n",
20 | " 'p.Asn100Lys',\n",
21 | " 'p.LEU110VAL',\n",
22 | " 13,\n",
23 | " 4.0,\n",
24 | " True,\n",
25 | " 'p.Tyr341Leu',\n",
26 | " 'AUG',\n",
27 | " 'p.Tyr0Le',\n",
28 | " 'p.Asn1.3Lys',\n",
29 | " 'p.Arg0Leu'\n",
30 | "]"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n",
38 | "\n",
39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "#### Tips"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 2,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "valid_aminos = [\n",
56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n",
57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n",
58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n",
59 | "]\n",
60 | "\n",
61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n",
62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n",
63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "valid_mutations = []\n",
73 | "\n",
74 | "# TODO"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Exercise\n",
82 | "\n",
83 | "Write a script to check if a protein sequence is valid."
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 4,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# TODO"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "### Exercise\n",
100 | "\n",
101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 5,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "# TODO"
111 | ]
112 | }
113 | ],
114 | "metadata": {
115 | "kernelspec": {
116 | "display_name": "Python 3 (ipykernel)",
117 | "language": "python",
118 | "name": "python3"
119 | },
120 | "language_info": {
121 | "codemirror_mode": {
122 | "name": "ipython",
123 | "version": 3
124 | },
125 | "file_extension": ".py",
126 | "mimetype": "text/x-python",
127 | "name": "python",
128 | "nbconvert_exporter": "python",
129 | "pygments_lexer": "ipython3",
130 | "version": "3.10.6"
131 | }
132 | },
133 | "nbformat": 4,
134 | "nbformat_minor": 4
135 | }
136 |
--------------------------------------------------------------------------------
/2024_2025/Lesson2/Exercises2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n",
10 | "\n",
11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n",
12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n",
13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": null,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "mutations = [\n",
23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n",
24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n",
25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n",
26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n",
27 | "]\n",
28 | "\n",
29 | "aa_3L_to_1L = {\n",
30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
34 | "}"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": null,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "# TO DO"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "Set of valid mutations that could belong to the HLA class I histocompatibility antigen protein:\n",
51 | "\n",
52 | "`['p.Pro39Arg', 'p.Gly40Ile', 'p.Leu19Gly', 'p.Val49Ile', 'p.Asn90Asp', 'p.Phe133His', 'p.Leu134Cys', 'p.Glu190Ser', 'p.Gln248Val', 'p.Thr249Ile']`"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "### Exercise\n",
60 | "\n",
61 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins."
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "# TO DO"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "### Exercise\n",
78 | "\n",
79 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method."
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "# TO DO"
89 | ]
90 | }
91 | ],
92 | "metadata": {
93 | "kernelspec": {
94 | "display_name": "Python 3 (ipykernel)",
95 | "language": "python",
96 | "name": "python3"
97 | },
98 | "language_info": {
99 | "codemirror_mode": {
100 | "name": "ipython",
101 | "version": 3
102 | },
103 | "file_extension": ".py",
104 | "mimetype": "text/x-python",
105 | "name": "python",
106 | "nbconvert_exporter": "python",
107 | "pygments_lexer": "ipython3",
108 | "version": "3.10.6"
109 | }
110 | },
111 | "nbformat": 4,
112 | "nbformat_minor": 4
113 | }
114 |
--------------------------------------------------------------------------------
/2024_2025/Lesson3/Exercises3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n",
10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "# TO DO"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Exercise\n",
27 | "\n",
28 | "Write a function to remove duplicates in a list."
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "# TO DO"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### Exercise\n",
45 | "\n",
46 | "Write a function to calculate the identity between 2 sequences."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# TO DO"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3 (ipykernel)",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.10.6"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
81 |
--------------------------------------------------------------------------------
/2024_2025/Lesson4/Exercises4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Compute all pair-wise identities (number of identical character pairs)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "sequences = [\n",
19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n",
20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n",
21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n",
22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n",
23 | "]\n",
24 | "\n",
25 | "# TO DO"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "### Exercise\n",
33 | "\n",
34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n",
35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n",
36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon."
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "# TO DO"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "### Exercise\n",
53 | "\n",
54 | "Print the index of the first occurrence of the ATG codon in `dna_seq`.\n",
55 | "\n",
56 | "Try with and without using the `find()` method on strings.\n",
57 | "\n",
58 | "Do the same with the ribonucleotide sequence in the `P04439.rna.fasta` file (manage the `U` <-> `T` conversion)."
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "dna_seq = 'AAAAATCCCGAGGCGGCAUGTATATAGGGCTCCGGAGGCGTAATATAAAA'\n",
68 | "\n",
69 | "# TODO"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "kernelspec": {
75 | "display_name": "Python 3 (ipykernel)",
76 | "language": "python",
77 | "name": "python3"
78 | },
79 | "language_info": {
80 | "codemirror_mode": {
81 | "name": "ipython",
82 | "version": 3
83 | },
84 | "file_extension": ".py",
85 | "mimetype": "text/x-python",
86 | "name": "python",
87 | "nbconvert_exporter": "python",
88 | "pygments_lexer": "ipython3",
89 | "version": "3.10.6"
90 | }
91 | },
92 | "nbformat": 4,
93 | "nbformat_minor": 4
94 | }
95 |
--------------------------------------------------------------------------------
/2024_2025/Lesson5/Exercises5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Write a function to search motifs in a sequence.\n",
10 | "\n",
11 | "Try with and without using the `re` module."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 1,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n",
21 | "\n",
22 | "consensus_motifs = {\n",
23 | " 'motif1': 'AGGAG[GT]',\n",
24 | " 'motif2': 'T[AT]AAT',\n",
25 | " 'motif3': 'GG.A.T[AG]'\n",
26 | "}"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "Possible printed output:\n",
34 | "```\n",
35 | "AGGAG[GT]\n",
36 | "\t(969, 975) AGGAGG\n",
37 | "\t(1153, 1159) AGGAGG\n",
38 | "\t(1339, 1345) AGGAGT\n",
39 | "\t(1587, 1593) AGGAGG\n",
40 | "\t(1881, 1887) AGGAGG\n",
41 | "\t(1941, 1947) AGGAGG\n",
42 | "T[AT]AAT\n",
43 | "\t(50, 55) TAAAT\n",
44 | "\t(1098, 1103) TAAAT\n",
45 | "\t(1276, 1281) TAAAT\n",
46 | "GG.A.T[AG]\n",
47 | "\t(248, 255) GGTACTG\n",
48 | "\t(983, 990) GGAAATA\n",
49 | "\t(1910, 1917) GGGACTG\n",
50 | "\t(1980, 1987) GGCAGTG\n",
51 | "```\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# TO DO"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "### Exercise\n",
68 | "\n",
69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 2,
75 | "metadata": {},
76 | "outputs": [],
77 | "source": [
78 | "aa_3L_to_1L = {\n",
79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n",
80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n",
81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n",
82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n",
83 | "}\n",
84 | "\n",
85 | "#aa_1L_to_3L['A'] --> 'ALA'"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "# TO DO"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {},
100 | "source": [
101 | "### Exercise\n",
102 | "\n",
103 | "Write a function to remove not valid aminoacids from a protein."
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# TO DO"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3 (ipykernel)",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.10.6"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 4
137 | }
138 |
--------------------------------------------------------------------------------
/2024_2025/Lesson6/Exercises6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n",
10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n",
11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that."
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 3,
17 | "metadata": {},
18 | "outputs": [
19 | {
20 | "data": {
21 | "text/html": [
22 | "\n",
23 | "\n",
36 | "
\n",
37 | " \n",
38 | " \n",
39 | " | \n",
40 | " UUU | \n",
41 | " F | \n",
42 | " Phe | \n",
43 | " Phenylalanine | \n",
44 | "
\n",
45 | " \n",
46 | " \n",
47 | " \n",
48 | " 0 | \n",
49 | " UUC | \n",
50 | " F | \n",
51 | " Phe | \n",
52 | " Phenylalanine | \n",
53 | "
\n",
54 | " \n",
55 | " 1 | \n",
56 | " UUA | \n",
57 | " L | \n",
58 | " Leu | \n",
59 | " Leucine | \n",
60 | "
\n",
61 | " \n",
62 | " 2 | \n",
63 | " UUG | \n",
64 | " L | \n",
65 | " Leu | \n",
66 | " Leucine | \n",
67 | "
\n",
68 | " \n",
69 | " 3 | \n",
70 | " CUU | \n",
71 | " L | \n",
72 | " Leu | \n",
73 | " Leucine | \n",
74 | "
\n",
75 | " \n",
76 | " 4 | \n",
77 | " CUC | \n",
78 | " L | \n",
79 | " Leu | \n",
80 | " Leucine | \n",
81 | "
\n",
82 | " \n",
83 | " ... | \n",
84 | " ... | \n",
85 | " ... | \n",
86 | " ... | \n",
87 | " ... | \n",
88 | "
\n",
89 | " \n",
90 | " 58 | \n",
91 | " AGG | \n",
92 | " R | \n",
93 | " Arg | \n",
94 | " Arginine | \n",
95 | "
\n",
96 | " \n",
97 | " 59 | \n",
98 | " GGU | \n",
99 | " G | \n",
100 | " Gly | \n",
101 | " Glycine | \n",
102 | "
\n",
103 | " \n",
104 | " 60 | \n",
105 | " GGC | \n",
106 | " G | \n",
107 | " Gly | \n",
108 | " Glycine | \n",
109 | "
\n",
110 | " \n",
111 | " 61 | \n",
112 | " GGA | \n",
113 | " G | \n",
114 | " Gly | \n",
115 | " Glycine | \n",
116 | "
\n",
117 | " \n",
118 | " 62 | \n",
119 | " GGG | \n",
120 | " G | \n",
121 | " Gly | \n",
122 | " Glycine | \n",
123 | "
\n",
124 | " \n",
125 | "
\n",
126 | "
63 rows × 4 columns
\n",
127 | "
"
128 | ],
129 | "text/plain": [
130 | " UUU F Phe Phenylalanine\n",
131 | "0 UUC F Phe Phenylalanine\n",
132 | "1 UUA L Leu Leucine\n",
133 | "2 UUG L Leu Leucine\n",
134 | "3 CUU L Leu Leucine\n",
135 | "4 CUC L Leu Leucine\n",
136 | ".. ... .. ... ...\n",
137 | "58 AGG R Arg Arginine\n",
138 | "59 GGU G Gly Glycine\n",
139 | "60 GGC G Gly Glycine\n",
140 | "61 GGA G Gly Glycine\n",
141 | "62 GGG G Gly Glycine\n",
142 | "\n",
143 | "[63 rows x 4 columns]"
144 | ]
145 | },
146 | "execution_count": 3,
147 | "metadata": {},
148 | "output_type": "execute_result"
149 | }
150 | ],
151 | "source": [
152 | "import pandas as pd\n",
153 | "\n",
154 | "# TO DO"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### Exercise\n",
162 | "\n",
163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times."
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 4,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "# TO DO"
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "### Exercise\n",
180 | "\n",
181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory."
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "metadata": {},
188 | "outputs": [],
189 | "source": [
190 | "# TO DO"
191 | ]
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "Python 3 (ipykernel)",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.10.6"
211 | }
212 | },
213 | "nbformat": 4,
214 | "nbformat_minor": 4
215 | }
216 |
--------------------------------------------------------------------------------
/2024_2025/Lesson7/Exercises7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Exercise\n",
8 | "\n",
9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "# TO DO"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "### Exercise\n",
26 | "\n",
27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "# TO DO"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### Exercise\n",
44 | "\n",
45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)."
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 3,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "# TO DO"
55 | ]
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3 (ipykernel)",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.10.6"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 4
79 | }
80 |
--------------------------------------------------------------------------------
/2024_2025/data/P04439.fasta:
--------------------------------------------------------------------------------
1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2
2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF
3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ
4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL
5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT
6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL
7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL
8 | TACKV
9 |
--------------------------------------------------------------------------------
/2024_2025/data/brca_transcripts.txt:
--------------------------------------------------------------------------------
1 | transcript_id biotype bp aa
2 | ENST00000352993.7 Protein coding 3668 721
3 | ENST00000354071.7 Protein coding 4497 1399
4 | ENST00000461221.5 Nonsense mediated decay 5693 63
5 | ENST00000461574.1 Protein coding 726 242
6 | ENST00000461798.5 Nonsense mediated decay 582 63
7 |
--------------------------------------------------------------------------------
/2024_2025/data/genetic_code.tsv:
--------------------------------------------------------------------------------
1 | UUU F Phe Phenylalanine
2 | UUC F Phe Phenylalanine
3 | UUA L Leu Leucine
4 | UUG L Leu Leucine
5 | CUU L Leu Leucine
6 | CUC L Leu Leucine
7 | CUA L Leu Leucine
8 | CUG L Leu Leucine
9 | AUU I Ile Isoleucine
10 | AUC I Ile Isoleucine
11 | AUA I Ile Isoleucine
12 | AUG M Met Methionine (Start)
13 | GUU V Val Valine
14 | GUC V Val Valine
15 | GUA V Val Valine
16 | GUG V Val Valine
17 | UCU S Ser Serine
18 | UCC S Ser Serine
19 | UCA S Ser Serine
20 | UCG S Ser Serine
21 | CCU P Pro Proline
22 | CCC P Pro Proline
23 | CCA P Pro Proline
24 | CCG P Pro Proline
25 | ACU T Thr Threonine
26 | ACC T Thr Threonine
27 | ACA T Thr Threonine
28 | ACG T Thr Threonine
29 | GCU A Ala Alanine
30 | GCC A Ala Alanine
31 | GCA A Ala Alanine
32 | GCG A Ala Alanine
33 | UAU Y Tyr Tyrosine
34 | UAC Y Tyr Tyrosine
35 | UAA X Stop (Stop)
36 | UAG X Stop (Stop)
37 | CAU H His Histidine
38 | CAC H His Histidine
39 | CAA Q Gln Glutamine
40 | CAG Q Gln Glutamine
41 | AAU N Asn Asparagine
42 | AAC N Asn Asparagine
43 | AAA K Lys Lysine
44 | AAG K Lys Lysine
45 | GAU D Asp Aspartic acid
46 | GAC D Asp Aspartic acid
47 | GAA E Glu Glutamic acid
48 | GAG E Glu Glutamic acid
49 | UGU C Cys Cysteine
50 | UGC C Cys Cysteine
51 | UGA X Stop (Stop)
52 | UGG W Trp Tryptophan
53 | CGU R Arg Arginine
54 | CGC R Arg Arginine
55 | CGA R Arg Arginine
56 | CGG R Arg Arginine
57 | AGU S Ser Serine
58 | AGC S Ser Serine
59 | AGA R Arg Arginine
60 | AGG R Arg Arginine
61 | GGU G Gly Glycine
62 | GGC G Gly Glycine
63 | GGA G Gly Glycine
64 | GGG G Gly Glycine
--------------------------------------------------------------------------------
/2024_2025/data/my_utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | def generate_string(n, alphabet):
4 | s = ""
5 | for i in range(n):
6 | s += random.choice(alphabet)
7 |
8 | return s
9 |
--------------------------------------------------------------------------------
/2024_2025/data/uniprot_ids.txt:
--------------------------------------------------------------------------------
1 | Q13188
2 | O00444
3 | P49760
4 | PYYY4Z
5 | Q13627
6 | Q02156
7 |
--------------------------------------------------------------------------------
/2024_2025/data/validation.py:
--------------------------------------------------------------------------------
1 | def valid_sequence(sequence, valid_characters):
2 | for c in sequence:
3 | if c.upper() not in valid_characters:
4 | return False
5 |
6 | return True
7 |
8 | def validate_dna(sequence):
9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C'])
10 |
11 | def validate_rna(sequence):
12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C'])
13 |
14 | def validate_protein(sequence):
15 | return valid_sequence(
16 | sequence,
17 | [
18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/2024_2025/images/Integer.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2024_2025/images/Integer.jpeg
--------------------------------------------------------------------------------
/2024_2025/images/List.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2024_2025/images/List.jpeg
--------------------------------------------------------------------------------
/ExamResults/2020.12.23.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0027557 | 27 |
4 | | 0278947 | 29 |
5 | | 0280655 | 30 |
6 | | 0281512 | 26 |
7 | | 0285818 | 28 |
8 | | 0287922 | 30 |
9 |
--------------------------------------------------------------------------------
/ExamResults/2021.01.21.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0287959 | 27 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2021.04.08.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0301264 | 30 |
4 | | 0301247 | 30 |
5 | | 0208810 | 21 |
6 |
--------------------------------------------------------------------------------
/ExamResults/2021.06.10.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0279152 | 28 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2021.09.23.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|--------|
3 | | 0296657 | absent |
4 | | 0259940 | 27 |
5 | | 0292176 | absent |
6 |
--------------------------------------------------------------------------------
/ExamResults/2021.12.20.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0291176 | 30 |
4 | | 0291151 | 28 |
5 | | 0292143 | 26 |
6 |
--------------------------------------------------------------------------------
/ExamResults/2022.01.13.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0293463 | 28 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2022.04.22.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|--------|
3 | | 0294083 | 29 |
4 | | 0296657 | 28 |
5 | | 0299326 | absent |
6 | | 0292176 | 24 |
7 | | 0292378 | 30 |
8 |
--------------------------------------------------------------------------------
/ExamResults/2022.07.14.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|--------|
3 | | 0299326 | absent |
4 |
--------------------------------------------------------------------------------
/ExamResults/2022.09.08.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0299326 | 28 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2022.12.22.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0307827 | 29 |
4 | | 0316609 | 30 |
5 | | 0309343 | 30L |
6 |
--------------------------------------------------------------------------------
/ExamResults/2022.12.23.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0316001 | 25 |
4 | | 0302429 | 30 |
5 | | 0316603 | 28 |
6 | | 0316680 | 30 |
7 | | 0317105 | 27 |
8 |
--------------------------------------------------------------------------------
/ExamResults/2023.02.09.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0315940 | 27 |
4 | | 0292781 | 27 |
5 |
--------------------------------------------------------------------------------
/ExamResults/2023.06.12.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0315940 | 29 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2023.09.07.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0329476 | 30 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2024.01.15.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0336987 | 27 |
4 | | 0323251 | 27 |
5 |
--------------------------------------------------------------------------------
/ExamResults/2024.06.11.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0329807 | 26 |
4 |
--------------------------------------------------------------------------------
/ExamResults/2024.12.19.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0350219 | 30 |
4 | | 0345526 | 29 |
5 |
--------------------------------------------------------------------------------
/ExamResults/2025.02.13.md:
--------------------------------------------------------------------------------
1 | | Student ID | Grade |
2 | |------------|-------|
3 | | 0334169 | 25 |
4 |
--------------------------------------------------------------------------------
/ExamResults/plot_date_vs_grade.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import os
3 | import pandas as pd
4 | from datetime import datetime
5 | import seaborn as sns
6 |
7 | # Function to read the MD file and return a DataFrame
8 | def read_md_file(filepath):
9 | with open(filepath, 'r') as file:
10 | lines = file.readlines()
11 | lines = lines[2:] # Skip the header and the underline
12 | student_ids = []
13 | grades = []
14 | for line in lines:
15 | parts = line.strip().split('|')
16 | if len(parts) < 3:
17 | continue
18 | grade = parts[2].strip()
19 | if grade == 'absent': # Skip the row if grade is 'absent'
20 | continue
21 | elif grade == '30L':
22 | grades.append(32)
23 | else:
24 | grades.append(int(grade))
25 | student_ids.append(parts[1].strip())
26 | return pd.DataFrame({'StudentID': student_ids, 'Grade': grades})
27 |
28 | # Directory containing the MD files
29 | directory = os.path.dirname(os.path.realpath(__file__))
30 |
31 | # List to hold DataFrames for each date
32 | dfs = []
33 |
34 | # Iterate through the files in the directory
35 | for filename in os.listdir(directory):
36 | if filename.endswith(".md"):
37 | date_str = filename[:-3] # Remove the .md extension
38 | date = datetime.strptime(date_str, '%Y.%m.%d').date() # Extract the date only
39 | filepath = os.path.join(directory, filename)
40 | df = read_md_file(filepath)
41 | df['Date'] = date
42 | dfs.append(df[['Date', 'Grade']])
43 |
44 | # Concatenate all DataFrames
45 | data = pd.concat(dfs, ignore_index=True)
46 |
47 | # Sort by Date
48 | data = data.sort_values(by='Date')
49 |
50 | # Set the style for the plot
51 | sns.set_style("whitegrid")
52 | plt.figure(figsize=(20, 12))
53 |
54 | # Create the boxplot
55 | sns.boxplot(x='Date', y='Grade', data=data, color="lightblue", width=0.5)
56 |
57 | # Add swarmplot for individual data points
58 | sns.swarmplot(x='Date', y='Grade', data=data, color="navy", size=6, alpha=0.6)
59 |
60 | # Customize the plot
61 | plt.title('Data Structures for Bioinformatics Exam\nDistribution of Grades Over Time', fontsize=28, pad=20)
62 | plt.xlabel('Exam Date', fontsize=24, labelpad=15)
63 | plt.ylabel('Grade', fontsize=24, labelpad=15)
64 | plt.xticks(rotation=45, ha='right', fontsize=20)
65 | plt.yticks(fontsize=20)
66 |
67 | # Increase tick label size
68 | plt.tick_params(axis='both', which='major', labelsize=20)
69 |
70 | # Add a horizontal line for the mean grade
71 | mean_grade = data['Grade'].mean()
72 | plt.axhline(y=mean_grade, color='red', linestyle='--', alpha=0.7, linewidth=2)
73 | plt.text(plt.xlim()[1], mean_grade, f' Mean: {mean_grade:.2f}',
74 | verticalalignment='center', fontsize=20, color='red', fontweight='bold')
75 |
76 | # Adjust the layout and display the plot
77 | plt.tight_layout()
78 | plt.show()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DataStructuresForBioinformatics
2 |
3 | Material for the **Data Structures for Bioinformatics** course (Master’s degree in Bioinformatics, University of Rome Tor Vergata).
4 |
5 | **Class schedule (2024/2025)**: every Thursday, 15-17 pm (GMT+2)
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------