├── .gitignore ├── 2020_2021 ├── Lesson1 │ ├── Exercises1.ipynb │ └── Lesson1.ipynb ├── Lesson2 │ ├── Exercises2.ipynb │ └── Lesson2.ipynb ├── Lesson3 │ ├── Exercises3.ipynb │ └── Lesson3.ipynb ├── Lesson4 │ ├── Exercises4.ipynb │ └── Lesson4.ipynb ├── Lesson5 │ ├── Exercises5.ipynb │ ├── Lesson5.ipynb │ └── protein_sequences │ │ ├── O00444.fasta │ │ ├── P49760.fasta │ │ ├── Q02156.fasta │ │ ├── Q13188.fasta │ │ └── Q13627.fasta ├── Lesson6 │ ├── Exercises6.ipynb │ └── Lesson6.ipynb ├── Lesson7 │ ├── Exercises7.ipynb │ └── Lesson7.ipynb ├── Lesson8 │ └── Lesson8.ipynb ├── data │ ├── P04439.fasta │ ├── RepeatMasker.subset.bed │ ├── brca_transcripts.txt │ ├── cervical.csv │ ├── genetic_code.tsv │ ├── trio.2010_06.ychr.sites.vcf │ ├── uniprot_ids.txt │ ├── utils.py │ └── validation.py └── images │ ├── Integer.jpeg │ └── List.jpeg ├── 2021_2022 ├── Lesson1 │ ├── Exercises1.ipynb │ └── Lesson1.ipynb ├── Lesson2 │ ├── Exercises2.ipynb │ └── Lesson2.ipynb ├── Lesson3 │ ├── Exercises3.ipynb │ └── Lesson3.ipynb ├── Lesson4 │ ├── Exercises4.ipynb │ └── Lesson4.ipynb ├── Lesson5 │ ├── Exercises5.ipynb │ └── Lesson5.ipynb ├── Lesson6 │ ├── Exercises6.ipynb │ └── Lesson6.ipynb ├── Lesson7 │ ├── Exercises7.ipynb │ └── Lesson7.ipynb ├── Lesson8 │ └── Lesson8.ipynb ├── data │ ├── P04439.fasta │ ├── RepeatMasker.subset.bed │ ├── brca_transcripts.txt │ ├── cervical.csv │ ├── genetic_code.tsv │ ├── my_utils.py │ ├── trio.2010_06.ychr.sites.vcf │ ├── uniprot_ids.txt │ └── validation.py └── images │ ├── Integer.jpeg │ └── List.jpeg ├── 2022_2023 ├── Lesson1 │ ├── Exercises1.ipynb │ └── Lesson1.ipynb ├── Lesson2 │ ├── Exercises2.ipynb │ └── Lesson2.ipynb ├── Lesson3 │ ├── Exercises3.ipynb │ └── Lesson3.ipynb ├── Lesson4 │ ├── Exercises4.ipynb │ └── Lesson4.ipynb ├── Lesson5 │ ├── Exercises5.ipynb │ └── Lesson5.ipynb ├── Lesson6 │ ├── Exercises6.ipynb │ └── Lesson6.ipynb ├── Lesson7 │ ├── Exercises7.ipynb │ └── Lesson7.ipynb ├── Lesson8 │ └── Lesson8.ipynb ├── data │ ├── P04439.fasta │ ├── RepeatMasker.subset.bed │ ├── brca_transcripts.txt │ ├── cervical.csv │ ├── genetic_code.tsv │ ├── my_utils.py │ ├── trio.2010_06.ychr.sites.vcf │ ├── uniprot_ids.txt │ └── validation.py └── images │ ├── Integer.jpeg │ └── List.jpeg ├── 2023_2024 ├── Lesson1 │ ├── Exercises1.ipynb │ └── Lesson1.ipynb ├── Lesson2 │ ├── Exercises2.ipynb │ └── Lesson2.ipynb ├── Lesson3 │ ├── Exercises3.ipynb │ └── Lesson3.ipynb ├── Lesson4 │ ├── Exercises4.ipynb │ └── Lesson4.ipynb ├── Lesson5 │ ├── Exercises5.ipynb │ └── Lesson5.ipynb ├── Lesson6 │ ├── Exercises6.ipynb │ └── Lesson6.ipynb ├── Lesson7 │ ├── Exercises7.ipynb │ └── Lesson7.ipynb ├── Lesson8 │ └── Lesson8.ipynb ├── data │ ├── P04439.fasta │ ├── RepeatMasker.subset.bed │ ├── brca_transcripts.txt │ ├── genetic_code.tsv │ ├── my_utils.py │ ├── uniprot_ids.txt │ └── validation.py └── images │ ├── Integer.jpeg │ └── List.jpeg ├── 2024_2025 ├── Lesson1 │ ├── Exercises1.ipynb │ └── Lesson1.ipynb ├── Lesson2 │ ├── Exercises2.ipynb │ └── Lesson2.ipynb ├── Lesson3 │ ├── Exercises3.ipynb │ └── Lesson3.ipynb ├── Lesson4 │ ├── Exercises4.ipynb │ └── Lesson4.ipynb ├── Lesson5 │ ├── Exercises5.ipynb │ └── Lesson5.ipynb ├── Lesson6 │ ├── Exercises6.ipynb │ └── Lesson6.ipynb ├── Lesson7 │ ├── Exercises7.ipynb │ └── Lesson7.ipynb ├── Lesson8 │ └── Lesson8.ipynb ├── data │ ├── P04439.fasta │ ├── RepeatMasker.subset.bed │ ├── brca_transcripts.txt │ ├── genetic_code.tsv │ ├── my_utils.py │ ├── uniprot_ids.txt │ └── validation.py └── images │ ├── Integer.jpeg │ └── List.jpeg ├── ExamResults ├── 2020.12.23.md ├── 2021.01.21.md ├── 2021.04.08.md ├── 2021.06.10.md ├── 2021.09.23.md ├── 2021.12.20.md ├── 2022.01.13.md ├── 2022.04.22.md ├── 2022.07.14.md ├── 2022.09.08.md ├── 2022.12.22.md ├── 2022.12.23.md ├── 2023.02.09.md ├── 2023.06.12.md ├── 2023.09.07.md ├── 2024.01.15.md ├── 2024.06.11.md ├── 2024.12.19.md ├── 2025.02.13.md └── plot_date_vs_grade.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /2020_2021/Lesson1/Exercises1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "The following list is corrupted:" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "mutations = [\n", 18 | " 'p.Ser31Ala',\n", 19 | " 'p.Pro38Leu',\n", 20 | " 'p.Asn100Lys',\n", 21 | " 'p.LEU110VAL',\n", 22 | " 13,\n", 23 | " 4.0,\n", 24 | " True,\n", 25 | " 'p.Tyr341Leu',\n", 26 | " 'AUG',\n", 27 | " 'p.Tyr0Le',\n", 28 | " 'p.Asn1.3Lys',\n", 29 | " 'p.Arg0Leu'\n", 30 | "]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n", 38 | "\n", 39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "#### Tips" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "valid_aminos = [\n", 56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n", 57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n", 58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n", 59 | "]\n", 60 | "\n", 61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n", 62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n", 63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "valid_mutations = []\n", 73 | "\n", 74 | "# TODO" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Exercise\n", 82 | "\n", 83 | "Write a script to check if a protein sequence is valid." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# TODO" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Exercise\n", 100 | "\n", 101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# TODO" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.8.3" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } -------------------------------------------------------------------------------- /2020_2021/Lesson2/Exercises2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n", 10 | "\n", 11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n", 12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n", 13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "mutations = [\n", 23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n", 24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n", 25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n", 26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n", 27 | "]\n", 28 | "\n", 29 | "aa_3L_to_1L = {\n", 30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 34 | "}" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# TO DO" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Exercise\n", 51 | "\n", 52 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# TO DO" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Exercise\n", 69 | "\n", 70 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# TO DO" 80 | ] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.8.3" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 4 104 | } 105 | -------------------------------------------------------------------------------- /2020_2021/Lesson3/Exercises3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n", 10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# TO DO" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Exercise\n", 27 | "\n", 28 | "Write a function to remove duplicates in a list." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# TO DO" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Exercise\n", 45 | "\n", 46 | "Write a function to calculate the identity between 2 sequences." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# TO DO" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.8.3" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } -------------------------------------------------------------------------------- /2020_2021/Lesson4/Exercises4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Compute all pair-wise identities (number of identical character pairs)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "sequences = [\n", 19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n", 20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n", 21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n", 22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n", 23 | "]\n", 24 | "\n", 25 | "# TO DO" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Exercise\n", 33 | "\n", 34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n", 35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n", 36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# TO DO" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Exercise\n", 53 | "\n", 54 | "Print the index of the first occurrence of the ATG codon.\n", 55 | "\n", 56 | "Try with and without using the `find()` method on strings." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# TO DO" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.8.3" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 4 90 | } 91 | -------------------------------------------------------------------------------- /2020_2021/Lesson5/Exercises5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Write a function to search motifs in a sequence.\n", 10 | "\n", 11 | "Try with and without using the `re` module." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n", 21 | "\n", 22 | "consensus_motifs = {\n", 23 | " 'motif1': 'AGGAG[GT]',\n", 24 | " 'motif2': 'T[AT]AAT',\n", 25 | " 'motif3': 'GG.A.T[AG]'\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Possible printed output:\n", 34 | "```\n", 35 | "AGGAG[GT]\n", 36 | "\t(969, 975) AGGAGG\n", 37 | "\t(1153, 1159) AGGAGG\n", 38 | "\t(1339, 1345) AGGAGT\n", 39 | "\t(1587, 1593) AGGAGG\n", 40 | "\t(1881, 1887) AGGAGG\n", 41 | "\t(1941, 1947) AGGAGG\n", 42 | "T[AT]AAT\n", 43 | "\t(50, 55) TAAAT\n", 44 | "\t(1098, 1103) TAAAT\n", 45 | "\t(1276, 1281) TAAAT\n", 46 | "GG.A.T[AG]\n", 47 | "\t(248, 255) GGTACTG\n", 48 | "\t(983, 990) GGAAATA\n", 49 | "\t(1910, 1917) GGGACTG\n", 50 | "\t(1980, 1987) GGCAGTG\n", 51 | "```\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# TO DO" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Exercise\n", 68 | "\n", 69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "aa_3L_to_1L = {\n", 79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 83 | "}\n", 84 | "\n", 85 | "#aa_1L_to_3L['A'] --> 'ALA'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# TO DO" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Exercise\n", 102 | "\n", 103 | "Write a function to remove not valid aminoacids from a protein." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# TO DO" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.8.3" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 4 137 | } -------------------------------------------------------------------------------- /2020_2021/Lesson5/protein_sequences/O00444.fasta: -------------------------------------------------------------------------------- 1 | >sp|O00444|PLK4_HUMAN Serine/threonine-protein kinase PLK4 OS=Homo sapiens OX=9606 GN=PLK4 PE=1 SV=3 2 | MATCIGEKIEDFKVGNLLGKGSFAGVYRAESIHTGLEVAIKMIDKKAMYKAGMVQRVQNE 3 | VKIHCQLKHPSILELYNYFEDSNYVYLVLEMCHNGEMNRYLKNRVKPFSENEARHFMHQI 4 | ITGMLYLHSHGILHRDLTLSNLLLTRNMNIKIADFGLATQLKMPHEKHYTLCGTPNYISP 5 | EIATRSAHGLESDVWSLGCMFYTLLIGRPPFDTDTVKNTLNKVVLADYEMPSFLSIEAKD 6 | LIHQLLRRNPADRLSLSSVLDHPFMSRNSSTKSKDLGTVEDSIDSGHATISTAITASSST 7 | SISGSLFDKRRLLIGQPLPNKMTVFPKNKSSTDFSSSGDGNSFYTQWGNQETSNSGRGRV 8 | IQDAEERPHSRYLRRAYSSDRSGTSNSQSQAKTYTMERCHSAEMLSVSKRSGGGENEERY 9 | SPTDNNANIFNFFKEKTSSSSGSFERPDNNQALSNHLCPGKTPFPFADPTPQTETVQQWF 10 | GNLQINAHLRKTTEYDSISPNRDFQGHPDLQKDTSKNAWTDTKVKKNSDASDNAHSVKQQ 11 | NTMKYMTALHSKPEIIQQECVFGSDPLSEQSKTRGMEPPWGYQNRTLRSITSPLVAHRLK 12 | PIRQKTKKAVVSILDSEEVCVELVKEYASQEYVKEVLQISSDGNTITIYYPNGGRGFPLA 13 | DRPPSPTDNISRYSFDNLPEKYWRKYQYASRFVQLVRSKSPKITYFTRYAKCILMENSPG 14 | ADFEVWFYDGVKIHKTEDFIQVIEKTGKSYTLKSESEVNSLKEEIKMYMDHANEGHRICL 15 | ALESIISEEERKTRSAPFFPIIIGRKPGSTSSPKALSPPPSVDSNYPTRERASFNRMVMH 16 | SAASPTQAPILNPSMVTNEGLGLTTTASGTDISSNSLKDCLPKSAQLLKSVFVKNVGWAT 17 | QLTSGAVWVQFNDGSQLVVQAGVSSISYTSPNGQTTRYGENEKLPDYIKQKLQCLSSILL 18 | MFSNPTPNFH 19 | -------------------------------------------------------------------------------- /2020_2021/Lesson5/protein_sequences/P49760.fasta: -------------------------------------------------------------------------------- 1 | >sp|P49760|CLK2_HUMAN Dual specificity protein kinase CLK2 OS=Homo sapiens OX=9606 GN=CLK2 PE=1 SV=1 2 | MPHPRRYHSSERGSRGSYREHYRSRKHKRRRSRSWSSSSDRTRRRRREDSYHVRSRSSYD 3 | DRSSDRRVYDRRYCGSYRRNDYSRDRGDAYYDTDYRHSYEYQRENSSYRSQRSSRRKHRR 4 | RRRRSRTFSRSSSQHSSRRAKSVEDDAEGHLIYHVGDWLQERYEIVSTLGEGTFGRVVQC 5 | VDHRRGGARVALKIIKNVEKYKEAARLEINVLEKINEKDPDNKNLCVQMFDWFDYHGHMC 6 | ISFELLGLSTFDFLKDNNYLPYPIHQVRHMAFQLCQAVKFLHDNKLTHTDLKPENILFVN 7 | SDYELTYNLEKKRDERSVKSTAVRVVDFGSATFDHEHHSTIVSTRHYRAPEVILELGWSQ 8 | PCDVWSIGCIIFEYYVGFTLFQTHDNREHLAMMERILGPIPSRMIRKTRKQKYFYRGRLD 9 | WDENTSAGRYVRENCKPLRRYLTSEAEEHHQLFDLIESMLEYEPAKRLTLGEALQHPFFA 10 | RLRAEPPNKLWDSSRDISR 11 | -------------------------------------------------------------------------------- /2020_2021/Lesson5/protein_sequences/Q02156.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q02156|KPCE_HUMAN Protein kinase C epsilon type OS=Homo sapiens OX=9606 GN=PRKCE PE=1 SV=1 2 | MVVFNGLLKIKICEAVSLKPTAWSLRHAVGPRPQTFLLDPYIALNVDDSRIGQTATKQKT 3 | NSPAWHDEFVTDVCNGRKIELAVFHDAPIGYDDFVANCTIQFEELLQNGSRHFEDWIDLE 4 | PEGRVYVIIDLSGSSGEAPKDNEERVFRERMRPRKRQGAVRRRVHQVNGHKFMATYLRQP 5 | TYCSHCRDFIWGVIGKQGYQCQVCTCVVHKRCHELIITKCAGLKKQETPDQVGSQRFSVN 6 | MPHKFGIHNYKVPTFCDHCGSLLWGLLRQGLQCKVCKMNVHRRCETNVAPNCGVDARGIA 7 | KVLADLGVTPDKITNSGQRRKKLIAGAESPQPASGSSPSEEDRSKSAPTSPCDQEIKELE 8 | NNIRKALSFDNRGEEHRAASSPDGQLMSPGENGEVRQGQAKRLGLDEFNFIKVLGKGSFG 9 | KVMLAELKGKDEVYAVKVLKKDVILQDDDVDCTMTEKRILALARKHPYLTQLYCCFQTKD 10 | RLFFVMEYVNGGDLMFQIQRSRKFDEPRSRFYAAEVTSALMFLHQHGVIYRDLKLDNILL 11 | DAEGHCKLADFGMCKEGILNGVTTTTFCGTPDYIAPEILQELEYGPSVDWWALGVLMYEM 12 | MAGQPPFEADNEDDLFESILHDDVLYPVWLSKEAVSILKAFMTKNPHKRLGCVASQNGED 13 | AIKQHPFFKEIDWVLLEQKKIKPPFKPRIKTKRDVNNFDQDFTREEPVLTLVDEAIVKQI 14 | NQEEFKGFSYFGEDLMP 15 | -------------------------------------------------------------------------------- /2020_2021/Lesson5/protein_sequences/Q13188.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q13188|STK3_HUMAN Serine/threonine-protein kinase 3 OS=Homo sapiens OX=9606 GN=STK3 PE=1 SV=2 2 | MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIHKESGQVVAIKQVPV 3 | ESDLQEIIKEISIMQQCDSPYVVKYYGSYFKNTDLWIVMEYCGAGSVSDIIRLRNKTLIE 4 | DEIATILKSTLKGLEYLHFMRKIHRDIKAGNILLNTEGHAKLADFGVAGQLTDTMAKRNT 5 | VIGTPFWMAPEVIQEIGYNCVADIWSLGITSIEMAEGKPPYADIHPMRAIFMIPTNPPPT 6 | FRKPELWSDDFTDFVKKCLVKNPEQRATATQLLQHPFIKNAKPVSILRDLITEAMEIKAK 7 | RHEEQQRELEEEEENSDEDELDSHTMVKTSVESVGTMRATSTMSEGAQTMIEHNSTMLES 8 | DLGTMVINSEDEEEEDGTMKRNATSPQVQRPSFMDYFDKQDFKNKSHENCNQNMHEPFPM 9 | SKNVFPDNWKVPQDGDFDFLKNLSLEELQMRLKALDPMMEREIEELRQRYTAKRQPILDA 10 | MDAKKRRQQNF 11 | -------------------------------------------------------------------------------- /2020_2021/Lesson5/protein_sequences/Q13627.fasta: -------------------------------------------------------------------------------- 1 | >sp|Q13627|DYR1A_HUMAN Dual specificity tyrosine-phosphorylation-regulated kinase 1A OS=Homo sapiens OX=9606 GN=DYRK1A PE=1 SV=2 2 | MHTGGETSACKPSSVRLAPSFSFHAAGLQMAGQMPHSHQYSDRRQPNISDQQVSALSYSD 3 | QIQQPLTNQVMPDIVMLQRRMPQTFRDPATAPLRKLSVDLIKTYKHINEVYYAKKKRRHQ 4 | QGQGDDSSHKKERKVYNDGYDDDNYDYIVKNGEKWMDRYEIDSLIGKGSFGQVVKAYDRV 5 | EQEWVAIKIIKNKKAFLNQAQIEVRLLELMNKHDTEMKYYIVHLKRHFMFRNHLCLVFEM 6 | LSYNLYDLLRNTNFRGVSLNLTRKFAQQMCTALLFLATPELSIIHCDLKPENILLCNPKR 7 | SAIKIVDFGSSCQLGQRIYQYIQSRFYRSPEVLLGMPYDLAIDMWSLGCILVEMHTGEPL 8 | FSGANEVDQMNKIVEVLGIPPAHILDQAPKARKFFEKLPDGTWNLKKTKDGKREYKPPGT 9 | RKLHNILGVETGGPGGRRAGESGHTVADYLKFKDLILRMLDYDPKTRIQPYYALQHSFFK 10 | KTADEGTNTSNSVSTSPAMEQSQSSGTTSSTSSSSGGSSGTSNSGRARSDPTHQHRHSGG 11 | HFTAAVQAMDCETHSPQVRQQFPAPLGWSGTEAPTQVTVETHPVQETTFHVAPQQNALHH 12 | HHGNSSHHHHHHHHHHHHHGQQALGNRTRPRVYNSPTNSSSTQDSMEVGHSHHSMTSLSS 13 | STTSSSTSSSSTGNQGNQAYQNRPVAANTLDFGQNGAMDVNLTVYSNPRQETGIAGHPTY 14 | QFSANTGPAHYMTEGHLTMRQGADREESPMTGVCVQQSPVASS 15 | -------------------------------------------------------------------------------- /2020_2021/Lesson6/Exercises6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n", 10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n", 11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
UUUFPhePhenylalanine
0UUCFPhePhenylalanine
1UUALLeuLeucine
2UUGLLeuLeucine
3CUULLeuLeucine
4CUCLLeuLeucine
...............
58AGGRArgArginine
59GGUGGlyGlycine
60GGCGGlyGlycine
61GGAGGlyGlycine
62GGGGGlyGlycine
\n", 126 | "

63 rows × 4 columns

\n", 127 | "
" 128 | ], 129 | "text/plain": [ 130 | " UUU F Phe Phenylalanine\n", 131 | "0 UUC F Phe Phenylalanine\n", 132 | "1 UUA L Leu Leucine\n", 133 | "2 UUG L Leu Leucine\n", 134 | "3 CUU L Leu Leucine\n", 135 | "4 CUC L Leu Leucine\n", 136 | ".. ... .. ... ...\n", 137 | "58 AGG R Arg Arginine\n", 138 | "59 GGU G Gly Glycine\n", 139 | "60 GGC G Gly Glycine\n", 140 | "61 GGA G Gly Glycine\n", 141 | "62 GGG G Gly Glycine\n", 142 | "\n", 143 | "[63 rows x 4 columns]" 144 | ] 145 | }, 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "import pandas as pd\n", 153 | "\n", 154 | "# TO DO" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Exercise\n", 162 | "\n", 163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# TO DO" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Exercise\n", 180 | "\n", 181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# TO DO" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.8.3" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } -------------------------------------------------------------------------------- /2020_2021/Lesson6/Lesson6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 6 - 2020/12/03" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Numpy\n", 15 | "[NumPy](https://numpy.org/) (short for *Numerical Python*) is a numerical library for Python which provides an efficient interface to store and operate on data." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### A Python Integer Is More Than Just an Integer\n", 23 | "A Python integer is a pointer to a position in memory containing all the Python object information, including the bytes that contain the integer value.\n", 24 | "\n", 25 | "![Integer.jpeg](../images/Integer.jpeg)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### A Python List Is More Than Just a List\n", 33 | "\n", 34 | "Because of Python's dynamic typing, we can create heterogeneous lists:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "[bool, str, float, int]" 46 | ] 47 | }, 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "my_list = [True, \"2\", 3.0, 4]\n", 55 | "\n", 56 | "[type(item) for item in my_list]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "But this flexibility comes at a cost.\n", 64 | "\n", 65 | "![List.jpeg](../images/List.jpeg)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "In the special case that all variables are of the same type, much of this information is redundant: it can be much more efficient to store data in a fixed-type array." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Fixed-Type Arrays in Python\n", 80 | "The built-in ``array`` module can be used to create arrays of a uniform type:" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 2, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 92 | ] 93 | }, 94 | "execution_count": 2, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "import array\n", 101 | "\n", 102 | "L = list(range(10))\n", 103 | "A = array.array('i', L) # i indicates integer values\n", 104 | "\n", 105 | "A" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Much more useful, however, is the ``numpy.ndarray`` object of the NumPy package which adds to this efficient *operations* on that data." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 3, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "(numpy.ndarray, array([3, 9, 8, 8, 4, 3, 8, 2, 2]))" 124 | ] 125 | }, 126 | "execution_count": 3, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "import numpy as np\n", 133 | "\n", 134 | "x_np = np.random.randint(10, size=9) # One-dimensional array\n", 135 | "\n", 136 | "type(x_np), x_np" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "x1[3]: 8\n", 149 | "x1[2:5]: [8 8 4]\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "print('x1[3]:', x_np[3]) # Array Indexing\n", 155 | "print('x1[2:5]:', x_np[2:5]) # Array Slicing" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "3\n", 168 | "9\n", 169 | "8\n", 170 | "8\n", 171 | "4\n", 172 | "3\n", 173 | "8\n", 174 | "2\n", 175 | "2\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "# Iteration\n", 181 | "for element in x_np:\n", 182 | " print(element)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "``numpy.ndarray`` stands for N-dimensional array which means that this object is built to be multi-dimensional, with attributes and methods specifically designed for this feature." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 6, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "[3 9 8 8 4 3 8 2 2]\n", 202 | "[[3 9 8]\n", 203 | " [8 4 3]\n", 204 | " [8 2 2]]\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "grid = x_np.reshape((3, 3)) # Two-dimensional array\n", 210 | "\n", 211 | "print(x_np)\n", 212 | "print(grid)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "grid.ndim: 2\n", 225 | "grid.shape: (3, 3)\n", 226 | "grid.size: 9\n", 227 | "grid.dtype: int64\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "print(\"grid.ndim: \", grid.ndim)\n", 233 | "print(\"grid.shape:\", grid.shape)\n", 234 | "print(\"grid.size: \", grid.size)\n", 235 | "print(\"grid.dtype:\", grid.dtype)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### Boolean indexing\n", 243 | "Numpy arrays can be sliced with vectors of booleans (``list``s or other ``ndarray``s) with the same dimensions." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 8, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "x_np: [3 9 8 8 4 3 8 2 2]\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "print('x_np:', x_np)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 9, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "boolean_np: [False True True True True False True False False]\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "boolean_np = x_np > 3\n", 278 | "\n", 279 | "print('boolean_np:', boolean_np) # It states if the element in the elements in the same position are > 3." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 10, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "[9, 8, 8, 4, 8]" 291 | ] 292 | }, 293 | "execution_count": 10, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "[x for x in x_np if x > 3]" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 11, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "array([9, 8, 8, 4, 8])" 311 | ] 312 | }, 313 | "execution_count": 11, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "x_np[boolean_np]" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 12, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "2.76 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", 332 | "5.36 ms ± 63.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "big_array = np.random.rand(10000000)\n", 338 | "\n", 339 | "%timeit [x for x in big_array if x > 3]\n", 340 | "%timeit big_array[big_array > 3]" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "### Vectorized Operations\n", 348 | "Operation between arrays are carried out with a different logic than that of standard lists." 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 13, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "x_list + x_list: [3, 9, 8, 8, 4, 3, 8, 2, 2, 3, 9, 8, 8, 4, 3, 8, 2, 2]\n", 361 | "x_np + x_np: [ 6 18 16 16 8 6 16 4 4]\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "x_list = list(x_np)\n", 367 | "\n", 368 | "print('x_list + x_list:', x_list + x_list)\n", 369 | "print('x_np + x_np:', x_np + x_np)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "| Operator | Equivalent func | Description |\n", 377 | "|---------------|---------------------|---------------------------------------|\n", 378 | "|``+`` |``np.add`` |Addition (e.g., ``1 + 1 = 2``) |\n", 379 | "|``-`` |``np.subtract`` |Subtraction (e.g., ``3 - 2 = 1``) |\n", 380 | "|``-`` |``np.negative`` |Unary negation (e.g., ``-2``) |\n", 381 | "|``*`` |``np.multiply`` |Multiplication (e.g., ``2 * 3 = 6``) |\n", 382 | "|``/`` |``np.divide`` |Division (e.g., ``3 / 2 = 1.5``) |\n", 383 | "|``//`` |``np.floor_divide`` |Floor division (e.g., ``3 // 2 = 1``) |\n", 384 | "|``**`` |``np.power`` |Exponentiation (e.g., ``2 ** 3 = 8``) |\n", 385 | "|``%`` |``np.mod`` |Modulus/remainder (e.g., ``9 % 4 = 1``)|" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 14, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "1.41 s ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", 398 | "4.11 ms ± 39 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "%timeit sum(big_array)\n", 404 | "%timeit np.sum(big_array) # or big_array.sum()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "Important: whenever possible, make sure that you are using the NumPy version of these operations when operating on NumPy arrays." 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "## Pandas\n", 419 | "\n", 420 | "[Pandas](https://pandas.pydata.org/) is a library built on top of NumPy, which provides an efficient implementation of a ``DataFrame``.\n", 421 | "\n", 422 | "``DataFrame``s can be seens as multidimensional arrays with attached row and column labels, that can presennt heterogeneous types and/or missing data." 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "### The Pandas Series Object\n", 430 | "A Pandas ``Series`` is a one-dimensional array of indexed data." 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 15, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "0 RNA\n", 442 | "1 gene\n", 443 | "2 protein\n", 444 | "dtype: object" 445 | ] 446 | }, 447 | "execution_count": 15, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "import pandas as pd\n", 454 | "\n", 455 | "data = pd.Series(['RNA', 'gene', 'protein'])\n", 456 | "data" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 16, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/plain": [ 467 | "array(['RNA', 'gene', 'protein'], dtype=object)" 468 | ] 469 | }, 470 | "execution_count": 16, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "data.values" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 17, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "RangeIndex(start=0, stop=3, step=1)" 488 | ] 489 | }, 490 | "execution_count": 17, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "data.index" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "The index need not be an integer, but can consist of values of any type:" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 18, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/plain": [ 514 | "ENST RNA\n", 515 | "ENSG gene\n", 516 | "ENSP protein\n", 517 | "dtype: object" 518 | ] 519 | }, 520 | "execution_count": 18, 521 | "metadata": {}, 522 | "output_type": "execute_result" 523 | } 524 | ], 525 | "source": [ 526 | "data = pd.Series(\n", 527 | " ['RNA', 'gene', 'protein'],\n", 528 | " index=['ENST', 'ENSG', 'ENSP']\n", 529 | ")\n", 530 | "data" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 19, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "'gene'" 542 | ] 543 | }, 544 | "execution_count": 19, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "data['ENSG']" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "We can construct a ``Series`` from a dictionary and the way we access the values are similar to dictionaries:" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 20, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "ENST RNA\n", 569 | "ENSG gene\n", 570 | "ENSP protein\n", 571 | "dtype: object" 572 | ] 573 | }, 574 | "execution_count": 20, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n", 581 | "data = pd.Series(map_dict)\n", 582 | "data" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 21, 588 | "metadata": {}, 589 | "outputs": [ 590 | { 591 | "data": { 592 | "text/plain": [ 593 | "ENSG gene\n", 594 | "ENSP protein\n", 595 | "dtype: object" 596 | ] 597 | }, 598 | "execution_count": 21, 599 | "metadata": {}, 600 | "output_type": "execute_result" 601 | } 602 | ], 603 | "source": [ 604 | "data['ENSG':]" 605 | ] 606 | }, 607 | { 608 | "cell_type": "markdown", 609 | "metadata": {}, 610 | "source": [ 611 | "### The Pandas DataFrame Object\n", 612 | "\n", 613 | "It can be constructed from 2 or more dictionary with the same keys (or from 2 `Series` with the same indexes)." 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 22, 619 | "metadata": {}, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "text/html": [ 624 | "
\n", 625 | "\n", 638 | "\n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | "
mapping typecounts
ENSTRNA3300
ENSGgene18435
ENSPprotein12034
\n", 664 | "
" 665 | ], 666 | "text/plain": [ 667 | " mapping type counts\n", 668 | "ENST RNA 3300\n", 669 | "ENSG gene 18435\n", 670 | "ENSP protein 12034" 671 | ] 672 | }, 673 | "execution_count": 22, 674 | "metadata": {}, 675 | "output_type": "execute_result" 676 | } 677 | ], 678 | "source": [ 679 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n", 680 | "count_dict = {'ENST': 3300, 'ENSG': 18435, 'ENSP': 12034}\n", 681 | " \n", 682 | "df = pd.DataFrame({'mapping type': map_dict, 'counts': count_dict})\n", 683 | "df" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 23, 689 | "metadata": {}, 690 | "outputs": [ 691 | { 692 | "data": { 693 | "text/plain": [ 694 | "Index(['ENST', 'ENSG', 'ENSP'], dtype='object')" 695 | ] 696 | }, 697 | "execution_count": 23, 698 | "metadata": {}, 699 | "output_type": "execute_result" 700 | } 701 | ], 702 | "source": [ 703 | "df.index" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 24, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "data": { 713 | "text/plain": [ 714 | "Index(['mapping type', 'counts'], dtype='object')" 715 | ] 716 | }, 717 | "execution_count": 24, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "df.columns" 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "We can access a colum like a dictionary or in a Pandas way:" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 25, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "data": { 740 | "text/plain": [ 741 | "ENST 3300\n", 742 | "ENSG 18435\n", 743 | "ENSP 12034\n", 744 | "Name: counts, dtype: int64" 745 | ] 746 | }, 747 | "execution_count": 25, 748 | "metadata": {}, 749 | "output_type": "execute_result" 750 | } 751 | ], 752 | "source": [ 753 | "df['counts'] # like a dictionary" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 26, 759 | "metadata": {}, 760 | "outputs": [ 761 | { 762 | "data": { 763 | "text/plain": [ 764 | "ENST 3300\n", 765 | "ENSG 18435\n", 766 | "ENSP 12034\n", 767 | "Name: counts, dtype: int64" 768 | ] 769 | }, 770 | "execution_count": 26, 771 | "metadata": {}, 772 | "output_type": "execute_result" 773 | } 774 | ], 775 | "source": [ 776 | "df.counts # The Pandas way" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 27, 782 | "metadata": {}, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "ENST RNA\n", 788 | "ENSG gene\n", 789 | "ENSP protein\n", 790 | "Name: mapping type, dtype: object" 791 | ] 792 | }, 793 | "execution_count": 27, 794 | "metadata": {}, 795 | "output_type": "execute_result" 796 | } 797 | ], 798 | "source": [ 799 | "df['mapping type']\n", 800 | "#df.mapping type # I can't do it" 801 | ] 802 | } 803 | ], 804 | "metadata": { 805 | "kernelspec": { 806 | "display_name": "Python 3", 807 | "language": "python", 808 | "name": "python3" 809 | }, 810 | "language_info": { 811 | "codemirror_mode": { 812 | "name": "ipython", 813 | "version": 3 814 | }, 815 | "file_extension": ".py", 816 | "mimetype": "text/x-python", 817 | "name": "python", 818 | "nbconvert_exporter": "python", 819 | "pygments_lexer": "ipython3", 820 | "version": "3.8.3" 821 | } 822 | }, 823 | "nbformat": 4, 824 | "nbformat_minor": 4 825 | } 826 | -------------------------------------------------------------------------------- /2020_2021/Lesson7/Exercises7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# TO DO" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Exercise\n", 26 | "\n", 27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# TO DO" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Exercise\n", 44 | "\n", 45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# TO DO" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.8.3" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /2020_2021/data/P04439.fasta: -------------------------------------------------------------------------------- 1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2 2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF 3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ 4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL 5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT 6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL 7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL 8 | TACKV 9 | -------------------------------------------------------------------------------- /2020_2021/data/brca_transcripts.txt: -------------------------------------------------------------------------------- 1 | transcript_id biotype bp aa 2 | ENST00000352993.7 Protein coding 3668 721 3 | ENST00000354071.7 Protein coding 4497 1399 4 | ENST00000461221.5 Nonsense mediated decay 5693 63 5 | ENST00000461574.1 Protein coding 726 242 6 | ENST00000461798.5 Nonsense mediated decay 582 63 7 | -------------------------------------------------------------------------------- /2020_2021/data/genetic_code.tsv: -------------------------------------------------------------------------------- 1 | UUU F Phe Phenylalanine 2 | UUC F Phe Phenylalanine 3 | UUA L Leu Leucine 4 | UUG L Leu Leucine 5 | CUU L Leu Leucine 6 | CUC L Leu Leucine 7 | CUA L Leu Leucine 8 | CUG L Leu Leucine 9 | AUU I Ile Isoleucine 10 | AUC I Ile Isoleucine 11 | AUA I Ile Isoleucine 12 | AUG M Met Methionine (Start) 13 | GUU V Val Valine 14 | GUC V Val Valine 15 | GUA V Val Valine 16 | GUG V Val Valine 17 | UCU S Ser Serine 18 | UCC S Ser Serine 19 | UCA S Ser Serine 20 | UCG S Ser Serine 21 | CCU P Pro Proline 22 | CCC P Pro Proline 23 | CCA P Pro Proline 24 | CCG P Pro Proline 25 | ACU T Thr Threonine 26 | ACC T Thr Threonine 27 | ACA T Thr Threonine 28 | ACG T Thr Threonine 29 | GCU A Ala Alanine 30 | GCC A Ala Alanine 31 | GCA A Ala Alanine 32 | GCG A Ala Alanine 33 | UAU Y Tyr Tyrosine 34 | UAC Y Tyr Tyrosine 35 | UAA X Stop (Stop) 36 | UAG X Stop (Stop) 37 | CAU H His Histidine 38 | CAC H His Histidine 39 | CAA Q Gln Glutamine 40 | CAG Q Gln Glutamine 41 | AAU N Asn Asparagine 42 | AAC N Asn Asparagine 43 | AAA K Lys Lysine 44 | AAG K Lys Lysine 45 | GAU D Asp Aspartic acid 46 | GAC D Asp Aspartic acid 47 | GAA E Glu Glutamic acid 48 | GAG E Glu Glutamic acid 49 | UGU C Cys Cysteine 50 | UGC C Cys Cysteine 51 | UGA X Stop (Stop) 52 | UGG W Trp Tryptophan 53 | CGU R Arg Arginine 54 | CGC R Arg Arginine 55 | CGA R Arg Arginine 56 | CGG R Arg Arginine 57 | AGU S Ser Serine 58 | AGC S Ser Serine 59 | AGA R Arg Arginine 60 | AGG R Arg Arginine 61 | GGU G Gly Glycine 62 | GGC G Gly Glycine 63 | GGA G Gly Glycine 64 | GGG G Gly Glycine -------------------------------------------------------------------------------- /2020_2021/data/uniprot_ids.txt: -------------------------------------------------------------------------------- 1 | Q13188 2 | O00444 3 | P49760 4 | PYYY4Z 5 | Q13627 6 | Q02156 7 | -------------------------------------------------------------------------------- /2020_2021/data/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def generate_string(n, alphabet): 4 | s = "" 5 | for i in range(n): 6 | s += random.choice(alphabet) 7 | 8 | return s 9 | -------------------------------------------------------------------------------- /2020_2021/data/validation.py: -------------------------------------------------------------------------------- 1 | def valid_sequence(sequence, valid_characters): 2 | for c in sequence: 3 | if c.upper() not in valid_characters: 4 | return False 5 | 6 | return True 7 | 8 | def validate_dna(sequence): 9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C']) 10 | 11 | def validate_rna(sequence): 12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C']) 13 | 14 | def validate_protein(sequence): 15 | return valid_sequence( 16 | sequence, 17 | [ 18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /2020_2021/images/Integer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2020_2021/images/Integer.jpeg -------------------------------------------------------------------------------- /2020_2021/images/List.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2020_2021/images/List.jpeg -------------------------------------------------------------------------------- /2021_2022/Lesson1/Exercises1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "The following list is corrupted:" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "mutations = [\n", 18 | " 'p.Ser31Ala',\n", 19 | " 'p.Pro38Leu',\n", 20 | " 'p.Asn100Lys',\n", 21 | " 'p.LEU110VAL',\n", 22 | " 13,\n", 23 | " 4.0,\n", 24 | " True,\n", 25 | " 'p.Tyr341Leu',\n", 26 | " 'AUG',\n", 27 | " 'p.Tyr0Le',\n", 28 | " 'p.Asn1.3Lys',\n", 29 | " 'p.Arg0Leu'\n", 30 | "]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n", 38 | "\n", 39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "#### Tips" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "valid_aminos = [\n", 56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n", 57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n", 58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n", 59 | "]\n", 60 | "\n", 61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n", 62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n", 63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "valid_mutations = []\n", 73 | "\n", 74 | "# TODO" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Exercise\n", 82 | "\n", 83 | "Write a script to check if a protein sequence is valid." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# TODO" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Exercise\n", 100 | "\n", 101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# TODO" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.8.3" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } -------------------------------------------------------------------------------- /2021_2022/Lesson2/Exercises2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n", 10 | "\n", 11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n", 12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n", 13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "mutations = [\n", 23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n", 24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n", 25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n", 26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n", 27 | "]\n", 28 | "\n", 29 | "aa_3L_to_1L = {\n", 30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 34 | "}" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# TO DO" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "### Exercise\n", 51 | "\n", 52 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# TO DO" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "### Exercise\n", 69 | "\n", 70 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# TO DO" 80 | ] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "Python 3", 86 | "language": "python", 87 | "name": "python3" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": { 91 | "name": "ipython", 92 | "version": 3 93 | }, 94 | "file_extension": ".py", 95 | "mimetype": "text/x-python", 96 | "name": "python", 97 | "nbconvert_exporter": "python", 98 | "pygments_lexer": "ipython3", 99 | "version": "3.8.10" 100 | } 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 4 104 | } 105 | -------------------------------------------------------------------------------- /2021_2022/Lesson3/Exercises3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n", 10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# TO DO" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Exercise\n", 27 | "\n", 28 | "Write a function to remove duplicates in a list." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# TO DO" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Exercise\n", 45 | "\n", 46 | "Write a function to calculate the identity between 2 sequences." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# TO DO" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.8.10" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /2021_2022/Lesson4/Exercises4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Compute all pair-wise identities (number of identical character pairs)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "sequences = [\n", 19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n", 20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n", 21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n", 22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n", 23 | "]\n", 24 | "\n", 25 | "# TO DO" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Exercise\n", 33 | "\n", 34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n", 35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n", 36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# TO DO" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Exercise\n", 53 | "\n", 54 | "Print the index of the first occurrence of the ATG codon.\n", 55 | "\n", 56 | "Try with and without using the `find()` method on strings." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# TO DO" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.8.10" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 4 90 | } 91 | -------------------------------------------------------------------------------- /2021_2022/Lesson5/Exercises5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Write a function to search motifs in a sequence.\n", 10 | "\n", 11 | "Try with and without using the `re` module." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n", 21 | "\n", 22 | "consensus_motifs = {\n", 23 | " 'motif1': 'AGGAG[GT]',\n", 24 | " 'motif2': 'T[AT]AAT',\n", 25 | " 'motif3': 'GG.A.T[AG]'\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Possible printed output:\n", 34 | "```\n", 35 | "AGGAG[GT]\n", 36 | "\t(969, 975) AGGAGG\n", 37 | "\t(1153, 1159) AGGAGG\n", 38 | "\t(1339, 1345) AGGAGT\n", 39 | "\t(1587, 1593) AGGAGG\n", 40 | "\t(1881, 1887) AGGAGG\n", 41 | "\t(1941, 1947) AGGAGG\n", 42 | "T[AT]AAT\n", 43 | "\t(50, 55) TAAAT\n", 44 | "\t(1098, 1103) TAAAT\n", 45 | "\t(1276, 1281) TAAAT\n", 46 | "GG.A.T[AG]\n", 47 | "\t(248, 255) GGTACTG\n", 48 | "\t(983, 990) GGAAATA\n", 49 | "\t(1910, 1917) GGGACTG\n", 50 | "\t(1980, 1987) GGCAGTG\n", 51 | "```\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# TO DO" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Exercise\n", 68 | "\n", 69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "aa_3L_to_1L = {\n", 79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 83 | "}\n", 84 | "\n", 85 | "#aa_1L_to_3L['A'] --> 'ALA'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# TO DO" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Exercise\n", 102 | "\n", 103 | "Write a function to remove not valid aminoacids from a protein." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# TO DO" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.8.10" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 4 137 | } 138 | -------------------------------------------------------------------------------- /2021_2022/Lesson6/Exercises6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n", 10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n", 11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
UUUFPhePhenylalanine
0UUCFPhePhenylalanine
1UUALLeuLeucine
2UUGLLeuLeucine
3CUULLeuLeucine
4CUCLLeuLeucine
...............
58AGGRArgArginine
59GGUGGlyGlycine
60GGCGGlyGlycine
61GGAGGlyGlycine
62GGGGGlyGlycine
\n", 126 | "

63 rows × 4 columns

\n", 127 | "
" 128 | ], 129 | "text/plain": [ 130 | " UUU F Phe Phenylalanine\n", 131 | "0 UUC F Phe Phenylalanine\n", 132 | "1 UUA L Leu Leucine\n", 133 | "2 UUG L Leu Leucine\n", 134 | "3 CUU L Leu Leucine\n", 135 | "4 CUC L Leu Leucine\n", 136 | ".. ... .. ... ...\n", 137 | "58 AGG R Arg Arginine\n", 138 | "59 GGU G Gly Glycine\n", 139 | "60 GGC G Gly Glycine\n", 140 | "61 GGA G Gly Glycine\n", 141 | "62 GGG G Gly Glycine\n", 142 | "\n", 143 | "[63 rows x 4 columns]" 144 | ] 145 | }, 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "import pandas as pd\n", 153 | "\n", 154 | "# TO DO" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Exercise\n", 162 | "\n", 163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# TO DO" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Exercise\n", 180 | "\n", 181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# TO DO" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.8.10" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /2021_2022/Lesson6/Lesson6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lesson 6 - 2021/11/18" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Numpy\n", 15 | "[NumPy](https://numpy.org/) (short for *Numerical Python*) is a numerical library for Python which provides an efficient interface to store and operate on data." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### A Python Integer Is More Than Just an Integer\n", 23 | "A Python integer is a pointer to a position in memory containing all the Python object information, including the bytes that contain the integer value.\n", 24 | "\n", 25 | "![Integer.jpeg](../images/Integer.jpeg)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### A Python List Is More Than Just a List\n", 33 | "\n", 34 | "Because of Python's dynamic typing, we can create heterogeneous lists:" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": {}, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "[bool, str, float, int]" 46 | ] 47 | }, 48 | "execution_count": 1, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "my_list = [True, \"2\", 3.0, 4]\n", 55 | "\n", 56 | "[type(item) for item in my_list]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "But this flexibility comes at a cost.\n", 64 | "\n", 65 | "![List.jpeg](../images/List.jpeg)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "In the special case that all variables are of the same type, much of this information is redundant: it can be much more efficient to store data in a fixed-type array." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### Fixed-Type Arrays in Python\n", 80 | "The built-in ``array`` module can be used to create arrays of a uniform type:" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 2, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])" 92 | ] 93 | }, 94 | "execution_count": 2, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "import array\n", 101 | "\n", 102 | "L = list(range(10))\n", 103 | "A = array.array('i', L) # i indicates integer values\n", 104 | "\n", 105 | "A" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Much more useful, however, is the ``numpy.ndarray`` object of the NumPy package which adds to this efficient *operations* on that data." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 3, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "(numpy.ndarray, array([9, 8, 3, 5, 1, 1, 6, 0, 5]))" 124 | ] 125 | }, 126 | "execution_count": 3, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "import numpy as np\n", 133 | "\n", 134 | "x_np = np.random.randint(10, size=9) # One-dimensional array\n", 135 | "\n", 136 | "type(x_np), x_np" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 4, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "x1[3]: 5\n", 149 | "x1[2:5]: [3 5 1]\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "print('x1[3]:', x_np[3]) # Array Indexing\n", 155 | "print('x1[2:5]:', x_np[2:5]) # Array Slicing" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "9\n", 168 | "8\n", 169 | "3\n", 170 | "5\n", 171 | "1\n", 172 | "1\n", 173 | "6\n", 174 | "0\n", 175 | "5\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "# Iteration\n", 181 | "for element in x_np:\n", 182 | " print(element)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "``numpy.ndarray`` stands for N-dimensional array which means that this object is built to be multi-dimensional, with attributes and methods specifically designed for this feature." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 6, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "[9 8 3 5 1 1 6 0 5]\n", 202 | "[[9 8 3]\n", 203 | " [5 1 1]\n", 204 | " [6 0 5]]\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "grid = x_np.reshape((3, 3)) # Two-dimensional array\n", 210 | "\n", 211 | "print(x_np)\n", 212 | "print(grid)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "grid.ndim: 2\n", 225 | "grid.shape: (3, 3)\n", 226 | "grid.size: 9\n", 227 | "grid.dtype: int64\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "print(\"grid.ndim: \", grid.ndim)\n", 233 | "print(\"grid.shape:\", grid.shape)\n", 234 | "print(\"grid.size: \", grid.size)\n", 235 | "print(\"grid.dtype:\", grid.dtype)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### Boolean indexing\n", 243 | "Numpy arrays can be sliced with vectors of booleans (``list``s or other ``ndarray``s) with the same dimensions." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 9, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "x_np: [9 8 3 5 1 1 6 0 5]\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "print('x_np:', x_np)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 10, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "boolean_np: [ True True False True False False True False True]\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "boolean_np = x_np > 3\n", 278 | "\n", 279 | "print('boolean_np:', boolean_np) # It states if the element in the elements in the same position are > 3." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 11, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "[9, 8, 5, 6, 5]" 291 | ] 292 | }, 293 | "execution_count": 11, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "[x for x in x_np if x > 3]" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 12, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "array([9, 8, 5, 6, 5])" 311 | ] 312 | }, 313 | "execution_count": 12, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "x_np[boolean_np]" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 13, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "2.41 s ± 938 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", 332 | "5.54 ms ± 18 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "big_array = np.random.rand(10000000)\n", 338 | "\n", 339 | "%timeit [x for x in big_array if x > 3]\n", 340 | "%timeit big_array[big_array > 3]" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "### Vectorized Operations\n", 348 | "Operation between arrays are carried out with a different logic than that of standard lists." 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 14, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "x_list + x_list: [9, 8, 3, 5, 1, 1, 6, 0, 5, 9, 8, 3, 5, 1, 1, 6, 0, 5]\n", 361 | "x_np + x_np: [18 16 6 10 2 2 12 0 10]\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "x_list = list(x_np)\n", 367 | "\n", 368 | "print('x_list + x_list:', x_list + x_list)\n", 369 | "print('x_np + x_np:', x_np + x_np)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "| Operator | Equivalent func | Description |\n", 377 | "|---------------|---------------------|---------------------------------------|\n", 378 | "|``+`` |``np.add`` |Addition (e.g., ``1 + 1 = 2``) |\n", 379 | "|``-`` |``np.subtract`` |Subtraction (e.g., ``3 - 2 = 1``) |\n", 380 | "|``-`` |``np.negative`` |Unary negation (e.g., ``-2``) |\n", 381 | "|``*`` |``np.multiply`` |Multiplication (e.g., ``2 * 3 = 6``) |\n", 382 | "|``/`` |``np.divide`` |Division (e.g., ``3 / 2 = 1.5``) |\n", 383 | "|``//`` |``np.floor_divide`` |Floor division (e.g., ``3 // 2 = 1``) |\n", 384 | "|``**`` |``np.power`` |Exponentiation (e.g., ``2 ** 3 = 8``) |\n", 385 | "|``%`` |``np.mod`` |Modulus/remainder (e.g., ``9 % 4 = 1``)|" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 15, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "name": "stdout", 395 | "output_type": "stream", 396 | "text": [ 397 | "1.37 s ± 39.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", 398 | "4.26 ms ± 7.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "%timeit sum(big_array)\n", 404 | "%timeit np.sum(big_array) # or big_array.sum()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "Important: whenever possible, make sure that you are using the NumPy version of these operations when operating on NumPy arrays." 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "## Pandas\n", 419 | "\n", 420 | "[Pandas](https://pandas.pydata.org/) is a library built on top of NumPy, which provides an efficient implementation of a ``DataFrame``.\n", 421 | "\n", 422 | "``DataFrame``s can be seens as multidimensional arrays with attached row and column labels, that can presennt heterogeneous types and/or missing data." 423 | ] 424 | }, 425 | { 426 | "cell_type": "markdown", 427 | "metadata": {}, 428 | "source": [ 429 | "### The Pandas Series Object\n", 430 | "A Pandas ``Series`` is a one-dimensional array of indexed data." 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 16, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "0 RNA\n", 442 | "1 gene\n", 443 | "2 protein\n", 444 | "dtype: object" 445 | ] 446 | }, 447 | "execution_count": 16, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "import pandas as pd\n", 454 | "\n", 455 | "data = pd.Series(['RNA', 'gene', 'protein'])\n", 456 | "data" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 17, 462 | "metadata": {}, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/plain": [ 467 | "array(['RNA', 'gene', 'protein'], dtype=object)" 468 | ] 469 | }, 470 | "execution_count": 17, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "data.values" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 18, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "data": { 486 | "text/plain": [ 487 | "RangeIndex(start=0, stop=3, step=1)" 488 | ] 489 | }, 490 | "execution_count": 18, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "data.index" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": {}, 502 | "source": [ 503 | "The index need not be an integer, but can consist of values of any type:" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 19, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/plain": [ 514 | "ENST RNA\n", 515 | "ENSG gene\n", 516 | "ENSP protein\n", 517 | "dtype: object" 518 | ] 519 | }, 520 | "execution_count": 19, 521 | "metadata": {}, 522 | "output_type": "execute_result" 523 | } 524 | ], 525 | "source": [ 526 | "data = pd.Series(\n", 527 | " ['RNA', 'gene', 'protein'],\n", 528 | " index=['ENST', 'ENSG', 'ENSP']\n", 529 | ")\n", 530 | "data" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 20, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "'gene'" 542 | ] 543 | }, 544 | "execution_count": 20, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "data['ENSG']" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "We can construct a ``Series`` from a dictionary and the way we access the values are similar to dictionaries:" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 21, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "ENST RNA\n", 569 | "ENSG gene\n", 570 | "ENSP protein\n", 571 | "dtype: object" 572 | ] 573 | }, 574 | "execution_count": 21, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n", 581 | "data = pd.Series(map_dict)\n", 582 | "data" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 22, 588 | "metadata": {}, 589 | "outputs": [ 590 | { 591 | "data": { 592 | "text/plain": [ 593 | "ENSG gene\n", 594 | "ENSP protein\n", 595 | "dtype: object" 596 | ] 597 | }, 598 | "execution_count": 22, 599 | "metadata": {}, 600 | "output_type": "execute_result" 601 | } 602 | ], 603 | "source": [ 604 | "data['ENSG':]" 605 | ] 606 | }, 607 | { 608 | "cell_type": "markdown", 609 | "metadata": {}, 610 | "source": [ 611 | "### The Pandas DataFrame Object\n", 612 | "\n", 613 | "It can be constructed from 2 or more dictionary with the same keys (or from 2 `Series` with the same indexes)." 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 24, 619 | "metadata": {}, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "text/html": [ 624 | "
\n", 625 | "\n", 638 | "\n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | "
mapping typecounts
ENSTRNA3300
ENSGgene18435
ENSPprotein12034
\n", 664 | "
" 665 | ], 666 | "text/plain": [ 667 | " mapping type counts\n", 668 | "ENST RNA 3300\n", 669 | "ENSG gene 18435\n", 670 | "ENSP protein 12034" 671 | ] 672 | }, 673 | "execution_count": 24, 674 | "metadata": {}, 675 | "output_type": "execute_result" 676 | } 677 | ], 678 | "source": [ 679 | "map_dict = {'ENST': 'RNA', 'ENSG': 'gene', 'ENSP': 'protein'}\n", 680 | "count_dict = {'ENST': 3300, 'ENSG': 18435, 'ENSP': 12034}\n", 681 | " \n", 682 | "df = pd.DataFrame({'mapping type': map_dict, 'counts': count_dict})\n", 683 | "df" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 25, 689 | "metadata": {}, 690 | "outputs": [ 691 | { 692 | "data": { 693 | "text/plain": [ 694 | "Index(['ENST', 'ENSG', 'ENSP'], dtype='object')" 695 | ] 696 | }, 697 | "execution_count": 25, 698 | "metadata": {}, 699 | "output_type": "execute_result" 700 | } 701 | ], 702 | "source": [ 703 | "df.index" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 26, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "data": { 713 | "text/plain": [ 714 | "Index(['mapping type', 'counts'], dtype='object')" 715 | ] 716 | }, 717 | "execution_count": 26, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "df.columns" 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "We can access a colum like a dictionary or in a Pandas way:" 731 | ] 732 | }, 733 | { 734 | "cell_type": "code", 735 | "execution_count": 25, 736 | "metadata": {}, 737 | "outputs": [ 738 | { 739 | "data": { 740 | "text/plain": [ 741 | "ENST 3300\n", 742 | "ENSG 18435\n", 743 | "ENSP 12034\n", 744 | "Name: counts, dtype: int64" 745 | ] 746 | }, 747 | "execution_count": 25, 748 | "metadata": {}, 749 | "output_type": "execute_result" 750 | } 751 | ], 752 | "source": [ 753 | "df['counts'] # like a dictionary" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 26, 759 | "metadata": {}, 760 | "outputs": [ 761 | { 762 | "data": { 763 | "text/plain": [ 764 | "ENST 3300\n", 765 | "ENSG 18435\n", 766 | "ENSP 12034\n", 767 | "Name: counts, dtype: int64" 768 | ] 769 | }, 770 | "execution_count": 26, 771 | "metadata": {}, 772 | "output_type": "execute_result" 773 | } 774 | ], 775 | "source": [ 776 | "df.counts # The Pandas way" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 27, 782 | "metadata": {}, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "ENST RNA\n", 788 | "ENSG gene\n", 789 | "ENSP protein\n", 790 | "Name: mapping type, dtype: object" 791 | ] 792 | }, 793 | "execution_count": 27, 794 | "metadata": {}, 795 | "output_type": "execute_result" 796 | } 797 | ], 798 | "source": [ 799 | "df['mapping type']\n", 800 | "#df.mapping type # I can't do it" 801 | ] 802 | } 803 | ], 804 | "metadata": { 805 | "kernelspec": { 806 | "display_name": "Python 3", 807 | "language": "python", 808 | "name": "python3" 809 | }, 810 | "language_info": { 811 | "codemirror_mode": { 812 | "name": "ipython", 813 | "version": 3 814 | }, 815 | "file_extension": ".py", 816 | "mimetype": "text/x-python", 817 | "name": "python", 818 | "nbconvert_exporter": "python", 819 | "pygments_lexer": "ipython3", 820 | "version": "3.8.10" 821 | } 822 | }, 823 | "nbformat": 4, 824 | "nbformat_minor": 4 825 | } 826 | -------------------------------------------------------------------------------- /2021_2022/Lesson7/Exercises7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# TO DO" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Exercise\n", 26 | "\n", 27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# TO DO" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Exercise\n", 44 | "\n", 45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# TO DO" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.8.10" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /2021_2022/data/P04439.fasta: -------------------------------------------------------------------------------- 1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2 2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF 3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ 4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL 5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT 6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL 7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL 8 | TACKV 9 | -------------------------------------------------------------------------------- /2021_2022/data/brca_transcripts.txt: -------------------------------------------------------------------------------- 1 | transcript_id biotype bp aa 2 | ENST00000352993.7 Protein coding 3668 721 3 | ENST00000354071.7 Protein coding 4497 1399 4 | ENST00000461221.5 Nonsense mediated decay 5693 63 5 | ENST00000461574.1 Protein coding 726 242 6 | ENST00000461798.5 Nonsense mediated decay 582 63 7 | -------------------------------------------------------------------------------- /2021_2022/data/genetic_code.tsv: -------------------------------------------------------------------------------- 1 | UUU F Phe Phenylalanine 2 | UUC F Phe Phenylalanine 3 | UUA L Leu Leucine 4 | UUG L Leu Leucine 5 | CUU L Leu Leucine 6 | CUC L Leu Leucine 7 | CUA L Leu Leucine 8 | CUG L Leu Leucine 9 | AUU I Ile Isoleucine 10 | AUC I Ile Isoleucine 11 | AUA I Ile Isoleucine 12 | AUG M Met Methionine (Start) 13 | GUU V Val Valine 14 | GUC V Val Valine 15 | GUA V Val Valine 16 | GUG V Val Valine 17 | UCU S Ser Serine 18 | UCC S Ser Serine 19 | UCA S Ser Serine 20 | UCG S Ser Serine 21 | CCU P Pro Proline 22 | CCC P Pro Proline 23 | CCA P Pro Proline 24 | CCG P Pro Proline 25 | ACU T Thr Threonine 26 | ACC T Thr Threonine 27 | ACA T Thr Threonine 28 | ACG T Thr Threonine 29 | GCU A Ala Alanine 30 | GCC A Ala Alanine 31 | GCA A Ala Alanine 32 | GCG A Ala Alanine 33 | UAU Y Tyr Tyrosine 34 | UAC Y Tyr Tyrosine 35 | UAA X Stop (Stop) 36 | UAG X Stop (Stop) 37 | CAU H His Histidine 38 | CAC H His Histidine 39 | CAA Q Gln Glutamine 40 | CAG Q Gln Glutamine 41 | AAU N Asn Asparagine 42 | AAC N Asn Asparagine 43 | AAA K Lys Lysine 44 | AAG K Lys Lysine 45 | GAU D Asp Aspartic acid 46 | GAC D Asp Aspartic acid 47 | GAA E Glu Glutamic acid 48 | GAG E Glu Glutamic acid 49 | UGU C Cys Cysteine 50 | UGC C Cys Cysteine 51 | UGA X Stop (Stop) 52 | UGG W Trp Tryptophan 53 | CGU R Arg Arginine 54 | CGC R Arg Arginine 55 | CGA R Arg Arginine 56 | CGG R Arg Arginine 57 | AGU S Ser Serine 58 | AGC S Ser Serine 59 | AGA R Arg Arginine 60 | AGG R Arg Arginine 61 | GGU G Gly Glycine 62 | GGC G Gly Glycine 63 | GGA G Gly Glycine 64 | GGG G Gly Glycine -------------------------------------------------------------------------------- /2021_2022/data/my_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def generate_string(n, alphabet): 4 | s = "" 5 | for i in range(n): 6 | s += random.choice(alphabet) 7 | 8 | return s 9 | -------------------------------------------------------------------------------- /2021_2022/data/uniprot_ids.txt: -------------------------------------------------------------------------------- 1 | Q13188 2 | O00444 3 | P49760 4 | PYYY4Z 5 | Q13627 6 | Q02156 7 | -------------------------------------------------------------------------------- /2021_2022/data/validation.py: -------------------------------------------------------------------------------- 1 | def valid_sequence(sequence, valid_characters): 2 | for c in sequence: 3 | if c.upper() not in valid_characters: 4 | return False 5 | 6 | return True 7 | 8 | def validate_dna(sequence): 9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C']) 10 | 11 | def validate_rna(sequence): 12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C']) 13 | 14 | def validate_protein(sequence): 15 | return valid_sequence( 16 | sequence, 17 | [ 18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /2021_2022/images/Integer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2021_2022/images/Integer.jpeg -------------------------------------------------------------------------------- /2021_2022/images/List.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2021_2022/images/List.jpeg -------------------------------------------------------------------------------- /2022_2023/Lesson1/Exercises1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "The following list is corrupted:" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "mutations = [\n", 18 | " 'p.Ser31Ala',\n", 19 | " 'p.Pro38Leu',\n", 20 | " 'p.Asn100Lys',\n", 21 | " 'p.LEU110VAL',\n", 22 | " 13,\n", 23 | " 4.0,\n", 24 | " True,\n", 25 | " 'p.Tyr341Leu',\n", 26 | " 'AUG',\n", 27 | " 'p.Tyr0Le',\n", 28 | " 'p.Asn1.3Lys',\n", 29 | " 'p.Arg0Leu'\n", 30 | "]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n", 38 | "\n", 39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "#### Tips" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "valid_aminos = [\n", 56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n", 57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n", 58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n", 59 | "]\n", 60 | "\n", 61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n", 62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n", 63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "valid_mutations = []\n", 73 | "\n", 74 | "# TODO" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Exercise\n", 82 | "\n", 83 | "Write a script to check if a protein sequence is valid." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# TODO" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Exercise\n", 100 | "\n", 101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# TODO" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.10.6" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } 136 | -------------------------------------------------------------------------------- /2022_2023/Lesson2/Exercises2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n", 10 | "\n", 11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n", 12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n", 13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "mutations = [\n", 23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n", 24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n", 25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n", 26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n", 27 | "]\n", 28 | "\n", 29 | "aa_3L_to_1L = {\n", 30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 34 | "}" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# TO DO" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Set of valid mutations that could belong to the HLA class I histocompatibility antigen protein:\n", 51 | "\n", 52 | "`['p.Pro39Arg', 'p.Gly40Ile', 'p.Leu19Gly', 'p.Val49Ile', 'p.Asn90Asp', 'p.Phe133His', 'p.Leu134Cys', 'p.Glu190Ser', 'p.Gln248Val', 'p.Thr249Ile']`" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### Exercise\n", 60 | "\n", 61 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# TO DO" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Exercise\n", 78 | "\n", 79 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# TO DO" 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3 (ipykernel)", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.10.6" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 4 113 | } 114 | -------------------------------------------------------------------------------- /2022_2023/Lesson3/Exercises3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n", 10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# TO DO" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Exercise\n", 27 | "\n", 28 | "Write a function to remove duplicates in a list." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# TO DO" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Exercise\n", 45 | "\n", 46 | "Write a function to calculate the identity between 2 sequences." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# TO DO" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3 (ipykernel)", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.10.6" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /2022_2023/Lesson4/Exercises4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Compute all pair-wise identities (number of identical character pairs)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "sequences = [\n", 19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n", 20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n", 21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n", 22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n", 23 | "]\n", 24 | "\n", 25 | "# TO DO" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Exercise\n", 33 | "\n", 34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n", 35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n", 36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# TO DO" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Exercise\n", 53 | "\n", 54 | "Print the index of the first occurrence of the ATG codon in `dna_seq`.\n", 55 | "\n", 56 | "Try with and without using the `find()` method on strings.\n", 57 | "\n", 58 | "Do the same with the ribonucleotide sequence in the `P04439.rna.fasta` file (manage the `U` <-> `T` conversion)." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "dna_seq = 'AAAAATCCCGAGGCGGCAUGTATATAGGGCTCCGGAGGCGTAATATAAAA'\n", 68 | "\n", 69 | "# TODO" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3 (ipykernel)", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.10.6" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 4 94 | } 95 | -------------------------------------------------------------------------------- /2022_2023/Lesson5/Exercises5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Write a function to search motifs in a sequence.\n", 10 | "\n", 11 | "Try with and without using the `re` module." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n", 21 | "\n", 22 | "consensus_motifs = {\n", 23 | " 'motif1': 'AGGAG[GT]',\n", 24 | " 'motif2': 'T[AT]AAT',\n", 25 | " 'motif3': 'GG.A.T[AG]'\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Possible printed output:\n", 34 | "```\n", 35 | "AGGAG[GT]\n", 36 | "\t(969, 975) AGGAGG\n", 37 | "\t(1153, 1159) AGGAGG\n", 38 | "\t(1339, 1345) AGGAGT\n", 39 | "\t(1587, 1593) AGGAGG\n", 40 | "\t(1881, 1887) AGGAGG\n", 41 | "\t(1941, 1947) AGGAGG\n", 42 | "T[AT]AAT\n", 43 | "\t(50, 55) TAAAT\n", 44 | "\t(1098, 1103) TAAAT\n", 45 | "\t(1276, 1281) TAAAT\n", 46 | "GG.A.T[AG]\n", 47 | "\t(248, 255) GGTACTG\n", 48 | "\t(983, 990) GGAAATA\n", 49 | "\t(1910, 1917) GGGACTG\n", 50 | "\t(1980, 1987) GGCAGTG\n", 51 | "```\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# TO DO" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Exercise\n", 68 | "\n", 69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "aa_3L_to_1L = {\n", 79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 83 | "}\n", 84 | "\n", 85 | "#aa_1L_to_3L['A'] --> 'ALA'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# TO DO" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Exercise\n", 102 | "\n", 103 | "Write a function to remove not valid aminoacids from a protein." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# TO DO" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.10.6" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 4 137 | } 138 | -------------------------------------------------------------------------------- /2022_2023/Lesson6/Exercises6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n", 10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n", 11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
UUUFPhePhenylalanine
0UUCFPhePhenylalanine
1UUALLeuLeucine
2UUGLLeuLeucine
3CUULLeuLeucine
4CUCLLeuLeucine
...............
58AGGRArgArginine
59GGUGGlyGlycine
60GGCGGlyGlycine
61GGAGGlyGlycine
62GGGGGlyGlycine
\n", 126 | "

63 rows × 4 columns

\n", 127 | "
" 128 | ], 129 | "text/plain": [ 130 | " UUU F Phe Phenylalanine\n", 131 | "0 UUC F Phe Phenylalanine\n", 132 | "1 UUA L Leu Leucine\n", 133 | "2 UUG L Leu Leucine\n", 134 | "3 CUU L Leu Leucine\n", 135 | "4 CUC L Leu Leucine\n", 136 | ".. ... .. ... ...\n", 137 | "58 AGG R Arg Arginine\n", 138 | "59 GGU G Gly Glycine\n", 139 | "60 GGC G Gly Glycine\n", 140 | "61 GGA G Gly Glycine\n", 141 | "62 GGG G Gly Glycine\n", 142 | "\n", 143 | "[63 rows x 4 columns]" 144 | ] 145 | }, 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "import pandas as pd\n", 153 | "\n", 154 | "# TO DO" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Exercise\n", 162 | "\n", 163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# TO DO" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Exercise\n", 180 | "\n", 181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# TO DO" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3 (ipykernel)", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.10.6" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /2022_2023/Lesson7/Exercises7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# TO DO" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Exercise\n", 26 | "\n", 27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# TO DO" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Exercise\n", 44 | "\n", 45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# TO DO" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3 (ipykernel)", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.10.6" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /2022_2023/data/P04439.fasta: -------------------------------------------------------------------------------- 1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2 2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF 3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ 4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL 5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT 6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL 7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL 8 | TACKV 9 | -------------------------------------------------------------------------------- /2022_2023/data/brca_transcripts.txt: -------------------------------------------------------------------------------- 1 | transcript_id biotype bp aa 2 | ENST00000352993.7 Protein coding 3668 721 3 | ENST00000354071.7 Protein coding 4497 1399 4 | ENST00000461221.5 Nonsense mediated decay 5693 63 5 | ENST00000461574.1 Protein coding 726 242 6 | ENST00000461798.5 Nonsense mediated decay 582 63 7 | -------------------------------------------------------------------------------- /2022_2023/data/genetic_code.tsv: -------------------------------------------------------------------------------- 1 | UUU F Phe Phenylalanine 2 | UUC F Phe Phenylalanine 3 | UUA L Leu Leucine 4 | UUG L Leu Leucine 5 | CUU L Leu Leucine 6 | CUC L Leu Leucine 7 | CUA L Leu Leucine 8 | CUG L Leu Leucine 9 | AUU I Ile Isoleucine 10 | AUC I Ile Isoleucine 11 | AUA I Ile Isoleucine 12 | AUG M Met Methionine (Start) 13 | GUU V Val Valine 14 | GUC V Val Valine 15 | GUA V Val Valine 16 | GUG V Val Valine 17 | UCU S Ser Serine 18 | UCC S Ser Serine 19 | UCA S Ser Serine 20 | UCG S Ser Serine 21 | CCU P Pro Proline 22 | CCC P Pro Proline 23 | CCA P Pro Proline 24 | CCG P Pro Proline 25 | ACU T Thr Threonine 26 | ACC T Thr Threonine 27 | ACA T Thr Threonine 28 | ACG T Thr Threonine 29 | GCU A Ala Alanine 30 | GCC A Ala Alanine 31 | GCA A Ala Alanine 32 | GCG A Ala Alanine 33 | UAU Y Tyr Tyrosine 34 | UAC Y Tyr Tyrosine 35 | UAA X Stop (Stop) 36 | UAG X Stop (Stop) 37 | CAU H His Histidine 38 | CAC H His Histidine 39 | CAA Q Gln Glutamine 40 | CAG Q Gln Glutamine 41 | AAU N Asn Asparagine 42 | AAC N Asn Asparagine 43 | AAA K Lys Lysine 44 | AAG K Lys Lysine 45 | GAU D Asp Aspartic acid 46 | GAC D Asp Aspartic acid 47 | GAA E Glu Glutamic acid 48 | GAG E Glu Glutamic acid 49 | UGU C Cys Cysteine 50 | UGC C Cys Cysteine 51 | UGA X Stop (Stop) 52 | UGG W Trp Tryptophan 53 | CGU R Arg Arginine 54 | CGC R Arg Arginine 55 | CGA R Arg Arginine 56 | CGG R Arg Arginine 57 | AGU S Ser Serine 58 | AGC S Ser Serine 59 | AGA R Arg Arginine 60 | AGG R Arg Arginine 61 | GGU G Gly Glycine 62 | GGC G Gly Glycine 63 | GGA G Gly Glycine 64 | GGG G Gly Glycine -------------------------------------------------------------------------------- /2022_2023/data/my_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def generate_string(n, alphabet): 4 | s = "" 5 | for i in range(n): 6 | s += random.choice(alphabet) 7 | 8 | return s 9 | -------------------------------------------------------------------------------- /2022_2023/data/uniprot_ids.txt: -------------------------------------------------------------------------------- 1 | Q13188 2 | O00444 3 | P49760 4 | PYYY4Z 5 | Q13627 6 | Q02156 7 | -------------------------------------------------------------------------------- /2022_2023/data/validation.py: -------------------------------------------------------------------------------- 1 | def valid_sequence(sequence, valid_characters): 2 | for c in sequence: 3 | if c.upper() not in valid_characters: 4 | return False 5 | 6 | return True 7 | 8 | def validate_dna(sequence): 9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C']) 10 | 11 | def validate_rna(sequence): 12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C']) 13 | 14 | def validate_protein(sequence): 15 | return valid_sequence( 16 | sequence, 17 | [ 18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /2022_2023/images/Integer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2022_2023/images/Integer.jpeg -------------------------------------------------------------------------------- /2022_2023/images/List.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2022_2023/images/List.jpeg -------------------------------------------------------------------------------- /2023_2024/Lesson1/Exercises1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "The following list is corrupted:" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "mutations = [\n", 18 | " 'p.Ser31Ala',\n", 19 | " 'p.Pro38Leu',\n", 20 | " 'p.Asn100Lys',\n", 21 | " 'p.LEU110VAL',\n", 22 | " 13,\n", 23 | " 4.0,\n", 24 | " True,\n", 25 | " 'p.Tyr341Leu',\n", 26 | " 'AUG',\n", 27 | " 'p.Tyr0Le',\n", 28 | " 'p.Asn1.3Lys',\n", 29 | " 'p.Arg0Leu'\n", 30 | "]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n", 38 | "\n", 39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "#### Tips" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "valid_aminos = [\n", 56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n", 57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n", 58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n", 59 | "]\n", 60 | "\n", 61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n", 62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n", 63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "valid_mutations = []\n", 73 | "\n", 74 | "# TODO" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Exercise\n", 82 | "\n", 83 | "Write a script to check if a protein sequence is valid." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# TODO" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Exercise\n", 100 | "\n", 101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# TODO" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.10.6" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } 136 | -------------------------------------------------------------------------------- /2023_2024/Lesson2/Exercises2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n", 10 | "\n", 11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n", 12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n", 13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "mutations = [\n", 23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n", 24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n", 25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n", 26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n", 27 | "]\n", 28 | "\n", 29 | "aa_3L_to_1L = {\n", 30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 34 | "}" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# TO DO" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Set of valid mutations that could belong to the HLA class I histocompatibility antigen protein:\n", 51 | "\n", 52 | "`['p.Pro39Arg', 'p.Gly40Ile', 'p.Leu19Gly', 'p.Val49Ile', 'p.Asn90Asp', 'p.Phe133His', 'p.Leu134Cys', 'p.Glu190Ser', 'p.Gln248Val', 'p.Thr249Ile']`" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### Exercise\n", 60 | "\n", 61 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# TO DO" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Exercise\n", 78 | "\n", 79 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# TO DO" 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3 (ipykernel)", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.10.6" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 4 113 | } 114 | -------------------------------------------------------------------------------- /2023_2024/Lesson3/Exercises3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n", 10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# TO DO" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Exercise\n", 27 | "\n", 28 | "Write a function to remove duplicates in a list." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# TO DO" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Exercise\n", 45 | "\n", 46 | "Write a function to calculate the identity between 2 sequences." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# TO DO" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3 (ipykernel)", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.10.6" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /2023_2024/Lesson4/Exercises4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Compute all pair-wise identities (number of identical character pairs)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "sequences = [\n", 19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n", 20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n", 21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n", 22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n", 23 | "]\n", 24 | "\n", 25 | "# TO DO" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Exercise\n", 33 | "\n", 34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n", 35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n", 36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# TO DO" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Exercise\n", 53 | "\n", 54 | "Print the index of the first occurrence of the ATG codon in `dna_seq`.\n", 55 | "\n", 56 | "Try with and without using the `find()` method on strings.\n", 57 | "\n", 58 | "Do the same with the ribonucleotide sequence in the `P04439.rna.fasta` file (manage the `U` <-> `T` conversion)." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "dna_seq = 'AAAAATCCCGAGGCGGCAUGTATATAGGGCTCCGGAGGCGTAATATAAAA'\n", 68 | "\n", 69 | "# TODO" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3 (ipykernel)", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.10.6" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 4 94 | } 95 | -------------------------------------------------------------------------------- /2023_2024/Lesson5/Exercises5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Write a function to search motifs in a sequence.\n", 10 | "\n", 11 | "Try with and without using the `re` module." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n", 21 | "\n", 22 | "consensus_motifs = {\n", 23 | " 'motif1': 'AGGAG[GT]',\n", 24 | " 'motif2': 'T[AT]AAT',\n", 25 | " 'motif3': 'GG.A.T[AG]'\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Possible printed output:\n", 34 | "```\n", 35 | "AGGAG[GT]\n", 36 | "\t(969, 975) AGGAGG\n", 37 | "\t(1153, 1159) AGGAGG\n", 38 | "\t(1339, 1345) AGGAGT\n", 39 | "\t(1587, 1593) AGGAGG\n", 40 | "\t(1881, 1887) AGGAGG\n", 41 | "\t(1941, 1947) AGGAGG\n", 42 | "T[AT]AAT\n", 43 | "\t(50, 55) TAAAT\n", 44 | "\t(1098, 1103) TAAAT\n", 45 | "\t(1276, 1281) TAAAT\n", 46 | "GG.A.T[AG]\n", 47 | "\t(248, 255) GGTACTG\n", 48 | "\t(983, 990) GGAAATA\n", 49 | "\t(1910, 1917) GGGACTG\n", 50 | "\t(1980, 1987) GGCAGTG\n", 51 | "```\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# TO DO" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Exercise\n", 68 | "\n", 69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "aa_3L_to_1L = {\n", 79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 83 | "}\n", 84 | "\n", 85 | "#aa_1L_to_3L['A'] --> 'ALA'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# TO DO" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Exercise\n", 102 | "\n", 103 | "Write a function to remove not valid aminoacids from a protein." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# TO DO" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.10.6" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 4 137 | } 138 | -------------------------------------------------------------------------------- /2023_2024/Lesson6/Exercises6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n", 10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n", 11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
UUUFPhePhenylalanine
0UUCFPhePhenylalanine
1UUALLeuLeucine
2UUGLLeuLeucine
3CUULLeuLeucine
4CUCLLeuLeucine
...............
58AGGRArgArginine
59GGUGGlyGlycine
60GGCGGlyGlycine
61GGAGGlyGlycine
62GGGGGlyGlycine
\n", 126 | "

63 rows × 4 columns

\n", 127 | "
" 128 | ], 129 | "text/plain": [ 130 | " UUU F Phe Phenylalanine\n", 131 | "0 UUC F Phe Phenylalanine\n", 132 | "1 UUA L Leu Leucine\n", 133 | "2 UUG L Leu Leucine\n", 134 | "3 CUU L Leu Leucine\n", 135 | "4 CUC L Leu Leucine\n", 136 | ".. ... .. ... ...\n", 137 | "58 AGG R Arg Arginine\n", 138 | "59 GGU G Gly Glycine\n", 139 | "60 GGC G Gly Glycine\n", 140 | "61 GGA G Gly Glycine\n", 141 | "62 GGG G Gly Glycine\n", 142 | "\n", 143 | "[63 rows x 4 columns]" 144 | ] 145 | }, 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "import pandas as pd\n", 153 | "\n", 154 | "# TO DO" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Exercise\n", 162 | "\n", 163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# TO DO" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Exercise\n", 180 | "\n", 181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# TO DO" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3 (ipykernel)", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.10.6" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /2023_2024/Lesson7/Exercises7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# TO DO" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Exercise\n", 26 | "\n", 27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# TO DO" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Exercise\n", 44 | "\n", 45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# TO DO" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3 (ipykernel)", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.10.6" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /2023_2024/data/P04439.fasta: -------------------------------------------------------------------------------- 1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2 2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF 3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ 4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL 5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT 6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL 7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL 8 | TACKV 9 | -------------------------------------------------------------------------------- /2023_2024/data/brca_transcripts.txt: -------------------------------------------------------------------------------- 1 | transcript_id biotype bp aa 2 | ENST00000352993.7 Protein coding 3668 721 3 | ENST00000354071.7 Protein coding 4497 1399 4 | ENST00000461221.5 Nonsense mediated decay 5693 63 5 | ENST00000461574.1 Protein coding 726 242 6 | ENST00000461798.5 Nonsense mediated decay 582 63 7 | -------------------------------------------------------------------------------- /2023_2024/data/genetic_code.tsv: -------------------------------------------------------------------------------- 1 | UUU F Phe Phenylalanine 2 | UUC F Phe Phenylalanine 3 | UUA L Leu Leucine 4 | UUG L Leu Leucine 5 | CUU L Leu Leucine 6 | CUC L Leu Leucine 7 | CUA L Leu Leucine 8 | CUG L Leu Leucine 9 | AUU I Ile Isoleucine 10 | AUC I Ile Isoleucine 11 | AUA I Ile Isoleucine 12 | AUG M Met Methionine (Start) 13 | GUU V Val Valine 14 | GUC V Val Valine 15 | GUA V Val Valine 16 | GUG V Val Valine 17 | UCU S Ser Serine 18 | UCC S Ser Serine 19 | UCA S Ser Serine 20 | UCG S Ser Serine 21 | CCU P Pro Proline 22 | CCC P Pro Proline 23 | CCA P Pro Proline 24 | CCG P Pro Proline 25 | ACU T Thr Threonine 26 | ACC T Thr Threonine 27 | ACA T Thr Threonine 28 | ACG T Thr Threonine 29 | GCU A Ala Alanine 30 | GCC A Ala Alanine 31 | GCA A Ala Alanine 32 | GCG A Ala Alanine 33 | UAU Y Tyr Tyrosine 34 | UAC Y Tyr Tyrosine 35 | UAA X Stop (Stop) 36 | UAG X Stop (Stop) 37 | CAU H His Histidine 38 | CAC H His Histidine 39 | CAA Q Gln Glutamine 40 | CAG Q Gln Glutamine 41 | AAU N Asn Asparagine 42 | AAC N Asn Asparagine 43 | AAA K Lys Lysine 44 | AAG K Lys Lysine 45 | GAU D Asp Aspartic acid 46 | GAC D Asp Aspartic acid 47 | GAA E Glu Glutamic acid 48 | GAG E Glu Glutamic acid 49 | UGU C Cys Cysteine 50 | UGC C Cys Cysteine 51 | UGA X Stop (Stop) 52 | UGG W Trp Tryptophan 53 | CGU R Arg Arginine 54 | CGC R Arg Arginine 55 | CGA R Arg Arginine 56 | CGG R Arg Arginine 57 | AGU S Ser Serine 58 | AGC S Ser Serine 59 | AGA R Arg Arginine 60 | AGG R Arg Arginine 61 | GGU G Gly Glycine 62 | GGC G Gly Glycine 63 | GGA G Gly Glycine 64 | GGG G Gly Glycine -------------------------------------------------------------------------------- /2023_2024/data/my_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def generate_string(n, alphabet): 4 | s = "" 5 | for i in range(n): 6 | s += random.choice(alphabet) 7 | 8 | return s 9 | -------------------------------------------------------------------------------- /2023_2024/data/uniprot_ids.txt: -------------------------------------------------------------------------------- 1 | Q13188 2 | O00444 3 | P49760 4 | PYYY4Z 5 | Q13627 6 | Q02156 7 | -------------------------------------------------------------------------------- /2023_2024/data/validation.py: -------------------------------------------------------------------------------- 1 | def valid_sequence(sequence, valid_characters): 2 | for c in sequence: 3 | if c.upper() not in valid_characters: 4 | return False 5 | 6 | return True 7 | 8 | def validate_dna(sequence): 9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C']) 10 | 11 | def validate_rna(sequence): 12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C']) 13 | 14 | def validate_protein(sequence): 15 | return valid_sequence( 16 | sequence, 17 | [ 18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /2023_2024/images/Integer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2023_2024/images/Integer.jpeg -------------------------------------------------------------------------------- /2023_2024/images/List.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2023_2024/images/List.jpeg -------------------------------------------------------------------------------- /2024_2025/Lesson1/Exercises1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "The following list is corrupted:" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "mutations = [\n", 18 | " 'p.Ser31Ala',\n", 19 | " 'p.Pro38Leu',\n", 20 | " 'p.Asn100Lys',\n", 21 | " 'p.LEU110VAL',\n", 22 | " 13,\n", 23 | " 4.0,\n", 24 | " True,\n", 25 | " 'p.Tyr341Leu',\n", 26 | " 'AUG',\n", 27 | " 'p.Tyr0Le',\n", 28 | " 'p.Asn1.3Lys',\n", 29 | " 'p.Arg0Leu'\n", 30 | "]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "How check which are valid mutations? Put valid mutations in a new list. A reasonable output could be:\n", 38 | "\n", 39 | "`['p.Ser31Ala', 'p.Pro38Leu', 'p.Asn100Lys', 'p.Leu110Val', 'p.Tyr341Leu']`" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "#### Tips" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "valid_aminos = [\n", 56 | " 'Cys', 'Asp', 'Ser', 'Gln', 'Lys', 'Ile', 'Pro',\n", 57 | " 'Thr', 'Phe', 'Asn', 'Gly', 'His', 'Leu', 'Arg',\n", 58 | " 'Trp', 'Ala', 'Val', 'Glu', 'Tyr', 'Met'\n", 59 | "]\n", 60 | "\n", 61 | "# https://www.geeksforgeeks.org/string-capitalize-python/\n", 62 | "# https://thispointer.com/python-how-to-check-if-an-item-exists-in-list-search-by-value-or-condition/\n", 63 | "# https://stackoverflow.com/questions/1265665/how-can-i-check-if-a-string-represents-an-int-without-using-try-except" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "valid_mutations = []\n", 73 | "\n", 74 | "# TODO" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Exercise\n", 82 | "\n", 83 | "Write a script to check if a protein sequence is valid." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# TODO" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Exercise\n", 100 | "\n", 101 | "Print the amino acid composition of an input protein (23.3% S, 10.1% M, ...)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# TODO" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.10.6" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 4 135 | } 136 | -------------------------------------------------------------------------------- /2024_2025/Lesson2/Exercises2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "We have a dirty list of mutations. Clean it, and check if the valid mutations can belong to the HLA class I histocompatibility antigen protein. In particular:\n", 10 | "\n", 11 | "- create a `get_valid_mutation` which take as input a list of mutations, and returns a new list containing only the valid mutations (try to use the `startswith` method to check the presence of the `p.` prefix);\n", 12 | "- read the HLA class I histocompatibility antigen protein sequence from the `P04439.fasta` file;\n", 13 | "- for each valid mutation, check if it can belong to the HLA class I histocompatibility antigen protein sequence (try to use the `lstrip()` method to remove the `p.` prefix from the mutation)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "mutations = [\n", 23 | " 'p.thr21ARG', 'AUG', 'p.Pro39Arg', 'p.Gly40Ile', 'p.Thr366Ser', 'p.Leu19Gly',\n", 24 | " 'p.LEU110VAL', 'p.Val49Ile', 'p.Asn90Asp', 13, 'p.Tyr109GIy', 'p.Phe133His',\n", 25 | " 'p.Arg0Leu', 'p.Leu134Cys', 'p.M4t162Arg', True, 'p.Glu190Ser', 'p.Thr213Phe',\n", 26 | " 'p.Tyr0Le', 'p.Cys222Tyr', 'p.GLN248VaL', 'p.Thr249Ile', 'p.Asn1.3Lys', 'p.Ala322Gly'\n", 27 | "]\n", 28 | "\n", 29 | "aa_3L_to_1L = {\n", 30 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 31 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 32 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 33 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 34 | "}" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# TO DO" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Set of valid mutations that could belong to the HLA class I histocompatibility antigen protein:\n", 51 | "\n", 52 | "`['p.Pro39Arg', 'p.Gly40Ile', 'p.Leu19Gly', 'p.Val49Ile', 'p.Asn90Asp', 'p.Phe133His', 'p.Leu134Cys', 'p.Glu190Ser', 'p.Gln248Val', 'p.Thr249Ile']`" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### Exercise\n", 60 | "\n", 61 | "Write a function which generates 1000000 random strings long 100 characters, and return how many of them are valid proteins." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# TO DO" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Exercise\n", 78 | "\n", 79 | "Write a function that counts the number of times a character appears in the sequence taken as input. Do not use the `count()` method." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# TO DO" 89 | ] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3 (ipykernel)", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.10.6" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 4 113 | } 114 | -------------------------------------------------------------------------------- /2024_2025/Lesson3/Exercises3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "- Read the file [./../data/RepeatMasker.subset.bed](../data/RepeatMasker.subset.bed). This is a [BED](https://m.ensembl.org/info/website/upload/bed.html) format file obtained from [UCSC Table browser](http://genome.ucsc.edu/cgi-bin/hgTables).\n", 10 | "- Separate rows relating to chromosome 1 into a different file called `RepeatMasker.subset.chr1.bed`." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# TO DO" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Exercise\n", 27 | "\n", 28 | "Write a function to remove duplicates in a list." 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# TO DO" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Exercise\n", 45 | "\n", 46 | "Write a function to calculate the identity between 2 sequences." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# TO DO" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3 (ipykernel)", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.10.6" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /2024_2025/Lesson4/Exercises4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Compute all pair-wise identities (number of identical character pairs)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "sequences = [\n", 19 | " 'CGAACGCCCTAGGCGGGTCAGGGCCGAGGGCGGAGACCAGCGATACAATA',\n", 20 | " 'CGCCCAATCGCCTCTGGAAGTTTGGATGCCCCGTGCGGTAGCCCCAGGTC',\n", 21 | " 'TTTGAGCGCGCGCGCCTCTGTTGAAAACGCCCCGTTCTCGCCGGACAAAA',\n", 22 | " 'AGCCCGAAGAATAATGGACTTTCGCCTTTGTCGCAGCCAGCGATTCCGAC'\n", 23 | "]\n", 24 | "\n", 25 | "# TO DO" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Exercise\n", 33 | "\n", 34 | "- read the HLA class I histocompatibility antigen protein sequence from the [P04439.fasta](../data/P04439.fasta) file;\n", 35 | "- read the genetic code in the [genetic_code.tsv](../data/genetic_code.tsv) file\n", 36 | "- write the corresponding ribonucleotide sequence in a file, `P04439.rna.fasta`, replacing each amino acid with the corresponding codon." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# TO DO" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Exercise\n", 53 | "\n", 54 | "Print the index of the first occurrence of the ATG codon in `dna_seq`.\n", 55 | "\n", 56 | "Try with and without using the `find()` method on strings.\n", 57 | "\n", 58 | "Do the same with the ribonucleotide sequence in the `P04439.rna.fasta` file (manage the `U` <-> `T` conversion)." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "dna_seq = 'AAAAATCCCGAGGCGGCAUGTATATAGGGCTCCGGAGGCGTAATATAAAA'\n", 68 | "\n", 69 | "# TODO" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3 (ipykernel)", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.10.6" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 4 94 | } 95 | -------------------------------------------------------------------------------- /2024_2025/Lesson5/Exercises5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Write a function to search motifs in a sequence.\n", 10 | "\n", 11 | "Try with and without using the `re` module." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "seq = 'TAGGATTACAGGCATGAGCTACCGTATAATGGCCAGGCCCCCTGCCTTTGTAAATAAATTTTCACTGGAACCTGGACACACTTGTTTATGTGTTGTTTGTGCCTGTTTTCACGCTGCGGCAGGAAAGTTGAGTCGTTGTGTCAGAGACCAGAGAGAGAGCCTGCAGAACCTCAAATACTATCTGGCCCTTGCCAGAAAAAGTTTACCAACCCCCTGCCTCCCTGGAATGGGTGGAGGGTGGTTGTAAAGGTACTGGAGGATCTGAAGACATAATAGGGTCCGTGACCCTTGTGAGGTTGTGAAGCTCCCTTAAGGCACATGGTGGCTGGGCTGTGGATTTGGGGTATGGGCAGAGAGTGTGGAGAGCACTTCCAGGGGCCATGTCTGAGAGACTACATGATGCCACTTTGAATGCCCAGTTTGTTCATCCTTTTCTGTTTTCCCCACTTCCCCAGATGGGTGATCTACAATGACCAGAAAGTGTGTGCCTCCGAGAAGCCGCCCAAGGATATAATACATCTACTTCTACCAGAGAGTGGCCAGCTAAGAGCCTGCCTCACCCCTTACCAATGAGGGCAGGGGAAGACCACCTGGCATGAGGGAGAGGGGCTGAGGGATGGACTTCAGCCCCTCTGCTCTGTACCCTTTTTCCTTTTGTCCCCGGCAGCAGGGAAGAAGCTGGAGGCCGTGGGAGAATGGCTGGGCAGAGCAGAGGGGCAGCGATAGACTCTGGGGATGGAGCAGGACGGGGACGGGAGGGGCCGGCCACCTGTCTGTAAGGAGACTTTGTTGCTTCCCCTGCCCCCGGAATCCACAGTGCTCTGCTTCTCTGTGTCGCCCCGCCCAGCCCCCTGGTGTGGAGGGAGGGGTCTCGTTTGTGCGCGTGGGTGTAGCTTTGTGCATCCTCTCCCAGTGGAGCGATCACCTGTGCCTCCCCTCCCCCTTTGTTTGCCCCTGTGTGGTTGGTCAAGGAGGGATGTGAGGGAAATAGGGACCCCCCGACTTGCCCTCCTGCCTCAGTCTTTCCCCCACCCTGTCTCTTCCTTGTCCTTCTCTGGAAAATGCCAAAATACACGATGTGAATAAAAGTACAACGGCTAAATTGTGTCCTGTTTGATACCTTGGGGGAGAGGCTTACCTTCCTGGGGTTAGCAGGAGGGCGCTTAAGAAAACTCCTAACTCTGGCCGCCTCCCTGCCAAAGTCAAGTCTCCACTTTTCACTGGTTCTAGAGCTCTAGGAAAATTGGGGTTGGGTGGGGAGGTGGAGTAGAGTGACTAAATGCCGACACAAAGCCAAGGAAAGATGGAGTGAAGAACCCTTCCCTCTCTTTATTCACACAGGAGTGGAGGATTTCCCAAATGTCCCTAACTGGCTAGCTGGCTTCAGGCTGGGACTCAGTCCCTGCAGTTCCTGCCAGGCCTTGCCAGCCGGGGCGAGGGTTGGGATGATCCTGGCGGCCTATGCCTTATAATGCTGCCCCTCCCGCTGTGAACCCTGCATTTGTCCCGCAAGTTTTCACTCAGGTAGACTCCCTGGGTACAAGGGTGCCTGCTCAGCAGTCGGGCATGAGCTGCTCCGATGGGCGAAGGAGGTTGTCTATCCCACAGTTGGAGAGGGGCCCTCTCTGCCCCAGTGGGCGATCTGGGCTACGGCCAAGTTGCCACCAGCTAGTTCCGCTTGAAAACCACTTCTGGCCCCGTGGGGGACTCAAGTCGCCAAGCGAGGGTTCCCCTGAGCGCCGGAGCTCACAGGTCTCGCCTTGTCCCGAAAGCCCCGCAATCGAGGCGGAGGCGACCGAGCCCCCGACTCTCCTAGAACGTTGCCACAAGAAGGGGGAACGTCGGAACAGTGCATCATCGGGCGGCGGCCGGGGCGGCGGCAGGAGGGCGGGCGGGGGGCAGGGCTCCGGGGGACTGGGCGGGCCATGGCGGAGGACGGCGAGGAGGCGGAGTTCCACTTCGCGGCGCTCTATATAAGTGGGCAGTGGCCGCGACTGCGCGCAGACACTGACCTTCAGCGCCTCGGCTCCAGCGCCATGGCGCCCTCCAGGAAGTTCTTCGTTGGGGGAAACTGGAAGATGAACGGGCGGAAGCAGAGTCTGGGGGAGCTCATCGGCACTCTGAACGCGGCCAAGGTGCCGGCCGACACCG'\n", 21 | "\n", 22 | "consensus_motifs = {\n", 23 | " 'motif1': 'AGGAG[GT]',\n", 24 | " 'motif2': 'T[AT]AAT',\n", 25 | " 'motif3': 'GG.A.T[AG]'\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "Possible printed output:\n", 34 | "```\n", 35 | "AGGAG[GT]\n", 36 | "\t(969, 975) AGGAGG\n", 37 | "\t(1153, 1159) AGGAGG\n", 38 | "\t(1339, 1345) AGGAGT\n", 39 | "\t(1587, 1593) AGGAGG\n", 40 | "\t(1881, 1887) AGGAGG\n", 41 | "\t(1941, 1947) AGGAGG\n", 42 | "T[AT]AAT\n", 43 | "\t(50, 55) TAAAT\n", 44 | "\t(1098, 1103) TAAAT\n", 45 | "\t(1276, 1281) TAAAT\n", 46 | "GG.A.T[AG]\n", 47 | "\t(248, 255) GGTACTG\n", 48 | "\t(983, 990) GGAAATA\n", 49 | "\t(1910, 1917) GGGACTG\n", 50 | "\t(1980, 1987) GGCAGTG\n", 51 | "```\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# TO DO" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Exercise\n", 68 | "\n", 69 | "Starting from the `aa_3L_to_1L` dictionary, create a new `aa_1L_to_3L` dictionary where the keys become the values and the values become the keys." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 2, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "aa_3L_to_1L = {\n", 79 | " 'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',\n", 80 | " 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',\n", 81 | " 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',\n", 82 | " 'ALA': 'A', 'VAL': 'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'\n", 83 | "}\n", 84 | "\n", 85 | "#aa_1L_to_3L['A'] --> 'ALA'" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# TO DO" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### Exercise\n", 102 | "\n", 103 | "Write a function to remove not valid aminoacids from a protein." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# TO DO" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3 (ipykernel)", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.10.6" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 4 137 | } 138 | -------------------------------------------------------------------------------- /2024_2025/Lesson6/Exercises6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file. Do that in 2 ways:\n", 10 | "- read the file, put each column in a different dictionary, and create the Pandas `DataFrame` from these dictionaries.\n", 11 | "- check the documentation out to see how to load such a file format into a Pandas `DataFrame` and do that." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "
\n", 23 | "\n", 36 | "\n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
UUUFPhePhenylalanine
0UUCFPhePhenylalanine
1UUALLeuLeucine
2UUGLLeuLeucine
3CUULLeuLeucine
4CUCLLeuLeucine
...............
58AGGRArgArginine
59GGUGGlyGlycine
60GGCGGlyGlycine
61GGAGGlyGlycine
62GGGGGlyGlycine
\n", 126 | "

63 rows × 4 columns

\n", 127 | "
" 128 | ], 129 | "text/plain": [ 130 | " UUU F Phe Phenylalanine\n", 131 | "0 UUC F Phe Phenylalanine\n", 132 | "1 UUA L Leu Leucine\n", 133 | "2 UUG L Leu Leucine\n", 134 | "3 CUU L Leu Leucine\n", 135 | "4 CUC L Leu Leucine\n", 136 | ".. ... .. ... ...\n", 137 | "58 AGG R Arg Arginine\n", 138 | "59 GGU G Gly Glycine\n", 139 | "60 GGC G Gly Glycine\n", 140 | "61 GGA G Gly Glycine\n", 141 | "62 GGG G Gly Glycine\n", 142 | "\n", 143 | "[63 rows x 4 columns]" 144 | ] 145 | }, 146 | "execution_count": 3, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "import pandas as pd\n", 153 | "\n", 154 | "# TO DO" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Exercise\n", 162 | "\n", 163 | "Generate a million random integers from 0 to 999 and sort them in ascending order. Do it with Python lists and Numpy Arrays, and quantify the execution times." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 4, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# TO DO" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### Exercise\n", 180 | "\n", 181 | "Write a function that takes a directory as input (for example, `/home`) and prints only the subdirectories, ignoring the files in the specified directory." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# TO DO" 191 | ] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3 (ipykernel)", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.10.6" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 4 215 | } 216 | -------------------------------------------------------------------------------- /2024_2025/Lesson7/Exercises7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Exercise\n", 8 | "\n", 9 | "Create a Pandas `DataFrame` starting from the [genetic_code.tsv](../data/genetic_code.tsv) file, writing the column names." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "# TO DO" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Exercise\n", 26 | "\n", 27 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and remove the variants with quality lower than 30." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# TO DO" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Exercise\n", 44 | "\n", 45 | "Using Pandas, read the variants in the [trio.2010_06.ychr.sites.vcf](../data/trio.2010_06.ychr.sites.vcf) file (in [VCF format](https://en.wikipedia.org/wiki/Variant_Call_Format)), and put in a Python list all the DP values (182, 196, 275, ...)." 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# TO DO" 55 | ] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3 (ipykernel)", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.10.6" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /2024_2025/data/P04439.fasta: -------------------------------------------------------------------------------- 1 | >sp|P04439|HLAA_HUMAN HLA class I histocompatibility antigen, A alpha chain OS=Homo sapiens OX=9606 GN=HLA-A PE=1 SV=2 2 | MAVMAPRTLLLLLSGALALTQTWAGSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRF 3 | DSDAASQRMEPRAPWIEQEGPEYWDQETRNVKAQSQTDRVDLGTLRGYYNQSEAGSHTIQ 4 | IMYGCDVGSDGRFLRGYRQDAYDGKDYIALNEDLRSWTAADMAAQITKRKWEAAHEAEQL 5 | RAYLDGTCVEWLRRYLENGKETLQRTDPPKTHMTHHPISDHEATLRCWALGFYPAEITLT 6 | WQRDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPKPLTLRWEL 7 | SSQPTIPIVGIIAGLVLLGAVITGAVVAAVMWRRKSSDRKGGSYTQAASSDSAQGSDVSL 8 | TACKV 9 | -------------------------------------------------------------------------------- /2024_2025/data/brca_transcripts.txt: -------------------------------------------------------------------------------- 1 | transcript_id biotype bp aa 2 | ENST00000352993.7 Protein coding 3668 721 3 | ENST00000354071.7 Protein coding 4497 1399 4 | ENST00000461221.5 Nonsense mediated decay 5693 63 5 | ENST00000461574.1 Protein coding 726 242 6 | ENST00000461798.5 Nonsense mediated decay 582 63 7 | -------------------------------------------------------------------------------- /2024_2025/data/genetic_code.tsv: -------------------------------------------------------------------------------- 1 | UUU F Phe Phenylalanine 2 | UUC F Phe Phenylalanine 3 | UUA L Leu Leucine 4 | UUG L Leu Leucine 5 | CUU L Leu Leucine 6 | CUC L Leu Leucine 7 | CUA L Leu Leucine 8 | CUG L Leu Leucine 9 | AUU I Ile Isoleucine 10 | AUC I Ile Isoleucine 11 | AUA I Ile Isoleucine 12 | AUG M Met Methionine (Start) 13 | GUU V Val Valine 14 | GUC V Val Valine 15 | GUA V Val Valine 16 | GUG V Val Valine 17 | UCU S Ser Serine 18 | UCC S Ser Serine 19 | UCA S Ser Serine 20 | UCG S Ser Serine 21 | CCU P Pro Proline 22 | CCC P Pro Proline 23 | CCA P Pro Proline 24 | CCG P Pro Proline 25 | ACU T Thr Threonine 26 | ACC T Thr Threonine 27 | ACA T Thr Threonine 28 | ACG T Thr Threonine 29 | GCU A Ala Alanine 30 | GCC A Ala Alanine 31 | GCA A Ala Alanine 32 | GCG A Ala Alanine 33 | UAU Y Tyr Tyrosine 34 | UAC Y Tyr Tyrosine 35 | UAA X Stop (Stop) 36 | UAG X Stop (Stop) 37 | CAU H His Histidine 38 | CAC H His Histidine 39 | CAA Q Gln Glutamine 40 | CAG Q Gln Glutamine 41 | AAU N Asn Asparagine 42 | AAC N Asn Asparagine 43 | AAA K Lys Lysine 44 | AAG K Lys Lysine 45 | GAU D Asp Aspartic acid 46 | GAC D Asp Aspartic acid 47 | GAA E Glu Glutamic acid 48 | GAG E Glu Glutamic acid 49 | UGU C Cys Cysteine 50 | UGC C Cys Cysteine 51 | UGA X Stop (Stop) 52 | UGG W Trp Tryptophan 53 | CGU R Arg Arginine 54 | CGC R Arg Arginine 55 | CGA R Arg Arginine 56 | CGG R Arg Arginine 57 | AGU S Ser Serine 58 | AGC S Ser Serine 59 | AGA R Arg Arginine 60 | AGG R Arg Arginine 61 | GGU G Gly Glycine 62 | GGC G Gly Glycine 63 | GGA G Gly Glycine 64 | GGG G Gly Glycine -------------------------------------------------------------------------------- /2024_2025/data/my_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def generate_string(n, alphabet): 4 | s = "" 5 | for i in range(n): 6 | s += random.choice(alphabet) 7 | 8 | return s 9 | -------------------------------------------------------------------------------- /2024_2025/data/uniprot_ids.txt: -------------------------------------------------------------------------------- 1 | Q13188 2 | O00444 3 | P49760 4 | PYYY4Z 5 | Q13627 6 | Q02156 7 | -------------------------------------------------------------------------------- /2024_2025/data/validation.py: -------------------------------------------------------------------------------- 1 | def valid_sequence(sequence, valid_characters): 2 | for c in sequence: 3 | if c.upper() not in valid_characters: 4 | return False 5 | 6 | return True 7 | 8 | def validate_dna(sequence): 9 | return valid_sequence(sequence, ['A', 'T', 'G', 'C']) 10 | 11 | def validate_rna(sequence): 12 | return valid_sequence(sequence, ['A', 'U', 'G', 'C']) 13 | 14 | def validate_protein(sequence): 15 | return valid_sequence( 16 | sequence, 17 | [ 18 | 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 19 | 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y' 20 | ] 21 | ) 22 | -------------------------------------------------------------------------------- /2024_2025/images/Integer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2024_2025/images/Integer.jpeg -------------------------------------------------------------------------------- /2024_2025/images/List.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AndreaGuarracino/DataStructuresForBioinformatics/4097823c75e78a6963d224e113833c45315285e6/2024_2025/images/List.jpeg -------------------------------------------------------------------------------- /ExamResults/2020.12.23.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0027557 | 27 | 4 | | 0278947 | 29 | 5 | | 0280655 | 30 | 6 | | 0281512 | 26 | 7 | | 0285818 | 28 | 8 | | 0287922 | 30 | 9 | -------------------------------------------------------------------------------- /ExamResults/2021.01.21.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0287959 | 27 | 4 | -------------------------------------------------------------------------------- /ExamResults/2021.04.08.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0301264 | 30 | 4 | | 0301247 | 30 | 5 | | 0208810 | 21 | 6 | -------------------------------------------------------------------------------- /ExamResults/2021.06.10.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0279152 | 28 | 4 | -------------------------------------------------------------------------------- /ExamResults/2021.09.23.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|--------| 3 | | 0296657 | absent | 4 | | 0259940 | 27 | 5 | | 0292176 | absent | 6 | -------------------------------------------------------------------------------- /ExamResults/2021.12.20.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0291176 | 30 | 4 | | 0291151 | 28 | 5 | | 0292143 | 26 | 6 | -------------------------------------------------------------------------------- /ExamResults/2022.01.13.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0293463 | 28 | 4 | -------------------------------------------------------------------------------- /ExamResults/2022.04.22.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|--------| 3 | | 0294083 | 29 | 4 | | 0296657 | 28 | 5 | | 0299326 | absent | 6 | | 0292176 | 24 | 7 | | 0292378 | 30 | 8 | -------------------------------------------------------------------------------- /ExamResults/2022.07.14.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|--------| 3 | | 0299326 | absent | 4 | -------------------------------------------------------------------------------- /ExamResults/2022.09.08.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0299326 | 28 | 4 | -------------------------------------------------------------------------------- /ExamResults/2022.12.22.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0307827 | 29 | 4 | | 0316609 | 30 | 5 | | 0309343 | 30L | 6 | -------------------------------------------------------------------------------- /ExamResults/2022.12.23.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0316001 | 25 | 4 | | 0302429 | 30 | 5 | | 0316603 | 28 | 6 | | 0316680 | 30 | 7 | | 0317105 | 27 | 8 | -------------------------------------------------------------------------------- /ExamResults/2023.02.09.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0315940 | 27 | 4 | | 0292781 | 27 | 5 | -------------------------------------------------------------------------------- /ExamResults/2023.06.12.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0315940 | 29 | 4 | -------------------------------------------------------------------------------- /ExamResults/2023.09.07.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0329476 | 30 | 4 | -------------------------------------------------------------------------------- /ExamResults/2024.01.15.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0336987 | 27 | 4 | | 0323251 | 27 | 5 | -------------------------------------------------------------------------------- /ExamResults/2024.06.11.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0329807 | 26 | 4 | -------------------------------------------------------------------------------- /ExamResults/2024.12.19.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0350219 | 30 | 4 | | 0345526 | 29 | 5 | -------------------------------------------------------------------------------- /ExamResults/2025.02.13.md: -------------------------------------------------------------------------------- 1 | | Student ID | Grade | 2 | |------------|-------| 3 | | 0334169 | 25 | 4 | -------------------------------------------------------------------------------- /ExamResults/plot_date_vs_grade.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import os 3 | import pandas as pd 4 | from datetime import datetime 5 | import seaborn as sns 6 | 7 | # Function to read the MD file and return a DataFrame 8 | def read_md_file(filepath): 9 | with open(filepath, 'r') as file: 10 | lines = file.readlines() 11 | lines = lines[2:] # Skip the header and the underline 12 | student_ids = [] 13 | grades = [] 14 | for line in lines: 15 | parts = line.strip().split('|') 16 | if len(parts) < 3: 17 | continue 18 | grade = parts[2].strip() 19 | if grade == 'absent': # Skip the row if grade is 'absent' 20 | continue 21 | elif grade == '30L': 22 | grades.append(32) 23 | else: 24 | grades.append(int(grade)) 25 | student_ids.append(parts[1].strip()) 26 | return pd.DataFrame({'StudentID': student_ids, 'Grade': grades}) 27 | 28 | # Directory containing the MD files 29 | directory = os.path.dirname(os.path.realpath(__file__)) 30 | 31 | # List to hold DataFrames for each date 32 | dfs = [] 33 | 34 | # Iterate through the files in the directory 35 | for filename in os.listdir(directory): 36 | if filename.endswith(".md"): 37 | date_str = filename[:-3] # Remove the .md extension 38 | date = datetime.strptime(date_str, '%Y.%m.%d').date() # Extract the date only 39 | filepath = os.path.join(directory, filename) 40 | df = read_md_file(filepath) 41 | df['Date'] = date 42 | dfs.append(df[['Date', 'Grade']]) 43 | 44 | # Concatenate all DataFrames 45 | data = pd.concat(dfs, ignore_index=True) 46 | 47 | # Sort by Date 48 | data = data.sort_values(by='Date') 49 | 50 | # Set the style for the plot 51 | sns.set_style("whitegrid") 52 | plt.figure(figsize=(20, 12)) 53 | 54 | # Create the boxplot 55 | sns.boxplot(x='Date', y='Grade', data=data, color="lightblue", width=0.5) 56 | 57 | # Add swarmplot for individual data points 58 | sns.swarmplot(x='Date', y='Grade', data=data, color="navy", size=6, alpha=0.6) 59 | 60 | # Customize the plot 61 | plt.title('Data Structures for Bioinformatics Exam\nDistribution of Grades Over Time', fontsize=28, pad=20) 62 | plt.xlabel('Exam Date', fontsize=24, labelpad=15) 63 | plt.ylabel('Grade', fontsize=24, labelpad=15) 64 | plt.xticks(rotation=45, ha='right', fontsize=20) 65 | plt.yticks(fontsize=20) 66 | 67 | # Increase tick label size 68 | plt.tick_params(axis='both', which='major', labelsize=20) 69 | 70 | # Add a horizontal line for the mean grade 71 | mean_grade = data['Grade'].mean() 72 | plt.axhline(y=mean_grade, color='red', linestyle='--', alpha=0.7, linewidth=2) 73 | plt.text(plt.xlim()[1], mean_grade, f' Mean: {mean_grade:.2f}', 74 | verticalalignment='center', fontsize=20, color='red', fontweight='bold') 75 | 76 | # Adjust the layout and display the plot 77 | plt.tight_layout() 78 | plt.show() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataStructuresForBioinformatics 2 | 3 | Material for the **Data Structures for Bioinformatics** course (Master’s degree in Bioinformatics, University of Rome Tor Vergata). 4 | 5 | **Class schedule (2024/2025)**: every Thursday, 15-17 pm (GMT+2) 6 | 7 | 8 | 9 | 10 | --------------------------------------------------------------------------------