├── Ch14 ├── sample.txt ├── app │ ├── requirements.txt │ └── app.py ├── README.md ├── Dockerfile ├── ch14-cloud-computing.yml └── s3-bucket.yaml ├── Ch05 ├── README.md ├── ch05-alignment.yml ├── Ch05-1-qc-data.ipynb └── Ch05-4-variant-calling.ipynb ├── Ch06 ├── README.md └── ch06-annotation.yml ├── Requirements.txt ├── Ch02 ├── README.md ├── Ch02-3-pandas-memory.ipynb └── Ch02-2-pandas-pitfalls.ipynb ├── Ch03 ├── README.md ├── example.fasta ├── sample.py ├── Ch03-1-pycodestyle.ipynb ├── Ch03-2-sequence-manipulation.ipynb ├── Ch03-3-read-alignment.ipynb ├── pycodestyle.ipynb ├── Ch03-4-test-writing.ipynb └── Ch02-1-pandas-basic.ipynb ├── Ch04 ├── README.md ├── ch04-data-science.yml ├── Ch04-3-k-means.ipynb ├── Ch04-2-PCA.ipynb ├── Ch04-6-seaborn.ipynb └── Ch04-4-decision-trees.ipynb ├── Ch10 ├── README.md ├── ch10-phylogenetics.yml ├── .ipynb_checkpoints │ └── Ch10-1-preparing-dataset-checkpoint.ipynb ├── Ch10-6-visualizing-phylogenetics.ipynb ├── Ch10-4-reconstructing-trees.ipynb ├── Ch10-5-recursive-trees.ipynb └── Ch10-2-aligning-genetic-data.ipynb ├── Ch11 ├── README.md ├── ch11-population-genomics.yml ├── Ch11-3-exploring-with-sgkit.ipynb ├── Ch11-2-using-sgkit.ipynb └── Ch11-1-plink.ipynb ├── Ch13 ├── README.md └── ch13-genome-editing.yml ├── Ch16 ├── README.md └── ch16-more-workflows.yml ├── Ch17 ├── README.md └── ch17-machine-learning.yml ├── Ch18 ├── README.md └── ch18-single-cell.yml ├── Ch08 ├── README.md ├── ch08-databases.yml ├── Ch08-1-genbank-ncbi.ipynb ├── Ch08-3-pdb-uniprot.ipynb └── Ch08-2-using-sra.ipynb ├── Ch09 ├── README.md ├── ch09-proteins.yml ├── Ch09-5-proteomics.ipynb ├── Ch09-2-molecular-distances.ipynb ├── Ch09-4-py3dmol.ipynb └── Ch09-3-geometric-operations.ipynb ├── Ch12 ├── README.md ├── ch12-applications.yml └── Ch12-1-cobrapy.ipynb ├── Ch15 ├── README.md ├── nextflow │ └── nextflow.config ├── ch15-workflows.yml ├── galaxy │ └── Ch15-1-introducing-galaxy.ipynb └── snakemake │ └── Snakefile ├── README.md ├── Ch01 ├── bioinformatics_base.yml ├── README.md ├── Welcome.py └── Welcome.ipynb ├── Ch07 ├── ch07-genomes.yml ├── Ch07-4-genome-assessment.ipynb ├── Ch07-3-long-read-assembly.ipynb └── Ch07-1-genomes.ipynb ├── LICENSE └── docker └── main └── Dockerfile /Ch14/sample.txt: -------------------------------------------------------------------------------- 1 | This is a sample file. -------------------------------------------------------------------------------- /Ch14/app/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==2.2.3 2 | -------------------------------------------------------------------------------- /Ch05/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch05 2 | -------------------------------------------------------------------------------- /Ch06/README.md: -------------------------------------------------------------------------------- 1 | This is the REAMDE for Ch06. 2 | -------------------------------------------------------------------------------- /Requirements.txt: -------------------------------------------------------------------------------- 1 | # SAB - Requirements.txt 2 | -------------------------------------------------------------------------------- /Ch02/README.md: -------------------------------------------------------------------------------- 1 | This is the README file for Ch02 2 | -------------------------------------------------------------------------------- /Ch03/README.md: -------------------------------------------------------------------------------- 1 | This is the README file for Ch03. 2 | -------------------------------------------------------------------------------- /Ch04/README.md: -------------------------------------------------------------------------------- 1 | This is the README file for Ch04. 2 | -------------------------------------------------------------------------------- /Ch10/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch10. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch11/README.md: -------------------------------------------------------------------------------- 1 | This is the REAMDE for Ch11. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch13/README.md: -------------------------------------------------------------------------------- 1 | This is the REAMDE for Ch13. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch14/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch14. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch16/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch16. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch17/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch17. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch18/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch18 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch08/README.md: -------------------------------------------------------------------------------- 1 | This is the REAMDE file for Ch08. 2 | 3 | 4 | -------------------------------------------------------------------------------- /Ch09/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch09. 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /Ch12/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch12. 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /Ch15/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch15. 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /Ch03/example.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | ATCGTACGATCG 3 | GATCGTACGATC 4 | >seq2 5 | CGTAGCTAGCTA 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bioinformatics-with-Python-Cookbook-Fourth-Edition 2 | Bioinformatics with Python Cookbook - Fourth Edition, published by Packt 3 | -------------------------------------------------------------------------------- /Ch14/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | FROM python:3.9-slim 3 | 4 | WORKDIR /app 5 | 6 | COPY app/requirements.txt . 7 | RUN pip install --no-cache-dir -r requirements.txt 8 | 9 | COPY app/ . 10 | 11 | EXPOSE 5000 12 | 13 | CMD ["python", "app.py"] 14 | -------------------------------------------------------------------------------- /Ch14/app/app.py: -------------------------------------------------------------------------------- 1 | 2 | from flask import Flask 3 | app = Flask(__name__) 4 | 5 | @app.route('/') 6 | def hello(): 7 | return "Hello from the Docker container!" 8 | 9 | if __name__ == "__main__": 10 | app.run(host='0.0.0.0', port=5000) 11 | -------------------------------------------------------------------------------- /Ch08/ch08-databases.yml: -------------------------------------------------------------------------------- 1 | name: ch08-databases 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.11 8 | - biopython 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - scipy 14 | - pysradb -------------------------------------------------------------------------------- /Ch13/ch13-genome-editing.yml: -------------------------------------------------------------------------------- 1 | name: ch13-genome-editing 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - scipy 14 | - notebook 15 | - seaborn -------------------------------------------------------------------------------- /Ch06/ch06-annotation.yml: -------------------------------------------------------------------------------- 1 | name: bioinformatics_base 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.11 8 | - biopython=1.84 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - scipy 14 | - cyvcf2 15 | - notebook 16 | 17 | -------------------------------------------------------------------------------- /Ch01/bioinformatics_base.yml: -------------------------------------------------------------------------------- 1 | name: bioinformatics_base 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.9.2 11 | - numpy=2.1.0 12 | - pandas=2.2.3 13 | - scipy=1.14.1 14 | - notebook 15 | 16 | -------------------------------------------------------------------------------- /Ch10/ch10-phylogenetics.yml: -------------------------------------------------------------------------------- 1 | name: ch10-phylogenetics 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.11 8 | - biopython 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - scipy 14 | - dendropy 15 | - trimal 16 | - mafft 17 | - muscle 18 | - raxml-ng -------------------------------------------------------------------------------- /Ch05/ch05-alignment.yml: -------------------------------------------------------------------------------- 1 | name: bioinformatics_base 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.10.3 11 | - numpy=2.1.0 12 | - pandas=2.2.3 13 | - scipy=1.14.1 14 | - pysam=0.23.3 15 | - notebook 16 | 17 | -------------------------------------------------------------------------------- /Ch07/ch07-genomes.yml: -------------------------------------------------------------------------------- 1 | name: ch07-genomes 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.11 8 | - biopython 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - scipy 14 | - pyfastx 15 | - networkx 16 | - raven-assembler 17 | - pip 18 | - pip: 19 | - quast -------------------------------------------------------------------------------- /Ch01/README.md: -------------------------------------------------------------------------------- 1 | This is the README for Ch01. 2 | 3 | 4 | These are the commands to build and run the Docker container for the book: 5 | 6 | docker build -t bio https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-fourth-edition.git#main:docker/main 7 | 8 | docker run -ti -p 9875:9875 -v /Users/shanebrubaker/work/docker_files:/data bio 9 | 10 | 11 | -------------------------------------------------------------------------------- /Ch14/ch14-cloud-computing.yml: -------------------------------------------------------------------------------- 1 | name: ch14-cloud-computing 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.9.2 11 | - numpy=2.1.0 12 | - pandas=2.2.3 13 | - scipy=1.14.1 14 | - cyvcf2=0.31.1 15 | - notebook 16 | - boto3=1.38.13 17 | -------------------------------------------------------------------------------- /Ch15/nextflow/nextflow.config: -------------------------------------------------------------------------------- 1 | 2 | process { 3 | cpus = 1 4 | memory = '2 GB' 5 | time = '30m' 6 | } 7 | 8 | executor { 9 | name = 'local' 10 | cpus = 4 11 | } 12 | 13 | report { 14 | enabled = true 15 | file = 'reports/execution_report.html' 16 | } 17 | 18 | timeline { 19 | enabled = true 20 | file = 'reports/timeline.html' 21 | } 22 | -------------------------------------------------------------------------------- /Ch15/ch15-workflows.yml: -------------------------------------------------------------------------------- 1 | name: ch15-workflows 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.9.2 11 | - numpy=2.1.0 12 | - pandas=2.2.3 13 | - scipy=1.14. 14 | - cyvcf2=0.31.1 15 | - notebook 16 | - bioblend=1.5.0 17 | - sra-tools>=3.0 18 | 19 | -------------------------------------------------------------------------------- /Ch16/ch16-more-workflows.yml: -------------------------------------------------------------------------------- 1 | name: ch16-more-workflows 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.9.2 11 | - numpy=2.1.0 12 | - pandas=2.2.3 13 | - scipy=1.14. 14 | - cyvcf2=0.31.1 15 | - notebook 16 | - bioblend=1.5.0 17 | - sra-tools>=3.0 18 | 19 | -------------------------------------------------------------------------------- /Ch14/s3-bucket.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: '2010-09-09' 2 | Description: CloudFormation template to create an S3 bucket 3 | 4 | Resources: 5 | MyS3Bucket: 6 | Type: AWS::S3::Bucket 7 | Properties: 8 | BucketName: my-simple-cf-bucket-123456 # Must be globally unique 9 | 10 | Outputs: 11 | BucketName: 12 | Description: The name of the created S3 bucket 13 | Value: !Ref MyS3Bucket 14 | 15 | -------------------------------------------------------------------------------- /Ch09/ch09-proteins.yml: -------------------------------------------------------------------------------- 1 | name: ch09-proteins 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.9.2 11 | - seaborn=0.13.2 12 | - numpy=2.1.0 13 | - pandas=2.2.3 14 | - scipy=1.14. 15 | - cyvcf2=0.31.1 16 | - notebook 17 | - nglview=3.1.4 18 | - pyteomics=4.7.5 19 | - py3Dmol=2.5.1 20 | 21 | -------------------------------------------------------------------------------- /Ch11/ch11-population-genomics.yml: -------------------------------------------------------------------------------- 1 | name: ch11-population-genomics 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.11 8 | - biopython 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - seaborn 14 | - scipy 15 | - notebook 16 | - dask 17 | - dask-ml 18 | - scikit-learn 19 | - pyarrow 20 | - pip 21 | - pip: 22 | - sgkit[plink] 23 | - cbgen -------------------------------------------------------------------------------- /Ch04/ch04-data-science.yml: -------------------------------------------------------------------------------- 1 | name: ch04-data-science 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.9.2 11 | - numpy=2.1.0 12 | - pandas=2.2.3 13 | - scipy=1.14.1 14 | - notebook 15 | - scikit-learn=1.7.0 16 | - scipy=1.14.1 17 | - seaborn=0.13.2 18 | - umap-learn=0.5.7 19 | - ipywidgets=8.1.7 20 | 21 | -------------------------------------------------------------------------------- /Ch12/ch12-applications.yml: -------------------------------------------------------------------------------- 1 | name: ch12-metabolic-modeling 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython 9 | - jupyterlab 10 | - matplotlib 11 | - numpy 12 | - pandas 13 | - scipy 14 | - notebook 15 | - seaborn 16 | - ViennaRNA 17 | - requests 18 | - pip 19 | - pip: 20 | - cobra 21 | - python-libsbml-experimental 22 | - sgkit[plink] 23 | - cbgen -------------------------------------------------------------------------------- /Ch18/ch18-single-cell.yml: -------------------------------------------------------------------------------- 1 | name: ch18-single-cell 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib>=3.9 11 | - numpy>=2.1.0 12 | - pandas>=2.2 13 | - scipy>=1.14 14 | - notebook 15 | - shapely 16 | - scanpy 17 | - igraph 18 | - leidenalg 19 | - scikit-image 20 | - seaborn 21 | - scikit-learn 22 | - networkx 23 | -------------------------------------------------------------------------------- /Ch17/ch17-machine-learning.yml: -------------------------------------------------------------------------------- 1 | name: ch17-machine-learning 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.12 8 | - biopython=1.84 9 | - jupyterlab=4.4.0 10 | - matplotlib=3.10.3 11 | - numpy>=2.2.6 # Updated for cyvcf2 compatibility 12 | - pandas=2.2.3 13 | - scipy=1.14.1 14 | - pysam>=0.23.3 # Use >= instead of = for flexibility 15 | - notebook 16 | - pytorch=2.0.1 17 | - seaborn=0.13.2 18 | - transformers=4.30.2 19 | - plotly=6.3.1 20 | - tqdm=4.67.1 21 | - cyvcf2 # Now this should work 22 | -------------------------------------------------------------------------------- /Ch01/Welcome.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # formats: ipynb,py:light 5 | # text_representation: 6 | # extension: .py 7 | # format_name: light 8 | # format_version: '1.5' 9 | # jupytext_version: 1.17.1 10 | # kernelspec: 11 | # display_name: Python 3 (ipykernel) 12 | # language: python 13 | # name: python3 14 | # --- 15 | 16 | # + 17 | # BioInformatics with Python Cookbook - Fourth Edition # 18 | 19 | # + 20 | # Welcome to the book! # 21 | # 1-1 Welcome # 22 | # - 23 | 24 | print("Welcome to the BioInformatics with Python Cookbook Fourth Edition!") 25 | 26 | # + 27 | # Install packages using Conda 28 | # - 29 | 30 | # ! conda install -y biopython==1.84 jupyterlab==4.3.0 matplotlib==3.9.2 numpy==2.1.0 pandas==2.2.3 scipy==1.14.1 31 | 32 | # + 33 | # Install Jupytext 34 | # - 35 | 36 | # ! pip install jupytext 37 | 38 | # + 39 | ## End of Notebook ## 40 | -------------------------------------------------------------------------------- /Ch03/sample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os, sys # Unused imports and multiple imports on one line 4 | 5 | def example_function(a, b): # Missing docstring 6 | if a > b: 7 | print("a is greater than b") # Improper indentation 8 | else: 9 | print("b is greater or equal to a") # Extra indentation 10 | 11 | class ExampleClass: # Missing class docstring 12 | def __init__(self,value): 13 | self.value=value # Missing spaces around '=' operator 14 | self.data = [] # Unused attribute 15 | 16 | def add_data(self, item): # Unused method argument 'item' 17 | pass 18 | 19 | def display(self): 20 | print("Value: ", self.value) # Space before comma is bad style 21 | 22 | # Unused variable and name not in snake_case 23 | BADVariableName = 42 24 | 25 | # Long line exceeding 80 characters 26 | print("This is a really, really, really, really, really, really, really long line of code.") 27 | 28 | example_function(10, 5) # Function call with no meaningful context 29 | 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Ch10/.ipynb_checkpoints/Ch10-1-preparing-dataset-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "56dc5d7f-7c92-42c9-adea-9cc2f1a275c0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch10-1 - Preparing a dataset for phylogenetic analysis" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "be988ce8-4b24-455d-ac96-262d509f6d88", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "fa5178b5-b30a-4a27-b910-cfa4bb5f7bde", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "b5803ed5-b13f-42d7-98ca-adef0206431f", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "1165eb6a-33d1-44ce-a138-e325dd821e5c", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "7fe9b62b-470b-4c1a-ba72-91b886102a64", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "Python 3 (ipykernel)", 57 | "language": "python", 58 | "name": "python3" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 3 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.11.3" 71 | } 72 | }, 73 | "nbformat": 4, 74 | "nbformat_minor": 5 75 | } 76 | -------------------------------------------------------------------------------- /docker/main/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/anaconda3:latest 2 | 3 | LABEL maintainer="Shane Brubaker " 4 | 5 | # Fix: Remove asterisks from ENV variable 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | 8 | # Install system packages and bioinformatics tools in a single layer 9 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \ 10 | # System packages 11 | git \ 12 | wget \ 13 | curl \ 14 | build-essential \ 15 | unzip \ 16 | graphviz \ 17 | libgraphviz-dev \ 18 | pkg-config \ 19 | swig \ 20 | libx11-dev \ 21 | libgsl0-dev \ 22 | libopenblas-dev \ 23 | liblapacke-dev \ 24 | # Bioinformatics tools 25 | samtools \ 26 | mafft \ 27 | muscle \ 28 | raxml \ 29 | tabix \ 30 | && rm -rf /var/lib/apt/lists/* \ 31 | && apt-get clean 32 | 33 | # Clone the repository 34 | RUN git clone https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-fourth-Edition.git 35 | 36 | # Update conda and configure channels 37 | RUN conda update -n base conda && \ 38 | conda config --add channels conda-forge && \ 39 | conda config --add channels bioconda && \ 40 | conda config --set channel_priority strict 41 | 42 | # Create bioinformatics environment 43 | RUN conda env create -f /Bioinformatics-with-Python-Cookbook-fourth-Edition/Ch01/bioinformatics_base.yml 44 | 45 | # Install additional packages if needed 46 | # RUN conda run -n bioinformatics_base pip install pyarrow==8.0.0 47 | 48 | # Initialize conda for bash and set up environment activation 49 | RUN conda init bash && \ 50 | echo "conda activate bioinformatics_base" >> /root/.bashrc && \ 51 | echo "setterm -foreground magenta" >> /etc/bash.bashrc 52 | 53 | # Create workspace directory 54 | RUN mkdir -p /workspace 55 | 56 | # Set working directory 57 | WORKDIR /Bioinformatics-with-Python-Cookbook-fourth-Edition 58 | 59 | # Create a non-root user for security (optional but recommended) 60 | # RUN useradd -m -s /bin/bash biouser && \ 61 | # chown -R biouser:biouser /Bioinformatics-with-Python-Cookbook-fourth-Edition /workspace 62 | # USER biouser 63 | 64 | # Expose port 65 | EXPOSE 9875 66 | 67 | # Set environment variables for Jupyter 68 | ENV JUPYTER_ENABLE_LAB=yes 69 | ENV JUPYTER_TOKEN="" 70 | 71 | # Start Jupyter Lab with better formatting 72 | CMD ["conda", "run", "--no-capture-output", "-n", "bioinformatics_base", \ 73 | "jupyter-lab", \ 74 | "--ip=0.0.0.0", \ 75 | "--no-browser", \ 76 | "--allow-root", \ 77 | "--port=9875", \ 78 | "--NotebookApp.token=", \ 79 | "--NotebookApp.password="] -------------------------------------------------------------------------------- /Ch03/Ch03-1-pycodestyle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "09c662f9-9479-42fb-9e60-88d839b3db11", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# 3-1 - Linting and Style" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "c9631482-1cb5-49ec-b372-26c7c7081c72", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# pycodestyle usage" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "id": "2df1320f-a791-4033-8739-3ff8cfaebe59", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Load the pycodestyle extention\n", 31 | "%load_ext pycodestyle_magic" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "0d1faf3d-4a2f-4c99-824a-8433eb9c237a", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%%pycodestyle\n", 42 | "import os, sys # Unused imports and multiple imports on one line\n", 43 | "def example_function(a, b): # Missing docstring\n", 44 | " if a > b:\n", 45 | " print(\"a is greater than b\") # Improper indentation\n", 46 | " else:\n", 47 | " print(\"b is greater or equal to a\") # Extra indentation\n", 48 | "# Long line exceeding 80 characters\n", 49 | "print(\n", 50 | " \"This is a really, really, really, really, really, really, really long line of code.\"\n", 51 | ")\n", 52 | "example_function(10, 5) # Function call with no meaningful context" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "ea6cde80-0604-43b0-9908-484fee5e31be", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Above in the output you should see a series of style suggestions" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "1c0cdd0a-df99-4d3e-b5e5-c57d7fe0027c", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "## End of Notebook ##" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3 (ipykernel)", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.11.13" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 5 97 | } 98 | -------------------------------------------------------------------------------- /Ch11/Ch11-3-exploring-with-sgkit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "d1aa5bcd-9b5f-4391-a89f-a6b245f4d8cc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch11-3-exploring-with-sgkit" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "64c849ae-086f-4208-9f03-032afe7f41a9", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# 1. Import libraries and get data\n", 21 | "import numpy as np \n", 22 | "import xarray as xr \n", 23 | "import sgkit as sg \n", 24 | "from sgkit.io import plink \n", 25 | "data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\\t') " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "id": "fd86462b-0072-4cd2-a342-02d0dca685df", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# 2. Get variant stats\n", 36 | "variant_stats = sg.variant_stats(data) \n", 37 | "variant_stats " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "07c0a37b-0d57-4026-97d3-0446bb0cb30d", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# 3. Look at variant call rate\n", 48 | "variant_stats.variant_call_rate.to_series().describe() " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "92dc45ca-1b3b-4704-8984-d5f710cde72a", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# 4. Sample statistics\n", 59 | "sample_stats = sg.sample_stats(data) \n", 60 | "sample_stats " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "9a47d767-c735-4d06-aa58-6c79442fa4b0", 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# 5. Sample call rates\n", 71 | "sample_stats.sample_call_rate.to_series().hist() " 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "41b7cb11-bdd8-469b-a262-881cdb5d126b", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "## End of Notebook ##" 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3 (ipykernel)", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.11.14" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 5 106 | } 107 | -------------------------------------------------------------------------------- /Ch15/galaxy/Ch15-1-introducing-galaxy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "816fb516-37d9-49b7-a262-1dab6bdb32a6", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch15-1 Introducing Galaxy" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "7cb3ca76-dbce-4b1e-81d5-6ebc07e49d26", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "## In this exercise you will set up an account on usegalaxy.org and learn the Galaxy interface\n", 21 | "# You will be running the commands below in the Terminal to set up a Docker container for Galaxy\n", 22 | "#. These instructions are notes for you to follow and run in your terminal\n", 23 | "# - no actual Jupyter notebook work will be used here" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "2ef173f6", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "### Docker Installation of Galaxy ###\n", 34 | "# First make sure you have Docker installed\n", 35 | "# Follow the instructions in Chapter 1-2 \"Installing the required software with Docker\"\n", 36 | "# Then Register with Docker here:\n", 37 | "# https://app.docker.com/signup \n", 38 | "# Test that you can log into Docker from the terminal like this: \n", 39 | "# docker login " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "cba11fcf-4910-4828-aa4f-f0901663713a", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Docker Pull of the Galaxy image\n", 50 | "# docker pull bgruening/galaxy-stable " 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "7b13f410-1827-46b7-8b6c-572da46a13d1", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Create a directory to store the data for the Docker container:\n", 61 | "# mkdir /tmp/galaxy_data " 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "da1f5697-b697-4a0e-a5b0-3abeddf48e8c", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# Docker Run\n", 72 | "# docker run -d -p 8080:80 --platform linux/amd64 -v /tmp/galaxy_data:/export --name galaxy bgruening/galaxy-stable " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "id": "d0ac053f-5ef2-4f38-9675-e9f7f06645c6", 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# Check that Galaxy container is running (this will also show you what port it is running on):\n", 83 | "# docker ps " 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "8d41b995-1ed8-45e8-a1ba-406539dcb95e", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "## End of Notebook ##" 94 | ] 95 | } 96 | ], 97 | "metadata": { 98 | "kernelspec": { 99 | "display_name": "Python 3 (ipykernel)", 100 | "language": "python", 101 | "name": "python3" 102 | }, 103 | "language_info": { 104 | "codemirror_mode": { 105 | "name": "ipython", 106 | "version": 3 107 | }, 108 | "file_extension": ".py", 109 | "mimetype": "text/x-python", 110 | "name": "python", 111 | "nbconvert_exporter": "python", 112 | "pygments_lexer": "ipython3", 113 | "version": "3.12.10" 114 | } 115 | }, 116 | "nbformat": 4, 117 | "nbformat_minor": 5 118 | } 119 | -------------------------------------------------------------------------------- /Ch15/snakemake/Snakefile: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from pathlib import Path 4 | 5 | # Configuration 6 | SAMPLES = ["sample1", "sample2", "sample3"] 7 | DATA_DIR = "data" 8 | RESULTS_DIR = "results" 9 | 10 | # Target rule - what we want to produce 11 | rule all: 12 | input: 13 | # FastQC reports (real) 14 | expand(f"{RESULTS_DIR}/fastqc/{{sample}}_R1_fastqc.html", sample=SAMPLES), 15 | expand(f"{RESULTS_DIR}/fastqc/{{sample}}_R2_fastqc.html", sample=SAMPLES), 16 | # Mock outputs 17 | expand(f"{RESULTS_DIR}/alignment/{{sample}}.bam", sample=SAMPLES), 18 | expand(f"{RESULTS_DIR}/variants/{{sample}}.vcf", sample=SAMPLES), 19 | f"{RESULTS_DIR}/multiqc_report.html", 20 | f"{RESULTS_DIR}/pipeline_summary.json" 21 | 22 | # Real FastQC rule 23 | rule fastqc: 24 | input: 25 | fastq=f"{DATA_DIR}/raw/{{sample}}_{{read}}.fastq.gz" 26 | output: 27 | html=f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.html", 28 | zip=f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.zip" 29 | params: 30 | outdir=f"{RESULTS_DIR}/fastqc" 31 | log: 32 | "logs/fastqc_{sample}_{read}.log" 33 | shell: 34 | """ 35 | # Check if fastqc is available, if not use mock 36 | if command -v fastqc >/dev/null 2>&1; then 37 | fastqc {input.fastq} -o {params.outdir} --extract 2> {log} 38 | else 39 | echo "FastQC not found, creating mock output..." > {log} 40 | python scripts/mock_fastqc.py {input.fastq} {params.outdir} {wildcards.sample} {wildcards.read} 41 | fi 42 | """ 43 | 44 | # Mock alignment rule 45 | rule align_reads: 46 | input: 47 | r1=f"{DATA_DIR}/raw/{{sample}}_R1.fastq.gz", 48 | r2=f"{DATA_DIR}/raw/{{sample}}_R2.fastq.gz" 49 | output: 50 | bam=f"{RESULTS_DIR}/alignment/{{sample}}.bam", 51 | bai=f"{RESULTS_DIR}/alignment/{{sample}}.bam.bai" 52 | log: 53 | "logs/align_{sample}.log" 54 | shell: 55 | """ 56 | echo "Mock alignment for {wildcards.sample}" > {log} 57 | python scripts/mock_alignment.py {input.r1} {input.r2} {output.bam} {output.bai} 58 | """ 59 | 60 | # Mock variant calling rule 61 | rule call_variants: 62 | input: 63 | bam=f"{RESULTS_DIR}/alignment/{{sample}}.bam", 64 | bai=f"{RESULTS_DIR}/alignment/{{sample}}.bam.bai" 65 | output: 66 | vcf=f"{RESULTS_DIR}/variants/{{sample}}.vcf" 67 | log: 68 | "logs/variants_{sample}.log" 69 | shell: 70 | """ 71 | echo "Mock variant calling for {wildcards.sample}" > {log} 72 | python scripts/mock_variants.py {input.bam} {output.vcf} 73 | """ 74 | 75 | # Mock MultiQC rule 76 | rule multiqc: 77 | input: 78 | fastqc_reports=expand(f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.zip", 79 | sample=SAMPLES, read=["R1", "R2"]), 80 | bams=expand(f"{RESULTS_DIR}/alignment/{{sample}}.bam", sample=SAMPLES) 81 | output: 82 | report=f"{RESULTS_DIR}/multiqc_report.html" 83 | log: 84 | "logs/multiqc.log" 85 | shell: 86 | """ 87 | echo "Mock MultiQC report generation" > {log} 88 | python scripts/mock_multiqc.py {output.report} 89 | """ 90 | 91 | # Pipeline summary rule 92 | rule pipeline_summary: 93 | input: 94 | vcfs=expand(f"{RESULTS_DIR}/variants/{{sample}}.vcf", sample=SAMPLES), 95 | multiqc=f"{RESULTS_DIR}/multiqc_report.html" 96 | output: 97 | summary=f"{RESULTS_DIR}/pipeline_summary.json" 98 | log: 99 | "logs/summary.log" 100 | shell: 101 | """ 102 | echo "Generating pipeline summary" > {log} 103 | python scripts/generate_summary.py {output.summary} 104 | """ 105 | -------------------------------------------------------------------------------- /Ch03/Ch03-2-sequence-manipulation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "feab660d-2231-4911-ab09-da9d7c6a5397", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch03-2 Sequence Manipulation" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "4d62f32a-abf2-4964-af3c-c9c9ad8fcd35", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Function Definition\n", 21 | "def parse_fasta(file_path):\n", 22 | " \"\"\"\n", 23 | " Parses a FASTA file and returns a dictionary with sequence headers as keys and sequences as values.\n", 24 | " Parameters:\n", 25 | " - file_path (str): Path to the FASTA file.\n", 26 | " Returns:\n", 27 | " - dict: A dictionary where keys are sequence headers and values are sequences.\n", 28 | " \"\"\"\n", 29 | " fasta_dict = {}\n", 30 | " with open(file_path, 'r') as file:\n", 31 | " header = None\n", 32 | " sequence = []\n", 33 | " for line in file:\n", 34 | " line = line.strip()\n", 35 | " if line.startswith(\">\"): # Header line\n", 36 | " if header: # Save the previous sequence\n", 37 | " fasta_dict[header] = ''.join(sequence)\n", 38 | " header = line[1:] # Remove \">\"\n", 39 | " sequence = [] # Reset sequence list\n", 40 | " else:\n", 41 | " sequence.append(line)\n", 42 | " if header: # Save the last sequence\n", 43 | " fasta_dict[header] = ''.join(sequence)\n", 44 | " return fasta_dict" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "id": "16c87949-80c7-409b-8fe4-f9db06031c42", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Header: seq1\n", 58 | "Sequence: ATCGTACGATCGGATCGTACGATC\n", 59 | "Header: seq2\n", 60 | "Sequence: CGTAGCTAGCTA\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "# Example usage:\n", 66 | "fasta_file = \"example.fasta\"\n", 67 | "fasta_records = parse_fasta(fasta_file)\n", 68 | "for header, seq in fasta_records.items():\n", 69 | " print(f\"Header: {header}\")\n", 70 | " print(f\"Sequence: {seq}\")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "347b2c76-3fe4-4a90-a31b-cb91f7abc157", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "695b0e93-5357-4d2d-925e-72c70dfafb07", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "a41df267-cab9-4ff8-aee5-5efaa8d83982", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "b238aba9-097d-4aa6-b424-9280c54a1e8d", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "id": "0ee8aae9-1366-4eea-8392-5b8e16a1d84a", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3 (ipykernel)", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.11.3" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 5 135 | } 136 | -------------------------------------------------------------------------------- /Ch07/Ch07-4-genome-assessment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "fca2f045", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch07-4 genome assessment" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "8a7ad84e", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install QUAST\n", 21 | "! pip install quast" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "9500e3dd", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Check that QUAST is installed\n", 32 | "! quast.py --version" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "fecf93af", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Import libraries\n", 43 | "import subprocess" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "00032b5e", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Function to run QUAST on a genome assembly\n", 54 | "def run_quast(assembly_file, reference_file=None, output_dir=\"quast_output\"):\n", 55 | " \"\"\"\n", 56 | " Runs QUAST to assess the quality of a genome assembly.\n", 57 | "\n", 58 | " Parameters:\n", 59 | " assembly_file (str): Path to the assembled genome FASTA file.\n", 60 | " reference_file (str, optional): Path to the reference genome FASTA file. Defaults to None.\n", 61 | " output_dir (str): Directory to save QUAST results. Defaults to \"quast_output\".\n", 62 | "\n", 63 | " Returns:\n", 64 | " None\n", 65 | " \"\"\"\n", 66 | " try:\n", 67 | " command = [\"quast.py\", assembly_file, \"-o\", output_dir]\n", 68 | " if reference_file:\n", 69 | " command.extend([\"-r\", reference_file])\n", 70 | "\n", 71 | " print(f\"Running QUAST...\\nCommand: {' '.join(command)}\")\n", 72 | " subprocess.run(command, check=True)\n", 73 | " print(f\"QUAST analysis complete. Results saved in: {output_dir}\")\n", 74 | " except FileNotFoundError:\n", 75 | " print(\"QUAST is not installed or not found in the system PATH.\")\n", 76 | " except subprocess.CalledProcessError as e:\n", 77 | " print(f\"Error running QUAST: {e}\")\n", 78 | " except Exception as e:\n", 79 | " print(f\"Unexpected error: {e}\")" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "ab030149", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "## Main code to run our QUAST function ##\n", 90 | "if __name__ == \"__main__\":\n", 91 | " assembly = \"output/ecoli-assembly.fasta\"\n", 92 | " reference = None # Set to \"reference.fasta\" if available\n", 93 | " output = \"quast_results\"\n", 94 | " run_quast(assembly, reference, output)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "8a33b21b", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# Open the QUAST report\n", 105 | "! open quast_results/report.html" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "002ea16f", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "## End of Notebook ##" 116 | ] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "Python 3 (ipykernel)", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.11.3" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 5 140 | } 141 | -------------------------------------------------------------------------------- /Ch03/Ch03-3-read-alignment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "fe161674-e8b6-41e1-b0a0-ca0fa241b4d4", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch03-3 Read Alignment " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "86f2be45-730f-49c0-8eb2-bf6af68d2456", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Libraries\n", 21 | "from Bio import pairwise2\n", 22 | "from Bio.pairwise2 import format_alignment" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "aabb521e-fa09-4904-92df-05eeef0367ef", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Example using Pairwise2\n", 33 | "# Define the sequences\n", 34 | "seq1 = \"ACGTGCTAGCTAG\"\n", 35 | "seq2 = \"ACGTCGATGCTA\"\n", 36 | "\n", 37 | "# Perform global alignment\n", 38 | "alignments = pairwise2.align.globalxx(seq1, seq2)\n", 39 | "\n", 40 | "# Display the best alignment\n", 41 | "print(\"Best alignment:\")\n", 42 | "print(format_alignment(*alignments[0]))\n", 43 | "\n", 44 | "# Perform local alignment\n", 45 | "local_alignments = pairwise2.align.localxx(seq1, seq2)\n", 46 | "\n", 47 | "# Display the best local alignment\n", 48 | "print(\"\\nBest local alignment:\")\n", 49 | "print(format_alignment(*local_alignments[0]))" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "fbf817f0-ed49-4952-8bcd-04c8ba96812d", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Example with PairwiseAligner" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "f8cd0758-a2b5-4206-9a3f-6439db557f49", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# Libraries\n", 70 | "from Bio.Align import PairwiseAligner" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "b07b39ea-e3e4-4571-a4af-9ddf07516ead", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Define the sequences\n", 81 | "seq1 = \"ACGTGCTAGCTAG\"\n", 82 | "seq2 = \"ACGTCGATGCTA\"\n", 83 | "\n", 84 | "# Initialize the PairwiseAligner\n", 85 | "aligner = PairwiseAligner()\n", 86 | "\n", 87 | "# Set alignment scoring (optional, defaults to match=1, mismatch=0, gap=-1)\n", 88 | "aligner.match_score = 1\n", 89 | "aligner.mismatch_score = -1\n", 90 | "aligner.open_gap_score = -1\n", 91 | "aligner.extend_gap_score = -0.5\n", 92 | "\n", 93 | "# Perform global alignment\n", 94 | "global_alignments = aligner.align(seq1, seq2)\n", 95 | "\n", 96 | "# Display the best global alignment\n", 97 | "print(\"Best global alignment:\")\n", 98 | "print(global_alignments[0])\n", 99 | "print(f\"Score: {global_alignments[0].score}\")\n", 100 | "\n", 101 | "# Perform local alignment\n", 102 | "aligner.mode = 'local' # Switch to local alignment mode\n", 103 | "local_alignments = aligner.align(seq1, seq2)\n", 104 | "\n", 105 | "# Display the best local alignment\n", 106 | "print(\"\\nBest local alignment:\")\n", 107 | "print(local_alignments[0])\n", 108 | "print(f\"Score: {local_alignments[0].score}\")\n" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "5d6b9d0f-a0a0-4f8e-a65f-9efffc636abe", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "c325b1fb-c3f5-44f9-b87d-fef313281353", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "166103c3-5096-4449-830c-ab49988a0981", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [] 134 | } 135 | ], 136 | "metadata": { 137 | "kernelspec": { 138 | "display_name": "Python 3 (ipykernel)", 139 | "language": "python", 140 | "name": "python3" 141 | }, 142 | "language_info": { 143 | "codemirror_mode": { 144 | "name": "ipython", 145 | "version": 3 146 | }, 147 | "file_extension": ".py", 148 | "mimetype": "text/x-python", 149 | "name": "python", 150 | "nbconvert_exporter": "python", 151 | "pygments_lexer": "ipython3", 152 | "version": "3.11.3" 153 | } 154 | }, 155 | "nbformat": 4, 156 | "nbformat_minor": 5 157 | } 158 | -------------------------------------------------------------------------------- /Ch10/Ch10-6-visualizing-phylogenetics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "ea8ee1c6-9f19-4571-8242-eeae2df0c867", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch10-6 - Visualizing Phylogenetic data [Updated to use raxml-ng]" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "8bbae47e-f0a7-45b5-b024-41656fab9414", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# 30 & 31. Load phylogenetic data and Draw Trees\n", 21 | "from copy import deepcopy\n", 22 | "from Bio import Phylo\n", 23 | "# Define the correct RAxML-NG output files\n", 24 | "best_tree_file = \"ebola_tree.raxml.bestTreeCollapsed\" # Best ML tree\n", 25 | "# Read the best ML tree\n", 26 | "ebola_tree = Phylo.read(best_tree_file, \"newick\")\n", 27 | "ebola_tree.name = \"Ebolavirus Tree\"\n", 28 | "# Print tree structures for verification\n", 29 | "Phylo.draw_ascii(ebola_tree)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "a26ed965-d007-4bf9-a045-9a1683fc29cf", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# 32. Bio.Phylo [Updated]\n", 40 | "import matplotlib.pyplot as plt\n", 41 | "from Bio import Phylo\n", 42 | "# Define the RAxML-NG output file\n", 43 | "simplified_tree_file = \"ebola_tree.raxml.bestTreeCollapsed\" # Previous output of raxml-ng\n", 44 | "# Read the tree (RAxML-NG outputs trees in Newick format)\n", 45 | "ebola_simple_tree = Phylo.read(simplified_tree_file, \"newick\")\n", 46 | "# Create a figure and axis\n", 47 | "fig = plt.figure(figsize=(16, 22))\n", 48 | "ax = fig.add_subplot(111)\n", 49 | "# Function to conditionally label branches\n", 50 | "def label_branches(clade):\n", 51 | " if clade.branch_length and clade.branch_length > 0.02:\n", 52 | " return f\"{clade.branch_length:.3f}\" # Format to 3 decimal places\n", 53 | " return None\n", 54 | "# Draw the tree with branch labels\n", 55 | "Phylo.draw(ebola_simple_tree, branch_labels=label_branches, axes=ax)\n", 56 | "# Show the plot\n", 57 | "plt.show()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "81667248-d700-451b-aac9-e103a1311c31", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# 33. Plot the complete dataset\n", 68 | "fig = plt.figure(figsize=(16, 22)) \n", 69 | "ax = fig.add_subplot(111) \n", 70 | "from collections import OrderedDict \n", 71 | "my_colors = OrderedDict({ \n", 72 | "'EBOV_2014': 'red', \n", 73 | "'EBOV': 'magenta', \n", 74 | "'BDBV': 'cyan', \n", 75 | "'SUDV': 'blue', \n", 76 | "'RESTV' : 'green', \n", 77 | "'TAFV' : 'yellow' \n", 78 | "}) \n", 79 | "\n", 80 | "def get_color(name): \n", 81 | " for pref, color in my_colors.items(): \n", 82 | " if name.find(pref) > -1: \n", 83 | " return color \n", 84 | " return 'grey' \n", 85 | "\n", 86 | "def color_tree(node, fun_color=get_color): \n", 87 | " if node.is_terminal(): \n", 88 | " node.color = fun_color(node.name) \n", 89 | " else: \n", 90 | " my_children = set() \n", 91 | " for child in node.clades: \n", 92 | " color_tree(child, fun_color) \n", 93 | " my_children.add(child.color.to_hex()) \n", 94 | " if len(my_children) == 1: \n", 95 | " node.color = child.color \n", 96 | " else: \n", 97 | " node.color = 'grey' \n", 98 | "\n", 99 | "ebola_color_tree = deepcopy(ebola_tree) \n", 100 | "color_tree(ebola_color_tree.root) \n", 101 | "Phylo.draw(ebola_color_tree, axes=ax, label_func=lambda x: x.name.split(' ')[0][1:] if x.name is not None else None) " 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "f7b5e7f2-2e4c-4dc9-a982-579a9371f36b", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "## End of Notebook ##" 112 | ] 113 | } 114 | ], 115 | "metadata": { 116 | "kernelspec": { 117 | "display_name": "Python 3 (ipykernel)", 118 | "language": "python", 119 | "name": "python3" 120 | }, 121 | "language_info": { 122 | "codemirror_mode": { 123 | "name": "ipython", 124 | "version": 3 125 | }, 126 | "file_extension": ".py", 127 | "mimetype": "text/x-python", 128 | "name": "python", 129 | "nbconvert_exporter": "python", 130 | "pygments_lexer": "ipython3", 131 | "version": "3.12.10" 132 | } 133 | }, 134 | "nbformat": 4, 135 | "nbformat_minor": 5 136 | } 137 | -------------------------------------------------------------------------------- /Ch03/pycodestyle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "86c669e3-1d83-459f-8a43-321dda3b7071", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Requirement already satisfied: flake8 in /opt/anaconda3/lib/python3.12/site-packages (7.0.0)\n", 14 | "Requirement already satisfied: mccabe<0.8.0,>=0.7.0 in /opt/anaconda3/lib/python3.12/site-packages (from flake8) (0.7.0)\n", 15 | "Requirement already satisfied: pycodestyle<2.12.0,>=2.11.0 in /opt/anaconda3/lib/python3.12/site-packages (from flake8) (2.11.1)\n", 16 | "Requirement already satisfied: pyflakes<3.3.0,>=3.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from flake8) (3.2.0)\n", 17 | "Requirement already satisfied: pycodestyle in /opt/anaconda3/lib/python3.12/site-packages (2.11.1)\n", 18 | "Requirement already satisfied: pycodestyle_magic in /opt/anaconda3/lib/python3.12/site-packages (0.5)\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "!pip install flake8\n", 24 | "!pip install pycodestyle \n", 25 | "!pip install pycodestyle_magic" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 11, 31 | "id": "186d6989-aed9-4cc0-b01e-c225ae3e1ab5", 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "The pycodestyle_magic extension is already loaded. To reload it, use:\n", 39 | " %reload_ext pycodestyle_magic\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "%load_ext pycodestyle_magic" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 12, 50 | "id": "beba9f29-84ad-4b96-a694-24efe2826bc4", 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stderr", 55 | "output_type": "stream", 56 | "text": [ 57 | "2:25: W291 trailing whitespace\n", 58 | "3:7: E225 missing whitespace around operator\n", 59 | "3:10: W291 trailing whitespace\n", 60 | "4:1: W391 blank line at end of file\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "%%pycodestyle \n", 66 | "# 3.1.2 - Example Code A \n", 67 | "my_var=10 " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 14, 73 | "id": "2c42823c-29c5-4960-bcc8-894468d70219", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stderr", 78 | "output_type": "stream", 79 | "text": [ 80 | "2:10: E401 multiple imports on one line\n", 81 | "3:1: E302 expected 2 blank lines, found 0\n", 82 | "5:7: E111 indentation is not a multiple of 4\n", 83 | "9:1: E305 expected 2 blank lines after class or function definition, found 0\n", 84 | "9:80: E501 line too long (92 > 79 characters)\n", 85 | "12:1: W391 blank line at end of file\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "%%pycodestyle \n", 91 | "import os, sys # Unused imports and multiple imports on one line\n", 92 | "def example_function(a, b): # Missing docstring\n", 93 | " if a > b:\n", 94 | " print(\"a is greater than b\") # Improper indentation\n", 95 | " else:\n", 96 | " print(\"b is greater or equal to a\") # Extra indentation\n", 97 | "# Long line exceeding 80 characters\n", 98 | "print(\"This is a really, really, really, really, really, really, really long line of code.\")\n", 99 | "\n", 100 | "example_function(10, 5) # Function call with no meaningful context\n" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "c5c1e0dc-3037-4296-8a8e-6a0450265359", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "76b8fde1-79b6-476d-8611-7e627cc8d646", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "id": "b58b633b-4ecb-4a51-94a9-3a290a5b758e", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3 (ipykernel)", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.12.2" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 5 149 | } 150 | -------------------------------------------------------------------------------- /Ch10/Ch10-4-reconstructing-trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "35ae8cd1-cffe-4704-850a-92366b2eb71e", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch10-4 - Reconstructing Phylogenetic Trees [Updated to use raxml-ng]" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "f8aa4582-f223-4cb1-a1b5-e8627660e512", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install raxml-ng\n", 21 | "! brew install raxml-ng" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "1d77eaff-414f-4aad-bfc6-6d1c1149046e", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# 18. Use DendroPy to reconstruct genus dataset (takes 15-30 minutes)\n", 32 | "import os\n", 33 | "import subprocess\n", 34 | "# Define input and output paths\n", 35 | "data_path = \"trim.fasta\"\n", 36 | "output_prefix = \"ebola_tree\"\n", 37 | "# Check if the input file exists\n", 38 | "if not os.path.exists(data_path):\n", 39 | " raise FileNotFoundError(f\"Error: The file {data_path} does not exist!\")\n", 40 | "# Define the RAxML-NG command\n", 41 | "cmd = [\n", 42 | " \"raxml-ng\",\n", 43 | " \"--msa\", data_path, # Input sequence alignment\n", 44 | " \"--model\", \"GTR+G\", # Substitution model\n", 45 | " \"--prefix\", output_prefix, # Output file prefix\n", 46 | " \"--search\", # Perform Maximum Likelihood tree search\n", 47 | "]\n", 48 | "# Run RAxML-NG\n", 49 | "try:\n", 50 | " subprocess.run(cmd, check=True)\n", 51 | " print(f\"RAxML-NG completed successfully. Output files are saved with prefix '{output_prefix}'\")\n", 52 | "except subprocess.CalledProcessError as e:\n", 53 | " print(f\"Error running RAxML-NG: {e}\")\n", 54 | "# Optional: Clean up RAxML-NG temporary files\n", 55 | "for ext in [\".raxml.log\", \".raxml.bestTree\", \".raxml.rba\", \".raxml.rfdist\"]:\n", 56 | " file_path = f\"{output_prefix}{ext}\"\n", 57 | " if os.path.exists(file_path):\n", 58 | " os.remove(file_path)\n", 59 | "print(\"Temporary files cleaned up.\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "0b212241-685f-41c2-9d61-c6fd7345f36e", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# You will see ebola_tree.raxml* files in the working directory " 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "4b569640-903e-4867-a2ef-a7df4a607fcb", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# 19. Save the files - skip this because we already have the files" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "3c1d688c-d48d-4663-a45a-02585abd7b44", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# 20. Visualize the genus tree\n", 90 | "import matplotlib.pyplot as plt\n", 91 | "from Bio import Phylo\n", 92 | "\n", 93 | "# Define the correct output tree file from RAxML-NG\n", 94 | "tree_file = \"ebola_tree.raxml.bestTreeCollapsed\" # Based on the raxml-ng output from the previous step\n", 95 | "\n", 96 | "# Read the tree in Newick format (RAxML-NG default)\n", 97 | "my_ebola_tree = Phylo.read(tree_file, \"newick\")\n", 98 | "\n", 99 | "# Set a name for the tree\n", 100 | "my_ebola_tree.name = \"Our Ebolavirus Tree\"\n", 101 | "\n", 102 | "# Plot the tree\n", 103 | "fig = plt.figure(figsize=(16, 18))\n", 104 | "ax = fig.add_subplot(1, 1, 1)\n", 105 | "Phylo.draw(my_ebola_tree, axes=ax)\n", 106 | "\n", 107 | "plt.show() # Display the tree" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "da4121ba-afdd-4817-b4ae-495630628c3c", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "# 21 onward Skipped" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "7d166c03-eb19-4eb0-bb44-58fc5627b0c4", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "## End of Notebook ##" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3 (ipykernel)", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.12.10" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 5 152 | } 153 | -------------------------------------------------------------------------------- /Ch08/Ch08-1-genbank-ncbi.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "babd109e", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch08-1 Accessing Genbank and Navigating the NCBI" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "7be76015", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# 1. Import modules and configure email\n", 21 | "from Bio import Entrez, SeqIO \n", 22 | "Entrez.email = 'put@your.email.here' \n", 23 | "# 2. Make output dir\n", 24 | "! mkdir -p output" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "facc0855", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# 3. Find the CRT gene of Plasmodium\n", 35 | "handle = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND \"Plasmodium falciparum\"[Organism]') \n", 36 | "rec_list = Entrez.read(handle) \n", 37 | "if int(rec_list['RetMax']) < int(rec_list['Count']): \n", 38 | " handle = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND \"Plasmodium falciparum\"[Organism]', \n", 39 | " retmax=rec_list['Count'])\n", 40 | " rec_list = Entrez.read(handle) " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "f7c260aa", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# 4. Retrieve records\n", 51 | "id_list = rec_list['IdList'] \n", 52 | "hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb') " 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "78617567", 59 | "metadata": { 60 | "scrolled": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "# 5. Read and parse results\n", 65 | "recs = list(SeqIO.parse(hdl, 'gb')) " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "944f99cb", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# 6. Review the record\n", 76 | "for rec in recs: \n", 77 | " if rec.name == 'KM288867':\n", 78 | " break \n", 79 | "print(rec.name) \n", 80 | "print(rec.description) " 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "7dd46b0d", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# 7. Extract features\n", 91 | "for feature in rec.features: \n", 92 | " if feature.type == 'gene':\n", 93 | " print(feature.qualifiers['gene']) \n", 94 | " elif feature.type == 'exon': \n", 95 | " loc = feature.location \n", 96 | " print(loc.start, loc.end, loc.strand) \n", 97 | " else: \n", 98 | " print('not processed:\\n%s' % feature) " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "a3f5356f", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# 8. Review annotations\n", 109 | "for name, value in rec.annotations.items(): \n", 110 | " print('%s=%s' % (name, value)) " 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "fb87731a", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# 9. Access the Sequence\n", 121 | "print(len(rec.seq))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "00dcb7a3", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# 10. There's More... (Pubmed search)\n", 132 | "from Bio import Medline \n", 133 | "refs = rec.annotations['references'] \n", 134 | "for ref in refs: \n", 135 | " if ref.pubmed_id != '': \n", 136 | " print(ref.pubmed_id) \n", 137 | " handle = Entrez.efetch(db='pubmed', id=[ref.pubmed_id], rettype='medline', retmode='text') \n", 138 | " records = Medline.parse(handle) \n", 139 | " for med_rec in records: \n", 140 | " for k, v in med_rec.items(): \n", 141 | " print('%s: %s' % (k, v)) " 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "19e2f484", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "## End of Notebook ##" 152 | ] 153 | } 154 | ], 155 | "metadata": { 156 | "kernelspec": { 157 | "display_name": "Python 3 (ipykernel)", 158 | "language": "python", 159 | "name": "python3" 160 | }, 161 | "language_info": { 162 | "codemirror_mode": { 163 | "name": "ipython", 164 | "version": 3 165 | }, 166 | "file_extension": ".py", 167 | "mimetype": "text/x-python", 168 | "name": "python", 169 | "nbconvert_exporter": "python", 170 | "pygments_lexer": "ipython3", 171 | "version": "3.11.3" 172 | } 173 | }, 174 | "nbformat": 4, 175 | "nbformat_minor": 5 176 | } 177 | -------------------------------------------------------------------------------- /Ch09/Ch09-5-proteomics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "03576b95-0a35-4fbb-b0ec-7f451c854b86", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch09-5 - Proteomics Analysis" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "95cb6809-e14b-446e-8c48-75b66f789bff", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "## Install Packages ##\n", 21 | "! pip install biopython matplotlib pandas seaborn pyteomics" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "52e3d25f-8449-4c99-970d-23b36c60a465", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Import Libraries \n", 32 | "import pandas as pd\n", 33 | "import matplotlib.pyplot as plt\n", 34 | "import seaborn as sns\n", 35 | "from Bio.SeqUtils.ProtParam import ProteinAnalysis\n", 36 | "from pyteomics import parser, mass" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "fe53ba18-a22f-4154-b80f-620aa3c2254c", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Define a sample protein sequence (P53 - TP53 Tumor Suppressor)\n", 47 | "protein_sequence = (\n", 48 | " \"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPG\"\n", 49 | " \"PDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT\"\n", 50 | " \"GFVKVGQSTSRHKKLMFKTEGPDSD\"\n", 51 | ")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "ae2732b2-b6a1-492d-8887-415a498e7bd2", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Analyze protein properties\n", 62 | "protein = ProteinAnalysis(protein_sequence)\n", 63 | "molecular_weight = protein.molecular_weight()\n", 64 | "hydrophobicity = protein.gravy()\n", 65 | "isoelectric_point = protein.isoelectric_point()\n", 66 | "amino_acid_composition = protein.count_amino_acids()\n", 67 | "\n", 68 | "print(f\"Protein Molecular Weight: {molecular_weight:.2f} Da\")\n", 69 | "print(f\"Protein Hydrophobicity (GRAVY): {hydrophobicity:.2f}\")\n", 70 | "print(f\"Protein Isoelectric Point (pI): {isoelectric_point:.2f}\")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "962694fa-5081-4aa6-9d13-c829f914cc76", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Perform in-silico trypsin digestion\n", 81 | "peptides = sorted(list(parser.cleave(protein_sequence, parser.expasy_rules['trypsin']))) # Convert set to list" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "bdd68dd6-774e-4d65-972c-35c9778df3be", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Calculate peptide masses\n", 92 | "peptide_masses = [mass.calculate_mass(sequence=p) for p in peptides]" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "80d4c748-df80-4664-9187-00a673fcb1aa", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Create DataFrame for peptide properties\n", 103 | "df = pd.DataFrame({'Peptide': peptides, 'Mass (Da)': peptide_masses})\n", 104 | "\n", 105 | "# Filter out very small peptides (e.g., <500 Da)\n", 106 | "df = df[df['Mass (Da)'] > 500]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "ca73d09f-7673-4226-8b90-ba6e3210cb3f", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# Display top 10 peptides\n", 117 | "print(\"\\nTop 10 Peptides:\")\n", 118 | "print(df.head(10))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "b8d288a6-6533-4220-bcbd-ce49e94c022b", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# Visualization: Peptide mass distribution\n", 129 | "plt.figure(figsize=(10, 5))\n", 130 | "sns.histplot(df['Mass (Da)'], bins=30, kde=True, color=\"blue\")\n", 131 | "plt.xlabel(\"Peptide Mass (Da)\")\n", 132 | "plt.ylabel(\"Frequency\")\n", 133 | "plt.title(\"Peptide Mass Distribution (Trypsin Digest)\")\n", 134 | "plt.show()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "58a56f8d-57b3-4ff1-8bf1-f88c2b32c04f", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "## End of Notebook ##" 145 | ] 146 | } 147 | ], 148 | "metadata": { 149 | "kernelspec": { 150 | "display_name": "Python 3 (ipykernel)", 151 | "language": "python", 152 | "name": "python3" 153 | }, 154 | "language_info": { 155 | "codemirror_mode": { 156 | "name": "ipython", 157 | "version": 3 158 | }, 159 | "file_extension": ".py", 160 | "mimetype": "text/x-python", 161 | "name": "python", 162 | "nbconvert_exporter": "python", 163 | "pygments_lexer": "ipython3", 164 | "version": "3.11.3" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 5 169 | } 170 | -------------------------------------------------------------------------------- /Ch07/Ch07-3-long-read-assembly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "1d3b4ccb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch07-3 - Long Read Assembly" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "9210885f", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Get nanopore reads for E Coli (Loman lab)\n", 21 | "! wget https://nanopore.s3.climb.ac.uk/MAP006-1_2D_pass.fasta\n", 22 | "! mv MAP006-1_2D_pass.fasta data/" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "7d11b44f", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Install Raven\n", 33 | "! git clone https://github.com/lbcb-sci/raven.git && cd raven\n", 34 | "! cmake -S ./ -B./build -DRAVEN_BUILD_EXE=1 -DCMAKE_BUILD_TYPE=Release\n", 35 | "!cmake --build build" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "c5e0a7dc", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# To put Raven in your path\n", 46 | "! export PATH=$PATH:~/work/CookBook/Ch07/raven/build/bin" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "16e6690e", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# To add Raven to your .zschrc\n", 57 | "! echo 'export PATH=$PATH:~/work/CookBook/Ch07/raven/build/bin' >> ~/.zshrc\n", 58 | "! source ~/.zshrc" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "9a8244ca", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# Check if raven is working (you may need to restart your Jupyter notebook)\n", 69 | "! raven --help" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 1, 75 | "id": "608bdf6b-0e59-4780-8eba-0310b42fa548", 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# Import Libraries\n", 80 | "import subprocess" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "f6971be9", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# Run Raven assembler\n", 91 | "def run_raven(input_fasta, output_fasta):\n", 92 | " \"\"\"\n", 93 | " Runs Raven to assemble a genome from an input FASTA file.\n", 94 | "\n", 95 | " Parameters:\n", 96 | " input_fasta (str): Path to the input FASTA file containing long reads.\n", 97 | " output_fasta (str): Path to save the assembled genome in FASTA format.\n", 98 | "\n", 99 | " Returns:\n", 100 | " None\n", 101 | " \"\"\"\n", 102 | " try:\n", 103 | " print(f\"Running Raven on {input_fasta}...\")\n", 104 | " # Use stdout redirection for Raven output\n", 105 | " with open(output_fasta, \"w\") as output_file:\n", 106 | " command = [\"raven\", input_fasta]\n", 107 | " subprocess.run(command, stdout=output_file, check=True)\n", 108 | " print(f\"Assembly completed. Output saved to {output_fasta}\")\n", 109 | " except FileNotFoundError:\n", 110 | " print(\"Error: Raven is not installed or not found in the system PATH.\")\n", 111 | " except subprocess.CalledProcessError as e:\n", 112 | " print(f\"Error running Raven: {e}\")\n", 113 | " except Exception as e:\n", 114 | " print(f\"Unexpected error: {e}\")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "e754c1e6", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Run the Raven function on our long read input data\n", 125 | "if __name__ == \"__main__\":\n", 126 | " input_fasta = \"data/MAP006-1_2D_pass.fasta\"\n", 127 | " output_fasta = \"assembly.fasta\"\n", 128 | " run_raven(input_fasta, output_fasta)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "68ccff06", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# Move the assembly fasta file to the output directory\n", 139 | "! mkdir -p output\n", 140 | "! mv assembly.fasta output/ecoli-assembly.fasta" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "51b3ff98", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "## End of Noteboook ##" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3 (ipykernel)", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.11.3" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 5 175 | } 176 | -------------------------------------------------------------------------------- /Ch11/Ch11-2-using-sgkit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "b2b305b9-9ca7-47b6-a9e3-99628fae7a9a", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch11-2-using-sgkit" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "2e374b2d-e950-4706-9616-fe2a91569257", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install libraries\n", 21 | "! pip install sgkit\n", 22 | "! pip install 'sgkit[plink]'" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "89d77eba-d7cd-4c7e-b719-d0e89d9e1952", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# You may experience compatibility issues with pyarrow - if so uninstall it and update it\n", 33 | "! pip uninstall pyarrow -y\n", 34 | "! pip install pyarrow --no-cache-dir\n", 35 | "# Or try (from Terminal):\n", 36 | "# Uninstall from BOTH\n", 37 | "# conda uninstall pyarrow -y\n", 38 | "# pip uninstall pyarrow -y\n", 39 | "# Reinstall with conda ONLY\n", 40 | "# conda install -c conda-forge pyarrow -y" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "d6ef6bdb-7b0c-4862-abdc-d49936149a76", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# You may also experience Dask incompatibilities; If so try:\n", 51 | "! pip install --upgrade dask-ml\n", 52 | "! pip install --upgrade dask" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "49fa98d7-b20c-4b7c-8059-b41731966c35", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# 1. Load dataset from previous recipe\n", 63 | "import numpy as np \n", 64 | "from sgkit.io import plink \n", 65 | "data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\\t') " 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "8882ef96-e1a2-41b7-9796-c8bce3d6d8ad", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# 2. Check the structure of the data\n", 76 | "data " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "d898cb18-c020-49dd-896f-9c8c4bd9e629", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# 3. Get summary information\n", 87 | "print(data.dims) " 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "11d2e4fc-a090-4794-b2fb-b5272592dc86", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# 4. Extract information about samples\n", 98 | "print(len(data.sample_id.values)) \n", 99 | "print(data.sample_id.values) \n", 100 | "print(data.sample_family_id.values) \n", 101 | "print(data.sample_sex.values) " 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "id": "e9d1cc35-d74a-4e34-a716-d578fc9fc7f7", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# 5. Review contigs\n", 112 | "print(data.contigs) " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "a9e73ded-57b6-4dd8-be30-b0235c264af8", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# 6. Look at the variants\n", 123 | "print(len(data.variant_contig.values)) \n", 124 | "print(data.variant_contig.values) \n", 125 | "print(data.variant_position.values) \n", 126 | "print(data.variant_allele.values) \n", 127 | "print(data.variant_id.values) " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "9edce820-55b9-4355-81ce-b6dd2b428023", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# 7. Look at Genotype data\n", 138 | "call_genotype = data.call_genotype.values \n", 139 | "print(call_genotype.shape) \n", 140 | "first_individual = call_genotype[:,0,:] \n", 141 | "first_variant = call_genotype[0,:,:] \n", 142 | "first_variant_of_first_individual = call_genotype[0,0,:] \n", 143 | "print(first_variant_of_first_individual) \n", 144 | "print(data.sample_family_id.values[0], data.sample_id.values[0]) \n", 145 | "print(data.variant_allele.values[0]) " 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "540f6197-efa9-4ae0-a9ab-b27aa50778ef", 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "## End of Notebook ##" 156 | ] 157 | } 158 | ], 159 | "metadata": { 160 | "kernelspec": { 161 | "display_name": "Python 3 (ipykernel)", 162 | "language": "python", 163 | "name": "python3" 164 | }, 165 | "language_info": { 166 | "codemirror_mode": { 167 | "name": "ipython", 168 | "version": 3 169 | }, 170 | "file_extension": ".py", 171 | "mimetype": "text/x-python", 172 | "name": "python", 173 | "nbconvert_exporter": "python", 174 | "pygments_lexer": "ipython3", 175 | "version": "3.11.14" 176 | } 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 5 180 | } 181 | -------------------------------------------------------------------------------- /Ch02/Ch02-3-pandas-memory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "fbeff717", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch02-3 Pandas Memory" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "71c9ea98-4713-4e8d-9a39-8fa2b764e200", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "########################## Notes & Updates ##############################################################\n", 21 | "# If you are using Docker and your data directory is mapped to \"/data\" then you can use the commented-out\n", 22 | "# Docker lines below in place of the primary line (which you will comment out when running)\n", 23 | "# You will also find other alternative lines or blocks that can be used to avoid potential issues \n", 24 | "#########################################################################################################" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "1e62c334", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Libraries \n", 35 | "import numpy as np \n", 36 | "import pandas as pd " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "dbf171c1", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Load data\n", 47 | "vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\", low_memory=False) \n", 48 | "# vdata = pd.read_csv(\"/data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\", low_memory=False) # Docker\n", 49 | "vdata.info(memory_usage=\"deep\") " 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "5258bc2e", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Inspect the size of each column\n", 60 | "for name in vdata.columns:\n", 61 | " col_bytes = vdata[name].memory_usage(index=False, deep=True) \n", 62 | " col_type = vdata[name].dtype\n", 63 | " print(name, col_type, col_bytes // (1024 ** 2)) " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "75883fe5", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Review the Died column\n", 74 | "vdata.DIED.memory_usage(index=False, deep=True) \n", 75 | "vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True) " 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "551e9be4", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# State column\n", 86 | "vdata[\"STATE\"] = vdata.STATE.str.upper() \n", 87 | "states = list(vdata[\"STATE\"].unique()) \n", 88 | "vdata[\"encoded_state\"] = vdata.STATE.apply(lambda state: states.index(state)) \n", 89 | "vdata[\"encoded_state\"] = vdata[\"encoded_state\"].astype(np.uint8) \n", 90 | "vdata[\"STATE\"].memory_usage(index=False, deep=True) \n", 91 | "vdata[\"encoded_state\"].memory_usage(index=False, deep=True) " 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "26d93734", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# Apply optimizations while loading the data\n", 102 | "states = list(pd.read_csv(\"vdata_sample.csv.gz\",\n", 103 | " converters={\"STATE\": lambda state: state.upper()}, \n", 104 | " usecols=[\"STATE\"] \n", 105 | ")[\"STATE\"].unique()) " 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "bd132f8b", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "# Skip the symptom_text column\n", 116 | "vdata = pd.read_csv(\"vdata_sample.csv.gz\", index_col=\"VAERS_ID\",\n", 117 | " converters={\n", 118 | " \"DIED\": lambda died: died == \"Y\", \"STATE\": lambda state: states.index(state.upper())\n", 119 | " }, usecols=lambda name: name != \"SYMPTOM_TEXT\")\n", 120 | "vdata[\"STATE\"] = vdata[\"STATE\"].astype(np.uint8)\n", 121 | "vdata.info(memory_usage=\"deep\") " 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "28e8a98c-39d0-407f-a3ca-aa37b12bcad0", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# Note - it is ok to get a dtype warning on the above" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "a6e2cac3", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# End of Notebook #" 142 | ] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3 (ipykernel)", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.12.11" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 5 166 | } 167 | -------------------------------------------------------------------------------- /Ch03/Ch03-4-test-writing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0679a213-c3a1-443b-bb61-98b5c7dd980c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch03-4 Test Writing" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 9, 16 | "id": "09272938-f7b9-44f8-8d02-a62b654032b1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Libraries\n", 21 | "import unittest\n", 22 | "from Bio.Align import PairwiseAligner" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 10, 28 | "id": "95af1088-176c-404e-b1bd-ef19e7fbd11d", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "FF\n", 36 | "======================================================================\n", 37 | "FAIL: test_global_alignment (__main__.TestPairwiseAligner.test_global_alignment)\n", 38 | "Test global alignment.\n", 39 | "----------------------------------------------------------------------\n", 40 | "Traceback (most recent call last):\n", 41 | " File \"/var/folders/53/kmyyy3057lndfb0bpwx_2pkr0000gn/T/ipykernel_37172/1953800863.py\", line 24, in test_global_alignment\n", 42 | " self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n", 43 | "AssertionError: 6.0 != 9.0 within 1 places (3.0 difference)\n", 44 | "\n", 45 | "======================================================================\n", 46 | "FAIL: test_local_alignment (__main__.TestPairwiseAligner.test_local_alignment)\n", 47 | "Test local alignment.\n", 48 | "----------------------------------------------------------------------\n", 49 | "Traceback (most recent call last):\n", 50 | " File \"/var/folders/53/kmyyy3057lndfb0bpwx_2pkr0000gn/T/ipykernel_37172/1953800863.py\", line 39, in test_local_alignment\n", 51 | " self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n", 52 | "AssertionError: 7.0 != 6.0 within 1 places (1.0 difference)\n", 53 | "\n", 54 | "----------------------------------------------------------------------\n", 55 | "Ran 2 tests in 0.005s\n", 56 | "\n", 57 | "FAILED (failures=2)\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "# Define the test Class\n", 63 | "class TestPairwiseAligner(unittest.TestCase):\n", 64 | "\n", 65 | " def setUp(self):\n", 66 | " \"\"\"Set up test cases with sequences and aligner.\"\"\"\n", 67 | " self.seq1 = \"ACGTGCTAGCTAG\"\n", 68 | " self.seq2 = \"ACGTCGATGCTA\"\n", 69 | " self.aligner = PairwiseAligner()\n", 70 | " self.aligner.match_score = 1\n", 71 | " self.aligner.mismatch_score = -1\n", 72 | " self.aligner.open_gap_score = -1\n", 73 | " self.aligner.extend_gap_score = -0.5\n", 74 | "\n", 75 | " def test_global_alignment(self):\n", 76 | " \"\"\"Test global alignment.\"\"\"\n", 77 | " alignments = self.aligner.align(self.seq1, self.seq2)\n", 78 | " best_alignment = alignments[0]\n", 79 | " \n", 80 | " # Expected results\n", 81 | " expected_score = 9.0\n", 82 | " expected_target = \"ACGTGCTAGCTAG\"\n", 83 | " expected_query = \"ACGTCGATGCTA-\"\n", 84 | " \n", 85 | " self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n", 86 | " self.assertEqual(str(best_alignment).splitlines()[0], \"target 0 ACGTGCTAGCTAG 13\")\n", 87 | " self.assertEqual(str(best_alignment).splitlines()[2], \"query 0 ACGTCGATGCTA- 12\")\n", 88 | "\n", 89 | " def test_local_alignment(self):\n", 90 | " \"\"\"Test local alignment.\"\"\"\n", 91 | " self.aligner.mode = 'local'\n", 92 | " alignments = self.aligner.align(self.seq1, self.seq2)\n", 93 | " best_alignment = alignments[0]\n", 94 | " \n", 95 | " # Expected results\n", 96 | " expected_score = 6.0\n", 97 | " expected_target = \"GTGCTAG\"\n", 98 | " expected_query = \"GTCGATG\"\n", 99 | " \n", 100 | " self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n", 101 | " self.assertIn(expected_target, str(best_alignment))\n", 102 | " self.assertIn(expected_query, str(best_alignment))\n", 103 | "\n", 104 | "# Run the tests\n", 105 | "if __name__ == \"__main__\":\n", 106 | " unittest.main(argv=[''], exit=False)\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "6793ddb1-4ca7-4915-b9a1-f2e81d939e8f", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3 (ipykernel)", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.11.3" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 5 139 | } 140 | -------------------------------------------------------------------------------- /Ch07/Ch07-1-genomes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "a2023924", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch07-1-genomes" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "c614152d", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Download T2T reference genome\n", 21 | "! mkdir -p data\n", 22 | "! wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz\n", 23 | "! gunzip GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz\n", 24 | "! mv GCA_009914755.4_T2T-CHM13v2.0_genomic.fna data/T2T_genome.fasta" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "4d8c6315", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# Install pyfastx\n", 35 | "! pip install pyfastx" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 6, 41 | "id": "a7b566a0", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# Import libraries\n", 46 | "import pyfastx" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 1, 52 | "id": "f426a4be", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Input genome file\n", 57 | "genome_fasta = \"data/T2T_genome.fasta\"" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "id": "bc05ab68", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "def compute_genome_size(fasta_file):\n", 68 | " \"\"\"\n", 69 | " Compute the total genome size from a FASTA file.\n", 70 | " \"\"\"\n", 71 | " genome_size = 0\n", 72 | " genome = pyfastx.Fasta(fasta_file, build_index=False)\n", 73 | " for _, seq in genome: # Use the sequence directly from the tuple\n", 74 | " genome_size += len(seq)\n", 75 | " return genome_size" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 8, 81 | "id": "1c39876c", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "def compute_gc_content(fasta_file):\n", 86 | " \"\"\"\n", 87 | " Compute the overall GC content of the genome.\n", 88 | " \"\"\"\n", 89 | " total_bases = 0\n", 90 | " gc_count = 0\n", 91 | " genome = pyfastx.Fasta(fasta_file, build_index=False)\n", 92 | " for _, seq in genome: # Use the sequence directly from the tuple\n", 93 | " total_bases += len(seq)\n", 94 | " gc_count += seq.upper().count('G') + seq.upper().count('C')\n", 95 | " return (gc_count / total_bases) * 100 if total_bases > 0 else 0" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "id": "05443299", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "def compute_n50(fasta_file):\n", 106 | " \"\"\"\n", 107 | " Compute the N50 value for the genome.\n", 108 | " \"\"\"\n", 109 | " lengths = []\n", 110 | " genome = pyfastx.Fasta(fasta_file, build_index=False)\n", 111 | " lengths = sorted([len(seq) for _, seq in genome], reverse=True)\n", 112 | " \n", 113 | " cumulative_length = 0\n", 114 | " total_length = sum(lengths)\n", 115 | " for length in lengths:\n", 116 | " cumulative_length += length\n", 117 | " if cumulative_length >= total_length / 2:\n", 118 | " return length\n", 119 | " return 0" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 9, 125 | "id": "86d5d764", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Genome Quality Metrics for data/T2T_genome.fasta:\n", 133 | "Total Genome Size: 3,117,292,070 bp\n", 134 | "GC Content: 40.75%\n", 135 | "N50: 150,617,247 bp\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "def assess_quality(fasta_file):\n", 141 | " \"\"\"\n", 142 | " Assess the quality of a T2T genome by calculating key metrics.\n", 143 | " \"\"\"\n", 144 | " genome_size = compute_genome_size(fasta_file)\n", 145 | " gc_content = compute_gc_content(fasta_file)\n", 146 | " n50 = compute_n50(fasta_file)\n", 147 | "\n", 148 | " print(f\"Genome Quality Metrics for {fasta_file}:\")\n", 149 | " print(f\"Total Genome Size: {genome_size:,} bp\")\n", 150 | " print(f\"GC Content: {gc_content:.2f}%\")\n", 151 | " print(f\"N50: {n50:,} bp\")\n", 152 | "\n", 153 | "# Run the quality assessment\n", 154 | "if __name__ == \"__main__\":\n", 155 | " assess_quality(genome_fasta)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "a43d8913", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "## End of Notebook ##" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.11.3" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 5 190 | } 191 | -------------------------------------------------------------------------------- /Ch04/Ch04-3-k-means.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "50013f28", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch04-3 K Means Analysis of Breast Cancer dataset" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "559118f4", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install scipy\n", 21 | "! pip install scipy==1.15.3 # Note that the latest conda scipy we could install was 1.14.1 in the YAML file" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "e81ad048", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Libraries\n", 32 | "from sklearn.datasets import load_breast_cancer\n", 33 | "from sklearn.cluster import KMeans\n", 34 | "from sklearn.preprocessing import StandardScaler\n", 35 | "from sklearn.decomposition import PCA\n", 36 | "import matplotlib.pyplot as plt\n", 37 | "import pandas as pd\n", 38 | "import numpy as np\n", 39 | "from sklearn.metrics import accuracy_score\n", 40 | "from scipy.stats import mode" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "2c0b64a8", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Load Breast Cancer dataset\n", 51 | "data = load_breast_cancer()\n", 52 | "X = data.data # Features array\n", 53 | "y = data.target # Labels / classes" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "48f15654", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Normalize the data\n", 64 | "scaler = StandardScaler()\n", 65 | "X_scaled = scaler.fit_transform(X)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "05349bc4", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# K-Means clustering\n", 76 | "kmeans = KMeans(n_clusters=2, random_state=42, n_init=10) # We know there are 2 classes; initialize 10 times\n", 77 | "clusters = kmeans.fit_predict(X_scaled)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "b155954b", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# Build a dataframe to analyze\n", 88 | "bc_kmeans_df = pd.DataFrame(X_scaled, columns=data.feature_names)\n", 89 | "bc_kmeans_df['Cluster'] = clusters\n", 90 | "bc_kmeans_df['True Label'] = y" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "e681d95b", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "# Align the labels to avoid Cluster Flipping\n", 101 | "def align_labels(true_labels, cluster_labels):\n", 102 | " # Map cluster labels to the majority true label in each cluster\n", 103 | " new_labels = np.zeros_like(cluster_labels)\n", 104 | " for cluster in np.unique(cluster_labels):\n", 105 | " mask = cluster_labels == cluster\n", 106 | " new_labels[mask] = mode(true_labels[mask], keepdims=False)[0]\n", 107 | " return new_labels\n", 108 | "aligned_clusters = align_labels(y, clusters)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "461f859d", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Evaluate accuracy of clustering\n", 119 | "accuracy = accuracy_score(y, aligned_clusters)\n", 120 | "print(f\"Accuracy of clustering: {accuracy:.2f}\")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "b1c96bff", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# Reduce the dimensions using PCA\n", 131 | "pca = PCA(n_components=2) # Use 2 components\n", 132 | "X_pca = pca.fit_transform(X_scaled)\n", 133 | "bc_kmeans_df['PC1'] = X_pca[:, 0]\n", 134 | "bc_kmeans_df['PC2'] = X_pca[:, 1]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "9538605b", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# Plot the K-means clustering results\n", 145 | "plt.figure(figsize=(8, 6))\n", 146 | "for cluster, color, marker in zip([0, 1], ['red', 'blue'], ['o', '^']):\n", 147 | " subset = bc_kmeans_df[bc_kmeans_df['Cluster'] == cluster]\n", 148 | " plt.scatter(subset['PC1'], subset['PC2'], c=color, label=f'Cluster {cluster}', marker=marker, alpha=0.7)\n", 149 | "\n", 150 | "plt.title('K-Means Clustering on Breast Cancer Dataset')\n", 151 | "plt.xlabel('Principal Component 1')\n", 152 | "plt.ylabel('Principal Component 2')\n", 153 | "plt.legend()\n", 154 | "plt.grid()\n", 155 | "plt.show()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "09d475f1", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "## End of Notebook ##" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.12.10" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 5 190 | } 191 | -------------------------------------------------------------------------------- /Ch04/Ch04-2-PCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "3bca36d9", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch04-2-PCA " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "0dabaee9", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install scikit-learn\n", 21 | "! pip install scikit-learn==1.7.0" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "283a2653", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Libraries\n", 32 | "from sklearn.datasets import load_breast_cancer\n", 33 | "from sklearn.decomposition import PCA\n", 34 | "from sklearn.preprocessing import StandardScaler\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "42defdf1", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Load Breast Cancer dataset\n", 47 | "bc_data = load_breast_cancer()\n", 48 | "X = bc_data.data # Features\n", 49 | "y = bc_data.target # Target labels (0 = malignant, 1 = benign)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "3c11d674", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "type(bc_data) # See what type of data is returned by sklearn - it is a Bunch" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "572461df", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# Standardize the data so that mean = 0 & variance = 1\n", 70 | "scaler = StandardScaler()\n", 71 | "X_scaled = scaler.fit_transform(X)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "24527df4", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# Perform PCA with 3 components\n", 82 | "bc_pca = PCA(n_components=3)\n", 83 | "X_bc_pca = bc_pca.fit_transform(X_scaled)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "bdd8a277", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# Load ther results into a dataframe\n", 94 | "bc_pca_df = pd.DataFrame(X_bc_pca, columns=['PC1', 'PC2', 'PC3'])\n", 95 | "bc_pca_df['label'] = y" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "ecd40b92", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# 3D Scatter Plot for PC1, PC2, and PC3\n", 106 | "fig = plt.figure(figsize=(10, 8))\n", 107 | "ax = fig.add_subplot(111, projection='3d')\n", 108 | "# Loop to create the 3 axes\n", 109 | "for label, color, marker in zip([0, 1], ['red', 'blue'], ['o', '^']):\n", 110 | " subset = bc_pca_df[bc_pca_df['label'] == label]\n", 111 | " ax.scatter(subset['PC1'], subset['PC2'], subset['PC3'], c=color, label=bc_data.target_names[label], marker=marker, alpha=0.7)\n", 112 | "# end for\n", 113 | "ax.set_title('PCA on Breast Cancer Dataset (3D View: PC1, PC2, PC3)')\n", 114 | "ax.set_xlabel('Principal Component 1')\n", 115 | "ax.set_ylabel('Principal Component 2')\n", 116 | "ax.set_zlabel('Principal Component 3')\n", 117 | "ax.legend()\n", 118 | "plt.show()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "5bca3af8", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# Get the explained variance ratio\n", 129 | "explained_variance = bc_pca.explained_variance_ratio_\n", 130 | "\n", 131 | "# Print explained variance for each component\n", 132 | "for i, variance in enumerate(explained_variance, 1):\n", 133 | " print(f\"Explained variance for PC{i}: {variance:.2f}\")\n", 134 | "\n", 135 | "# Print cumulative explained variance\n", 136 | "cumulative_variance = explained_variance.sum()\n", 137 | "print(f\"Total variance explained by the first 3 components: {cumulative_variance:.2f}\")" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "882b2889", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Plot cumulative explained variance\n", 148 | "cumulative_variance = explained_variance.cumsum()\n", 149 | "plt.figure(figsize=(8, 6))\n", 150 | "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')\n", 151 | "plt.title('Cumulative Explained Variance by Principal Components')\n", 152 | "plt.xlabel('Number of Principal Components')\n", 153 | "plt.ylabel('Cumulative Explained Variance')\n", 154 | "plt.grid(True)\n", 155 | "plt.show()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "01d4c628", 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "## End of Notebook ##" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.8.6" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 5 190 | } 191 | -------------------------------------------------------------------------------- /Ch05/Ch05-1-qc-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "64c65fdc-8012-4a2d-9585-f89a062504fa", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch05-1-qc-data" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "29cf5d46-8106-42bf-a02c-3072044e3e86", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Get data\n", 21 | "! wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR390/SRR390728/SRR390728_1.fastq.gz \n", 22 | "! wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR390/SRR390728/SRR390728_2.fastq.gz \n", 23 | "! mv SRR390728_1.fastq.gz data/\n", 24 | "! mv SRR390728_2.fastq.gz data/" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "eb999d32-b0fe-4eb0-aa68-fae649cf9b45", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# If you want to install FastQC from within your notebook use this\n", 35 | "! yes | conda install -c bioconda fastqc" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "08952a4f-473e-49eb-a003-9216a7541bd2", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# Install MultiQC if desired\n", 46 | "! yes | conda install -c bioconda multiqc" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "82b62e83-37eb-43db-b7da-ea7b5d26cf89", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# Import Libaries\n", 57 | "import os\n", 58 | "import subprocess" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "25367b17-2af5-4eb9-9ff9-45aa32be9ee3", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "def run_fastqc(input_dir, output_dir):\n", 69 | " \"\"\"\n", 70 | " Function to Run FastQC on all FASTQ files in the input directory.\n", 71 | " \"\"\"\n", 72 | " # Create an output directory for the FastQC reports\n", 73 | " os.makedirs(output_dir, exist_ok=True)\n", 74 | " # List all of the FASTQ files in the input directory\n", 75 | " fastq_files = [f for f in os.listdir(input_dir) if f.endswith((\".fastq\", \".fastq.gz\"))]\n", 76 | " if not fastq_files:\n", 77 | " print(\"Could not find any FASTQ files in the input directory.\")\n", 78 | " return\n", 79 | " print(\"Running FastQC...\")\n", 80 | " # Build the FastQC command\n", 81 | " fastqc_command = [\"fastqc\", \"-o\", output_dir] + [os.path.join(input_dir, f) for f in fastq_files]\n", 82 | " # Execute FastQC\n", 83 | " subprocess.run(fastqc_command)\n", 84 | " print(\"FastQC analysis Completed.\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "5ef0cee9-cf95-4d57-844e-74fb7e1ee531", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "def run_multiqc(input_dir, output_dir):\n", 95 | " \"\"\"\n", 96 | " Run MultiQC for aggregation of FastQC results.\n", 97 | " \"\"\"\n", 98 | " # Create output directory for the reports\n", 99 | " os.makedirs(output_dir, exist_ok=True)\n", 100 | " print(\"Running MultiQC...\")\n", 101 | " # Build the MultiQC command\n", 102 | " multiqc_command = [\"multiqc\", input_dir, \"-o\", output_dir]\n", 103 | " # Execute the MultiQC commands\n", 104 | " subprocess.run(multiqc_command)\n", 105 | " print(\"Finished...MultiQC report(s) generated.\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "c95cbbfd-96ab-49f8-96da-5e68800f130f", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "def main():\n", 116 | " \"\"\"\n", 117 | " Main function to perform quality control using FastQC and MultiQC.\n", 118 | " \"\"\"\n", 119 | " # Define the input and output directories\n", 120 | " input_dir = \"./data\" # We placed our fastq files in the data subdirectory\n", 121 | " fastqc_output_dir = \"fastqc_output\"\n", 122 | " multiqc_output_dir = \"multiqc_output\"\n", 123 | " # Run FastQC\n", 124 | " run_fastqc(input_dir, fastqc_output_dir)\n", 125 | " # Run MultiQC\n", 126 | " run_multiqc(fastqc_output_dir, multiqc_output_dir)\n", 127 | " print(f\"MultiQC report saved in: {os.path.abspath(multiqc_output_dir)}\")\n", 128 | "if __name__ == \"__main__\":\n", 129 | " main()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 16, 135 | "id": "0b2af0c2-6dfc-4dad-9d4b-0492f2cf0ed5", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Review the report\n", 140 | "! open multiqc_output/multiqc_report.html " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "caaa2bfc-75cc-461c-a84d-a1b27d3b3262", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# End of Notebook #" 151 | ] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "Python 3 (ipykernel)", 157 | "language": "python", 158 | "name": "python3" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.11.3" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 5 175 | } 176 | -------------------------------------------------------------------------------- /Ch12/Ch12-1-cobrapy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "3badbb9f-6117-46ee-9978-87f10180a0cf", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch12-1 - Metabolic Modelling with CobraPy" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "6722835f-7af1-4edd-964b-6ab25676fc50", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install the CobraPy package\n", 21 | "! pip install cobra" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "544f1e2e-b155-48e4-bd35-08f36ca43007", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Download the E. coli SBML mode from BiGG\n", 32 | "! wget http://bigg.ucsd.edu/static/models/e_coli_core.xml" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "329c2c54-e29e-45e1-90a9-c846f5da2c30", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Load the E. coli core model \n", 43 | "import cobra\n", 44 | "model = cobra.io.read_sbml_model(\"e_coli_core.xml\")" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "41e0bb39-4e8a-421f-a190-9c4e8a9993ab", 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# -----------------------------\n", 55 | "# Step 1: Run FBA on the Wild‐Type Model\n", 56 | "# -----------------------------\n", 57 | "solution_wt = model.optimize()\n", 58 | "print(\"Wild-type growth rate (objective value):\", solution_wt.objective_value)\n", 59 | "print(\"Flux distribution for key reactions:\")\n", 60 | "for rxn in model.reactions[:10]:\n", 61 | " print(f\"{rxn.id}: {solution_wt.fluxes[rxn.id]}\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "id": "e649b3cf-9ba0-450c-8ab1-aa998272e961", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# Review the objective function\n", 72 | "from cobra.util.solver import linear_reaction_coefficients\n", 73 | "print(\"Objective direction:\", model.objective.direction) # 'max' or 'min'\n", 74 | "for rxn, coef in linear_reaction_coefficients(model).items():\n", 75 | " print(f\"{rxn.id}: {coef}\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "30534058-aae7-464a-86fb-93e2370aa71c", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# -----------------------------\n", 86 | "# Step 2: Simulate a Gene Knockout\n", 87 | "# -----------------------------\n", 88 | "# For example, let's knock out gene \"b0001\" (this is one of the genes in the E. coli model)\n", 89 | "gene_to_knockout = \"b0351\"\n", 90 | "with model:\n", 91 | " # Knock out the gene (this automatically adjusts the reactions associated with the gene)\n", 92 | " model.genes.get_by_id(gene_to_knockout).knock_out()\n", 93 | " \n", 94 | " # Optimize the model after knockout\n", 95 | " solution_ko = model.optimize()\n", 96 | " print(f\"\\nGrowth rate after knocking out gene {gene_to_knockout}:\", solution_ko.objective_value)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "204c324d-2ce1-4f5d-b886-7de9624e17bd", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# Review genes you could knock out in your model\n", 107 | "print(\"Available gene IDs in the model:\")\n", 108 | "for gene in model.genes:\n", 109 | " print(gene.id)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "955ca8b8-9265-4a57-aa43-0901659f6f58", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# Step 2b - Try other gene knock outs\n", 120 | "gene_to_knockout = \"s0001\"\n", 121 | "with model:\n", 122 | " # Knock out the gene (this automatically adjusts the reactions associated with the gene)\n", 123 | " model.genes.get_by_id(gene_to_knockout).knock_out()\n", 124 | " \n", 125 | " # Optimize the model after knockout\n", 126 | " solution_ko = model.optimize()\n", 127 | " print(f\"\\nGrowth rate after knocking out gene {gene_to_knockout}:\", solution_ko.objective_value)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "a51b8ecb-ff43-4e3e-9895-ce8761d06cd4", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# -----------------------------\n", 138 | "# Step 3: Compare Flux Distributions (Optional)\n", 139 | "# -----------------------------\n", 140 | "print(\"\\nChange in fluxes for selected reactions after knockout:\")\n", 141 | "for rxn in model.reactions[:10]:\n", 142 | " flux_change = solution_wt.fluxes[rxn.id] - solution_ko.fluxes[rxn.id]\n", 143 | " print(f\"{rxn.id}: Δ flux = {flux_change:.2f}\")" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "abf612f6-ebc9-4b19-ad8d-7b9e18266497", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "## End of Notebook ##" 154 | ] 155 | } 156 | ], 157 | "metadata": { 158 | "kernelspec": { 159 | "display_name": "Python 3 (ipykernel)", 160 | "language": "python", 161 | "name": "python3" 162 | }, 163 | "language_info": { 164 | "codemirror_mode": { 165 | "name": "ipython", 166 | "version": 3 167 | }, 168 | "file_extension": ".py", 169 | "mimetype": "text/x-python", 170 | "name": "python", 171 | "nbconvert_exporter": "python", 172 | "pygments_lexer": "ipython3", 173 | "version": "3.11.3" 174 | } 175 | }, 176 | "nbformat": 4, 177 | "nbformat_minor": 5 178 | } 179 | -------------------------------------------------------------------------------- /Ch05/Ch05-4-variant-calling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "d481a727-d038-4493-8951-d80384b8196f", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch05-4 - Variant Calling with FreeBayes" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "07842b42-1212-48d7-96dd-7fa67bc32e09", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install FreeBayes\n", 21 | "! brew install freebayes" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "deb1070d-c3fc-49b6-aca3-26418b7a0460", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Import Libraries\n", 32 | "import subprocess\n", 33 | "import os" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "b507cff0-6e62-4d90-b49b-850f28c2ef41", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "def run_command(cmd):\n", 44 | " \"\"\"\n", 45 | " Run a shell command and check for errors.\n", 46 | " \"\"\"\n", 47 | " print(f\"Running: {' '.join(cmd)}\")\n", 48 | " subprocess.run(cmd, check=True)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "15054a71-65ef-434c-8ca3-d8bb558dde61", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "def index_reference(reference_fasta):\n", 59 | " \"\"\"\n", 60 | " Index the reference genome for FreeBayes and Samtools.\n", 61 | " \"\"\"\n", 62 | " print(\"Indexing the reference genome...\")\n", 63 | " # Generate a FASTA index for samtools and FreeBayes\n", 64 | " run_command([\"samtools\", \"faidx\", reference_fasta])\n", 65 | " print(\"Reference indexing complete.\\n\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "397d713d-c8f4-450e-a6fd-e436fd8ba8b3", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "def sort_and_index_bam(input_bam, output_sorted_bam):\n", 76 | " \"\"\"\n", 77 | " Sort and index the BAM file using Samtools.\n", 78 | " \"\"\"\n", 79 | " print(\"Sorting and indexing the BAM file...\")\n", 80 | " # Sort the BAM file\n", 81 | " run_command([\"samtools\", \"sort\", \"-o\", output_sorted_bam, input_bam])\n", 82 | " # Index the sorted BAM file\n", 83 | " run_command([\"samtools\", \"index\", output_sorted_bam])\n", 84 | " print(f\"Sorted BAM file: {output_sorted_bam}\\n\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "17bc0aea-e8a6-42d5-a1c0-52091f5591d9", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "def call_variants_with_freebayes(reference_fasta, input_bam, output_vcf):\n", 95 | " \"\"\"\n", 96 | " Call variants using FreeBayes.\n", 97 | " \"\"\"\n", 98 | " print(\"Calling variants with FreeBayes...\")\n", 99 | " cmd = [\n", 100 | " \"freebayes\",\n", 101 | " \"-f\", reference_fasta, # Reference genome\n", 102 | " input_bam # Sorted BAM file\n", 103 | " ]\n", 104 | " # Write the VCF output to a file\n", 105 | " with open(output_vcf, \"w\") as vcf_file:\n", 106 | " subprocess.run(cmd, stdout=vcf_file, check=True)\n", 107 | " print(f\"Variants called successfully. Output VCF: {output_vcf}\\n\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "63629749-515d-4b26-9822-76894edbaf94", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "def main():\n", 118 | " \"\"\"\n", 119 | " Main function to automate the FreeBayes variant calling workflow.\n", 120 | " \"\"\"\n", 121 | " # Input files\n", 122 | " reference_fasta = \"data/ecoli_genome/ecoli_reference.fasta\" # Path to the reference genome\n", 123 | " input_bam = \"data/output/aligned_reads.sam\" # Input BAM file (unsorted)\n", 124 | " # Output files\n", 125 | " output_sorted_bam = \"output/aligned_reads_sorted.bam\"\n", 126 | " output_vcf = \"output/variants.vcf\"\n", 127 | " # Create output directory\n", 128 | " os.makedirs(\"output\", exist_ok=True)\n", 129 | " try:\n", 130 | " # Step 1: Index the reference genome\n", 131 | " index_reference(reference_fasta)\n", 132 | " # Step 2: Sort and index the BAM file\n", 133 | " sort_and_index_bam(input_bam, output_sorted_bam)\n", 134 | " # Step 3: Call variants using FreeBayes\n", 135 | " call_variants_with_freebayes(reference_fasta, output_sorted_bam, output_vcf)\n", 136 | " except subprocess.CalledProcessError as e:\n", 137 | " print(f\"Error occurred while running a command: {e}\")\n", 138 | " except Exception as e:\n", 139 | " print(f\"Unexpected error: {e}\")\n", 140 | "\n", 141 | "if __name__ == \"__main__\":\n", 142 | " main()" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "8a30fbe7-e309-4b3e-b591-e40e6ad9feac", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "## End of Notebook ##" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "Python 3 (ipykernel)", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.11.3" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 5 177 | } 178 | -------------------------------------------------------------------------------- /Ch04/Ch04-6-seaborn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "a4c28fae", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch04-6 - Build a UMAP using Seaborn" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "116b7cac", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install pacakges\n", 21 | "! pip install seaborn\n", 22 | "! pip install umap-learn==0.5.7\n", 23 | "! pip install ipywidgets==8.1.7" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "43b3220b", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# Libraries\n", 34 | "import numpy as np\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import seaborn as sns\n", 37 | "from sklearn.datasets import load_breast_cancer\n", 38 | "from sklearn.preprocessing import StandardScaler\n", 39 | "import umap" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "61307b2e", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Load Breast Cancer dataset\n", 50 | "data = load_breast_cancer()\n", 51 | "X, y = data.data, data.target" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "00e9f97a-f3d5-4e06-af1c-c0472ccc3914", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Normalization\n", 62 | "scaler = StandardScaler()\n", 63 | "X_scaled = scaler.fit_transform(X)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "3b2d9615-2fa3-4925-97b5-0ce0ec50cb7c", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Create UMAP embedding\n", 74 | "umap_reducer = umap.UMAP(\n", 75 | " n_neighbors=15, # Controls local vs global structure\n", 76 | " min_dist=0.1, # Controls how tightly points are packed\n", 77 | " n_components=2, # 2D visualization\n", 78 | " random_state=42, # For reproducibility\n", 79 | " n_jobs=1\n", 80 | ")\n", 81 | "X_umap = umap_reducer.fit_transform(X_scaled)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "8499af82-6043-4a8f-a251-1a2e3df718e6", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Visualize the Umap\n", 92 | "plt.figure(figsize=(10, 8))\n", 93 | "for i in [0, 1]:\n", 94 | " mask = y == i\n", 95 | " plt.scatter(\n", 96 | " X_umap[mask, 0], \n", 97 | " X_umap[mask, 1], \n", 98 | " label=data.target_names[i],\n", 99 | " alpha=0.7,\n", 100 | " edgecolors='black', \n", 101 | " linewidth=0.5\n", 102 | " )\n", 103 | "plt.title('UMAP Visualization of Breast Cancer Dataset', fontsize=16)\n", 104 | "plt.xlabel('UMAP Dimension 1', fontsize=12)\n", 105 | "plt.ylabel('UMAP Dimension 2', fontsize=12)\n", 106 | "plt.legend()\n", 107 | "plt.grid(True, linestyle='--', alpha=0.7)\n", 108 | "plt.tight_layout()\n", 109 | "plt.show()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "ffb69ab5-e629-4c84-aa71-186b026fa5aa", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# Explore different UMAP parameters to improve the Clustering\n", 120 | "def plot_umap_parameter_comparison():\n", 121 | " # Create a figure with subplots for different UMAP configurations\n", 122 | " fig, axs = plt.subplots(2, 2, figsize=(16, 16))\n", 123 | " \n", 124 | " # Different n_neighbors values\n", 125 | " neighbors_values = [5, 15, 30, 50]\n", 126 | " \n", 127 | " for i, n_neighbors in enumerate(neighbors_values):\n", 128 | " row = i // 2\n", 129 | " col = i % 2\n", 130 | " \n", 131 | " # Create UMAP embedding with specific n_neighbors\n", 132 | " umap_reducer = umap.UMAP(\n", 133 | " n_neighbors=n_neighbors,\n", 134 | " min_dist=0.1,\n", 135 | " n_components=2,\n", 136 | " random_state=42,\n", 137 | " n_jobs=1\n", 138 | " )\n", 139 | " X_umap = umap_reducer.fit_transform(X_scaled)\n", 140 | " \n", 141 | " # Plot\n", 142 | " axs[row, col].scatter(\n", 143 | " X_umap[:, 0], \n", 144 | " X_umap[:, 1], \n", 145 | " c=y, \n", 146 | " cmap='viridis', \n", 147 | " alpha=0.7,\n", 148 | " edgecolors='black', \n", 149 | " linewidth=0.5\n", 150 | " )\n", 151 | " axs[row, col].set_title(f'UMAP (n_neighbors = {n_neighbors})')\n", 152 | " axs[row, col].set_xlabel('UMAP Dimension 1')\n", 153 | " axs[row, col].set_ylabel('UMAP Dimension 2')\n", 154 | " \n", 155 | " plt.tight_layout()\n", 156 | " plt.show()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "id": "0d39763c-4518-4fe5-a8fa-78eee13d5818", 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Run parameter comparison\n", 167 | "plot_umap_parameter_comparison()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "e3042be9-5bb6-4e4a-8a54-d13994886821", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "## End of Notebook ##" 178 | ] 179 | } 180 | ], 181 | "metadata": { 182 | "kernelspec": { 183 | "display_name": "Python 3 (ipykernel)", 184 | "language": "python", 185 | "name": "python3" 186 | }, 187 | "language_info": { 188 | "codemirror_mode": { 189 | "name": "ipython", 190 | "version": 3 191 | }, 192 | "file_extension": ".py", 193 | "mimetype": "text/x-python", 194 | "name": "python", 195 | "nbconvert_exporter": "python", 196 | "pygments_lexer": "ipython3", 197 | "version": "3.12.10" 198 | } 199 | }, 200 | "nbformat": 4, 201 | "nbformat_minor": 5 202 | } 203 | -------------------------------------------------------------------------------- /Ch10/Ch10-5-recursive-trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "81fc78c9-b88d-4371-910a-dfcd2fe49f4d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch10-5 Playing Recursively with Trees [Updated to use RAxML-NG]" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "e19afca5-8cd9-4ab4-93ec-691cee7f9145", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# 23. Load the raxml-generated tree for Ebola viruses\n", 21 | "import dendropy\n", 22 | "# Define the correct tree file output from RAxML-NG\n", 23 | "tree_file = \"ebola_tree.raxml.bestTreeCollapsed\" # Based on raxml-ng output from previous recipe\n", 24 | "# Load the tree using Newick format (RAxML-NG outputs trees in Newick)\n", 25 | "ebola_raxml = dendropy.Tree.get_from_path(tree_file, schema=\"newick\")\n", 26 | "# Optional: Print the tree to verify\n", 27 | "print(ebola_raxml.as_string(schema=\"newick\"))" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "25fde98b-6356-4a6f-9248-59665b54596f", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# 24. Compute the level of each node\n", 38 | "def compute_level(node, level=0): \n", 39 | " for child in node.child_nodes(): \n", 40 | " compute_level(child, level + 1) \n", 41 | " if node.taxon is not None: \n", 42 | " print(\"%s: %d %d\" % (node.taxon, node.level(), level)) \n", 43 | "compute_level(ebola_raxml.seed_node) " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "6ad0c4ef-5321-46cb-b48a-94c49595ac56", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# 25. Compute the height of each node\n", 54 | "def compute_height(node): \n", 55 | " children = node.child_nodes() \n", 56 | " if len(children) == 0: \n", 57 | " height = 0 \n", 58 | " else: \n", 59 | " height = 1 + max(map(lambda x: compute_height(x), children)) \n", 60 | " desc = node.taxon or 'Internal' \n", 61 | " print(\"%s: %d %d\" % (desc, height, node.level())) \n", 62 | " return height \n", 63 | "compute_height(ebola_raxml.seed_node) " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "99069105-cdde-4ed7-b8a8-38bb2601cb95", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# 26. Compute the number of offspring for each node\n", 74 | "def compute_nofs(node): \n", 75 | " children = node.child_nodes() \n", 76 | " nofs = len(children) \n", 77 | " map(lambda x: compute_nofs(x), children) \n", 78 | " desc = node.taxon or 'Internal' \n", 79 | " print(\"%s: %d %d\" % (desc, nofs, node.level())) \n", 80 | "compute_nofs(ebola_raxml.seed_node) " 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "bc73aca6-d055-4eda-9619-a9cf77e87922", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# 27. Print out the leaves\n", 91 | "def print_nodes(node): \n", 92 | " for child in node.child_nodes(): \n", 93 | " print_nodes(child) \n", 94 | " if node.taxon is not None: \n", 95 | " print('%s (%d)' % (node.taxon, node.level())) \n", 96 | "print_nodes(ebola_raxml.seed_node) " 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "id": "c475fc49-a772-40b6-a83f-4f443970d2c2", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "# 28. Print leaf nodes in breadth-first manner\n", 107 | "from collections import deque \n", 108 | "def print_breadth(tree): \n", 109 | " queue = deque() \n", 110 | " queue.append(tree.seed_node) \n", 111 | " while len(queue) > 0: \n", 112 | " process_node = queue.popleft() \n", 113 | " if process_node.taxon is not None: \n", 114 | " print('%s (%d)' % (process_node.taxon, process_node.level())) \n", 115 | " else: \n", 116 | " for child in process_node.child_nodes(): \n", 117 | " queue.append(child) \n", 118 | "print_breadth(ebola_raxml) " 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "1cb1caef-8aaa-4b4f-8531-66d0ebe1dbdf", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# 29. Getting back to the real dataset\n", 129 | "from copy import deepcopy \n", 130 | "simple_ebola = deepcopy(ebola_raxml) \n", 131 | "def simplify_tree(node): \n", 132 | " prefs = set() \n", 133 | " for leaf in node.leaf_nodes(): \n", 134 | " my_toks = leaf.taxon.label.split(' ') \n", 135 | " if my_toks[0] == 'EBOV': \n", 136 | " prefs.add('EBOV' + my_toks[1]) \n", 137 | " else: \n", 138 | " prefs.add(my_toks[0]) \n", 139 | " if len(prefs) == 1: \n", 140 | " print(prefs, len(node.leaf_nodes())) \n", 141 | " node.taxon = dendropy.Taxon(label=list(prefs)[0]) \n", 142 | " node.set_child_nodes([]) \n", 143 | " else: \n", 144 | " for child in node.child_nodes(): \n", 145 | " simplify_tree(child) \n", 146 | "simplify_tree(simple_ebola.seed_node) \n", 147 | "simple_ebola.ladderize() \n", 148 | "simple_ebola.write_to_path('ebola_simple.nex', 'nexus') " 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "b9caaa6c-4251-4db0-9ca4-47d4fbd9a747", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "## End of Notebook ##" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3 (ipykernel)", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.11.3" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 5 183 | } 184 | -------------------------------------------------------------------------------- /Ch09/Ch09-2-molecular-distances.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "9f15edc4", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch 9-2 - Computing molecular distances on a PDB file" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "261dd6e0", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Load model\n", 21 | "from Bio import PDB \n", 22 | "repository = PDB.PDBList() \n", 23 | "parser = PDB.PDBParser() \n", 24 | "repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') \n", 25 | "p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "id": "189e72bb", 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "# Get Zincs\n", 36 | "zns = []\n", 37 | "for atom in p53_1tup.get_atoms(): \n", 38 | " if atom.element == 'ZN': \n", 39 | " zns.append(atom) \n", 40 | "for zn in zns: \n", 41 | " print(zn, zn.coord) " 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "45708c7f", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# Get the distance between one atom and another set of atoms\n", 52 | "import math \n", 53 | "def get_closest_atoms(pdb_struct, ref_atom, distance): \n", 54 | " atoms = {} \n", 55 | " rx, ry, rz = ref_atom.coord \n", 56 | " for atom in pdb_struct.get_atoms(): \n", 57 | " if atom == ref_atom: \n", 58 | " continue \n", 59 | " x, y, z = atom.coord \n", 60 | " my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) \n", 61 | " if my_dist < distance: \n", 62 | " atoms[atom] = my_dist \n", 63 | " return atoms " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "def34d3d", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Compute atoms near zincs\n", 74 | "for zn in zns: \n", 75 | " print() \n", 76 | " print(zn.coord) \n", 77 | " atoms = get_closest_atoms(p53_1tup, zn, 4) \n", 78 | " for atom, distance in atoms.items(): \n", 79 | " print(atom.element, distance, atom.coord) " 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "id": "5aeceeae", 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# How many atoms do we get as we increase the distance\n", 90 | "for distance in [1, 2, 4, 8, 16, 32, 64, 128]: \n", 91 | " my_atoms = [] \n", 92 | " for zn in zns: \n", 93 | " atoms = get_closest_atoms(p53_1tup, zn, distance) \n", 94 | " my_atoms.append(len(atoms)) \n", 95 | " print(distance, my_atoms) " 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "310865b4", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Time the output\n", 106 | "import timeit \n", 107 | "nexecs = 10 \n", 108 | "print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], 4.0)', \n", 109 | " 'from __main__ import get_closest_atoms, p53_1tup, zns', \n", 110 | " number=nexecs) / nexecs * 1000) " 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "753d85a2", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# A better distance function\n", 121 | "def get_closest_alternative(pdb_struct, ref_atom, distance): \n", 122 | " atoms = {} \n", 123 | " rx, ry, rz = ref_atom.coord \n", 124 | " for atom in pdb_struct.get_atoms(): \n", 125 | " if atom == ref_atom: \n", 126 | " continue \n", 127 | " x, y, z = atom.coord \n", 128 | " if abs(x - rx) > distance or abs(y - ry) > distance or abs(z - rz) > distance: \n", 129 | " continue \n", 130 | " my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) \n", 131 | " if my_dist < distance: \n", 132 | " atoms[atom] = my_dist \n", 133 | " return atoms " 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "f6e2e5d7", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# Now let's time it:\n", 144 | "print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], 4.0)', \n", 145 | " 'from __main__ import get_closest_alternative, p53_1tup, zns', \n", 146 | " number=nexecs) / nexecs * 1000) " 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "ad69ba33", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "# Compare the function with different distances\n", 157 | "print('Standard') \n", 158 | "for distance in [1, 4, 16, 64, 128]: \n", 159 | " print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], distance)', \n", 160 | " 'from __main__ import get_closest_atoms, p53_1tup, zns, distance', \n", 161 | " number=nexecs) / nexecs * 1000) \n", 162 | "print('Optimized') \n", 163 | "for distance in [1, 4, 16, 64, 128]: \n", 164 | " print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], distance)', \n", 165 | " 'from __main__ import get_closest_alternative, p53_1tup, zns, distance', \n", 166 | " number=nexecs) / nexecs * 1000)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "26d469c5", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "## End of Notebook ##" 177 | ] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3 (ipykernel)", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.11.3" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 5 201 | } 202 | -------------------------------------------------------------------------------- /Ch09/Ch09-4-py3dmol.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "65a69498-8fdc-4ff3-9ded-0e93e5abbacb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch 09-4 py3dmol" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "5709bf95-8fa1-4815-83a6-c27c4c95e02a", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install dependencies\n", 21 | "! pip install py3Dmol" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "f54c2935", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Import Libraries\n", 32 | "import py3Dmol" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "id": "21aef57c-0cdb-4f48-a0a9-4ed7f7664a28", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "application/3dmoljs_load.v0": "
\n

3Dmol.js failed to load for some reason. Please check your browser console for error messages.

\n
\n", 44 | "text/html": [ 45 | "
\n", 46 | "

3Dmol.js failed to load for some reason. Please check your browser console for error messages.

\n", 47 | "
\n", 48 | "" 92 | ] 93 | }, 94 | "metadata": {}, 95 | "output_type": "display_data" 96 | } 97 | ], 98 | "source": [ 99 | "# Set up the Py3Dmol viewer for an example protein \n", 100 | "view = py3Dmol.view(query='pdb:1crn') # 1crn = Crambin\n", 101 | "view.setStyle({'cartoon': {'color': 'spectrum'}})\n", 102 | "view.zoomTo()\n", 103 | "view.show()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "a29bfd8c-680c-4cf9-8252-7453809a1baa", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "## End of Notebook ##" 114 | ] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3 (ipykernel)", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.12.11" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 5 138 | } 139 | -------------------------------------------------------------------------------- /Ch02/Ch02-2-pandas-pitfalls.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "9a6d23f7", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch02-2 - Pitfalls of joining data with pandas" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "cd5b26a0-b105-4219-a1df-b21464aa2675", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "########################## Notes & Updates ##############################################################\n", 21 | "# If you are using Docker and your data directory is mapped to \"/data\" then you can use the commented-out\n", 22 | "# Docker lines below in place of the primary line (which you will comment out when running)\n", 23 | "# You will also find other alternative lines or blocks that can be used to avoid potential issues \n", 24 | "#########################################################################################################" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "4d64b416", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# 1. Import Libraries\n", 35 | "import pandas as pd" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "923bc568", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# 2. Jumble the data using random sampling\n", 46 | "vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\") \n", 47 | "# vdata = pd.read_csv(\"/data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\") # Docker\n", 48 | "vdata.sample(frac=0.9).to_csv(\"vdata_sample.csv.gz\", index=False) \n", 49 | "vax = pd.read_csv(\"data/2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\") \n", 50 | "# vax = pd.read_csv(\"/data/2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\") # Docker \n", 51 | "vax.sample(frac=0.9).to_csv(\"vax_sample.csv.gz\", index=False) \n", 52 | "# Note - it is ok to get a dtype warning here" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "id": "d0ef3b0d", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# 3. Inner join on the tables\n", 63 | "vdata = pd.read_csv(\"vdata_sample.csv.gz\", low_memory=False) \n", 64 | "vax = pd.read_csv(\"vax_sample.csv.gz\", low_memory=False) \n", 65 | "vdata_with_vax = vdata.join(vax.set_index(\"VAERS_ID\"), on=\"VAERS_ID\", how=\"inner\") \n", 66 | "# vdata_with_vax = vdata.merge(vax, on=\"VAERS_ID\", how=\"inner\") # Docker - Alternate method \n", 67 | "len(vdata), len(vax), len(vdata_with_vax) " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "9ed5a95b", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# 4. Find the data not captured by the join\n", 78 | "lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)] \n", 79 | "lost_vdata \n", 80 | "lost_vax = vax[~vax[\"VAERS_ID\"].isin(vdata.index)] \n", 81 | "lost_vax " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "13329af6", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# 5. Left outer join\n", 92 | "vdata_with_vax_left = vdata.join(vax.set_index(\"VAERS_ID\"), on=\"VAERS_ID\") \n", 93 | "vdata_with_vax_left.groupby(\"VAERS_ID\").size().sort_values() \n", 94 | "# vdata_with_vax_left = vdata.merge(vax, on=\"VAERS_ID\", how=\"left\") # Docker - alternate version\n", 95 | "# vdata_with_vax_left.groupby(\"VAERS_ID\").size().sort_values() # Docker - alternate version" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "24d2f205", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# 6. Right join\n", 106 | "dead = vdata[vdata.DIED == \"Y\"] \n", 107 | "vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n", 108 | "vax19_dead = vax19.join(dead.set_index(\"VAERS_ID\"), on=\"VAERS_ID\", how=\"right\") \n", 109 | "len(vax19), len(dead), len(vax19_dead) \n", 110 | "len(vax19_dead[vax19_dead.VAERS_ID.duplicated()]) \n", 111 | "len(vax19_dead) - len(dead) " 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "a92b51dc-111a-4b05-a5b4-1124a631bc03", 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "# 6. Right join (alternate method)\n", 122 | "# dead = vdata[vdata.DIED == \"Y\"] \n", 123 | "# vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n", 124 | "# vax19_dead = vax19.merge(dead, on=\"VAERS_ID\", how=\"right\")\n", 125 | "# len(vax19), len(dead), len(vax19_dead)\n", 126 | "# len(vax19_dead[vax19_dead.VAERS_ID.duplicated()])\n", 127 | "# len(vax19_dead) - len(dead) " 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "b877c4b5", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# Problematic lots\n", 138 | "vax19_dead[\"STATE\"] = vax19_dead[\"STATE\"].str.upper() \n", 139 | "dead_lot = vax19_dead[[\"VAERS_ID\", \"VAX_LOT\", \"STATE\"]].set_index([\"VAERS_ID\", \"VAX_LOT\"]) \n", 140 | "dead_lot_clean = dead_lot[~dead_lot.index.duplicated()] \n", 141 | "dead_lot_clean = dead_lot_clean.reset_index() \n", 142 | "dead_lot_clean[dead_lot_clean.VAERS_ID.isna()] \n", 143 | "baddies = dead_lot_clean.groupby(\"VAX_LOT\").size().sort_values(ascending=False) " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "6b3cc8a5", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# Print problematic lots\n", 154 | "for i, (lot, cnt) in enumerate(baddies.items()):\n", 155 | " print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby(\"STATE\")))\n", 156 | " if i == 10:\n", 157 | " break" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "d249a4e0", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "# End of Notebook #" 168 | ] 169 | } 170 | ], 171 | "metadata": { 172 | "kernelspec": { 173 | "display_name": "Python 3 (ipykernel)", 174 | "language": "python", 175 | "name": "python3" 176 | }, 177 | "language_info": { 178 | "codemirror_mode": { 179 | "name": "ipython", 180 | "version": 3 181 | }, 182 | "file_extension": ".py", 183 | "mimetype": "text/x-python", 184 | "name": "python", 185 | "nbconvert_exporter": "python", 186 | "pygments_lexer": "ipython3", 187 | "version": "3.12.11" 188 | } 189 | }, 190 | "nbformat": 4, 191 | "nbformat_minor": 5 192 | } 193 | -------------------------------------------------------------------------------- /Ch01/Welcome.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "20b669c8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# BioInformatics with Python Cookbook - Fourth Edition #" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "32e90c95", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Welcome to the book! #\n", 21 | "# 1-1 Welcome #" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "id": "727b23fe", 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Welcome to the BioInformatics with Python Cookbook Fourth Edition!\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "print(\"Welcome to the BioInformatics with Python Cookbook Fourth Edition!\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "id": "2437ebca-218f-498a-ad08-158f5364face", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Install packages using Conda" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 4, 55 | "id": "869bd2c5-2117-493e-a0c9-aefd885b8259", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Retrieving notices: done\n", 63 | "Collecting package metadata (current_repodata.json): done\n", 64 | "Solving environment: unsuccessful initial attempt using frozen solve. Retrying with flexible solve.\n", 65 | "Collecting package metadata (repodata.json): \\ WARNING conda.models.version:get_matcher(563): Using .* with relational operator is superfluous and deprecated and will be removed in a future version of conda. Your spec was 1.8.0.*, but conda is ignoring the .* and treating it as 1.8.0\n", 66 | "WARNING conda.models.version:get_matcher(563): Using .* with relational operator is superfluous and deprecated and will be removed in a future version of conda. Your spec was 1.9.0.*, but conda is ignoring the .* and treating it as 1.9.0\n", 67 | "done\n", 68 | "Solving environment: - ^C\n", 69 | "unsuccessful initial attempt using frozen solve. Retrying with flexible solve.\n", 70 | "\n", 71 | "CondaError: KeyboardInterrupt\n", 72 | "\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "! conda install -y biopython==1.84 jupyterlab==4.3.0 matplotlib==3.9.2 numpy==2.1.0 pandas==2.2.3 scipy==1.14.1 " 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "id": "257691cc-bbe3-438d-89d2-94a537c789ab", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# Install Jupytext" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "id": "c0e9235a-7f3f-4633-9134-13f50fb17209", 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "Requirement already satisfied: jupytext in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (1.17.1)\n", 101 | "Requirement already satisfied: markdown-it-py>=1.0 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (2.2.0)\n", 102 | "Requirement already satisfied: mdit-py-plugins in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (0.5.0)\n", 103 | "Requirement already satisfied: nbformat in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (5.10.4)\n", 104 | "Requirement already satisfied: packaging in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (25.0)\n", 105 | "Requirement already satisfied: pyyaml in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (6.0.2)\n", 106 | "Requirement already satisfied: mdurl~=0.1 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from markdown-it-py>=1.0->jupytext) (0.1.0)\n", 107 | "Requirement already satisfied: fastjsonschema>=2.15 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (2.20.0)\n", 108 | "Requirement already satisfied: jsonschema>=2.6 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (4.25.0)\n", 109 | "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (5.8.1)\n", 110 | "Requirement already satisfied: traitlets>=5.1 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (5.14.3)\n", 111 | "Requirement already satisfied: attrs>=22.2.0 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (24.3.0)\n", 112 | "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (2023.7.1)\n", 113 | "Requirement already satisfied: referencing>=0.28.4 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (0.36.2)\n", 114 | "Requirement already satisfied: rpds-py>=0.7.1 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (0.22.3)\n", 115 | "Requirement already satisfied: platformdirs>=2.5 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat->jupytext) (4.3.7)\n", 116 | "Requirement already satisfied: typing-extensions>=4.4.0 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from referencing>=0.28.4->jsonschema>=2.6->nbformat->jupytext) (4.14.1)\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "! pip install jupytext" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "d5abaf2d", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "## End of Notebook ##" 132 | ] 133 | } 134 | ], 135 | "metadata": { 136 | "jupytext": { 137 | "formats": "ipynb,py:light" 138 | }, 139 | "kernelspec": { 140 | "display_name": "Python 3 (ipykernel)", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.11.13" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 5 159 | } 160 | -------------------------------------------------------------------------------- /Ch10/Ch10-2-aligning-genetic-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "0ba48448-2dc8-4c01-bf7b-d73e960ce9dc", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch10-2 - Aligning genetic data [Updated]" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "3d640c71-be77-4c60-885f-9b03489326d3", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install packages\n", 21 | "! brew install trimal\n", 22 | "! brew install mafft\n", 23 | "! brew install muscle" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "63c8069a-c598-4e76-ad49-18051468d11a", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# 8. Run MAFFT to align the genomes\n", 34 | "# Note- this takes about 30-60 minutes to run\n", 35 | "import subprocess\n", 36 | "from Bio.Align.Applications import MafftCommandline\n", 37 | "# Define the MAFFT command\n", 38 | "mafft_cline = MafftCommandline(input=\"sample.fasta\", ep=0.123, reorder=True, maxiterate=1000, localpair=True)\n", 39 | "# Print the command (for debugging purposes)\n", 40 | "print(\"Running MAFFT with command:\", mafft_cline)\n", 41 | "# Run MAFFT using subprocess\n", 42 | "process = subprocess.run(\n", 43 | " str(mafft_cline), # Convert command to string\n", 44 | " shell=True, # Run in shell environment\n", 45 | " capture_output=True, # Capture stdout and stderr\n", 46 | " text=True # Ensure output is captured as text (string)\n", 47 | ")\n", 48 | "# Check for errors\n", 49 | "if process.returncode != 0:\n", 50 | " print(\"Error running MAFFT:\", process.stderr)\n", 51 | "else:\n", 52 | " # Save the aligned output to a file\n", 53 | " with open(\"align.fasta\", \"w\") as w:\n", 54 | " w.write(process.stdout)\n", 55 | "print(\"Alignment completed and saved to align.fasta\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "63f8fcd5-d243-44b1-9f22-f29d27f48181", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# Once the above is completed, you should see the file align.fasta in your working directory" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "a2d30c45-147c-41a8-9935-859b77d7787f", 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# 9. Use TrimAl to trim sequences\n", 76 | "import os\n", 77 | "os.system('trimal -automated1 -in align.fasta -out trim.fasta -fasta') " 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "aa59be90-5b5c-4115-856b-d0a642d0e376", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# You should see the file trim.fasta as the output" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "29a14d97-e1c0-42c1-a46f-86c3327edce7", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# 10. Run MUSCLE to align the proteins (This uses MUSCLE V5)\n", 98 | "import subprocess\n", 99 | "import os\n", 100 | "\n", 101 | "my_genes = ['NP', 'L', 'VP35', 'VP40'] \n", 102 | "for gene in my_genes:\n", 103 | "\n", 104 | " input_file = f\"{gene}_P.fasta\"\n", 105 | " output_file = f\"{gene}_P_align.fasta\"\n", 106 | "\n", 107 | " # Verify if the input file exists\n", 108 | " if not os.path.exists(input_file):\n", 109 | " print(f\"Error: Input file '{input_file}' not found.\")\n", 110 | " else:\n", 111 | " # Construct the correct command for MUSCLE v5+\n", 112 | " muscle_cmd = f\"muscle -align {input_file} -output {output_file}\"\n", 113 | "\n", 114 | " print(f\"Running MUSCLE with command: {muscle_cmd}\")\n", 115 | "\n", 116 | " # Run MUSCLE using subprocess\n", 117 | " process = subprocess.run(\n", 118 | " muscle_cmd, shell=True, capture_output=True, text=True\n", 119 | " )\n", 120 | "\n", 121 | " # Check for errors\n", 122 | " if process.returncode != 0:\n", 123 | " print(\"Error running MUSCLE:\", process.stderr)\n", 124 | " else:\n", 125 | " print(f\"Alignment completed and saved to {output_file}\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "bd0737c6-786f-4c60-b68b-5820f0e1e140", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# You should see file four files: NP_P_align.fasta, L_P_align.fasta, VP35_P_align.fasta, VP40_P_align.fasta as the output" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "id": "2f3391e6-c1bd-4196-8ddf-272118e58bc7", 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# 11. Align genes by back-translation\n", 146 | "from Bio import SeqIO \n", 147 | "from Bio.Seq import Seq \n", 148 | "from Bio.SeqRecord import SeqRecord \n", 149 | "for gene in my_genes: \n", 150 | " gene_seqs = {} \n", 151 | " unal_gene = SeqIO.parse('%s.fasta' % gene, 'fasta') \n", 152 | " for rec in unal_gene: \n", 153 | " gene_seqs[rec.id] = rec.seq \n", 154 | " al_prot = SeqIO.parse('%s_P_align.fasta' % gene, 'fasta') \n", 155 | " al_genes = [] \n", 156 | " for protein in al_prot: \n", 157 | " my_id = protein.id \n", 158 | " seq = '' \n", 159 | " pos = 0 \n", 160 | " for c in protein.seq: \n", 161 | " if c == '-': \n", 162 | " seq += '---' \n", 163 | " else: \n", 164 | " seq += str(gene_seqs[my_id][pos:pos + 3]) \n", 165 | " pos += 3 \n", 166 | " al_genes.append(SeqRecord(Seq(seq), id=my_id)) \n", 167 | " SeqIO.write(al_genes, '%s_align.fasta' % gene, 'fasta') " 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "ce889cf2-4c3e-4344-897f-442cc559e599", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# You should see 4 output files: NP_align.fasta, L_align.fasta, VP35_align.fasta, VP40_align.fasta" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "id": "56e37a0c-fba0-45f5-8e1d-a73445504e4b", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "## End of Notebook ##" 188 | ] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.11.3" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 5 212 | } 213 | -------------------------------------------------------------------------------- /Ch03/Ch02-1-pandas-basic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "57186378", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch02-1-pandas-basic\n", 11 | "# Overview of basic pandas functionality for manipulating data files and tables" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "0bda3b91", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# Libraries\n", 22 | "import pandas as pd" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "8e82984b", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Read in the Vaccine data\n", 33 | "vdata = pd.read_csv(\"2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\") " 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "8acaabe3", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "vdata.columns " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "id": "c0f9e4a3", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "vdata.dtypes" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "1ca72187", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Get the shape of your data\n", 64 | "vdata.shape" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "374fc081", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "# Access a pandas array using an integer-based location\n", 75 | "vdata.iloc[0] " 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "id": "b1e0f929", 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# Set the index using a column\n", 86 | "vdata = vdata.set_index(\"VAERS_ID\") " 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "10cdfb94", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# Get the data using a key\n", 97 | "vdata.loc[916600] " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "5e40a3bc", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Use head to look at the top part of the data\n", 108 | "vdata.head(3) " 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "7405daa6", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Retrieve the first 3 rows using an array specification\n", 119 | "vdata.iloc[:3] " 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "28dde50b", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# Restrict the output to certain columns\n", 130 | "vdata.iloc[:5, 2:4] " 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "93836659", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "# Compute the maximum age in the dataset\n", 141 | "vdata[\"AGE_YRS\"].max() " 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "813b90ef", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "# A different style of notation\n", 152 | "vdata.AGE_YRS.max() " 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "id": "39bc5c15", 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# Plot the data\n", 163 | "vdata[\"AGE_YRS\"].sort_values().plot(use_index=False) " 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "6a686cf9", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# Second plot\n", 174 | "vdata[\"AGE_YRS\"].plot.hist(bins=20) " 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "714251b7", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "# Plot using matplotlib\n", 185 | "import matplotlib.pyplot as plt \n", 186 | "fig, ax = plt.subplots(1, 2, sharey=True) \n", 187 | "fig.suptitle(\"Age of adverse events\") \n", 188 | "vdata[\"AGE_YRS\"].sort_values().plot(use_index=False, ax=ax[0], xlabel=\"Obervation\", ylabel=\"Age\") \n", 189 | "vdata[\"AGE_YRS\"].plot.hist(bins=20, orientation=\"horizontal\") " 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "2a9d6911", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Count events per year\n", 200 | "vdata[\"AGE_YRS\"].dropna().apply(lambda x: int(x)).value_counts() " 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "663f65b4", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# Count the number of people who died\n", 211 | "vdata.DIED.value_counts(dropna=False) " 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "id": "d88bec1d", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# Set the is_dead column\n", 222 | "vdata[\"is_dead\"] = (vdata.DIED == \"Y\") " 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "7eecbe5e", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "# Associate data about deaths with vaccine involved\n", 233 | "dead = vdata[vdata.is_dead] \n", 234 | "vax = pd.read_csv(\"2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\").set_index(\"VAERS_ID\") \n", 235 | "vax.groupby(\"VAX_TYPE\").size().sort_values() \n", 236 | "vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n", 237 | "vax19_dead = dead.join(vax19) " 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "id": "a0020e32", 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "# Top 10 covid vaccine lots\n", 248 | "baddies = vax19_dead.groupby(\"VAX_LOT\").size().sort_values(ascending=False) " 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "id": "5f8f71be", 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# Print out the top lots\n", 259 | "for i, (lot,cnt) in enumerate(baddies.items()):\n", 260 | " print(lot, cnt, len(vax19_dead[vax19_dead.VAX_LOT == lot].groupby(\"STATE\")))\n", 261 | " if i == 10:\n", 262 | " break" 263 | ] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "Python 3 (ipykernel)", 269 | "language": "python", 270 | "name": "python3" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 3 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython3", 282 | "version": "3.12.2" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 5 287 | } 288 | -------------------------------------------------------------------------------- /Ch08/Ch08-3-pdb-uniprot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "785942a2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch08-3 - Using PDB & UniProt" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "16f46a4c", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Import libraries\n", 21 | "import requests\n", 22 | "import sys\n", 23 | "import json" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "af788ba3", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# 1. Fetch protein data from UniProt given an Accession\n", 34 | "def fetch_protein_data_json(accession):\n", 35 | " \"\"\"\n", 36 | " Fetch protein data from the EBI Proteins API in JSON format.\n", 37 | "\n", 38 | " Parameters:\n", 39 | " accession (str): Protein accession number (e.g., P21802).\n", 40 | "\n", 41 | " Returns:\n", 42 | " dict: The protein data as a JSON object (Python dictionary).\n", 43 | " \"\"\"\n", 44 | " request_url = f\"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession={accession}\"\n", 45 | " headers = {\"Accept\": \"application/json\"} # Request JSON format\n", 46 | "\n", 47 | " try:\n", 48 | " print(f\"Fetching data for accession: {accession}\")\n", 49 | " response = requests.get(request_url, headers=headers, timeout=30)\n", 50 | " response.raise_for_status() # Raise an exception for HTTP errors\n", 51 | " return response.json() # Parse JSON response directly into a Python dictionary\n", 52 | " except requests.exceptions.RequestException as e:\n", 53 | " print(f\"Error fetching protein data: {e}\")\n", 54 | " sys.exit(1)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "id": "069dfb5f", 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# Function to save the JSON data to a file\n", 65 | "def save_json_to_file(data, filename):\n", 66 | " \"\"\"\n", 67 | " Save JSON data to a file.\n", 68 | "\n", 69 | " Parameters:\n", 70 | " data (dict): The JSON data to save.\n", 71 | " filename (str): The name of the file to save the data to.\n", 72 | "\n", 73 | " Returns:\n", 74 | " None\n", 75 | " \"\"\"\n", 76 | " try:\n", 77 | " with open(filename, \"w\") as json_file:\n", 78 | " json.dump(data, json_file, indent=4)\n", 79 | " print(f\"Protein data saved to {filename}\")\n", 80 | " except IOError as e:\n", 81 | " print(f\"Error saving data to file: {e}\")\n", 82 | " sys.exit(1)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "id": "6966d15e", 89 | "metadata": { 90 | "scrolled": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "# Execute functions for accession number of interest\n", 95 | "def main():\n", 96 | " accession = \"P21802\" # Example accession\n", 97 | " output_file = \"protein_data.json\" # File to save the JSON response\n", 98 | "\n", 99 | " # Fetch protein data\n", 100 | " protein_data = fetch_protein_data_json(accession)\n", 101 | "\n", 102 | " # Print the JSON data as a Python dictionary\n", 103 | " print(\"Protein Data (JSON):\")\n", 104 | " print(protein_data)\n", 105 | "\n", 106 | " # Save the data to a file\n", 107 | " save_json_to_file(protein_data, output_file)\n", 108 | "\n", 109 | "if __name__ == \"__main__\":\n", 110 | " main()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "4c2a2954", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# Move file to output\n", 121 | "! mv protein_data.json output/" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "08d994f1", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# 2. Query PDB" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "5b72aefe", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Import Libraries\n", 142 | "import os\n", 143 | "import requests\n", 144 | "from Bio import PDB" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "id": "62f76cc5", 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# Download from PDB given and ID\n", 155 | "def download_pdb(pdb_id, output_dir=\"output\"):\n", 156 | " \"\"\"\n", 157 | " Downloads a PDB file and associated metadata from the Protein Data Bank.\n", 158 | "\n", 159 | " :param pdb_id: The 4-character PDB ID (e.g., '1A8M')\n", 160 | " :param output_dir: Directory where files will be saved\n", 161 | " \"\"\"\n", 162 | " pdb_id = pdb_id.lower() # Ensure the PDB ID is lowercase\n", 163 | " base_url = \"https://files.rcsb.org/download\"\n", 164 | " metadata_url = f\"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}\"\n", 165 | " pdb_url = f\"{base_url}/{pdb_id}.pdb\"\n", 166 | " \n", 167 | " # Create output directory if it doesn't exist\n", 168 | " os.makedirs(output_dir, exist_ok=True)\n", 169 | " \n", 170 | " # Download PDB file\n", 171 | " pdb_file_path = os.path.join(output_dir, f\"{pdb_id}.pdb\")\n", 172 | " response = requests.get(pdb_url)\n", 173 | " if response.status_code == 200:\n", 174 | " with open(pdb_file_path, \"w\") as file:\n", 175 | " file.write(response.text)\n", 176 | " print(f\"PDB file saved at: {pdb_file_path}\")\n", 177 | " else:\n", 178 | " print(f\"Failed to download PDB file for {pdb_id}.\")\n", 179 | " \n", 180 | " # Download metadata\n", 181 | " metadata_file_path = os.path.join(output_dir, f\"{pdb_id}_metadata.json\")\n", 182 | " response = requests.get(metadata_url)\n", 183 | " if response.status_code == 200:\n", 184 | " with open(metadata_file_path, \"w\") as file:\n", 185 | " file.write(response.text)\n", 186 | " print(f\"Metadata saved at: {metadata_file_path}\")\n", 187 | " else:\n", 188 | " print(f\"Failed to download metadata for {pdb_id}.\")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "f3dab7d9", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "# Run the function for our protein\n", 199 | "pdb_id = \"1A8M\" \n", 200 | "download_pdb(pdb_id)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "6167a641", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "## End of Notebook ##" 211 | ] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3 (ipykernel)", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.11.3" 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 5 235 | } 236 | -------------------------------------------------------------------------------- /Ch09/Ch09-3-geometric-operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e5b41ab1", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch09-3 Geometric Operations" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "b6cfdd54-15f6-4826-9d5b-f46af6ba0b01", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Installations (if not already completed)\n", 21 | "# ! pip install biopython" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "993c591b-f603-488d-8c5f-a5f7cabee97d", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# 1. Import Libraries\n", 32 | "import numpy as np\n", 33 | "from Bio import PDB \n", 34 | "import pandas as pd" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "9a4aa2bf", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# 2. Retrieve data\n", 45 | "repository = PDB.PDBList() \n", 46 | "parser = PDB.PDBParser() \n", 47 | "repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') \n", 48 | "p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') \n", 49 | "# Note - it is OK to get warnings about \"Structure Exists\" here" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "17b74123", 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# 3. Recall residue types\n", 60 | "my_residues = set() \n", 61 | "for residue in p53_1tup.get_residues(): \n", 62 | " my_residues.add(residue.id[0]) \n", 63 | "print(my_residues) " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "9a7aa5fe", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# 4. Compute masses for chains, zincs, and waters\n", 74 | "# Function to compute mass\n", 75 | "def get_mass(atoms, accept_fun=lambda x: True):\n", 76 | " return sum([atom.mass for atom in atoms if accept_fun(atom)])\n", 77 | "# Extract chain names\n", 78 | "chain_names = [chain.id for chain in p53_1tup.get_chains()]\n", 79 | "# Initialize NumPy array for masses\n", 80 | "my_mass = np.ndarray((len(chain_names), 3))\n", 81 | "# Iterate over chains to compute mass\n", 82 | "for i, chain in enumerate(p53_1tup.get_chains()):\n", 83 | " my_mass[i, 0] = get_mass(chain.get_atoms())\n", 84 | "print(\"Mass array:\", my_mass)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "f89881a3-60ac-4a4c-8c92-ce23dd8427b4", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# Alternate for the above with better formatting\n", 95 | "import numpy as np\n", 96 | "\n", 97 | "# Function to compute mass\n", 98 | "def get_mass(atoms, accept_fun=lambda x: True):\n", 99 | " \"\"\"Calculate total mass of atoms that pass the acceptance function.\"\"\"\n", 100 | " return sum([atom.mass for atom in atoms if accept_fun(atom)])\n", 101 | "\n", 102 | "# Filter functions for different atom types\n", 103 | "def is_not_water(atom):\n", 104 | " \"\"\"Returns True if atom is not part of a water molecule.\"\"\"\n", 105 | " return atom.get_parent().get_resname() != 'HOH'\n", 106 | "\n", 107 | "def is_zinc(atom):\n", 108 | " \"\"\"Returns True if atom is zinc.\"\"\"\n", 109 | " return atom.element == 'ZN'\n", 110 | "\n", 111 | "def is_water(atom):\n", 112 | " \"\"\"Returns True if atom is part of a water molecule.\"\"\"\n", 113 | " return atom.get_parent().get_resname() == 'HOH'\n", 114 | "\n", 115 | "# Extract chain names\n", 116 | "chain_names = [chain.id for chain in p53_1tup.get_chains()]\n", 117 | "\n", 118 | "# Initialize NumPy array for masses (3 columns: No water, Zincs, Water)\n", 119 | "my_mass = np.zeros((len(chain_names), 3))\n", 120 | "\n", 121 | "# Iterate over chains to compute mass for each category\n", 122 | "for i, chain in enumerate(p53_1tup.get_chains()):\n", 123 | " my_mass[i, 0] = get_mass(chain.get_atoms(), is_not_water) # No water\n", 124 | " my_mass[i, 1] = get_mass(chain.get_atoms(), is_zinc) # Zincs\n", 125 | " my_mass[i, 2] = get_mass(chain.get_atoms(), is_water) # Water\n", 126 | "\n", 127 | "# Create nicely formatted table\n", 128 | "print(\"Mass Distribution by Chain (Daltons)\")\n", 129 | "print(\"=\" * 45)\n", 130 | "print(f\"{'Chain':<6} {'No water':<12} {'Zincs':<12} {'Water':<12}\")\n", 131 | "print(\"-\" * 45)\n", 132 | "\n", 133 | "for i, chain_id in enumerate(chain_names):\n", 134 | " print(f\"{chain_id:<6} {my_mass[i, 0]:<12.2f} {my_mass[i, 1]:<12.2f} {my_mass[i, 2]:<12.2f}\")\n", 135 | "\n", 136 | "print(\"-\" * 45)\n", 137 | "\n", 138 | "# Calculate and display totals\n", 139 | "totals = np.sum(my_mass, axis=0)\n", 140 | "print(f\"{'Total':<6} {totals[0]:<12.2f} {totals[1]:<12.2f} {totals[2]:<12.2f}\")\n", 141 | "\n", 142 | "print(f\"\\nRaw mass array:\\n{my_mass}\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "088df255", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# 5. Compute geometric center and center of mass\n", 153 | "def get_center(atoms, \n", 154 | " weight_fun=lambda atom: 1 if atom.parent.id[0] != 'W' else 0): \n", 155 | " xsum = ysum = zsum = 0.0 \n", 156 | " acum = 0.0 \n", 157 | " for atom in atoms: \n", 158 | " x, y, z = atom.coord \n", 159 | " weight = weight_fun(atom) \n", 160 | " acum += weight \n", 161 | " xsum += weight * x \n", 162 | " ysum += weight * y \n", 163 | " zsum += weight * z \n", 164 | " return xsum / acum, ysum / acum, zsum / acum \n", 165 | " print(get_center(p53_1tup.get_atoms())) \n", 166 | " print(get_center(p53_1tup.get_atoms(), \n", 167 | " weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0)) " 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "1b5c3c06", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# 6. Compute center of mass and geometric center of each chain\n", 178 | "my_center = np.ndarray((len(chain_names), 6)) \n", 179 | "for i, chain in enumerate(p53_1tup.get_chains()): \n", 180 | " x, y, z = get_center(chain.get_atoms()) \n", 181 | " my_center[i, 0] = x \n", 182 | " my_center[i, 1] = y \n", 183 | " my_center[i, 2] = z \n", 184 | " x, y, z = get_center(chain.get_atoms(), \n", 185 | " weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0) \n", 186 | " my_center[i, 3] = x \n", 187 | " my_center[i, 4] = y \n", 188 | " my_center[i, 5] = z \n", 189 | "weights = pd.DataFrame(my_center, index=chain_names, \n", 190 | " columns=['X', 'Y', 'Z', 'X (Mass)', 'Y (Mass)', 'Z (Mass)']) \n", 191 | "print(weights) " 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "d7b3bd59", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "## End of Notebook ##" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3 (ipykernel)", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.12.11" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 5 226 | } 227 | -------------------------------------------------------------------------------- /Ch08/Ch08-2-using-sra.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5259b0e5", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch08-2 Using the Short Read Archive" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "debe30e4", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# To get the following code to run, you'll need to make sure fasterq-dump is already in your PATH\n", 21 | "# If you have not already done so, you'll want to install it and add it to your PATH\n", 22 | "# Then close out your notebooks and restart jupyter notebook from a terminal where you know you can\n", 23 | "# see fasterq-dump in your path" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "id": "c2f80ae6", 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# Refer to Ch 5 Recipe 2 for sra tools install\n", 34 | "# Run the command below in your terminal to make sure fasterq-dump is in your path and add it to your zshrc file\n", 35 | "# (check that your path is correct, it may not be the same as below)\n", 36 | "! echo 'export PATH=$PATH:~/Software/sratoolkit.3.1.1-mac-x86_64/bin' >> ~/.zshrc\n", 37 | "! source ~/.zshrc \n", 38 | "# check that fasterq-dump is working\n", 39 | "! fasterq-dump -h\n", 40 | "# Then restart your jupyter notebook" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "id": "9d917dd1", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Install pysradb\n", 51 | "! pip install pysradb" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "9a425eee", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# Import Libraries\n", 62 | "import os\n", 63 | "import subprocess\n", 64 | "from pysradb.sraweb import SRAweb" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "3728a992", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "def fetch_sra_metadata(sra_accession):\n", 75 | " \"\"\"\n", 76 | " Retrieve metadata for a given SRA accession using pysradb.\n", 77 | " \n", 78 | " Parameters:\n", 79 | " sra_accession (str): SRA study or run accession (e.g., SRP, SRX, SRA, or ERR).\n", 80 | " \n", 81 | " Returns:\n", 82 | " metadata (DataFrame): Metadata table for the SRA accession.\n", 83 | " \"\"\"\n", 84 | " db = SRAweb()\n", 85 | " metadata = db.sra_metadata(sra_accession, detailed=True)\n", 86 | " return metadata" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "4a86b2b1", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "def download_sra_run(run_accession, output_dir=\"sra_data\"):\n", 97 | " \"\"\"\n", 98 | " Download SRA run data using fasterq-dump.\n", 99 | " \n", 100 | " Parameters:\n", 101 | " run_accession (str): The specific SRA run accession (e.g., SRR12345678).\n", 102 | " output_dir (str): Directory to save the downloaded data.\n", 103 | " \n", 104 | " Returns:\n", 105 | " None\n", 106 | " \"\"\"\n", 107 | " if not os.path.exists(output_dir):\n", 108 | " os.makedirs(output_dir)\n", 109 | " \n", 110 | " try:\n", 111 | " print(f\"Downloading SRA run {run_accession}...\")\n", 112 | " # Command to download and convert to FASTQ\n", 113 | " subprocess.run(\n", 114 | " [\"fasterq-dump\", run_accession, \"--outdir\", output_dir, \"--split-files\"],\n", 115 | " check=True\n", 116 | " )\n", 117 | " print(f\"Download complete. Files saved in {output_dir}\")\n", 118 | " except subprocess.CalledProcessError as e:\n", 119 | " print(f\"Error downloading {run_accession}: {e}\")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "2946335e", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "def main():\n", 130 | " sra_accession = \"SRR536546\" # Small test dataset\n", 131 | " # Fetch metadata\n", 132 | " metadata = fetch_sra_metadata(sra_accession)\n", 133 | " print(\"Metadata for the accession:\")\n", 134 | " print(metadata)\n", 135 | "\n", 136 | " # Download the first run as an example\n", 137 | " if not metadata.empty:\n", 138 | " first_run = metadata[\"run_accession\"].iloc[0]\n", 139 | " download_sra_run(first_run)\n", 140 | " else:\n", 141 | " print(\"No runs found for this accession.\")\n", 142 | "if __name__ == \"__main__\":\n", 143 | " main()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "55f23863", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# Move folder to output\n", 154 | "! mv sra_data output/" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "26340898", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# Use BLAST #" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "id": "373178eb", 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# Query using BLAST via the NCBI API\n", 175 | "from Bio.Blast import NCBIWWW\n", 176 | "from Bio.Blast import NCBIXML\n", 177 | "\n", 178 | "# Define a sample FASTA sequence\n", 179 | "query_sequence = \">test_query\\nATGGCCATTGTAATCATGTTCTAATAGTGTTCA\"\n", 180 | "\n", 181 | "# Submit the query to NCBI BLAST (nucleotide BLAST: blastn)\n", 182 | "result_handle = NCBIWWW.qblast(\"blastn\", \"nt\", query_sequence)\n", 183 | "\n", 184 | "# Save the results to a file\n", 185 | "with open(\"blast_result.xml\", \"w\") as out_file:\n", 186 | " out_file.write(result_handle.read())\n", 187 | "\n", 188 | "print(\"BLAST search completed! Results saved in 'blast_result.xml'\")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "c1bc752f", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "# Parse the BLAST output #\n", 199 | "# Read BLAST results from XML file\n", 200 | "with open(\"blast_result.xml\") as result_file:\n", 201 | " blast_records = NCBIXML.read(result_file)\n", 202 | "\n", 203 | "# Print top hits\n", 204 | "for alignment in blast_records.alignments[:5]: # Display top 5 hits\n", 205 | " print(f\"Hit: {alignment.title}\")\n", 206 | " for hsp in alignment.hsps:\n", 207 | " print(f\" Score: {hsp.score}, E-value: {hsp.expect}\")" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "dfc177b1", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# Move the file to the output subdirectory\n", 218 | "! mv blast_result.xml output/" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "fc02d67c", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "## End of Notebook ##" 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3 (ipykernel)", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.12.11" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 5 253 | } 254 | -------------------------------------------------------------------------------- /Ch11/Ch11-1-plink.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "24b7b42f-b2e4-44dd-93f4-6a9367318395", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch11-1-plink [Updated]" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "e2e4f87a-2179-4f3b-b561-26f83ada3027", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install Plink\n", 21 | "#. Download the appropriate binary for your system from here:\n", 22 | "#. https://www.cog-genomics.org/plink/2.0/\n", 23 | "# Move the file from your Downloads directory to your Ch11 working directory \n", 24 | "# unzip the file\n", 25 | "# Test by running: \n", 26 | "# ./plink2\n", 27 | "# You will most likely get a message saying you cannot trust this file. If so, do the following:\n", 28 | " # 1. Go into your Mac Settings ->\n", 29 | " # 2. Open System Preferences → Security & Privacy:\n", 30 | " # 3. Click on the General tab.\n", 31 | " # 4. Look for a message saying that \"plink2 was blocked because it is from an unidentified developer.\"\n", 32 | " # 5. Click Allow Anyway\n", 33 | "# You should now be able to run ./plink2 \n", 34 | "# You may need to click Allow All again and provide the administrator password for your Mac" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "e8ec5e2c-2eca-40cc-ab26-8ff7cb994469", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "# Download the data\n", 45 | "# First go here and download the 2 files:\n", 46 | "# https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/\n", 47 | "# move them to your Ch11/data working directory\n", 48 | "# Unzip the files:\n", 49 | "! gunzip data/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz \n", 50 | "! gunzip data/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz \n", 51 | "# Get the relationships file\n", 52 | "! wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt\n", 53 | "! mv relationships_w_pops_041510.txt data/" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "dded2683-c7c3-473d-a54f-d8c3b873765e", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# 1 - get metadata for samples\n", 64 | "# Import libraries\n", 65 | "import os\n", 66 | "from collections import defaultdict \n", 67 | "f = open('data/relationships_w_pops_041510.txt') \n", 68 | "pop_ind = defaultdict(list) \n", 69 | "f.readline() # header \n", 70 | "offspring = [] \n", 71 | "for l in f: \n", 72 | " toks = l.rstrip().split('\\t') \n", 73 | " fam_id = toks[0] \n", 74 | " ind_id = toks[1] \n", 75 | " mom = toks[2] \n", 76 | " dad = toks[3] \n", 77 | " if mom != '0' or dad != '0': \n", 78 | " offspring.append((fam_id, ind_id)) \n", 79 | " pop = toks[-1] \n", 80 | "pop_ind[pop].append((fam_id, ind_id)) \n", 81 | "f.close() " 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "1d74e714-f0d8-41be-b19d-599c0be59cd5", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# 2 - subsample the data\n", 92 | "#. Note - replace the path below with your path to plink2\n", 93 | "! ./plink2 --pedmap data/hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap10 --thin 0.1 --geno 0.1 --export ped \n", 94 | "! ./plink2 --pedmap data/hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap1 --thin 0.01 --geno 0.1 --export ped " 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "636bfbe0-6c0a-4597-9fc7-32a947225272", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# 3. Generate subsets with just the autosomes\n", 105 | "def get_non_auto_SNPs(map_file, exclude_file): \n", 106 | " f = open(map_file) \n", 107 | " w = open(exclude_file, 'w') \n", 108 | " for l in f: \n", 109 | " toks = l.rstrip().split('\\t') \n", 110 | " try: \n", 111 | " chrom = int(toks[0]) \n", 112 | " except ValueError: \n", 113 | " rs = toks[1] \n", 114 | " w.write('%s\\n' % rs) \n", 115 | " w.close() \n", 116 | "get_non_auto_SNPs('hapmap1.map', 'exclude1.txt') \n", 117 | "get_non_auto_SNPs('hapmap10.map', 'exclude10.txt') \n", 118 | "os.system('./plink2 --pedmap hapmap1 --out hapmap1_auto --exclude exclude1.txt --export ped') \n", 119 | "os.system('./plink2 --pedmap hapmap10 --out hapmap10_auto --exclude exclude10.txt --export ped') " 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "id": "c9444cbc-36b2-4131-9d6f-e19a6d6ff0b1", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "# 5. Function to generate a list of SNPs belonging to autosomes is defined above" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "e0240969-9150-48e3-9bd9-7384f747adab", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# 6. Datasets without offspring\n", 140 | "os.system('./plink2 --pedmap hapmap10_auto --filter-founders --out hapmap10_auto_noofs --export ped') " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "e864875b-0d9c-4bcb-9262-76b8f6eaffc3", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# 7. Generate LD-pruned dataset\n", 151 | "os.system('./plink2 --pedmap hapmap10_auto_noofs --indep-pairwise 50 10 0.1 --out keep --export ped') \n", 152 | "#os.system('~/work/CookBook/Ch11/plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --recode --out hapmap10_auto_noofs_ld --export ped')\n", 153 | "# Remove --recode from the original commmand to remove an error: \n", 154 | "os.system('./plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --out hapmap10_auto_noofs_ld --export ped')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "12df8491-9975-433e-a746-7a85536d7581", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# 8. Recode cases in different formats\n", 165 | "# os.system('~/work/CookBook/Ch11/plink2 --file hapmap10_auto_noofs_ld --recode12 tab --out hapmap10_auto_noofs_ld_12 --export ped 12') \n", 166 | "#. Note - fixed above original command to not use --file\n", 167 | "os.system('./plink2 --pedmap hapmap10_auto_noofs_ld --export ped --out hapmap10_auto_noofs_ld_12')\n", 168 | "# os.system('~/work/CookBook/Ch11/plink2 --make-bed --file hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld') \n", 169 | "# Note - fixed above original command to not use --file\n", 170 | "os.system('./plink2 --pedmap hapmap10_auto_noofs_ld --make-bed --out hapmap10_auto_noofs_ld')" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "9d74a18a-f332-4dd9-8de0-9239b3fb303e", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# 9. Extract a single chromosome for analysis\n", 181 | "os.system('./plink2 --pedmap hapmap10_auto_noofs --chr 2 --out hapmap10_auto_noofs_2 --export ped') " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "12902b40-83a5-4f38-bd70-b9e5b4bbbfc1", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "## End of Notebook ##" 192 | ] 193 | } 194 | ], 195 | "metadata": { 196 | "kernelspec": { 197 | "display_name": "Python 3 (ipykernel)", 198 | "language": "python", 199 | "name": "python3" 200 | }, 201 | "language_info": { 202 | "codemirror_mode": { 203 | "name": "ipython", 204 | "version": 3 205 | }, 206 | "file_extension": ".py", 207 | "mimetype": "text/x-python", 208 | "name": "python", 209 | "nbconvert_exporter": "python", 210 | "pygments_lexer": "ipython3", 211 | "version": "3.11.14" 212 | } 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 5 216 | } 217 | -------------------------------------------------------------------------------- /Ch04/Ch04-4-decision-trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5c902a08", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "# Ch04-4 - Using Decision Trees to Explore Breast Cancer data" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "344986b2", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# Install the Seaborn library for graphing\n", 21 | "! pip install seaborn==0.13.2" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "bb4bb351", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# Import necessary libraries\n", 32 | "from sklearn.datasets import load_breast_cancer\n", 33 | "from sklearn.model_selection import train_test_split\n", 34 | "from sklearn.tree import DecisionTreeClassifier, plot_tree\n", 35 | "from sklearn.metrics import (\n", 36 | " accuracy_score, \n", 37 | " confusion_matrix, \n", 38 | " classification_report, \n", 39 | " precision_score, \n", 40 | " recall_score, \n", 41 | " f1_score\n", 42 | ")\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "import seaborn as sns\n", 45 | "import numpy as np" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "c71173e5", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Load the breast cancer dataset\n", 56 | "data = load_breast_cancer()\n", 57 | "X, y = data.data, data.target" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "b19fbc89", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# Split the data into training and testing sets\n", 68 | "X_train, X_test, y_train, y_test = train_test_split(\n", 69 | " X, y, test_size=0.2, random_state=42\n", 70 | ")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "df42c41a", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Create and train the decision tree classifier\n", 81 | "dt_classifier = DecisionTreeClassifier(\n", 82 | " random_state=42, # For reproducibility\n", 83 | " max_depth=5, # Limit tree depth to prevent overfitting\n", 84 | " criterion='gini' # Can also use 'entropy'\n", 85 | ")\n", 86 | "dt_classifier.fit(X_train, y_train)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "be10774e", 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "# Make predictions\n", 97 | "y_pred = dt_classifier.predict(X_test)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "0f16bec7", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Performance metrics\n", 108 | "print(\"Decision Tree Performance Metrics:\")\n", 109 | "print(\"-\" * 30)\n", 110 | "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n", 111 | "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n", 112 | "print(f\"Recall: {recall_score(y_test, y_pred):.4f}\")\n", 113 | "print(f\"F1 Score: {f1_score(y_test, y_pred):.4f}\")" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "c85362d2", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# Detailed classification report\n", 124 | "print(\"\\nDetailed Classification Report:\")\n", 125 | "print(classification_report(y_test, y_pred, \n", 126 | " target_names=data.target_names))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "e61f82e4", 133 | "metadata": { 134 | "scrolled": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "# Confusion Matrix Visualization\n", 139 | "plt.figure(figsize=(8, 6))\n", 140 | "cm = confusion_matrix(y_test, y_pred)\n", 141 | "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n", 142 | " xticklabels=data.target_names,\n", 143 | " yticklabels=data.target_names)\n", 144 | "plt.title('Confusion Matrix for Decision Tree')\n", 145 | "plt.xlabel('Predicted Label')\n", 146 | "plt.ylabel('True Label')\n", 147 | "plt.tight_layout()\n", 148 | "plt.show()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "1fe7bfba", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# Feature Importance Visualization\n", 159 | "plt.figure(figsize=(10, 6))\n", 160 | "feature_importance = dt_classifier.feature_importances_\n", 161 | "sorted_idx = np.argsort(feature_importance)\n", 162 | "pos = np.arange(sorted_idx.shape[0]) + .5\n", 163 | "\n", 164 | "plt.barh(pos, feature_importance[sorted_idx], align='center')\n", 165 | "plt.yticks(pos, [data.feature_names[i] for i in sorted_idx])\n", 166 | "plt.xlabel('Feature Importance')\n", 167 | "plt.title('Decision Tree Feature Importance')\n", 168 | "plt.tight_layout()\n", 169 | "plt.show()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "99250c28", 176 | "metadata": { 177 | "scrolled": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "# Visualize the Decision Tree\n", 182 | "plt.figure(figsize=(20,10))\n", 183 | "plot_tree(dt_classifier, \n", 184 | " feature_names=data.feature_names,\n", 185 | " class_names=data.target_names,\n", 186 | " filled=True, \n", 187 | " rounded=True)\n", 188 | "plt.title('Decision Tree Classifier')\n", 189 | "plt.show()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "d7fc8563-a9e9-4694-8f05-51ecfeca5487", 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Clearer view of the image\n", 200 | "import matplotlib.pyplot as plt\n", 201 | "from sklearn import tree\n", 202 | "\n", 203 | "# Create a larger figure with higher resolution\n", 204 | "plt.figure(figsize=(24, 12), dpi=300) # Larger size, higher resolution\n", 205 | "\n", 206 | "# Plot the decision tree\n", 207 | "tree.plot_tree(dt_classifier, \n", 208 | " feature_names=data.feature_names,\n", 209 | " class_names=data.target_names,\n", 210 | " filled=True, \n", 211 | " rounded=True,\n", 212 | " fontsize=14) # Increase font size\n", 213 | "\n", 214 | "# Add a title\n", 215 | "plt.title('Decision Tree Classifier', fontsize=18)\n", 216 | "\n", 217 | "# Show the plot\n", 218 | "plt.show()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "ea51a67a", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# Optional: Cross-validation for more robust performance estimation\n", 229 | "from sklearn.model_selection import cross_val_score\n", 230 | "cv_scores = cross_val_score(dt_classifier, X, y, cv=5)\n", 231 | "print(\"\\nCross-Validation Scores:\")\n", 232 | "print(f\"Mean CV Score: {cv_scores.mean():.4f}\")\n", 233 | "print(f\"Standard Deviation: {cv_scores.std():.4f}\")" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "7f2160aa", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "## End of Notebook ##" 244 | ] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3 (ipykernel)", 250 | "language": "python", 251 | "name": "python3" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 3 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython3", 263 | "version": "3.8.6" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 5 268 | } 269 | --------------------------------------------------------------------------------