├── Ch14
    ├── sample.txt
    ├── app
    │   ├── requirements.txt
    │   └── app.py
    ├── README.md
    ├── Dockerfile
    ├── ch14-cloud-computing.yml
    └── s3-bucket.yaml
├── Ch05
    ├── README.md
    ├── ch05-alignment.yml
    ├── Ch05-1-qc-data.ipynb
    └── Ch05-4-variant-calling.ipynb
├── Ch06
    ├── README.md
    └── ch06-annotation.yml
├── Requirements.txt
├── Ch02
    ├── README.md
    ├── Ch02-3-pandas-memory.ipynb
    └── Ch02-2-pandas-pitfalls.ipynb
├── Ch03
    ├── README.md
    ├── example.fasta
    ├── sample.py
    ├── Ch03-1-pycodestyle.ipynb
    ├── Ch03-2-sequence-manipulation.ipynb
    ├── Ch03-3-read-alignment.ipynb
    ├── pycodestyle.ipynb
    ├── Ch03-4-test-writing.ipynb
    └── Ch02-1-pandas-basic.ipynb
├── Ch04
    ├── README.md
    ├── ch04-data-science.yml
    ├── Ch04-3-k-means.ipynb
    ├── Ch04-2-PCA.ipynb
    ├── Ch04-6-seaborn.ipynb
    └── Ch04-4-decision-trees.ipynb
├── Ch10
    ├── README.md
    ├── ch10-phylogenetics.yml
    ├── .ipynb_checkpoints
    │   └── Ch10-1-preparing-dataset-checkpoint.ipynb
    ├── Ch10-6-visualizing-phylogenetics.ipynb
    ├── Ch10-4-reconstructing-trees.ipynb
    ├── Ch10-5-recursive-trees.ipynb
    └── Ch10-2-aligning-genetic-data.ipynb
├── Ch11
    ├── README.md
    ├── ch11-population-genomics.yml
    ├── Ch11-3-exploring-with-sgkit.ipynb
    ├── Ch11-2-using-sgkit.ipynb
    └── Ch11-1-plink.ipynb
├── Ch13
    ├── README.md
    └── ch13-genome-editing.yml
├── Ch16
    ├── README.md
    └── ch16-more-workflows.yml
├── Ch17
    ├── README.md
    └── ch17-machine-learning.yml
├── Ch18
    ├── README.md
    └── ch18-single-cell.yml
├── Ch08
    ├── README.md
    ├── ch08-databases.yml
    ├── Ch08-1-genbank-ncbi.ipynb
    ├── Ch08-3-pdb-uniprot.ipynb
    └── Ch08-2-using-sra.ipynb
├── Ch09
    ├── README.md
    ├── ch09-proteins.yml
    ├── Ch09-5-proteomics.ipynb
    ├── Ch09-2-molecular-distances.ipynb
    ├── Ch09-4-py3dmol.ipynb
    └── Ch09-3-geometric-operations.ipynb
├── Ch12
    ├── README.md
    ├── ch12-applications.yml
    └── Ch12-1-cobrapy.ipynb
├── Ch15
    ├── README.md
    ├── galaxy
    │   ├── Ch15-1-bonus-using-galaxy-apis.pdf
    │   └── Ch15-1-introducing-galaxy.ipynb
    ├── nextflow
    │   └── nextflow.config
    ├── ch15-workflows.yml
    └── snakemake
    │   └── Snakefile
├── Ch01
    ├── bioinformatics_base.yml
    ├── README.md
    ├── Welcome.py
    └── Welcome.ipynb
├── Ch07
    ├── ch07-genomes.yml
    ├── Ch07-4-genome-assessment.ipynb
    ├── Ch07-3-long-read-assembly.ipynb
    └── Ch07-1-genomes.ipynb
├── LICENSE
└── docker
    └── main
        └── Dockerfile


/Ch14/sample.txt:
--------------------------------------------------------------------------------
1 | This is a sample file.


--------------------------------------------------------------------------------
/Ch14/app/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==2.2.3
2 | 


--------------------------------------------------------------------------------
/Ch05/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch05
2 | 


--------------------------------------------------------------------------------
/Ch06/README.md:
--------------------------------------------------------------------------------
1 | This is the REAMDE for Ch06.
2 | 


--------------------------------------------------------------------------------
/Requirements.txt:
--------------------------------------------------------------------------------
1 | # SAB - Requirements.txt
2 | 


--------------------------------------------------------------------------------
/Ch02/README.md:
--------------------------------------------------------------------------------
1 | This is the README file for Ch02
2 | 


--------------------------------------------------------------------------------
/Ch03/README.md:
--------------------------------------------------------------------------------
1 | This is the README file for Ch03.
2 | 


--------------------------------------------------------------------------------
/Ch04/README.md:
--------------------------------------------------------------------------------
1 | This is the README file for Ch04.
2 | 


--------------------------------------------------------------------------------
/Ch10/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch10.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch11/README.md:
--------------------------------------------------------------------------------
1 | This is the REAMDE for Ch11.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch13/README.md:
--------------------------------------------------------------------------------
1 | This is the REAMDE for Ch13.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch14/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch14.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch16/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch16.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch17/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch17.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch18/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch18
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch08/README.md:
--------------------------------------------------------------------------------
1 | This is the REAMDE file for Ch08.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/Ch09/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch09.
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/Ch12/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch12.
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/Ch15/README.md:
--------------------------------------------------------------------------------
1 | This is the README for Ch15.
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/Ch03/example.fasta:
--------------------------------------------------------------------------------
1 | >seq1
2 | ATCGTACGATCG
3 | GATCGTACGATC
4 | >seq2
5 | CGTAGCTAGCTA
6 | 
7 | 


--------------------------------------------------------------------------------
/Ch15/galaxy/Ch15-1-bonus-using-galaxy-apis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-Fourth-Edition/HEAD/Ch15/galaxy/Ch15-1-bonus-using-galaxy-apis.pdf


--------------------------------------------------------------------------------
/Ch14/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | FROM python:3.9-slim
 3 | 
 4 | WORKDIR /app
 5 | 
 6 | COPY app/requirements.txt .
 7 | RUN pip install --no-cache-dir -r requirements.txt
 8 | 
 9 | COPY app/ .
10 | 
11 | EXPOSE 5000
12 | 
13 | CMD ["python", "app.py"]
14 | 


--------------------------------------------------------------------------------
/Ch14/app/app.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from flask import Flask
 3 | app = Flask(__name__)
 4 | 
 5 | @app.route('/')
 6 | def hello():
 7 |     return "Hello from the Docker container!"
 8 | 
 9 | if __name__ == "__main__":
10 |     app.run(host='0.0.0.0', port=5000)
11 | 


--------------------------------------------------------------------------------
/Ch08/ch08-databases.yml:
--------------------------------------------------------------------------------
 1 | name: ch08-databases
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.11
 8 |   - biopython
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - scipy
14 |   - pysradb


--------------------------------------------------------------------------------
/Ch13/ch13-genome-editing.yml:
--------------------------------------------------------------------------------
 1 | name: ch13-genome-editing
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - scipy
14 |   - notebook
15 |   - seaborn


--------------------------------------------------------------------------------
/Ch06/ch06-annotation.yml:
--------------------------------------------------------------------------------
 1 | name: bioinformatics_base
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.11
 8 |   - biopython=1.84
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - scipy
14 |   - cyvcf2
15 |   - notebook
16 | 
17 | 


--------------------------------------------------------------------------------
/Ch01/bioinformatics_base.yml:
--------------------------------------------------------------------------------
 1 | name: bioinformatics_base
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.9.2
11 |   - numpy=2.1.0 
12 |   - pandas=2.2.3
13 |   - scipy=1.14.1
14 |   - notebook
15 | 
16 | 


--------------------------------------------------------------------------------
/Ch10/ch10-phylogenetics.yml:
--------------------------------------------------------------------------------
 1 | name: ch10-phylogenetics
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.11
 8 |   - biopython
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - scipy
14 |   - dendropy
15 |   - trimal
16 |   - mafft
17 |   - muscle
18 |   - raxml-ng


--------------------------------------------------------------------------------
/Ch05/ch05-alignment.yml:
--------------------------------------------------------------------------------
 1 | name: bioinformatics_base
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.10.3
11 |   - numpy=2.1.0 
12 |   - pandas=2.2.3
13 |   - scipy=1.14.1
14 |   - pysam=0.23.3
15 |   - notebook
16 | 
17 | 


--------------------------------------------------------------------------------
/Ch07/ch07-genomes.yml:
--------------------------------------------------------------------------------
 1 | name: ch07-genomes
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.11
 8 |   - biopython
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - scipy
14 |   - pyfastx
15 |   - networkx
16 |   - raven-assembler
17 |   - pip
18 |   - pip:
19 |       - quast


--------------------------------------------------------------------------------
/Ch01/README.md:
--------------------------------------------------------------------------------
 1 | This is the README for Ch01.
 2 | 
 3 | 
 4 | These are the commands to build and run the Docker container for the book:
 5 | 
 6 | docker build -t bio https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-fourth-edition.git#main:docker/main
 7 | 
 8 | docker run -ti -p 9875:9875 -v /Users/shanebrubaker/work/docker_files:/data bio 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/Ch14/ch14-cloud-computing.yml:
--------------------------------------------------------------------------------
 1 | name: ch14-cloud-computing
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.9.2
11 |   - numpy=2.1.0 
12 |   - pandas=2.2.3
13 |   - scipy=1.14.1
14 |   - cyvcf2=0.31.1
15 |   - notebook
16 |   - boto3=1.38.13
17 | 


--------------------------------------------------------------------------------
/Ch15/nextflow/nextflow.config:
--------------------------------------------------------------------------------
 1 | 
 2 | process {
 3 |     cpus = 1
 4 |     memory = '2 GB'
 5 |     time = '30m'
 6 | }
 7 | 
 8 | executor {
 9 |     name = 'local'
10 |     cpus = 4
11 | }
12 | 
13 | report {
14 |     enabled = true
15 |     file = 'reports/execution_report.html'
16 | }
17 | 
18 | timeline {
19 |     enabled = true
20 |     file = 'reports/timeline.html'
21 | }
22 | 


--------------------------------------------------------------------------------
/Ch15/ch15-workflows.yml:
--------------------------------------------------------------------------------
 1 | name: ch15-workflows
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.9.2
11 |   - numpy=2.1.0 
12 |   - pandas=2.2.3
13 |   - scipy=1.14.
14 |   - cyvcf2=0.31.1
15 |   - notebook
16 |   - bioblend=1.5.0
17 |   - sra-tools>=3.0 
18 | 
19 | 


--------------------------------------------------------------------------------
/Ch16/ch16-more-workflows.yml:
--------------------------------------------------------------------------------
 1 | name: ch16-more-workflows
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.9.2
11 |   - numpy=2.1.0 
12 |   - pandas=2.2.3
13 |   - scipy=1.14.
14 |   - cyvcf2=0.31.1
15 |   - notebook
16 |   - bioblend=1.5.0
17 |   - sra-tools>=3.0 
18 | 
19 | 


--------------------------------------------------------------------------------
/Ch14/s3-bucket.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Description: CloudFormation template to create an S3 bucket
 3 | 
 4 | Resources:
 5 |   MyS3Bucket:
 6 |     Type: AWS::S3::Bucket
 7 |     Properties:
 8 |       BucketName: my-simple-cf-bucket-123456  # Must be globally unique
 9 | 
10 | Outputs:
11 |   BucketName:
12 |     Description: The name of the created S3 bucket
13 |     Value: !Ref MyS3Bucket
14 | 
15 | 


--------------------------------------------------------------------------------
/Ch09/ch09-proteins.yml:
--------------------------------------------------------------------------------
 1 | name: ch09-proteins 
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.9.2
11 |   - seaborn=0.13.2
12 |   - numpy=2.1.0 
13 |   - pandas=2.2.3
14 |   - scipy=1.14.
15 |   - cyvcf2=0.31.1
16 |   - notebook
17 |   - nglview=3.1.4
18 |   - pyteomics=4.7.5
19 |   - py3Dmol=2.5.1
20 | 
21 | 


--------------------------------------------------------------------------------
/Ch11/ch11-population-genomics.yml:
--------------------------------------------------------------------------------
 1 | name: ch11-population-genomics
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.11
 8 |   - biopython
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - seaborn
14 |   - scipy
15 |   - notebook
16 |   - dask
17 |   - dask-ml
18 |   - scikit-learn
19 |   - pyarrow
20 |   - pip
21 |   - pip:
22 |     - sgkit[plink]
23 |     - cbgen


--------------------------------------------------------------------------------
/Ch04/ch04-data-science.yml:
--------------------------------------------------------------------------------
 1 | name: ch04-data-science 
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib=3.9.2
11 |   - numpy=2.1.0 
12 |   - pandas=2.2.3
13 |   - scipy=1.14.1
14 |   - notebook
15 |   - scikit-learn=1.7.0
16 |   - scipy=1.14.1
17 |   - seaborn=0.13.2
18 |   - umap-learn=0.5.7
19 |   - ipywidgets=8.1.7
20 | 
21 | 


--------------------------------------------------------------------------------
/Ch12/ch12-applications.yml:
--------------------------------------------------------------------------------
 1 | name: ch12-metabolic-modeling
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython
 9 |   - jupyterlab
10 |   - matplotlib
11 |   - numpy
12 |   - pandas
13 |   - scipy
14 |   - notebook
15 |   - seaborn
16 |   - ViennaRNA
17 |   - requests
18 |   - pip
19 |   - pip:
20 |     - cobra
21 |     - python-libsbml-experimental
22 |     - sgkit[plink]
23 |     - cbgen


--------------------------------------------------------------------------------
/Ch18/ch18-single-cell.yml:
--------------------------------------------------------------------------------
 1 | name: ch18-single-cell
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0 
10 |   - matplotlib>=3.9
11 |   - numpy>=2.1.0
12 |   - pandas>=2.2
13 |   - scipy>=1.14
14 |   - notebook
15 |   - shapely
16 |   - scanpy
17 |   - igraph
18 |   - leidenalg
19 |   - scikit-image
20 |   - seaborn
21 |   - scikit-learn
22 |   - networkx
23 | 


--------------------------------------------------------------------------------
/Ch17/ch17-machine-learning.yml:
--------------------------------------------------------------------------------
 1 | name: ch17-machine-learning
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.12
 8 |   - biopython=1.84
 9 |   - jupyterlab=4.4.0
10 |   - matplotlib=3.10.3
11 |   - numpy>=2.2.6  # Updated for cyvcf2 compatibility
12 |   - pandas=2.2.3
13 |   - scipy=1.14.1
14 |   - pysam>=0.23.3  # Use >= instead of = for flexibility
15 |   - notebook
16 |   - pytorch=2.0.1
17 |   - seaborn=0.13.2
18 |   - transformers=4.30.2
19 |   - plotly=6.3.1
20 |   - tqdm=4.67.1
21 |   - cyvcf2  # Now this should work
22 | 


--------------------------------------------------------------------------------
/Ch01/Welcome.py:
--------------------------------------------------------------------------------
 1 | # ---
 2 | # jupyter:
 3 | #   jupytext:
 4 | #     formats: ipynb,py:light
 5 | #     text_representation:
 6 | #       extension: .py
 7 | #       format_name: light
 8 | #       format_version: '1.5'
 9 | #       jupytext_version: 1.17.1
10 | #   kernelspec:
11 | #     display_name: Python 3 (ipykernel)
12 | #     language: python
13 | #     name: python3
14 | # ---
15 | 
16 | # +
17 | # BioInformatics with Python Cookbook - Fourth Edition #
18 | 
19 | # +
20 | # Welcome to the book! #
21 | # 1-1 Welcome #
22 | # -
23 | 
24 | print("Welcome to the BioInformatics with Python Cookbook Fourth Edition!")
25 | 
26 | # +
27 | # Install packages using Conda
28 | # -
29 | 
30 | # ! conda install -y biopython==1.84 jupyterlab==4.3.0 matplotlib==3.9.2 numpy==2.1.0 pandas==2.2.3 scipy==1.14.1 
31 | 
32 | # +
33 | # Install Jupytext
34 | # -
35 | 
36 | # ! pip install jupytext
37 | 
38 | # +
39 | ## End of Notebook ##
40 | 


--------------------------------------------------------------------------------
/Ch03/sample.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os, sys  # Unused imports and multiple imports on one line
 4 | 
 5 | def example_function(a, b):   # Missing docstring
 6 |     if a > b:
 7 |       print("a is greater than b")  # Improper indentation
 8 |     else:
 9 |         print("b is greater or equal to a")  # Extra indentation
10 | 
11 | class ExampleClass:  # Missing class docstring
12 |     def __init__(self,value):
13 |         self.value=value   # Missing spaces around '=' operator
14 |         self.data = []   # Unused attribute
15 | 
16 |     def add_data(self, item):  # Unused method argument 'item'
17 |         pass
18 | 
19 |     def display(self):
20 |         print("Value: ", self.value)  # Space before comma is bad style
21 | 
22 | # Unused variable and name not in snake_case
23 | BADVariableName = 42
24 | 
25 | # Long line exceeding 80 characters
26 | print("This is a really, really, really, really, really, really, really long line of code.")
27 | 
28 | example_function(10, 5)  # Function call with no meaningful context
29 | 
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Ch10/.ipynb_checkpoints/Ch10-1-preparing-dataset-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "56dc5d7f-7c92-42c9-adea-9cc2f1a275c0",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "# Ch10-1 - Preparing a dataset for phylogenetic analysis"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "id": "be988ce8-4b24-455d-ac96-262d509f6d88",
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": []
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "id": "fa5178b5-b30a-4a27-b910-cfa4bb5f7bde",
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": []
28 |   },
29 |   {
30 |    "cell_type": "code",
31 |    "execution_count": null,
32 |    "id": "b5803ed5-b13f-42d7-98ca-adef0206431f",
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": []
36 |   },
37 |   {
38 |    "cell_type": "code",
39 |    "execution_count": null,
40 |    "id": "1165eb6a-33d1-44ce-a138-e325dd821e5c",
41 |    "metadata": {},
42 |    "outputs": [],
43 |    "source": []
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "id": "7fe9b62b-470b-4c1a-ba72-91b886102a64",
49 |    "metadata": {},
50 |    "outputs": [],
51 |    "source": []
52 |   }
53 |  ],
54 |  "metadata": {
55 |   "kernelspec": {
56 |    "display_name": "Python 3 (ipykernel)",
57 |    "language": "python",
58 |    "name": "python3"
59 |   },
60 |   "language_info": {
61 |    "codemirror_mode": {
62 |     "name": "ipython",
63 |     "version": 3
64 |    },
65 |    "file_extension": ".py",
66 |    "mimetype": "text/x-python",
67 |    "name": "python",
68 |    "nbconvert_exporter": "python",
69 |    "pygments_lexer": "ipython3",
70 |    "version": "3.11.3"
71 |   }
72 |  },
73 |  "nbformat": 4,
74 |  "nbformat_minor": 5
75 | }
76 | 


--------------------------------------------------------------------------------
/docker/main/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/anaconda3:latest
 2 | 
 3 | LABEL maintainer="Shane Brubaker <shanebrubaker55@gmail.com>"
 4 | 
 5 | # Fix: Remove asterisks from ENV variable
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | # Install system packages and bioinformatics tools in a single layer
 9 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \
10 |     # System packages
11 |     git \
12 |     wget \
13 |     curl \
14 |     build-essential \
15 |     unzip \
16 |     graphviz \
17 |     libgraphviz-dev \
18 |     pkg-config \
19 |     swig \
20 |     libx11-dev \
21 |     libgsl0-dev \
22 |     libopenblas-dev \
23 |     liblapacke-dev \
24 |     # Bioinformatics tools
25 |     samtools \
26 |     mafft \
27 |     muscle \
28 |     raxml \
29 |     tabix \
30 |     && rm -rf /var/lib/apt/lists/* \
31 |     && apt-get clean
32 | 
33 | # Clone the repository
34 | RUN git clone https://github.com/PacktPublishing/Bioinformatics-with-Python-Cookbook-fourth-Edition.git
35 | 
36 | # Update conda and configure channels
37 | RUN conda update -n base conda && \
38 |     conda config --add channels conda-forge && \
39 |     conda config --add channels bioconda && \
40 |     conda config --set channel_priority strict
41 | 
42 | # Create bioinformatics environment
43 | RUN conda env create -f /Bioinformatics-with-Python-Cookbook-fourth-Edition/Ch01/bioinformatics_base.yml
44 | 
45 | # Install additional packages if needed
46 | # RUN conda run -n bioinformatics_base pip install pyarrow==8.0.0
47 | 
48 | # Initialize conda for bash and set up environment activation
49 | RUN conda init bash && \
50 |     echo "conda activate bioinformatics_base" >> /root/.bashrc && \
51 |     echo "setterm -foreground magenta" >> /etc/bash.bashrc
52 | 
53 | # Create workspace directory
54 | RUN mkdir -p /workspace
55 | 
56 | # Set working directory
57 | WORKDIR /Bioinformatics-with-Python-Cookbook-fourth-Edition
58 | 
59 | # Create a non-root user for security (optional but recommended)
60 | # RUN useradd -m -s /bin/bash biouser && \
61 | #     chown -R biouser:biouser /Bioinformatics-with-Python-Cookbook-fourth-Edition /workspace
62 | # USER biouser
63 | 
64 | # Expose port
65 | EXPOSE 9875
66 | 
67 | # Set environment variables for Jupyter
68 | ENV JUPYTER_ENABLE_LAB=yes
69 | ENV JUPYTER_TOKEN=""
70 | 
71 | # Start Jupyter Lab with better formatting
72 | CMD ["conda", "run", "--no-capture-output", "-n", "bioinformatics_base", \
73 |      "jupyter-lab", \
74 |      "--ip=0.0.0.0", \
75 |      "--no-browser", \
76 |      "--allow-root", \
77 |      "--port=9875", \
78 |      "--NotebookApp.token=", \
79 |      "--NotebookApp.password="]


--------------------------------------------------------------------------------
/Ch03/Ch03-1-pycodestyle.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "09c662f9-9479-42fb-9e60-88d839b3db11",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "# 3-1 - Linting and Style"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "id": "c9631482-1cb5-49ec-b372-26c7c7081c72",
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "# pycodestyle usage"
21 |    ]
22 |   },
23 |   {
24 |    "cell_type": "code",
25 |    "execution_count": null,
26 |    "id": "2df1320f-a791-4033-8739-3ff8cfaebe59",
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "# Load the pycodestyle extention\n",
31 |     "%load_ext pycodestyle_magic"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "id": "0d1faf3d-4a2f-4c99-824a-8433eb9c237a",
38 |    "metadata": {},
39 |    "outputs": [],
40 |    "source": [
41 |     "%%pycodestyle\n",
42 |     "import os, sys  # Unused imports and multiple imports on one line\n",
43 |     "def example_function(a, b):  # Missing docstring\n",
44 |     "    if a > b:\n",
45 |     "        print(\"a is greater than b\")  # Improper indentation\n",
46 |     "    else:\n",
47 |     "        print(\"b is greater or equal to a\")  # Extra indentation\n",
48 |     "# Long line exceeding 80 characters\n",
49 |     "print(\n",
50 |     "    \"This is a really, really, really, really, really, really, really long line of code.\"\n",
51 |     ")\n",
52 |     "example_function(10, 5)  # Function call with no meaningful context"
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": null,
58 |    "id": "ea6cde80-0604-43b0-9908-484fee5e31be",
59 |    "metadata": {},
60 |    "outputs": [],
61 |    "source": [
62 |     "# Above in the output you should see a series of style suggestions"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": null,
68 |    "id": "1c0cdd0a-df99-4d3e-b5e5-c57d7fe0027c",
69 |    "metadata": {},
70 |    "outputs": [],
71 |    "source": [
72 |     "## End of Notebook ##"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "Python 3 (ipykernel)",
79 |    "language": "python",
80 |    "name": "python3"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.11.13"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 5
97 | }
98 | 


--------------------------------------------------------------------------------
/Ch11/Ch11-3-exploring-with-sgkit.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "d1aa5bcd-9b5f-4391-a89f-a6b245f4d8cc",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch11-3-exploring-with-sgkit"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "64c849ae-086f-4208-9f03-032afe7f41a9",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# 1.  Import libraries and get data\n",
 21 |     "import numpy as np \n",
 22 |     "import xarray as xr \n",
 23 |     "import sgkit as sg \n",
 24 |     "from sgkit.io import plink \n",
 25 |     "data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\\t') "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "id": "fd86462b-0072-4cd2-a342-02d0dca685df",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# 2. Get variant stats\n",
 36 |     "variant_stats = sg.variant_stats(data) \n",
 37 |     "variant_stats "
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "07c0a37b-0d57-4026-97d3-0446bb0cb30d",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# 3.  Look at variant call rate\n",
 48 |     "variant_stats.variant_call_rate.to_series().describe() "
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "92dc45ca-1b3b-4704-8984-d5f710cde72a",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# 4.  Sample statistics\n",
 59 |     "sample_stats = sg.sample_stats(data) \n",
 60 |     "sample_stats "
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "9a47d767-c735-4d06-aa58-6c79442fa4b0",
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# 5.  Sample call rates\n",
 71 |     "sample_stats.sample_call_rate.to_series().hist() "
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "41b7cb11-bdd8-469b-a262-881cdb5d126b",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "## End of Notebook ##"
 82 |    ]
 83 |   }
 84 |  ],
 85 |  "metadata": {
 86 |   "kernelspec": {
 87 |    "display_name": "Python 3 (ipykernel)",
 88 |    "language": "python",
 89 |    "name": "python3"
 90 |   },
 91 |   "language_info": {
 92 |    "codemirror_mode": {
 93 |     "name": "ipython",
 94 |     "version": 3
 95 |    },
 96 |    "file_extension": ".py",
 97 |    "mimetype": "text/x-python",
 98 |    "name": "python",
 99 |    "nbconvert_exporter": "python",
100 |    "pygments_lexer": "ipython3",
101 |    "version": "3.11.14"
102 |   }
103 |  },
104 |  "nbformat": 4,
105 |  "nbformat_minor": 5
106 | }
107 | 


--------------------------------------------------------------------------------
/Ch15/galaxy/Ch15-1-introducing-galaxy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "816fb516-37d9-49b7-a262-1dab6bdb32a6",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch15-1 Introducing Galaxy"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "7cb3ca76-dbce-4b1e-81d5-6ebc07e49d26",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "## In this exercise you will set up an account on usegalaxy.org and learn the Galaxy interface\n",
 21 |     "# You will be running the commands below in the Terminal to set up a Docker container for Galaxy\n",
 22 |     "#. These instructions are notes for you to follow and run in your terminal\n",
 23 |     "#      - no actual Jupyter notebook work will be used here"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "2ef173f6",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "### Docker Installation of Galaxy ###\n",
 34 |     "# First make sure you have Docker installed\n",
 35 |     "#  Follow the instructions in Chapter 1-2 \"Installing the required software with Docker\"\n",
 36 |     "# Then Register with Docker here:\n",
 37 |     "#  https://app.docker.com/signup \n",
 38 |     "#  Test that you can log into Docker from the terminal like this: \n",
 39 |     "# docker login "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "cba11fcf-4910-4828-aa4f-f0901663713a",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Docker Pull of the Galaxy image\n",
 50 |     "# docker pull bgruening/galaxy-stable "
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "7b13f410-1827-46b7-8b6c-572da46a13d1",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Create a directory to store the data for the Docker container:\n",
 61 |     "# mkdir /tmp/galaxy_data "
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "da1f5697-b697-4a0e-a5b0-3abeddf48e8c",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# Docker Run\n",
 72 |     "# docker run -d -p 8080:80 --platform linux/amd64 -v /tmp/galaxy_data:/export --name galaxy bgruening/galaxy-stable "
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "id": "d0ac053f-5ef2-4f38-9675-e9f7f06645c6",
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# Check that Galaxy container is running (this will also show you what port it is running on):\n",
 83 |     "# docker ps "
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "8d41b995-1ed8-45e8-a1ba-406539dcb95e",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "## End of Notebook ##"
 94 |    ]
 95 |   }
 96 |  ],
 97 |  "metadata": {
 98 |   "kernelspec": {
 99 |    "display_name": "Python 3 (ipykernel)",
100 |    "language": "python",
101 |    "name": "python3"
102 |   },
103 |   "language_info": {
104 |    "codemirror_mode": {
105 |     "name": "ipython",
106 |     "version": 3
107 |    },
108 |    "file_extension": ".py",
109 |    "mimetype": "text/x-python",
110 |    "name": "python",
111 |    "nbconvert_exporter": "python",
112 |    "pygments_lexer": "ipython3",
113 |    "version": "3.12.10"
114 |   }
115 |  },
116 |  "nbformat": 4,
117 |  "nbformat_minor": 5
118 | }
119 | 


--------------------------------------------------------------------------------
/Ch15/snakemake/Snakefile:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | from pathlib import Path
  4 | 
  5 | # Configuration
  6 | SAMPLES = ["sample1", "sample2", "sample3"]
  7 | DATA_DIR = "data"
  8 | RESULTS_DIR = "results"
  9 | 
 10 | # Target rule - what we want to produce
 11 | rule all:
 12 |     input:
 13 |         # FastQC reports (real)
 14 |         expand(f"{RESULTS_DIR}/fastqc/{{sample}}_R1_fastqc.html", sample=SAMPLES),
 15 |         expand(f"{RESULTS_DIR}/fastqc/{{sample}}_R2_fastqc.html", sample=SAMPLES),
 16 |         # Mock outputs
 17 |         expand(f"{RESULTS_DIR}/alignment/{{sample}}.bam", sample=SAMPLES),
 18 |         expand(f"{RESULTS_DIR}/variants/{{sample}}.vcf", sample=SAMPLES),
 19 |         f"{RESULTS_DIR}/multiqc_report.html",
 20 |         f"{RESULTS_DIR}/pipeline_summary.json"
 21 | 
 22 | # Real FastQC rule
 23 | rule fastqc:
 24 |     input:
 25 |         fastq=f"{DATA_DIR}/raw/{{sample}}_{{read}}.fastq.gz"
 26 |     output:
 27 |         html=f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.html",
 28 |         zip=f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.zip"
 29 |     params:
 30 |         outdir=f"{RESULTS_DIR}/fastqc"
 31 |     log:
 32 |         "logs/fastqc_{sample}_{read}.log"
 33 |     shell:
 34 |         """
 35 |         # Check if fastqc is available, if not use mock
 36 |         if command -v fastqc >/dev/null 2>&1; then
 37 |             fastqc {input.fastq} -o {params.outdir} --extract 2> {log}
 38 |         else
 39 |             echo "FastQC not found, creating mock output..." > {log}
 40 |             python scripts/mock_fastqc.py {input.fastq} {params.outdir} {wildcards.sample} {wildcards.read}
 41 |         fi
 42 |         """
 43 | 
 44 | # Mock alignment rule
 45 | rule align_reads:
 46 |     input:
 47 |         r1=f"{DATA_DIR}/raw/{{sample}}_R1.fastq.gz",
 48 |         r2=f"{DATA_DIR}/raw/{{sample}}_R2.fastq.gz"
 49 |     output:
 50 |         bam=f"{RESULTS_DIR}/alignment/{{sample}}.bam",
 51 |         bai=f"{RESULTS_DIR}/alignment/{{sample}}.bam.bai"
 52 |     log:
 53 |         "logs/align_{sample}.log"
 54 |     shell:
 55 |         """
 56 |         echo "Mock alignment for {wildcards.sample}" > {log}
 57 |         python scripts/mock_alignment.py {input.r1} {input.r2} {output.bam} {output.bai}
 58 |         """
 59 | 
 60 | # Mock variant calling rule
 61 | rule call_variants:
 62 |     input:
 63 |         bam=f"{RESULTS_DIR}/alignment/{{sample}}.bam",
 64 |         bai=f"{RESULTS_DIR}/alignment/{{sample}}.bam.bai"
 65 |     output:
 66 |         vcf=f"{RESULTS_DIR}/variants/{{sample}}.vcf"
 67 |     log:
 68 |         "logs/variants_{sample}.log"
 69 |     shell:
 70 |         """
 71 |         echo "Mock variant calling for {wildcards.sample}" > {log}
 72 |         python scripts/mock_variants.py {input.bam} {output.vcf}
 73 |         """
 74 | 
 75 | # Mock MultiQC rule
 76 | rule multiqc:
 77 |     input:
 78 |         fastqc_reports=expand(f"{RESULTS_DIR}/fastqc/{{sample}}_{{read}}_fastqc.zip", 
 79 |                              sample=SAMPLES, read=["R1", "R2"]),
 80 |         bams=expand(f"{RESULTS_DIR}/alignment/{{sample}}.bam", sample=SAMPLES)
 81 |     output:
 82 |         report=f"{RESULTS_DIR}/multiqc_report.html"
 83 |     log:
 84 |         "logs/multiqc.log"
 85 |     shell:
 86 |         """
 87 |         echo "Mock MultiQC report generation" > {log}
 88 |         python scripts/mock_multiqc.py {output.report}
 89 |         """
 90 | 
 91 | # Pipeline summary rule
 92 | rule pipeline_summary:
 93 |     input:
 94 |         vcfs=expand(f"{RESULTS_DIR}/variants/{{sample}}.vcf", sample=SAMPLES),
 95 |         multiqc=f"{RESULTS_DIR}/multiqc_report.html"
 96 |     output:
 97 |         summary=f"{RESULTS_DIR}/pipeline_summary.json"
 98 |     log:
 99 |         "logs/summary.log"
100 |     shell:
101 |         """
102 |         echo "Generating pipeline summary" > {log}
103 |         python scripts/generate_summary.py {output.summary}
104 |         """
105 | 


--------------------------------------------------------------------------------
/Ch03/Ch03-2-sequence-manipulation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "feab660d-2231-4911-ab09-da9d7c6a5397",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch03-2 Sequence Manipulation"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "4d62f32a-abf2-4964-af3c-c9c9ad8fcd35",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Function Definition\n",
 21 |     "def parse_fasta(file_path):\n",
 22 |     "    \"\"\"\n",
 23 |     "    Parses a FASTA file and returns a dictionary with sequence headers as keys and sequences as values.\n",
 24 |     "    Parameters:\n",
 25 |     "    - file_path (str): Path to the FASTA file.\n",
 26 |     "    Returns:\n",
 27 |     "    - dict: A dictionary where keys are sequence headers and values are sequences.\n",
 28 |     "    \"\"\"\n",
 29 |     "    fasta_dict = {}\n",
 30 |     "    with open(file_path, 'r') as file:\n",
 31 |     "        header = None\n",
 32 |     "        sequence = []\n",
 33 |     "        for line in file:\n",
 34 |     "            line = line.strip()\n",
 35 |     "            if line.startswith(\">\"):  # Header line\n",
 36 |     "                if header:  # Save the previous sequence\n",
 37 |     "                    fasta_dict[header] = ''.join(sequence)\n",
 38 |     "                header = line[1:]  # Remove \">\"\n",
 39 |     "                sequence = []  # Reset sequence list\n",
 40 |     "            else:\n",
 41 |     "                sequence.append(line)\n",
 42 |     "        if header:  # Save the last sequence\n",
 43 |     "            fasta_dict[header] = ''.join(sequence)\n",
 44 |     "    return fasta_dict"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "id": "16c87949-80c7-409b-8fe4-f9db06031c42",
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "Header: seq1\n",
 58 |       "Sequence: ATCGTACGATCGGATCGTACGATC\n",
 59 |       "Header: seq2\n",
 60 |       "Sequence: CGTAGCTAGCTA\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "# Example usage:\n",
 66 |     "fasta_file = \"example.fasta\"\n",
 67 |     "fasta_records = parse_fasta(fasta_file)\n",
 68 |     "for header, seq in fasta_records.items():\n",
 69 |     "    print(f\"Header: {header}\")\n",
 70 |     "    print(f\"Sequence: {seq}\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "347b2c76-3fe4-4a90-a31b-cb91f7abc157",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": []
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "695b0e93-5357-4d2d-925e-72c70dfafb07",
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": []
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "id": "a41df267-cab9-4ff8-aee5-5efaa8d83982",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": []
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "b238aba9-097d-4aa6-b424-9280c54a1e8d",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": []
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "id": "0ee8aae9-1366-4eea-8392-5b8e16a1d84a",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3 (ipykernel)",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.11.3"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 5
135 | }
136 | 


--------------------------------------------------------------------------------
/Ch07/Ch07-4-genome-assessment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "fca2f045",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch07-4 genome assessment"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "8a7ad84e",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install QUAST\n",
 21 |     "! pip install quast"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "9500e3dd",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Check that QUAST is installed\n",
 32 |     "! quast.py --version"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "fecf93af",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Import libraries\n",
 43 |     "import subprocess"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "00032b5e",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Function to run QUAST on a genome assembly\n",
 54 |     "def run_quast(assembly_file, reference_file=None, output_dir=\"quast_output\"):\n",
 55 |     "    \"\"\"\n",
 56 |     "    Runs QUAST to assess the quality of a genome assembly.\n",
 57 |     "\n",
 58 |     "    Parameters:\n",
 59 |     "        assembly_file (str): Path to the assembled genome FASTA file.\n",
 60 |     "        reference_file (str, optional): Path to the reference genome FASTA file. Defaults to None.\n",
 61 |     "        output_dir (str): Directory to save QUAST results. Defaults to \"quast_output\".\n",
 62 |     "\n",
 63 |     "    Returns:\n",
 64 |     "        None\n",
 65 |     "    \"\"\"\n",
 66 |     "    try:\n",
 67 |     "        command = [\"quast.py\", assembly_file, \"-o\", output_dir]\n",
 68 |     "        if reference_file:\n",
 69 |     "            command.extend([\"-r\", reference_file])\n",
 70 |     "\n",
 71 |     "        print(f\"Running QUAST...\\nCommand: {' '.join(command)}\")\n",
 72 |     "        subprocess.run(command, check=True)\n",
 73 |     "        print(f\"QUAST analysis complete. Results saved in: {output_dir}\")\n",
 74 |     "    except FileNotFoundError:\n",
 75 |     "        print(\"QUAST is not installed or not found in the system PATH.\")\n",
 76 |     "    except subprocess.CalledProcessError as e:\n",
 77 |     "        print(f\"Error running QUAST: {e}\")\n",
 78 |     "    except Exception as e:\n",
 79 |     "        print(f\"Unexpected error: {e}\")"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "ab030149",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "## Main code to run our QUAST function ##\n",
 90 |     "if __name__ == \"__main__\":\n",
 91 |     "    assembly = \"output/ecoli-assembly.fasta\"\n",
 92 |     "    reference = None  # Set to \"reference.fasta\" if available\n",
 93 |     "    output = \"quast_results\"\n",
 94 |     "    run_quast(assembly, reference, output)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "8a33b21b",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# Open the QUAST report\n",
105 |     "! open quast_results/report.html"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "002ea16f",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "## End of Notebook ##"
116 |    ]
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "Python 3 (ipykernel)",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.11.3"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 5
140 | }
141 | 


--------------------------------------------------------------------------------
/Ch03/Ch03-3-read-alignment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "fe161674-e8b6-41e1-b0a0-ca0fa241b4d4",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch03-3 Read Alignment "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "86f2be45-730f-49c0-8eb2-bf6af68d2456",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Libraries\n",
 21 |     "from Bio import pairwise2\n",
 22 |     "from Bio.pairwise2 import format_alignment"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "aabb521e-fa09-4904-92df-05eeef0367ef",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Example using Pairwise2\n",
 33 |     "# Define the sequences\n",
 34 |     "seq1 = \"ACGTGCTAGCTAG\"\n",
 35 |     "seq2 = \"ACGTCGATGCTA\"\n",
 36 |     "\n",
 37 |     "# Perform global alignment\n",
 38 |     "alignments = pairwise2.align.globalxx(seq1, seq2)\n",
 39 |     "\n",
 40 |     "# Display the best alignment\n",
 41 |     "print(\"Best alignment:\")\n",
 42 |     "print(format_alignment(*alignments[0]))\n",
 43 |     "\n",
 44 |     "# Perform local alignment\n",
 45 |     "local_alignments = pairwise2.align.localxx(seq1, seq2)\n",
 46 |     "\n",
 47 |     "# Display the best local alignment\n",
 48 |     "print(\"\\nBest local alignment:\")\n",
 49 |     "print(format_alignment(*local_alignments[0]))"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "fbf817f0-ed49-4952-8bcd-04c8ba96812d",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Example with PairwiseAligner"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "f8cd0758-a2b5-4206-9a3f-6439db557f49",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# Libraries\n",
 70 |     "from Bio.Align import PairwiseAligner"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "b07b39ea-e3e4-4571-a4af-9ddf07516ead",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Define the sequences\n",
 81 |     "seq1 = \"ACGTGCTAGCTAG\"\n",
 82 |     "seq2 = \"ACGTCGATGCTA\"\n",
 83 |     "\n",
 84 |     "# Initialize the PairwiseAligner\n",
 85 |     "aligner = PairwiseAligner()\n",
 86 |     "\n",
 87 |     "# Set alignment scoring (optional, defaults to match=1, mismatch=0, gap=-1)\n",
 88 |     "aligner.match_score = 1\n",
 89 |     "aligner.mismatch_score = -1\n",
 90 |     "aligner.open_gap_score = -1\n",
 91 |     "aligner.extend_gap_score = -0.5\n",
 92 |     "\n",
 93 |     "# Perform global alignment\n",
 94 |     "global_alignments = aligner.align(seq1, seq2)\n",
 95 |     "\n",
 96 |     "# Display the best global alignment\n",
 97 |     "print(\"Best global alignment:\")\n",
 98 |     "print(global_alignments[0])\n",
 99 |     "print(f\"Score: {global_alignments[0].score}\")\n",
100 |     "\n",
101 |     "# Perform local alignment\n",
102 |     "aligner.mode = 'local'  # Switch to local alignment mode\n",
103 |     "local_alignments = aligner.align(seq1, seq2)\n",
104 |     "\n",
105 |     "# Display the best local alignment\n",
106 |     "print(\"\\nBest local alignment:\")\n",
107 |     "print(local_alignments[0])\n",
108 |     "print(f\"Score: {local_alignments[0].score}\")\n"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "5d6b9d0f-a0a0-4f8e-a65f-9efffc636abe",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "c325b1fb-c3f5-44f9-b87d-fef313281353",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": []
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "id": "166103c3-5096-4449-830c-ab49988a0981",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": []
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "kernelspec": {
138 |    "display_name": "Python 3 (ipykernel)",
139 |    "language": "python",
140 |    "name": "python3"
141 |   },
142 |   "language_info": {
143 |    "codemirror_mode": {
144 |     "name": "ipython",
145 |     "version": 3
146 |    },
147 |    "file_extension": ".py",
148 |    "mimetype": "text/x-python",
149 |    "name": "python",
150 |    "nbconvert_exporter": "python",
151 |    "pygments_lexer": "ipython3",
152 |    "version": "3.11.3"
153 |   }
154 |  },
155 |  "nbformat": 4,
156 |  "nbformat_minor": 5
157 | }
158 | 


--------------------------------------------------------------------------------
/Ch10/Ch10-6-visualizing-phylogenetics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "ea8ee1c6-9f19-4571-8242-eeae2df0c867",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch10-6 - Visualizing Phylogenetic data [Updated to use raxml-ng]"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "8bbae47e-f0a7-45b5-b024-41656fab9414",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# 30 & 31.  Load phylogenetic data and Draw Trees\n",
 21 |     "from copy import deepcopy\n",
 22 |     "from Bio import Phylo\n",
 23 |     "# Define the correct RAxML-NG output files\n",
 24 |     "best_tree_file = \"ebola_tree.raxml.bestTreeCollapsed\"  # Best ML tree\n",
 25 |     "# Read the best ML tree\n",
 26 |     "ebola_tree = Phylo.read(best_tree_file, \"newick\")\n",
 27 |     "ebola_tree.name = \"Ebolavirus Tree\"\n",
 28 |     "# Print tree structures for verification\n",
 29 |     "Phylo.draw_ascii(ebola_tree)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "id": "a26ed965-d007-4bf9-a045-9a1683fc29cf",
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "# 32.  Bio.Phylo [Updated]\n",
 40 |     "import matplotlib.pyplot as plt\n",
 41 |     "from Bio import Phylo\n",
 42 |     "# Define the RAxML-NG output file\n",
 43 |     "simplified_tree_file = \"ebola_tree.raxml.bestTreeCollapsed\"  # Previous output of raxml-ng\n",
 44 |     "# Read the tree (RAxML-NG outputs trees in Newick format)\n",
 45 |     "ebola_simple_tree = Phylo.read(simplified_tree_file, \"newick\")\n",
 46 |     "# Create a figure and axis\n",
 47 |     "fig = plt.figure(figsize=(16, 22))\n",
 48 |     "ax = fig.add_subplot(111)\n",
 49 |     "# Function to conditionally label branches\n",
 50 |     "def label_branches(clade):\n",
 51 |     "    if clade.branch_length and clade.branch_length > 0.02:\n",
 52 |     "        return f\"{clade.branch_length:.3f}\"  # Format to 3 decimal places\n",
 53 |     "    return None\n",
 54 |     "# Draw the tree with branch labels\n",
 55 |     "Phylo.draw(ebola_simple_tree, branch_labels=label_branches, axes=ax)\n",
 56 |     "# Show the plot\n",
 57 |     "plt.show()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "81667248-d700-451b-aac9-e103a1311c31",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# 33.  Plot the complete dataset\n",
 68 |     "fig = plt.figure(figsize=(16, 22)) \n",
 69 |     "ax = fig.add_subplot(111) \n",
 70 |     "from collections import OrderedDict \n",
 71 |     "my_colors = OrderedDict({ \n",
 72 |     "'EBOV_2014': 'red', \n",
 73 |     "'EBOV': 'magenta', \n",
 74 |     "'BDBV': 'cyan', \n",
 75 |     "'SUDV': 'blue', \n",
 76 |     "'RESTV' : 'green', \n",
 77 |     "'TAFV' : 'yellow' \n",
 78 |     "}) \n",
 79 |     "\n",
 80 |     "def get_color(name): \n",
 81 |     "    for pref, color in my_colors.items(): \n",
 82 |     "        if name.find(pref) > -1: \n",
 83 |     "            return color \n",
 84 |     "    return 'grey' \n",
 85 |     "\n",
 86 |     "def color_tree(node, fun_color=get_color): \n",
 87 |     "    if node.is_terminal(): \n",
 88 |     "        node.color = fun_color(node.name) \n",
 89 |     "    else: \n",
 90 |     "        my_children = set() \n",
 91 |     "        for child in node.clades: \n",
 92 |     "            color_tree(child, fun_color) \n",
 93 |     "            my_children.add(child.color.to_hex()) \n",
 94 |     "        if len(my_children) == 1: \n",
 95 |     "            node.color = child.color \n",
 96 |     "        else: \n",
 97 |     "            node.color = 'grey' \n",
 98 |     "\n",
 99 |     "ebola_color_tree = deepcopy(ebola_tree) \n",
100 |     "color_tree(ebola_color_tree.root) \n",
101 |     "Phylo.draw(ebola_color_tree, axes=ax, label_func=lambda x: x.name.split(' ')[0][1:] if x.name is not None else None) "
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "f7b5e7f2-2e4c-4dc9-a982-579a9371f36b",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "## End of Notebook ##"
112 |    ]
113 |   }
114 |  ],
115 |  "metadata": {
116 |   "kernelspec": {
117 |    "display_name": "Python 3 (ipykernel)",
118 |    "language": "python",
119 |    "name": "python3"
120 |   },
121 |   "language_info": {
122 |    "codemirror_mode": {
123 |     "name": "ipython",
124 |     "version": 3
125 |    },
126 |    "file_extension": ".py",
127 |    "mimetype": "text/x-python",
128 |    "name": "python",
129 |    "nbconvert_exporter": "python",
130 |    "pygments_lexer": "ipython3",
131 |    "version": "3.12.10"
132 |   }
133 |  },
134 |  "nbformat": 4,
135 |  "nbformat_minor": 5
136 | }
137 | 


--------------------------------------------------------------------------------
/Ch03/pycodestyle.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "id": "86c669e3-1d83-459f-8a43-321dda3b7071",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stdout",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "Requirement already satisfied: flake8 in /opt/anaconda3/lib/python3.12/site-packages (7.0.0)\n",
 14 |       "Requirement already satisfied: mccabe<0.8.0,>=0.7.0 in /opt/anaconda3/lib/python3.12/site-packages (from flake8) (0.7.0)\n",
 15 |       "Requirement already satisfied: pycodestyle<2.12.0,>=2.11.0 in /opt/anaconda3/lib/python3.12/site-packages (from flake8) (2.11.1)\n",
 16 |       "Requirement already satisfied: pyflakes<3.3.0,>=3.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from flake8) (3.2.0)\n",
 17 |       "Requirement already satisfied: pycodestyle in /opt/anaconda3/lib/python3.12/site-packages (2.11.1)\n",
 18 |       "Requirement already satisfied: pycodestyle_magic in /opt/anaconda3/lib/python3.12/site-packages (0.5)\n"
 19 |      ]
 20 |     }
 21 |    ],
 22 |    "source": [
 23 |     "!pip install flake8\n",
 24 |     "!pip install pycodestyle \n",
 25 |     "!pip install pycodestyle_magic"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 11,
 31 |    "id": "186d6989-aed9-4cc0-b01e-c225ae3e1ab5",
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "The pycodestyle_magic extension is already loaded. To reload it, use:\n",
 39 |       "  %reload_ext pycodestyle_magic\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "%load_ext pycodestyle_magic"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 12,
 50 |    "id": "beba9f29-84ad-4b96-a694-24efe2826bc4",
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stderr",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "2:25: W291 trailing whitespace\n",
 58 |       "3:7: E225 missing whitespace around operator\n",
 59 |       "3:10: W291 trailing whitespace\n",
 60 |       "4:1: W391 blank line at end of file\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "%%pycodestyle \n",
 66 |     "# 3.1.2 - Example Code A \n",
 67 |     "my_var=10 "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 14,
 73 |    "id": "2c42823c-29c5-4960-bcc8-894468d70219",
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stderr",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "2:10: E401 multiple imports on one line\n",
 81 |       "3:1: E302 expected 2 blank lines, found 0\n",
 82 |       "5:7: E111 indentation is not a multiple of 4\n",
 83 |       "9:1: E305 expected 2 blank lines after class or function definition, found 0\n",
 84 |       "9:80: E501 line too long (92 > 79 characters)\n",
 85 |       "12:1: W391 blank line at end of file\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "%%pycodestyle \n",
 91 |     "import os, sys  # Unused imports and multiple imports on one line\n",
 92 |     "def example_function(a, b):   # Missing docstring\n",
 93 |     "    if a > b:\n",
 94 |     "      print(\"a is greater than b\")  # Improper indentation\n",
 95 |     "    else:\n",
 96 |     "        print(\"b is greater or equal to a\")  # Extra indentation\n",
 97 |     "# Long line exceeding 80 characters\n",
 98 |     "print(\"This is a really, really, really, really, really, really, really long line of code.\")\n",
 99 |     "\n",
100 |     "example_function(10, 5)  # Function call with no meaningful context\n"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "c5c1e0dc-3037-4296-8a8e-6a0450265359",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": []
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "76b8fde1-79b6-476d-8611-7e627cc8d646",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "id": "b58b633b-4ecb-4a51-94a9-3a290a5b758e",
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": []
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "Python 3 (ipykernel)",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.12.2"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 5
149 | }
150 | 


--------------------------------------------------------------------------------
/Ch10/Ch10-4-reconstructing-trees.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "35ae8cd1-cffe-4704-850a-92366b2eb71e",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch10-4 - Reconstructing Phylogenetic Trees [Updated to use raxml-ng]"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "f8aa4582-f223-4cb1-a1b5-e8627660e512",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install raxml-ng\n",
 21 |     "! brew install raxml-ng"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "1d77eaff-414f-4aad-bfc6-6d1c1149046e",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# 18.  Use DendroPy to reconstruct genus dataset (takes 15-30 minutes)\n",
 32 |     "import os\n",
 33 |     "import subprocess\n",
 34 |     "# Define input and output paths\n",
 35 |     "data_path = \"trim.fasta\"\n",
 36 |     "output_prefix = \"ebola_tree\"\n",
 37 |     "# Check if the input file exists\n",
 38 |     "if not os.path.exists(data_path):\n",
 39 |     "    raise FileNotFoundError(f\"Error: The file {data_path} does not exist!\")\n",
 40 |     "# Define the RAxML-NG command\n",
 41 |     "cmd = [\n",
 42 |     "    \"raxml-ng\",\n",
 43 |     "    \"--msa\", data_path,    # Input sequence alignment\n",
 44 |     "    \"--model\", \"GTR+G\",    # Substitution model\n",
 45 |     "    \"--prefix\", output_prefix,  # Output file prefix\n",
 46 |     "    \"--search\",  # Perform Maximum Likelihood tree search\n",
 47 |     "]\n",
 48 |     "# Run RAxML-NG\n",
 49 |     "try:\n",
 50 |     "    subprocess.run(cmd, check=True)\n",
 51 |     "    print(f\"RAxML-NG completed successfully. Output files are saved with prefix '{output_prefix}'\")\n",
 52 |     "except subprocess.CalledProcessError as e:\n",
 53 |     "    print(f\"Error running RAxML-NG: {e}\")\n",
 54 |     "# Optional: Clean up RAxML-NG temporary files\n",
 55 |     "for ext in [\".raxml.log\", \".raxml.bestTree\", \".raxml.rba\", \".raxml.rfdist\"]:\n",
 56 |     "    file_path = f\"{output_prefix}{ext}\"\n",
 57 |     "    if os.path.exists(file_path):\n",
 58 |     "        os.remove(file_path)\n",
 59 |     "print(\"Temporary files cleaned up.\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "0b212241-685f-41c2-9d61-c6fd7345f36e",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# You will see ebola_tree.raxml* files in the working directory "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "4b569640-903e-4867-a2ef-a7df4a607fcb",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# 19.  Save the files - skip this because we already have the files"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "3c1d688c-d48d-4663-a45a-02585abd7b44",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# 20.  Visualize the genus tree\n",
 90 |     "import matplotlib.pyplot as plt\n",
 91 |     "from Bio import Phylo\n",
 92 |     "\n",
 93 |     "# Define the correct output tree file from RAxML-NG\n",
 94 |     "tree_file = \"ebola_tree.raxml.bestTreeCollapsed\"  # Based on the raxml-ng output from the previous step\n",
 95 |     "\n",
 96 |     "# Read the tree in Newick format (RAxML-NG default)\n",
 97 |     "my_ebola_tree = Phylo.read(tree_file, \"newick\")\n",
 98 |     "\n",
 99 |     "# Set a name for the tree\n",
100 |     "my_ebola_tree.name = \"Our Ebolavirus Tree\"\n",
101 |     "\n",
102 |     "# Plot the tree\n",
103 |     "fig = plt.figure(figsize=(16, 18))\n",
104 |     "ax = fig.add_subplot(1, 1, 1)\n",
105 |     "Phylo.draw(my_ebola_tree, axes=ax)\n",
106 |     "\n",
107 |     "plt.show()  # Display the tree"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "da4121ba-afdd-4817-b4ae-495630628c3c",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# 21 onward Skipped"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "7d166c03-eb19-4eb0-bb44-58fc5627b0c4",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "## End of Notebook ##"
128 |    ]
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3 (ipykernel)",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.12.10"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 5
152 | }
153 | 


--------------------------------------------------------------------------------
/Ch08/Ch08-1-genbank-ncbi.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "babd109e",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch08-1 Accessing Genbank and Navigating the NCBI"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "7be76015",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# 1. Import modules and configure email\n",
 21 |     "from Bio import Entrez, SeqIO \n",
 22 |     "Entrez.email = 'put@your.email.here' \n",
 23 |     "# 2. Make output dir\n",
 24 |     "! mkdir -p output"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "facc0855",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# 3. Find the CRT gene of Plasmodium\n",
 35 |     "handle = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND \"Plasmodium falciparum\"[Organism]') \n",
 36 |     "rec_list = Entrez.read(handle) \n",
 37 |     "if int(rec_list['RetMax']) < int(rec_list['Count']): \n",
 38 |     "    handle = Entrez.esearch(db='nucleotide', term='CRT[Gene Name] AND \"Plasmodium falciparum\"[Organism]', \n",
 39 |     "                            retmax=rec_list['Count'])\n",
 40 |     "    rec_list = Entrez.read(handle) "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "f7c260aa",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# 4. Retrieve records\n",
 51 |     "id_list = rec_list['IdList'] \n",
 52 |     "hdl = Entrez.efetch(db='nucleotide', id=id_list, rettype='gb') "
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "78617567",
 59 |    "metadata": {
 60 |     "scrolled": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# 5. Read and parse results\n",
 65 |     "recs = list(SeqIO.parse(hdl, 'gb')) "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "944f99cb",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# 6. Review the record\n",
 76 |     "for rec in recs: \n",
 77 |     "    if rec.name == 'KM288867':\n",
 78 |     "        break \n",
 79 |     "print(rec.name) \n",
 80 |     "print(rec.description) "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "7dd46b0d",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# 7. Extract features\n",
 91 |     "for feature in rec.features: \n",
 92 |     "    if feature.type == 'gene':\n",
 93 |     "        print(feature.qualifiers['gene']) \n",
 94 |     "    elif feature.type == 'exon': \n",
 95 |     "        loc = feature.location \n",
 96 |     "        print(loc.start, loc.end, loc.strand) \n",
 97 |     "    else: \n",
 98 |     "        print('not processed:\\n%s' % feature) "
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "id": "a3f5356f",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# 8. Review annotations\n",
109 |     "for name, value in rec.annotations.items(): \n",
110 |     "    print('%s=%s' % (name, value)) "
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "fb87731a",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "# 9.  Access the Sequence\n",
121 |     "print(len(rec.seq))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "00dcb7a3",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# 10.  There's More... (Pubmed search)\n",
132 |     "from Bio import Medline \n",
133 |     "refs = rec.annotations['references'] \n",
134 |     "for ref in refs: \n",
135 |     "    if ref.pubmed_id != '': \n",
136 |     "        print(ref.pubmed_id) \n",
137 |     "        handle = Entrez.efetch(db='pubmed', id=[ref.pubmed_id], rettype='medline', retmode='text') \n",
138 |     "        records = Medline.parse(handle) \n",
139 |     "        for med_rec in records: \n",
140 |     "            for k, v in med_rec.items(): \n",
141 |     "                print('%s: %s' % (k, v)) "
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "19e2f484",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "## End of Notebook ##"
152 |    ]
153 |   }
154 |  ],
155 |  "metadata": {
156 |   "kernelspec": {
157 |    "display_name": "Python 3 (ipykernel)",
158 |    "language": "python",
159 |    "name": "python3"
160 |   },
161 |   "language_info": {
162 |    "codemirror_mode": {
163 |     "name": "ipython",
164 |     "version": 3
165 |    },
166 |    "file_extension": ".py",
167 |    "mimetype": "text/x-python",
168 |    "name": "python",
169 |    "nbconvert_exporter": "python",
170 |    "pygments_lexer": "ipython3",
171 |    "version": "3.11.3"
172 |   }
173 |  },
174 |  "nbformat": 4,
175 |  "nbformat_minor": 5
176 | }
177 | 


--------------------------------------------------------------------------------
/Ch09/Ch09-5-proteomics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "03576b95-0a35-4fbb-b0ec-7f451c854b86",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch09-5 - Proteomics Analysis"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "95cb6809-e14b-446e-8c48-75b66f789bff",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "## Install Packages ##\n",
 21 |     "! pip install biopython matplotlib pandas seaborn pyteomics"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "52e3d25f-8449-4c99-970d-23b36c60a465",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Import Libraries \n",
 32 |     "import pandas as pd\n",
 33 |     "import matplotlib.pyplot as plt\n",
 34 |     "import seaborn as sns\n",
 35 |     "from Bio.SeqUtils.ProtParam import ProteinAnalysis\n",
 36 |     "from pyteomics import parser, mass"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "fe53ba18-a22f-4154-b80f-620aa3c2254c",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Define a sample protein sequence (P53 - TP53 Tumor Suppressor)\n",
 47 |     "protein_sequence = (\n",
 48 |     "    \"MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPG\"\n",
 49 |     "    \"PDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT\"\n",
 50 |     "    \"GFVKVGQSTSRHKKLMFKTEGPDSD\"\n",
 51 |     ")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "ae2732b2-b6a1-492d-8887-415a498e7bd2",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Analyze protein properties\n",
 62 |     "protein = ProteinAnalysis(protein_sequence)\n",
 63 |     "molecular_weight = protein.molecular_weight()\n",
 64 |     "hydrophobicity = protein.gravy()\n",
 65 |     "isoelectric_point = protein.isoelectric_point()\n",
 66 |     "amino_acid_composition = protein.count_amino_acids()\n",
 67 |     "\n",
 68 |     "print(f\"Protein Molecular Weight: {molecular_weight:.2f} Da\")\n",
 69 |     "print(f\"Protein Hydrophobicity (GRAVY): {hydrophobicity:.2f}\")\n",
 70 |     "print(f\"Protein Isoelectric Point (pI): {isoelectric_point:.2f}\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "962694fa-5081-4aa6-9d13-c829f914cc76",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Perform in-silico trypsin digestion\n",
 81 |     "peptides = sorted(list(parser.cleave(protein_sequence, parser.expasy_rules['trypsin'])))  # Convert set to list"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "bdd68dd6-774e-4d65-972c-35c9778df3be",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Calculate peptide masses\n",
 92 |     "peptide_masses = [mass.calculate_mass(sequence=p) for p in peptides]"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "80d4c748-df80-4664-9187-00a673fcb1aa",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Create DataFrame for peptide properties\n",
103 |     "df = pd.DataFrame({'Peptide': peptides, 'Mass (Da)': peptide_masses})\n",
104 |     "\n",
105 |     "# Filter out very small peptides (e.g., <500 Da)\n",
106 |     "df = df[df['Mass (Da)'] > 500]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "ca73d09f-7673-4226-8b90-ba6e3210cb3f",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Display top 10 peptides\n",
117 |     "print(\"\\nTop 10 Peptides:\")\n",
118 |     "print(df.head(10))"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "b8d288a6-6533-4220-bcbd-ce49e94c022b",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# Visualization: Peptide mass distribution\n",
129 |     "plt.figure(figsize=(10, 5))\n",
130 |     "sns.histplot(df['Mass (Da)'], bins=30, kde=True, color=\"blue\")\n",
131 |     "plt.xlabel(\"Peptide Mass (Da)\")\n",
132 |     "plt.ylabel(\"Frequency\")\n",
133 |     "plt.title(\"Peptide Mass Distribution (Trypsin Digest)\")\n",
134 |     "plt.show()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "58a56f8d-57b3-4ff1-8bf1-f88c2b32c04f",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "## End of Notebook ##"
145 |    ]
146 |   }
147 |  ],
148 |  "metadata": {
149 |   "kernelspec": {
150 |    "display_name": "Python 3 (ipykernel)",
151 |    "language": "python",
152 |    "name": "python3"
153 |   },
154 |   "language_info": {
155 |    "codemirror_mode": {
156 |     "name": "ipython",
157 |     "version": 3
158 |    },
159 |    "file_extension": ".py",
160 |    "mimetype": "text/x-python",
161 |    "name": "python",
162 |    "nbconvert_exporter": "python",
163 |    "pygments_lexer": "ipython3",
164 |    "version": "3.11.3"
165 |   }
166 |  },
167 |  "nbformat": 4,
168 |  "nbformat_minor": 5
169 | }
170 | 


--------------------------------------------------------------------------------
/Ch07/Ch07-3-long-read-assembly.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "1d3b4ccb",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch07-3 - Long Read Assembly"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "9210885f",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Get nanopore reads for E Coli (Loman lab)\n",
 21 |     "! wget https://nanopore.s3.climb.ac.uk/MAP006-1_2D_pass.fasta\n",
 22 |     "! mv MAP006-1_2D_pass.fasta data/"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "7d11b44f",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Install Raven\n",
 33 |     "! git clone https://github.com/lbcb-sci/raven.git && cd raven\n",
 34 |     "! cmake -S ./ -B./build -DRAVEN_BUILD_EXE=1 -DCMAKE_BUILD_TYPE=Release\n",
 35 |     "!cmake --build build"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "c5e0a7dc",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# To put Raven in your path\n",
 46 |     "! export PATH=$PATH:~/work/CookBook/Ch07/raven/build/bin"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "16e6690e",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# To add Raven to your .zschrc\n",
 57 |     "! echo 'export PATH=$PATH:~/work/CookBook/Ch07/raven/build/bin' >> ~/.zshrc\n",
 58 |     "! source ~/.zshrc"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "9a8244ca",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# Check if raven is working (you may need to restart your Jupyter notebook)\n",
 69 |     "! raven --help"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 1,
 75 |    "id": "608bdf6b-0e59-4780-8eba-0310b42fa548",
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# Import Libraries\n",
 80 |     "import subprocess"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "f6971be9",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# Run Raven assembler\n",
 91 |     "def run_raven(input_fasta, output_fasta):\n",
 92 |     "    \"\"\"\n",
 93 |     "    Runs Raven to assemble a genome from an input FASTA file.\n",
 94 |     "\n",
 95 |     "    Parameters:\n",
 96 |     "        input_fasta (str): Path to the input FASTA file containing long reads.\n",
 97 |     "        output_fasta (str): Path to save the assembled genome in FASTA format.\n",
 98 |     "\n",
 99 |     "    Returns:\n",
100 |     "        None\n",
101 |     "    \"\"\"\n",
102 |     "    try:\n",
103 |     "        print(f\"Running Raven on {input_fasta}...\")\n",
104 |     "        # Use stdout redirection for Raven output\n",
105 |     "        with open(output_fasta, \"w\") as output_file:\n",
106 |     "            command = [\"raven\", input_fasta]\n",
107 |     "            subprocess.run(command, stdout=output_file, check=True)\n",
108 |     "        print(f\"Assembly completed. Output saved to {output_fasta}\")\n",
109 |     "    except FileNotFoundError:\n",
110 |     "        print(\"Error: Raven is not installed or not found in the system PATH.\")\n",
111 |     "    except subprocess.CalledProcessError as e:\n",
112 |     "        print(f\"Error running Raven: {e}\")\n",
113 |     "    except Exception as e:\n",
114 |     "        print(f\"Unexpected error: {e}\")"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "id": "e754c1e6",
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# Run the Raven function on our long read input data\n",
125 |     "if __name__ == \"__main__\":\n",
126 |     "    input_fasta = \"data/MAP006-1_2D_pass.fasta\"\n",
127 |     "    output_fasta = \"assembly.fasta\"\n",
128 |     "    run_raven(input_fasta, output_fasta)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "68ccff06",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# Move the assembly fasta file to the output directory\n",
139 |     "! mkdir -p output\n",
140 |     "! mv assembly.fasta output/ecoli-assembly.fasta"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "51b3ff98",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "## End of Noteboook ##"
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "Python 3 (ipykernel)",
157 |    "language": "python",
158 |    "name": "python3"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.11.3"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 5
175 | }
176 | 


--------------------------------------------------------------------------------
/Ch11/Ch11-2-using-sgkit.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "b2b305b9-9ca7-47b6-a9e3-99628fae7a9a",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch11-2-using-sgkit"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "2e374b2d-e950-4706-9616-fe2a91569257",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install libraries\n",
 21 |     "! pip install sgkit\n",
 22 |     "! pip install 'sgkit[plink]'"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "89d77eba-d7cd-4c7e-b719-d0e89d9e1952",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# You may experience compatibility issues with pyarrow - if so uninstall it and update it\n",
 33 |     "! pip uninstall pyarrow -y\n",
 34 |     "! pip install pyarrow --no-cache-dir\n",
 35 |     "# Or try (from Terminal):\n",
 36 |     "# Uninstall from BOTH\n",
 37 |     "# conda uninstall pyarrow -y\n",
 38 |     "# pip uninstall pyarrow -y\n",
 39 |     "# Reinstall with conda ONLY\n",
 40 |     "# conda install -c conda-forge pyarrow -y"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "d6ef6bdb-7b0c-4862-abdc-d49936149a76",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# You may also experience Dask incompatibilities; If so try:\n",
 51 |     "! pip install --upgrade dask-ml\n",
 52 |     "! pip install --upgrade dask"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "49fa98d7-b20c-4b7c-8059-b41731966c35",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# 1.  Load dataset from previous recipe\n",
 63 |     "import numpy as np \n",
 64 |     "from sgkit.io import plink \n",
 65 |     "data = plink.read_plink(path='hapmap10_auto_noofs_ld', fam_sep='\\t') "
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "8882ef96-e1a2-41b7-9796-c8bce3d6d8ad",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# 2.  Check the structure of the data\n",
 76 |     "data "
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "d898cb18-c020-49dd-896f-9c8c4bd9e629",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# 3.  Get summary information\n",
 87 |     "print(data.dims) "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "id": "11d2e4fc-a090-4794-b2fb-b5272592dc86",
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# 4.  Extract information about samples\n",
 98 |     "print(len(data.sample_id.values)) \n",
 99 |     "print(data.sample_id.values) \n",
100 |     "print(data.sample_family_id.values) \n",
101 |     "print(data.sample_sex.values) "
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "id": "e9d1cc35-d74a-4e34-a716-d578fc9fc7f7",
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# 5.  Review contigs\n",
112 |     "print(data.contigs) "
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "a9e73ded-57b6-4dd8-be30-b0235c264af8",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# 6. Look at the variants\n",
123 |     "print(len(data.variant_contig.values)) \n",
124 |     "print(data.variant_contig.values) \n",
125 |     "print(data.variant_position.values) \n",
126 |     "print(data.variant_allele.values) \n",
127 |     "print(data.variant_id.values) "
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "9edce820-55b9-4355-81ce-b6dd2b428023",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "# 7.  Look at Genotype data\n",
138 |     "call_genotype = data.call_genotype.values \n",
139 |     "print(call_genotype.shape) \n",
140 |     "first_individual = call_genotype[:,0,:] \n",
141 |     "first_variant = call_genotype[0,:,:] \n",
142 |     "first_variant_of_first_individual = call_genotype[0,0,:] \n",
143 |     "print(first_variant_of_first_individual) \n",
144 |     "print(data.sample_family_id.values[0], data.sample_id.values[0]) \n",
145 |     "print(data.variant_allele.values[0]) "
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "540f6197-efa9-4ae0-a9ab-b27aa50778ef",
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "## End of Notebook ##"
156 |    ]
157 |   }
158 |  ],
159 |  "metadata": {
160 |   "kernelspec": {
161 |    "display_name": "Python 3 (ipykernel)",
162 |    "language": "python",
163 |    "name": "python3"
164 |   },
165 |   "language_info": {
166 |    "codemirror_mode": {
167 |     "name": "ipython",
168 |     "version": 3
169 |    },
170 |    "file_extension": ".py",
171 |    "mimetype": "text/x-python",
172 |    "name": "python",
173 |    "nbconvert_exporter": "python",
174 |    "pygments_lexer": "ipython3",
175 |    "version": "3.11.14"
176 |   }
177 |  },
178 |  "nbformat": 4,
179 |  "nbformat_minor": 5
180 | }
181 | 


--------------------------------------------------------------------------------
/Ch02/Ch02-3-pandas-memory.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "fbeff717",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch02-3 Pandas Memory"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "71c9ea98-4713-4e8d-9a39-8fa2b764e200",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "########################## Notes & Updates ##############################################################\n",
 21 |     "# If you are using Docker and your data directory is mapped to \"/data\" then you can use the commented-out\n",
 22 |     "#   Docker lines below in place of the primary line (which you will comment out when running)\n",
 23 |     "# You will also find other alternative lines or blocks that can be used to avoid potential issues \n",
 24 |     "#########################################################################################################"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "1e62c334",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# Libraries \n",
 35 |     "import numpy as np \n",
 36 |     "import pandas as pd "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "dbf171c1",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Load data\n",
 47 |     "vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\", low_memory=False) \n",
 48 |     "# vdata = pd.read_csv(\"/data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\", low_memory=False) # Docker\n",
 49 |     "vdata.info(memory_usage=\"deep\") "
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "5258bc2e",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Inspect the size of each column\n",
 60 |     "for name in vdata.columns:\n",
 61 |     "    col_bytes = vdata[name].memory_usage(index=False, deep=True) \n",
 62 |     "    col_type = vdata[name].dtype\n",
 63 |     "    print(name, col_type, col_bytes // (1024 ** 2)) "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "id": "75883fe5",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# Review the Died column\n",
 74 |     "vdata.DIED.memory_usage(index=False, deep=True) \n",
 75 |     "vdata.DIED.fillna(False).astype(bool).memory_usage(index=False, deep=True) "
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "551e9be4",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# State column\n",
 86 |     "vdata[\"STATE\"] = vdata.STATE.str.upper() \n",
 87 |     "states = list(vdata[\"STATE\"].unique()) \n",
 88 |     "vdata[\"encoded_state\"] = vdata.STATE.apply(lambda state: states.index(state)) \n",
 89 |     "vdata[\"encoded_state\"] = vdata[\"encoded_state\"].astype(np.uint8) \n",
 90 |     "vdata[\"STATE\"].memory_usage(index=False, deep=True) \n",
 91 |     "vdata[\"encoded_state\"].memory_usage(index=False, deep=True) "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "26d93734",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Apply optimizations while loading the data\n",
102 |     "states = list(pd.read_csv(\"vdata_sample.csv.gz\",\n",
103 |     "    converters={\"STATE\": lambda state: state.upper()}, \n",
104 |     "    usecols=[\"STATE\"] \n",
105 |     ")[\"STATE\"].unique()) "
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "bd132f8b",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "# Skip the symptom_text column\n",
116 |     "vdata = pd.read_csv(\"vdata_sample.csv.gz\", index_col=\"VAERS_ID\",\n",
117 |     "    converters={\n",
118 |     "        \"DIED\": lambda died: died == \"Y\", \"STATE\": lambda state: states.index(state.upper())\n",
119 |     "    }, usecols=lambda name: name != \"SYMPTOM_TEXT\")\n",
120 |     "vdata[\"STATE\"] = vdata[\"STATE\"].astype(np.uint8)\n",
121 |     "vdata.info(memory_usage=\"deep\") "
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "28e8a98c-39d0-407f-a3ca-aa37b12bcad0",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# Note - it is ok to get a dtype warning on the above"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "a6e2cac3",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# End of Notebook #"
142 |    ]
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3 (ipykernel)",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.12.11"
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 5
166 | }
167 | 


--------------------------------------------------------------------------------
/Ch03/Ch03-4-test-writing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "0679a213-c3a1-443b-bb61-98b5c7dd980c",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch03-4 Test Writing"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 9,
 16 |    "id": "09272938-f7b9-44f8-8d02-a62b654032b1",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Libraries\n",
 21 |     "import unittest\n",
 22 |     "from Bio.Align import PairwiseAligner"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 10,
 28 |    "id": "95af1088-176c-404e-b1bd-ef19e7fbd11d",
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stderr",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "FF\n",
 36 |       "======================================================================\n",
 37 |       "FAIL: test_global_alignment (__main__.TestPairwiseAligner.test_global_alignment)\n",
 38 |       "Test global alignment.\n",
 39 |       "----------------------------------------------------------------------\n",
 40 |       "Traceback (most recent call last):\n",
 41 |       "  File \"/var/folders/53/kmyyy3057lndfb0bpwx_2pkr0000gn/T/ipykernel_37172/1953800863.py\", line 24, in test_global_alignment\n",
 42 |       "    self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n",
 43 |       "AssertionError: 6.0 != 9.0 within 1 places (3.0 difference)\n",
 44 |       "\n",
 45 |       "======================================================================\n",
 46 |       "FAIL: test_local_alignment (__main__.TestPairwiseAligner.test_local_alignment)\n",
 47 |       "Test local alignment.\n",
 48 |       "----------------------------------------------------------------------\n",
 49 |       "Traceback (most recent call last):\n",
 50 |       "  File \"/var/folders/53/kmyyy3057lndfb0bpwx_2pkr0000gn/T/ipykernel_37172/1953800863.py\", line 39, in test_local_alignment\n",
 51 |       "    self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n",
 52 |       "AssertionError: 7.0 != 6.0 within 1 places (1.0 difference)\n",
 53 |       "\n",
 54 |       "----------------------------------------------------------------------\n",
 55 |       "Ran 2 tests in 0.005s\n",
 56 |       "\n",
 57 |       "FAILED (failures=2)\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "# Define the test Class\n",
 63 |     "class TestPairwiseAligner(unittest.TestCase):\n",
 64 |     "\n",
 65 |     "    def setUp(self):\n",
 66 |     "        \"\"\"Set up test cases with sequences and aligner.\"\"\"\n",
 67 |     "        self.seq1 = \"ACGTGCTAGCTAG\"\n",
 68 |     "        self.seq2 = \"ACGTCGATGCTA\"\n",
 69 |     "        self.aligner = PairwiseAligner()\n",
 70 |     "        self.aligner.match_score = 1\n",
 71 |     "        self.aligner.mismatch_score = -1\n",
 72 |     "        self.aligner.open_gap_score = -1\n",
 73 |     "        self.aligner.extend_gap_score = -0.5\n",
 74 |     "\n",
 75 |     "    def test_global_alignment(self):\n",
 76 |     "        \"\"\"Test global alignment.\"\"\"\n",
 77 |     "        alignments = self.aligner.align(self.seq1, self.seq2)\n",
 78 |     "        best_alignment = alignments[0]\n",
 79 |     "        \n",
 80 |     "        # Expected results\n",
 81 |     "        expected_score = 9.0\n",
 82 |     "        expected_target = \"ACGTGCTAGCTAG\"\n",
 83 |     "        expected_query = \"ACGTCGATGCTA-\"\n",
 84 |     "        \n",
 85 |     "        self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n",
 86 |     "        self.assertEqual(str(best_alignment).splitlines()[0], \"target            0 ACGTGCTAGCTAG 13\")\n",
 87 |     "        self.assertEqual(str(best_alignment).splitlines()[2], \"query             0 ACGTCGATGCTA- 12\")\n",
 88 |     "\n",
 89 |     "    def test_local_alignment(self):\n",
 90 |     "        \"\"\"Test local alignment.\"\"\"\n",
 91 |     "        self.aligner.mode = 'local'\n",
 92 |     "        alignments = self.aligner.align(self.seq1, self.seq2)\n",
 93 |     "        best_alignment = alignments[0]\n",
 94 |     "        \n",
 95 |     "        # Expected results\n",
 96 |     "        expected_score = 6.0\n",
 97 |     "        expected_target = \"GTGCTAG\"\n",
 98 |     "        expected_query = \"GTCGATG\"\n",
 99 |     "        \n",
100 |     "        self.assertAlmostEqual(best_alignment.score, expected_score, places=1)\n",
101 |     "        self.assertIn(expected_target, str(best_alignment))\n",
102 |     "        self.assertIn(expected_query, str(best_alignment))\n",
103 |     "\n",
104 |     "# Run the tests\n",
105 |     "if __name__ == \"__main__\":\n",
106 |     "    unittest.main(argv=[''], exit=False)\n"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "6793ddb1-4ca7-4915-b9a1-f2e81d939e8f",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": []
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "Python 3 (ipykernel)",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.11.3"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 5
139 | }
140 | 


--------------------------------------------------------------------------------
/Ch07/Ch07-1-genomes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "a2023924",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch07-1-genomes"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "c614152d",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Download T2T reference genome\n",
 21 |     "! mkdir -p data\n",
 22 |     "! wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz\n",
 23 |     "! gunzip GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz\n",
 24 |     "! mv GCA_009914755.4_T2T-CHM13v2.0_genomic.fna data/T2T_genome.fasta"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "4d8c6315",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# Install pyfastx\n",
 35 |     "! pip install pyfastx"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 6,
 41 |    "id": "a7b566a0",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# Import libraries\n",
 46 |     "import pyfastx"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 1,
 52 |    "id": "f426a4be",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# Input genome file\n",
 57 |     "genome_fasta = \"data/T2T_genome.fasta\""
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 2,
 63 |    "id": "bc05ab68",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def compute_genome_size(fasta_file):\n",
 68 |     "    \"\"\"\n",
 69 |     "    Compute the total genome size from a FASTA file.\n",
 70 |     "    \"\"\"\n",
 71 |     "    genome_size = 0\n",
 72 |     "    genome = pyfastx.Fasta(fasta_file, build_index=False)\n",
 73 |     "    for _, seq in genome:  # Use the sequence directly from the tuple\n",
 74 |     "        genome_size += len(seq)\n",
 75 |     "    return genome_size"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 8,
 81 |    "id": "1c39876c",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "def compute_gc_content(fasta_file):\n",
 86 |     "    \"\"\"\n",
 87 |     "    Compute the overall GC content of the genome.\n",
 88 |     "    \"\"\"\n",
 89 |     "    total_bases = 0\n",
 90 |     "    gc_count = 0\n",
 91 |     "    genome = pyfastx.Fasta(fasta_file, build_index=False)\n",
 92 |     "    for _, seq in genome:  # Use the sequence directly from the tuple\n",
 93 |     "        total_bases += len(seq)\n",
 94 |     "        gc_count += seq.upper().count('G') + seq.upper().count('C')\n",
 95 |     "    return (gc_count / total_bases) * 100 if total_bases > 0 else 0"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "id": "05443299",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "def compute_n50(fasta_file):\n",
106 |     "    \"\"\"\n",
107 |     "    Compute the N50 value for the genome.\n",
108 |     "    \"\"\"\n",
109 |     "    lengths = []\n",
110 |     "    genome = pyfastx.Fasta(fasta_file, build_index=False)\n",
111 |     "    lengths = sorted([len(seq) for _, seq in genome], reverse=True)\n",
112 |     "    \n",
113 |     "    cumulative_length = 0\n",
114 |     "    total_length = sum(lengths)\n",
115 |     "    for length in lengths:\n",
116 |     "        cumulative_length += length\n",
117 |     "        if cumulative_length >= total_length / 2:\n",
118 |     "            return length\n",
119 |     "    return 0"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 9,
125 |    "id": "86d5d764",
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "Genome Quality Metrics for data/T2T_genome.fasta:\n",
133 |       "Total Genome Size: 3,117,292,070 bp\n",
134 |       "GC Content: 40.75%\n",
135 |       "N50: 150,617,247 bp\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "def assess_quality(fasta_file):\n",
141 |     "    \"\"\"\n",
142 |     "    Assess the quality of a T2T genome by calculating key metrics.\n",
143 |     "    \"\"\"\n",
144 |     "    genome_size = compute_genome_size(fasta_file)\n",
145 |     "    gc_content = compute_gc_content(fasta_file)\n",
146 |     "    n50 = compute_n50(fasta_file)\n",
147 |     "\n",
148 |     "    print(f\"Genome Quality Metrics for {fasta_file}:\")\n",
149 |     "    print(f\"Total Genome Size: {genome_size:,} bp\")\n",
150 |     "    print(f\"GC Content: {gc_content:.2f}%\")\n",
151 |     "    print(f\"N50: {n50:,} bp\")\n",
152 |     "\n",
153 |     "# Run the quality assessment\n",
154 |     "if __name__ == \"__main__\":\n",
155 |     "    assess_quality(genome_fasta)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "a43d8913",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "## End of Notebook ##"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.11.3"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 5
190 | }
191 | 


--------------------------------------------------------------------------------
/Ch04/Ch04-3-k-means.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "50013f28",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch04-3 K Means Analysis of Breast Cancer dataset"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "559118f4",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install scipy\n",
 21 |     "! pip install scipy==1.15.3  # Note that the latest conda scipy we could install was 1.14.1 in the YAML file"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "e81ad048",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Libraries\n",
 32 |     "from sklearn.datasets import load_breast_cancer\n",
 33 |     "from sklearn.cluster import KMeans\n",
 34 |     "from sklearn.preprocessing import StandardScaler\n",
 35 |     "from sklearn.decomposition import PCA\n",
 36 |     "import matplotlib.pyplot as plt\n",
 37 |     "import pandas as pd\n",
 38 |     "import numpy as np\n",
 39 |     "from sklearn.metrics import accuracy_score\n",
 40 |     "from scipy.stats import mode"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "2c0b64a8",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# Load Breast Cancer dataset\n",
 51 |     "data = load_breast_cancer()\n",
 52 |     "X = data.data  # Features array\n",
 53 |     "y = data.target  # Labels / classes"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "48f15654",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Normalize the data\n",
 64 |     "scaler = StandardScaler()\n",
 65 |     "X_scaled = scaler.fit_transform(X)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "05349bc4",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# K-Means clustering\n",
 76 |     "kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)  # We know there are 2 classes; initialize 10 times\n",
 77 |     "clusters = kmeans.fit_predict(X_scaled)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "b155954b",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Build a dataframe to analyze\n",
 88 |     "bc_kmeans_df = pd.DataFrame(X_scaled, columns=data.feature_names)\n",
 89 |     "bc_kmeans_df['Cluster'] = clusters\n",
 90 |     "bc_kmeans_df['True Label'] = y"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "id": "e681d95b",
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# Align the labels to avoid Cluster Flipping\n",
101 |     "def align_labels(true_labels, cluster_labels):\n",
102 |     "    # Map cluster labels to the majority true label in each cluster\n",
103 |     "    new_labels = np.zeros_like(cluster_labels)\n",
104 |     "    for cluster in np.unique(cluster_labels):\n",
105 |     "        mask = cluster_labels == cluster\n",
106 |     "        new_labels[mask] = mode(true_labels[mask], keepdims=False)[0]\n",
107 |     "    return new_labels\n",
108 |     "aligned_clusters = align_labels(y, clusters)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "461f859d",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Evaluate accuracy of clustering\n",
119 |     "accuracy = accuracy_score(y, aligned_clusters)\n",
120 |     "print(f\"Accuracy of clustering: {accuracy:.2f}\")"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "b1c96bff",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# Reduce the dimensions using PCA\n",
131 |     "pca = PCA(n_components=2)  # Use 2 components\n",
132 |     "X_pca = pca.fit_transform(X_scaled)\n",
133 |     "bc_kmeans_df['PC1'] = X_pca[:, 0]\n",
134 |     "bc_kmeans_df['PC2'] = X_pca[:, 1]"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "9538605b",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# Plot the K-means clustering results\n",
145 |     "plt.figure(figsize=(8, 6))\n",
146 |     "for cluster, color, marker in zip([0, 1], ['red', 'blue'], ['o', '^']):\n",
147 |     "    subset = bc_kmeans_df[bc_kmeans_df['Cluster'] == cluster]\n",
148 |     "    plt.scatter(subset['PC1'], subset['PC2'], c=color, label=f'Cluster {cluster}', marker=marker, alpha=0.7)\n",
149 |     "\n",
150 |     "plt.title('K-Means Clustering on Breast Cancer Dataset')\n",
151 |     "plt.xlabel('Principal Component 1')\n",
152 |     "plt.ylabel('Principal Component 2')\n",
153 |     "plt.legend()\n",
154 |     "plt.grid()\n",
155 |     "plt.show()"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "09d475f1",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "## End of Notebook ##"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.12.10"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 5
190 | }
191 | 


--------------------------------------------------------------------------------
/Ch04/Ch04-2-PCA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "3bca36d9",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch04-2-PCA "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "0dabaee9",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install scikit-learn\n",
 21 |     "!  pip install scikit-learn==1.7.0"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "283a2653",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Libraries\n",
 32 |     "from sklearn.datasets import load_breast_cancer\n",
 33 |     "from sklearn.decomposition import PCA\n",
 34 |     "from sklearn.preprocessing import StandardScaler\n",
 35 |     "import matplotlib.pyplot as plt\n",
 36 |     "import pandas as pd"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "42defdf1",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# Load Breast Cancer dataset\n",
 47 |     "bc_data = load_breast_cancer()\n",
 48 |     "X = bc_data.data  # Features\n",
 49 |     "y = bc_data.target  # Target labels (0 = malignant, 1 = benign)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "3c11d674",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "type(bc_data)  # See what type of data is returned by sklearn - it is a Bunch"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "id": "572461df",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# Standardize the data so that mean = 0 & variance = 1\n",
 70 |     "scaler = StandardScaler()\n",
 71 |     "X_scaled = scaler.fit_transform(X)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "24527df4",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Perform PCA with 3 components\n",
 82 |     "bc_pca = PCA(n_components=3)\n",
 83 |     "X_bc_pca = bc_pca.fit_transform(X_scaled)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "id": "bdd8a277",
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "# Load ther results into a dataframe\n",
 94 |     "bc_pca_df = pd.DataFrame(X_bc_pca, columns=['PC1', 'PC2', 'PC3'])\n",
 95 |     "bc_pca_df['label'] = y"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "ecd40b92",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# 3D Scatter Plot for PC1, PC2, and PC3\n",
106 |     "fig = plt.figure(figsize=(10, 8))\n",
107 |     "ax = fig.add_subplot(111, projection='3d')\n",
108 |     "# Loop to create the 3 axes\n",
109 |     "for label, color, marker in zip([0, 1], ['red', 'blue'], ['o', '^']):\n",
110 |     "    subset = bc_pca_df[bc_pca_df['label'] == label]\n",
111 |     "    ax.scatter(subset['PC1'], subset['PC2'], subset['PC3'], c=color, label=bc_data.target_names[label], marker=marker, alpha=0.7)\n",
112 |     "# end for\n",
113 |     "ax.set_title('PCA on Breast Cancer Dataset (3D View: PC1, PC2, PC3)')\n",
114 |     "ax.set_xlabel('Principal Component 1')\n",
115 |     "ax.set_ylabel('Principal Component 2')\n",
116 |     "ax.set_zlabel('Principal Component 3')\n",
117 |     "ax.legend()\n",
118 |     "plt.show()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "5bca3af8",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# Get the explained variance ratio\n",
129 |     "explained_variance = bc_pca.explained_variance_ratio_\n",
130 |     "\n",
131 |     "# Print explained variance for each component\n",
132 |     "for i, variance in enumerate(explained_variance, 1):\n",
133 |     "    print(f\"Explained variance for PC{i}: {variance:.2f}\")\n",
134 |     "\n",
135 |     "# Print cumulative explained variance\n",
136 |     "cumulative_variance = explained_variance.sum()\n",
137 |     "print(f\"Total variance explained by the first 3 components: {cumulative_variance:.2f}\")"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "id": "882b2889",
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# Plot cumulative explained variance\n",
148 |     "cumulative_variance = explained_variance.cumsum()\n",
149 |     "plt.figure(figsize=(8, 6))\n",
150 |     "plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')\n",
151 |     "plt.title('Cumulative Explained Variance by Principal Components')\n",
152 |     "plt.xlabel('Number of Principal Components')\n",
153 |     "plt.ylabel('Cumulative Explained Variance')\n",
154 |     "plt.grid(True)\n",
155 |     "plt.show()"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "01d4c628",
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "## End of Notebook ##"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.8.6"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 5
190 | }
191 | 


--------------------------------------------------------------------------------
/Ch05/Ch05-1-qc-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "64c65fdc-8012-4a2d-9585-f89a062504fa",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch05-1-qc-data"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "29cf5d46-8106-42bf-a02c-3072044e3e86",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Get data\n",
 21 |     "! wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR390/SRR390728/SRR390728_1.fastq.gz \n",
 22 |     "! wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR390/SRR390728/SRR390728_2.fastq.gz \n",
 23 |     "! mv SRR390728_1.fastq.gz data/\n",
 24 |     "! mv SRR390728_2.fastq.gz data/"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "eb999d32-b0fe-4eb0-aa68-fae649cf9b45",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# If you want to install FastQC from within your notebook use this\n",
 35 |     "! yes | conda install -c bioconda fastqc"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "08952a4f-473e-49eb-a003-9216a7541bd2",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# Install MultiQC if desired\n",
 46 |     "! yes | conda install -c bioconda multiqc"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "82b62e83-37eb-43db-b7da-ea7b5d26cf89",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# Import Libaries\n",
 57 |     "import os\n",
 58 |     "import subprocess"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "25367b17-2af5-4eb9-9ff9-45aa32be9ee3",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "def run_fastqc(input_dir, output_dir):\n",
 69 |     "    \"\"\"\n",
 70 |     "    Function to Run FastQC on all FASTQ files in the input directory.\n",
 71 |     "    \"\"\"\n",
 72 |     "    # Create an output directory for the FastQC reports\n",
 73 |     "    os.makedirs(output_dir, exist_ok=True)\n",
 74 |     "    # List all of the FASTQ files in the input directory\n",
 75 |     "    fastq_files = [f for f in os.listdir(input_dir) if f.endswith((\".fastq\", \".fastq.gz\"))]\n",
 76 |     "    if not fastq_files:\n",
 77 |     "        print(\"Could not find any FASTQ files in the input directory.\")\n",
 78 |     "        return\n",
 79 |     "    print(\"Running FastQC...\")\n",
 80 |     "    # Build the FastQC command\n",
 81 |     "    fastqc_command = [\"fastqc\", \"-o\", output_dir] + [os.path.join(input_dir, f) for f in fastq_files]\n",
 82 |     "    # Execute FastQC\n",
 83 |     "    subprocess.run(fastqc_command)\n",
 84 |     "    print(\"FastQC analysis Completed.\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "5ef0cee9-cf95-4d57-844e-74fb7e1ee531",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "def run_multiqc(input_dir, output_dir):\n",
 95 |     "    \"\"\"\n",
 96 |     "    Run MultiQC for aggregation of FastQC results.\n",
 97 |     "    \"\"\"\n",
 98 |     "    # Create output directory for the reports\n",
 99 |     "    os.makedirs(output_dir, exist_ok=True)\n",
100 |     "    print(\"Running MultiQC...\")\n",
101 |     "    # Build the MultiQC command\n",
102 |     "    multiqc_command = [\"multiqc\", input_dir, \"-o\", output_dir]\n",
103 |     "    # Execute the MultiQC commands\n",
104 |     "    subprocess.run(multiqc_command)\n",
105 |     "    print(\"Finished...MultiQC report(s) generated.\")"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "id": "c95cbbfd-96ab-49f8-96da-5e68800f130f",
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "def main():\n",
116 |     "    \"\"\"\n",
117 |     "    Main function to perform quality control using FastQC and MultiQC.\n",
118 |     "    \"\"\"\n",
119 |     "    # Define the input and output directories\n",
120 |     "    input_dir = \"./data\"  # We placed our fastq files in the data subdirectory\n",
121 |     "    fastqc_output_dir = \"fastqc_output\"\n",
122 |     "    multiqc_output_dir = \"multiqc_output\"\n",
123 |     "    # Run FastQC\n",
124 |     "    run_fastqc(input_dir, fastqc_output_dir)\n",
125 |     "    # Run MultiQC\n",
126 |     "    run_multiqc(fastqc_output_dir, multiqc_output_dir)\n",
127 |     "    print(f\"MultiQC report saved in: {os.path.abspath(multiqc_output_dir)}\")\n",
128 |     "if __name__ == \"__main__\":\n",
129 |     "    main()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 16,
135 |    "id": "0b2af0c2-6dfc-4dad-9d4b-0492f2cf0ed5",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# Review the report\n",
140 |     "! open multiqc_output/multiqc_report.html "
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "caaa2bfc-75cc-461c-a84d-a1b27d3b3262",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# End of Notebook #"
151 |    ]
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "Python 3 (ipykernel)",
157 |    "language": "python",
158 |    "name": "python3"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.11.3"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 5
175 | }
176 | 


--------------------------------------------------------------------------------
/Ch12/Ch12-1-cobrapy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "3badbb9f-6117-46ee-9978-87f10180a0cf",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch12-1 - Metabolic Modelling with CobraPy"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "6722835f-7af1-4edd-964b-6ab25676fc50",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install the CobraPy package\n",
 21 |     "! pip install cobra"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "544f1e2e-b155-48e4-bd35-08f36ca43007",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Download the E. coli SBML mode from BiGG\n",
 32 |     "! wget http://bigg.ucsd.edu/static/models/e_coli_core.xml"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "329c2c54-e29e-45e1-90a9-c846f5da2c30",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# Load the E. coli core model \n",
 43 |     "import cobra\n",
 44 |     "model = cobra.io.read_sbml_model(\"e_coli_core.xml\")"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "id": "41e0bb39-4e8a-421f-a190-9c4e8a9993ab",
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# -----------------------------\n",
 55 |     "# Step 1: Run FBA on the Wild‐Type Model\n",
 56 |     "# -----------------------------\n",
 57 |     "solution_wt = model.optimize()\n",
 58 |     "print(\"Wild-type growth rate (objective value):\", solution_wt.objective_value)\n",
 59 |     "print(\"Flux distribution for key reactions:\")\n",
 60 |     "for rxn in model.reactions[:10]:\n",
 61 |     "    print(f\"{rxn.id}: {solution_wt.fluxes[rxn.id]}\")"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "id": "e649b3cf-9ba0-450c-8ab1-aa998272e961",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# Review the objective function\n",
 72 |     "from cobra.util.solver import linear_reaction_coefficients\n",
 73 |     "print(\"Objective direction:\", model.objective.direction)   # 'max' or 'min'\n",
 74 |     "for rxn, coef in linear_reaction_coefficients(model).items():\n",
 75 |     "    print(f\"{rxn.id}: {coef}\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "30534058-aae7-464a-86fb-93e2370aa71c",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# -----------------------------\n",
 86 |     "# Step 2: Simulate a Gene Knockout\n",
 87 |     "# -----------------------------\n",
 88 |     "# For example, let's knock out gene \"b0001\" (this is one of the genes in the E. coli model)\n",
 89 |     "gene_to_knockout = \"b0351\"\n",
 90 |     "with model:\n",
 91 |     "    # Knock out the gene (this automatically adjusts the reactions associated with the gene)\n",
 92 |     "    model.genes.get_by_id(gene_to_knockout).knock_out()\n",
 93 |     "    \n",
 94 |     "    # Optimize the model after knockout\n",
 95 |     "    solution_ko = model.optimize()\n",
 96 |     "    print(f\"\\nGrowth rate after knocking out gene {gene_to_knockout}:\", solution_ko.objective_value)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "204c324d-2ce1-4f5d-b886-7de9624e17bd",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# Review genes you could knock out in your model\n",
107 |     "print(\"Available gene IDs in the model:\")\n",
108 |     "for gene in model.genes:\n",
109 |     "    print(gene.id)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "id": "955ca8b8-9265-4a57-aa43-0901659f6f58",
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "# Step 2b - Try other gene knock outs\n",
120 |     "gene_to_knockout = \"s0001\"\n",
121 |     "with model:\n",
122 |     "    # Knock out the gene (this automatically adjusts the reactions associated with the gene)\n",
123 |     "    model.genes.get_by_id(gene_to_knockout).knock_out()\n",
124 |     "    \n",
125 |     "    # Optimize the model after knockout\n",
126 |     "    solution_ko = model.optimize()\n",
127 |     "    print(f\"\\nGrowth rate after knocking out gene {gene_to_knockout}:\", solution_ko.objective_value)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "a51b8ecb-ff43-4e3e-9895-ce8761d06cd4",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "# -----------------------------\n",
138 |     "# Step 3: Compare Flux Distributions (Optional)\n",
139 |     "# -----------------------------\n",
140 |     "print(\"\\nChange in fluxes for selected reactions after knockout:\")\n",
141 |     "for rxn in model.reactions[:10]:\n",
142 |     "    flux_change = solution_wt.fluxes[rxn.id] - solution_ko.fluxes[rxn.id]\n",
143 |     "    print(f\"{rxn.id}: Δ flux = {flux_change:.2f}\")"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "abf612f6-ebc9-4b19-ad8d-7b9e18266497",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "## End of Notebook ##"
154 |    ]
155 |   }
156 |  ],
157 |  "metadata": {
158 |   "kernelspec": {
159 |    "display_name": "Python 3 (ipykernel)",
160 |    "language": "python",
161 |    "name": "python3"
162 |   },
163 |   "language_info": {
164 |    "codemirror_mode": {
165 |     "name": "ipython",
166 |     "version": 3
167 |    },
168 |    "file_extension": ".py",
169 |    "mimetype": "text/x-python",
170 |    "name": "python",
171 |    "nbconvert_exporter": "python",
172 |    "pygments_lexer": "ipython3",
173 |    "version": "3.11.3"
174 |   }
175 |  },
176 |  "nbformat": 4,
177 |  "nbformat_minor": 5
178 | }
179 | 


--------------------------------------------------------------------------------
/Ch05/Ch05-4-variant-calling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "d481a727-d038-4493-8951-d80384b8196f",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch05-4 - Variant Calling with FreeBayes"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "07842b42-1212-48d7-96dd-7fa67bc32e09",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install FreeBayes\n",
 21 |     "! brew install freebayes"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "deb1070d-c3fc-49b6-aca3-26418b7a0460",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Import Libraries\n",
 32 |     "import subprocess\n",
 33 |     "import os"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "b507cff0-6e62-4d90-b49b-850f28c2ef41",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "def run_command(cmd):\n",
 44 |     "    \"\"\"\n",
 45 |     "    Run a shell command and check for errors.\n",
 46 |     "    \"\"\"\n",
 47 |     "    print(f\"Running: {' '.join(cmd)}\")\n",
 48 |     "    subprocess.run(cmd, check=True)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "15054a71-65ef-434c-8ca3-d8bb558dde61",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "def index_reference(reference_fasta):\n",
 59 |     "    \"\"\"\n",
 60 |     "    Index the reference genome for FreeBayes and Samtools.\n",
 61 |     "    \"\"\"\n",
 62 |     "    print(\"Indexing the reference genome...\")\n",
 63 |     "    # Generate a FASTA index for samtools and FreeBayes\n",
 64 |     "    run_command([\"samtools\", \"faidx\", reference_fasta])\n",
 65 |     "    print(\"Reference indexing complete.\\n\")"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "397d713d-c8f4-450e-a6fd-e436fd8ba8b3",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def sort_and_index_bam(input_bam, output_sorted_bam):\n",
 76 |     "    \"\"\"\n",
 77 |     "    Sort and index the BAM file using Samtools.\n",
 78 |     "    \"\"\"\n",
 79 |     "    print(\"Sorting and indexing the BAM file...\")\n",
 80 |     "    # Sort the BAM file\n",
 81 |     "    run_command([\"samtools\", \"sort\", \"-o\", output_sorted_bam, input_bam])\n",
 82 |     "    # Index the sorted BAM file\n",
 83 |     "    run_command([\"samtools\", \"index\", output_sorted_bam])\n",
 84 |     "    print(f\"Sorted BAM file: {output_sorted_bam}\\n\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "17bc0aea-e8a6-42d5-a1c0-52091f5591d9",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "def call_variants_with_freebayes(reference_fasta, input_bam, output_vcf):\n",
 95 |     "    \"\"\"\n",
 96 |     "    Call variants using FreeBayes.\n",
 97 |     "    \"\"\"\n",
 98 |     "    print(\"Calling variants with FreeBayes...\")\n",
 99 |     "    cmd = [\n",
100 |     "        \"freebayes\",\n",
101 |     "        \"-f\", reference_fasta,  # Reference genome\n",
102 |     "        input_bam              # Sorted BAM file\n",
103 |     "    ]\n",
104 |     "    # Write the VCF output to a file\n",
105 |     "    with open(output_vcf, \"w\") as vcf_file:\n",
106 |     "        subprocess.run(cmd, stdout=vcf_file, check=True)\n",
107 |     "    print(f\"Variants called successfully. Output VCF: {output_vcf}\\n\")"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "63629749-515d-4b26-9822-76894edbaf94",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "def main():\n",
118 |     "    \"\"\"\n",
119 |     "    Main function to automate the FreeBayes variant calling workflow.\n",
120 |     "    \"\"\"\n",
121 |     "    # Input files\n",
122 |     "    reference_fasta = \"data/ecoli_genome/ecoli_reference.fasta\"  # Path to the reference genome\n",
123 |     "    input_bam = \"data/output/aligned_reads.sam\"            # Input BAM file (unsorted)\n",
124 |     "    # Output files\n",
125 |     "    output_sorted_bam = \"output/aligned_reads_sorted.bam\"\n",
126 |     "    output_vcf = \"output/variants.vcf\"\n",
127 |     "    # Create output directory\n",
128 |     "    os.makedirs(\"output\", exist_ok=True)\n",
129 |     "    try:\n",
130 |     "        # Step 1: Index the reference genome\n",
131 |     "        index_reference(reference_fasta)\n",
132 |     "        # Step 2: Sort and index the BAM file\n",
133 |     "        sort_and_index_bam(input_bam, output_sorted_bam)\n",
134 |     "        # Step 3: Call variants using FreeBayes\n",
135 |     "        call_variants_with_freebayes(reference_fasta, output_sorted_bam, output_vcf)\n",
136 |     "    except subprocess.CalledProcessError as e:\n",
137 |     "        print(f\"Error occurred while running a command: {e}\")\n",
138 |     "    except Exception as e:\n",
139 |     "        print(f\"Unexpected error: {e}\")\n",
140 |     "\n",
141 |     "if __name__ == \"__main__\":\n",
142 |     "    main()"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "8a30fbe7-e309-4b3e-b591-e40e6ad9feac",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "## End of Notebook ##"
153 |    ]
154 |   }
155 |  ],
156 |  "metadata": {
157 |   "kernelspec": {
158 |    "display_name": "Python 3 (ipykernel)",
159 |    "language": "python",
160 |    "name": "python3"
161 |   },
162 |   "language_info": {
163 |    "codemirror_mode": {
164 |     "name": "ipython",
165 |     "version": 3
166 |    },
167 |    "file_extension": ".py",
168 |    "mimetype": "text/x-python",
169 |    "name": "python",
170 |    "nbconvert_exporter": "python",
171 |    "pygments_lexer": "ipython3",
172 |    "version": "3.11.3"
173 |   }
174 |  },
175 |  "nbformat": 4,
176 |  "nbformat_minor": 5
177 | }
178 | 


--------------------------------------------------------------------------------
/Ch04/Ch04-6-seaborn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "a4c28fae",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch04-6 - Build a UMAP using Seaborn"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "116b7cac",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install pacakges\n",
 21 |     "! pip install seaborn\n",
 22 |     "! pip install umap-learn==0.5.7\n",
 23 |     "! pip install ipywidgets==8.1.7"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "43b3220b",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# Libraries\n",
 34 |     "import numpy as np\n",
 35 |     "import matplotlib.pyplot as plt\n",
 36 |     "import seaborn as sns\n",
 37 |     "from sklearn.datasets import load_breast_cancer\n",
 38 |     "from sklearn.preprocessing import StandardScaler\n",
 39 |     "import umap"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "61307b2e",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Load Breast Cancer dataset\n",
 50 |     "data = load_breast_cancer()\n",
 51 |     "X, y = data.data, data.target"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "00e9f97a-f3d5-4e06-af1c-c0472ccc3914",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Normalization\n",
 62 |     "scaler = StandardScaler()\n",
 63 |     "X_scaled = scaler.fit_transform(X)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "id": "3b2d9615-2fa3-4925-97b5-0ce0ec50cb7c",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# Create UMAP embedding\n",
 74 |     "umap_reducer = umap.UMAP(\n",
 75 |     "    n_neighbors=15,  # Controls local vs global structure\n",
 76 |     "    min_dist=0.1,    # Controls how tightly points are packed\n",
 77 |     "    n_components=2,  # 2D visualization\n",
 78 |     "    random_state=42,  # For reproducibility\n",
 79 |     "    n_jobs=1\n",
 80 |     ")\n",
 81 |     "X_umap = umap_reducer.fit_transform(X_scaled)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "8499af82-6043-4a8f-a251-1a2e3df718e6",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Visualize the Umap\n",
 92 |     "plt.figure(figsize=(10, 8))\n",
 93 |     "for i in [0, 1]:\n",
 94 |     "    mask = y == i\n",
 95 |     "    plt.scatter(\n",
 96 |     "        X_umap[mask, 0], \n",
 97 |     "        X_umap[mask, 1], \n",
 98 |     "        label=data.target_names[i],\n",
 99 |     "        alpha=0.7,\n",
100 |     "        edgecolors='black', \n",
101 |     "        linewidth=0.5\n",
102 |     "    )\n",
103 |     "plt.title('UMAP Visualization of Breast Cancer Dataset', fontsize=16)\n",
104 |     "plt.xlabel('UMAP Dimension 1', fontsize=12)\n",
105 |     "plt.ylabel('UMAP Dimension 2', fontsize=12)\n",
106 |     "plt.legend()\n",
107 |     "plt.grid(True, linestyle='--', alpha=0.7)\n",
108 |     "plt.tight_layout()\n",
109 |     "plt.show()"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "id": "ffb69ab5-e629-4c84-aa71-186b026fa5aa",
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "# Explore different UMAP parameters to improve the Clustering\n",
120 |     "def plot_umap_parameter_comparison():\n",
121 |     "    # Create a figure with subplots for different UMAP configurations\n",
122 |     "    fig, axs = plt.subplots(2, 2, figsize=(16, 16))\n",
123 |     "    \n",
124 |     "    # Different n_neighbors values\n",
125 |     "    neighbors_values = [5, 15, 30, 50]\n",
126 |     "    \n",
127 |     "    for i, n_neighbors in enumerate(neighbors_values):\n",
128 |     "        row = i // 2\n",
129 |     "        col = i % 2\n",
130 |     "        \n",
131 |     "        # Create UMAP embedding with specific n_neighbors\n",
132 |     "        umap_reducer = umap.UMAP(\n",
133 |     "            n_neighbors=n_neighbors,\n",
134 |     "            min_dist=0.1,\n",
135 |     "            n_components=2,\n",
136 |     "            random_state=42,\n",
137 |     "            n_jobs=1\n",
138 |     "        )\n",
139 |     "        X_umap = umap_reducer.fit_transform(X_scaled)\n",
140 |     "        \n",
141 |     "        # Plot\n",
142 |     "        axs[row, col].scatter(\n",
143 |     "            X_umap[:, 0], \n",
144 |     "            X_umap[:, 1], \n",
145 |     "            c=y, \n",
146 |     "            cmap='viridis', \n",
147 |     "            alpha=0.7,\n",
148 |     "            edgecolors='black', \n",
149 |     "            linewidth=0.5\n",
150 |     "        )\n",
151 |     "        axs[row, col].set_title(f'UMAP (n_neighbors = {n_neighbors})')\n",
152 |     "        axs[row, col].set_xlabel('UMAP Dimension 1')\n",
153 |     "        axs[row, col].set_ylabel('UMAP Dimension 2')\n",
154 |     "    \n",
155 |     "    plt.tight_layout()\n",
156 |     "    plt.show()"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "id": "0d39763c-4518-4fe5-a8fa-78eee13d5818",
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# Run parameter comparison\n",
167 |     "plot_umap_parameter_comparison()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "e3042be9-5bb6-4e4a-8a54-d13994886821",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "## End of Notebook ##"
178 |    ]
179 |   }
180 |  ],
181 |  "metadata": {
182 |   "kernelspec": {
183 |    "display_name": "Python 3 (ipykernel)",
184 |    "language": "python",
185 |    "name": "python3"
186 |   },
187 |   "language_info": {
188 |    "codemirror_mode": {
189 |     "name": "ipython",
190 |     "version": 3
191 |    },
192 |    "file_extension": ".py",
193 |    "mimetype": "text/x-python",
194 |    "name": "python",
195 |    "nbconvert_exporter": "python",
196 |    "pygments_lexer": "ipython3",
197 |    "version": "3.12.10"
198 |   }
199 |  },
200 |  "nbformat": 4,
201 |  "nbformat_minor": 5
202 | }
203 | 


--------------------------------------------------------------------------------
/Ch10/Ch10-5-recursive-trees.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "81fc78c9-b88d-4371-910a-dfcd2fe49f4d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch10-5 Playing Recursively with Trees [Updated to use RAxML-NG]"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "e19afca5-8cd9-4ab4-93ec-691cee7f9145",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# 23.  Load the raxml-generated tree for Ebola viruses\n",
 21 |     "import dendropy\n",
 22 |     "# Define the correct tree file output from RAxML-NG\n",
 23 |     "tree_file = \"ebola_tree.raxml.bestTreeCollapsed\"  # Based on raxml-ng output from previous recipe\n",
 24 |     "# Load the tree using Newick format (RAxML-NG outputs trees in Newick)\n",
 25 |     "ebola_raxml = dendropy.Tree.get_from_path(tree_file, schema=\"newick\")\n",
 26 |     "# Optional: Print the tree to verify\n",
 27 |     "print(ebola_raxml.as_string(schema=\"newick\"))"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "25fde98b-6356-4a6f-9248-59665b54596f",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# 24. Compute the level of each node\n",
 38 |     "def compute_level(node, level=0): \n",
 39 |     "    for child in node.child_nodes(): \n",
 40 |     "        compute_level(child, level + 1) \n",
 41 |     "    if node.taxon is not None: \n",
 42 |     "        print(\"%s: %d %d\" % (node.taxon, node.level(), level)) \n",
 43 |     "compute_level(ebola_raxml.seed_node) "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "6ad0c4ef-5321-46cb-b48a-94c49595ac56",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# 25.  Compute the height of each node\n",
 54 |     "def compute_height(node): \n",
 55 |     "    children = node.child_nodes() \n",
 56 |     "    if len(children) == 0: \n",
 57 |     "        height = 0 \n",
 58 |     "    else: \n",
 59 |     "        height = 1 + max(map(lambda x: compute_height(x), children)) \n",
 60 |     "    desc = node.taxon or 'Internal' \n",
 61 |     "    print(\"%s: %d %d\" % (desc, height, node.level())) \n",
 62 |     "    return height \n",
 63 |     "compute_height(ebola_raxml.seed_node) "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "id": "99069105-cdde-4ed7-b8a8-38bb2601cb95",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# 26.  Compute the number of offspring for each node\n",
 74 |     "def compute_nofs(node): \n",
 75 |     "    children = node.child_nodes() \n",
 76 |     "    nofs = len(children) \n",
 77 |     "    map(lambda x: compute_nofs(x), children) \n",
 78 |     "    desc = node.taxon or 'Internal' \n",
 79 |     "    print(\"%s: %d %d\" % (desc, nofs, node.level())) \n",
 80 |     "compute_nofs(ebola_raxml.seed_node) "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "bc73aca6-d055-4eda-9619-a9cf77e87922",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# 27.  Print out the leaves\n",
 91 |     "def print_nodes(node): \n",
 92 |     "    for child in node.child_nodes(): \n",
 93 |     "        print_nodes(child) \n",
 94 |     "    if node.taxon is not None: \n",
 95 |     "        print('%s (%d)' % (node.taxon, node.level())) \n",
 96 |     "print_nodes(ebola_raxml.seed_node) "
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "id": "c475fc49-a772-40b6-a83f-4f443970d2c2",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# 28.  Print leaf nodes in breadth-first manner\n",
107 |     "from collections import deque \n",
108 |     "def print_breadth(tree): \n",
109 |     "    queue = deque() \n",
110 |     "    queue.append(tree.seed_node) \n",
111 |     "    while len(queue) > 0: \n",
112 |     "        process_node = queue.popleft() \n",
113 |     "        if process_node.taxon is not None: \n",
114 |     "            print('%s (%d)' % (process_node.taxon, process_node.level())) \n",
115 |     "        else: \n",
116 |     "            for child in process_node.child_nodes(): \n",
117 |     "                queue.append(child) \n",
118 |     "print_breadth(ebola_raxml) "
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "1cb1caef-8aaa-4b4f-8531-66d0ebe1dbdf",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# 29.  Getting back to the real dataset\n",
129 |     "from copy import deepcopy \n",
130 |     "simple_ebola = deepcopy(ebola_raxml) \n",
131 |     "def simplify_tree(node): \n",
132 |     "    prefs = set() \n",
133 |     "    for leaf in node.leaf_nodes(): \n",
134 |     "        my_toks = leaf.taxon.label.split(' ') \n",
135 |     "        if my_toks[0] == 'EBOV': \n",
136 |     "            prefs.add('EBOV' + my_toks[1]) \n",
137 |     "        else: \n",
138 |     "            prefs.add(my_toks[0]) \n",
139 |     "    if len(prefs) == 1: \n",
140 |     "        print(prefs, len(node.leaf_nodes())) \n",
141 |     "        node.taxon = dendropy.Taxon(label=list(prefs)[0]) \n",
142 |     "        node.set_child_nodes([]) \n",
143 |     "    else: \n",
144 |     "        for child in node.child_nodes(): \n",
145 |     "            simplify_tree(child) \n",
146 |     "simplify_tree(simple_ebola.seed_node) \n",
147 |     "simple_ebola.ladderize() \n",
148 |     "simple_ebola.write_to_path('ebola_simple.nex', 'nexus') "
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "b9caaa6c-4251-4db0-9ca4-47d4fbd9a747",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "## End of Notebook ##"
159 |    ]
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3 (ipykernel)",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.11.3"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 5
183 | }
184 | 


--------------------------------------------------------------------------------
/Ch09/Ch09-2-molecular-distances.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "9f15edc4",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch 9-2 - Computing molecular distances on a PDB file"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "261dd6e0",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Load model\n",
 21 |     "from Bio import PDB \n",
 22 |     "repository = PDB.PDBList() \n",
 23 |     "parser = PDB.PDBParser() \n",
 24 |     "repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') \n",
 25 |     "p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "id": "189e72bb",
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Get Zincs\n",
 36 |     "zns = []\n",
 37 |     "for atom in p53_1tup.get_atoms(): \n",
 38 |     "    if atom.element == 'ZN': \n",
 39 |     "        zns.append(atom) \n",
 40 |     "for zn in zns: \n",
 41 |     "    print(zn, zn.coord) "
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "id": "45708c7f",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# Get the distance between one atom and another set of atoms\n",
 52 |     "import math \n",
 53 |     "def get_closest_atoms(pdb_struct, ref_atom, distance): \n",
 54 |     "    atoms = {} \n",
 55 |     "    rx, ry, rz = ref_atom.coord \n",
 56 |     "    for atom in pdb_struct.get_atoms(): \n",
 57 |     "        if atom == ref_atom: \n",
 58 |     "            continue \n",
 59 |     "        x, y, z = atom.coord \n",
 60 |     "        my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) \n",
 61 |     "        if my_dist < distance: \n",
 62 |     "            atoms[atom] = my_dist \n",
 63 |     "    return atoms "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "id": "def34d3d",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# Compute atoms near zincs\n",
 74 |     "for zn in zns: \n",
 75 |     "    print() \n",
 76 |     "    print(zn.coord) \n",
 77 |     "    atoms = get_closest_atoms(p53_1tup, zn, 4) \n",
 78 |     "    for atom, distance in atoms.items(): \n",
 79 |     "        print(atom.element, distance, atom.coord) "
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "id": "5aeceeae",
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# How many atoms do we get as we increase the distance\n",
 90 |     "for distance in [1, 2, 4, 8, 16, 32, 64, 128]: \n",
 91 |     "    my_atoms = [] \n",
 92 |     "    for zn in zns: \n",
 93 |     "        atoms = get_closest_atoms(p53_1tup, zn, distance) \n",
 94 |     "        my_atoms.append(len(atoms)) \n",
 95 |     "    print(distance, my_atoms) "
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "310865b4",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# Time the output\n",
106 |     "import timeit \n",
107 |     "nexecs = 10 \n",
108 |     "print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], 4.0)', \n",
109 |     "    'from __main__ import get_closest_atoms, p53_1tup, zns', \n",
110 |     "    number=nexecs) / nexecs * 1000) "
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "753d85a2",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "# A better distance function\n",
121 |     "def get_closest_alternative(pdb_struct, ref_atom, distance): \n",
122 |     "    atoms = {} \n",
123 |     "    rx, ry, rz = ref_atom.coord \n",
124 |     "    for atom in pdb_struct.get_atoms(): \n",
125 |     "        if atom == ref_atom: \n",
126 |     "            continue \n",
127 |     "        x, y, z = atom.coord \n",
128 |     "        if abs(x - rx) > distance or abs(y - ry) > distance or abs(z - rz) > distance: \n",
129 |     "            continue \n",
130 |     "        my_dist = math.sqrt((x - rx)**2 + (y - ry)**2 + (z - rz)**2) \n",
131 |     "        if my_dist < distance: \n",
132 |     "            atoms[atom] = my_dist \n",
133 |     "    return atoms "
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "id": "f6e2e5d7",
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "# Now let's time it:\n",
144 |     "print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], 4.0)', \n",
145 |     "    'from __main__ import get_closest_alternative, p53_1tup, zns', \n",
146 |     "    number=nexecs) / nexecs * 1000) "
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "ad69ba33",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# Compare the function with different distances\n",
157 |     "print('Standard') \n",
158 |     "for distance in [1, 4, 16, 64, 128]: \n",
159 |     "    print(timeit.timeit('get_closest_atoms(p53_1tup, zns[0], distance)', \n",
160 |     "    'from __main__ import get_closest_atoms, p53_1tup, zns, distance', \n",
161 |     "    number=nexecs) / nexecs * 1000) \n",
162 |     "print('Optimized') \n",
163 |     "for distance in [1, 4, 16, 64, 128]: \n",
164 |     "    print(timeit.timeit('get_closest_alternative(p53_1tup, zns[0], distance)', \n",
165 |     "        'from __main__ import get_closest_alternative, p53_1tup, zns, distance', \n",
166 |     "        number=nexecs) / nexecs * 1000)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "id": "26d469c5",
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "## End of Notebook ##"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3 (ipykernel)",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.11.3"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 5
201 | }
202 | 


--------------------------------------------------------------------------------
/Ch09/Ch09-4-py3dmol.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "65a69498-8fdc-4ff3-9ded-0e93e5abbacb",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch 09-4 py3dmol"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "5709bf95-8fa1-4815-83a6-c27c4c95e02a",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install dependencies\n",
 21 |     "! pip install py3Dmol"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "id": "f54c2935",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Import Libraries\n",
 32 |     "import py3Dmol"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "id": "21aef57c-0cdb-4f48-a0a9-4ed7f7664a28",
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "application/3dmoljs_load.v0": "<div id=\"3dmolviewer_17535500055953548\"  style=\"position: relative; width: 640px; height: 480px;\">\n        <p id=\"3dmolwarning_17535500055953548\" style=\"background-color:#ffcccc;color:black\">3Dmol.js failed to load for some reason.  Please check your browser console for error messages.<br></p>\n        </div>\n<script>\n\nvar loadScriptAsync = function(uri){\n  return new Promise((resolve, reject) => {\n    //this is to ignore the existence of requirejs amd\n    var savedexports, savedmodule;\n    if (typeof exports !== 'undefined') savedexports = exports;\n    else exports = {}\n    if (typeof module !== 'undefined') savedmodule = module;\n    else module = {}\n\n    var tag = document.createElement('script');\n    tag.src = uri;\n    tag.async = true;\n    tag.onload = () => {\n        exports = savedexports;\n        module = savedmodule;\n        resolve();\n    };\n  var firstScriptTag = document.getElementsByTagName('script')[0];\n  firstScriptTag.parentNode.insertBefore(tag, firstScriptTag);\n});\n};\n\nif(typeof $3Dmolpromise === 'undefined') {\n$3Dmolpromise = null;\n  $3Dmolpromise = loadScriptAsync('https://cdn.jsdelivr.net/npm/3dmol@2.5.1/build/3Dmol-min.js');\n}\n\nvar viewer_17535500055953548 = null;\nvar warn = document.getElementById(\"3dmolwarning_17535500055953548\");\nif(warn) {\n    warn.parentNode.removeChild(warn);\n}\n$3Dmolpromise.then(function() {\nviewer_17535500055953548 = $3Dmol.createViewer(document.getElementById(\"3dmolviewer_17535500055953548\"),{backgroundColor:\"white\"});\n$3Dmol.download(\"pdb:1crn\", viewer_17535500055953548, {}, function() {\nviewer_17535500055953548.zoomTo();\n\tviewer_17535500055953548.setStyle({\"cartoon\": {\"color\": \"spectrum\"}});\n\tviewer_17535500055953548.zoomTo();\nviewer_17535500055953548.render();\n})\n});\n</script>",
 44 |       "text/html": [
 45 |        "<div id=\"3dmolviewer_17535500055953548\"  style=\"position: relative; width: 640px; height: 480px;\">\n",
 46 |        "        <p id=\"3dmolwarning_17535500055953548\" style=\"background-color:#ffcccc;color:black\">3Dmol.js failed to load for some reason.  Please check your browser console for error messages.<br></p>\n",
 47 |        "        </div>\n",
 48 |        "<script>\n",
 49 |        "\n",
 50 |        "var loadScriptAsync = function(uri){\n",
 51 |        "  return new Promise((resolve, reject) => {\n",
 52 |        "    //this is to ignore the existence of requirejs amd\n",
 53 |        "    var savedexports, savedmodule;\n",
 54 |        "    if (typeof exports !== 'undefined') savedexports = exports;\n",
 55 |        "    else exports = {}\n",
 56 |        "    if (typeof module !== 'undefined') savedmodule = module;\n",
 57 |        "    else module = {}\n",
 58 |        "\n",
 59 |        "    var tag = document.createElement('script');\n",
 60 |        "    tag.src = uri;\n",
 61 |        "    tag.async = true;\n",
 62 |        "    tag.onload = () => {\n",
 63 |        "        exports = savedexports;\n",
 64 |        "        module = savedmodule;\n",
 65 |        "        resolve();\n",
 66 |        "    };\n",
 67 |        "  var firstScriptTag = document.getElementsByTagName('script')[0];\n",
 68 |        "  firstScriptTag.parentNode.insertBefore(tag, firstScriptTag);\n",
 69 |        "});\n",
 70 |        "};\n",
 71 |        "\n",
 72 |        "if(typeof $3Dmolpromise === 'undefined') {\n",
 73 |        "$3Dmolpromise = null;\n",
 74 |        "  $3Dmolpromise = loadScriptAsync('https://cdn.jsdelivr.net/npm/3dmol@2.5.1/build/3Dmol-min.js');\n",
 75 |        "}\n",
 76 |        "\n",
 77 |        "var viewer_17535500055953548 = null;\n",
 78 |        "var warn = document.getElementById(\"3dmolwarning_17535500055953548\");\n",
 79 |        "if(warn) {\n",
 80 |        "    warn.parentNode.removeChild(warn);\n",
 81 |        "}\n",
 82 |        "$3Dmolpromise.then(function() {\n",
 83 |        "viewer_17535500055953548 = $3Dmol.createViewer(document.getElementById(\"3dmolviewer_17535500055953548\"),{backgroundColor:\"white\"});\n",
 84 |        "$3Dmol.download(\"pdb:1crn\", viewer_17535500055953548, {}, function() {\n",
 85 |        "viewer_17535500055953548.zoomTo();\n",
 86 |        "\tviewer_17535500055953548.setStyle({\"cartoon\": {\"color\": \"spectrum\"}});\n",
 87 |        "\tviewer_17535500055953548.zoomTo();\n",
 88 |        "viewer_17535500055953548.render();\n",
 89 |        "})\n",
 90 |        "});\n",
 91 |        "</script>"
 92 |       ]
 93 |      },
 94 |      "metadata": {},
 95 |      "output_type": "display_data"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "# Set up the Py3Dmol viewer for an example protein \n",
100 |     "view = py3Dmol.view(query='pdb:1crn')  # 1crn = Crambin\n",
101 |     "view.setStyle({'cartoon': {'color': 'spectrum'}})\n",
102 |     "view.zoomTo()\n",
103 |     "view.show()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "id": "a29bfd8c-680c-4cf9-8252-7453809a1baa",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "## End of Notebook ##"
114 |    ]
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3 (ipykernel)",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.12.11"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 5
138 | }
139 | 


--------------------------------------------------------------------------------
/Ch02/Ch02-2-pandas-pitfalls.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "9a6d23f7",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch02-2 - Pitfalls of joining data with pandas"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "cd5b26a0-b105-4219-a1df-b21464aa2675",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "########################## Notes & Updates ##############################################################\n",
 21 |     "# If you are using Docker and your data directory is mapped to \"/data\" then you can use the commented-out\n",
 22 |     "#   Docker lines below in place of the primary line (which you will comment out when running)\n",
 23 |     "# You will also find other alternative lines or blocks that can be used to avoid potential issues \n",
 24 |     "#########################################################################################################"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "4d64b416",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# 1. Import Libraries\n",
 35 |     "import pandas as pd"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "id": "923bc568",
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# 2. Jumble the data using random sampling\n",
 46 |     "vdata = pd.read_csv(\"data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\") \n",
 47 |     "# vdata = pd.read_csv(\"/data/2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\")  # Docker\n",
 48 |     "vdata.sample(frac=0.9).to_csv(\"vdata_sample.csv.gz\", index=False) \n",
 49 |     "vax = pd.read_csv(\"data/2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\") \n",
 50 |     "# vax = pd.read_csv(\"/data/2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\") # Docker \n",
 51 |     "vax.sample(frac=0.9).to_csv(\"vax_sample.csv.gz\", index=False) \n",
 52 |     "# Note - it is ok to get a dtype warning here"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "id": "d0ef3b0d",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# 3. Inner join on the tables\n",
 63 |     "vdata = pd.read_csv(\"vdata_sample.csv.gz\", low_memory=False) \n",
 64 |     "vax = pd.read_csv(\"vax_sample.csv.gz\", low_memory=False) \n",
 65 |     "vdata_with_vax = vdata.join(vax.set_index(\"VAERS_ID\"), on=\"VAERS_ID\", how=\"inner\") \n",
 66 |     "# vdata_with_vax = vdata.merge(vax, on=\"VAERS_ID\", how=\"inner\") # Docker - Alternate method \n",
 67 |     "len(vdata), len(vax), len(vdata_with_vax) "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "9ed5a95b",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# 4. Find the data not captured by the join\n",
 78 |     "lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)] \n",
 79 |     "lost_vdata \n",
 80 |     "lost_vax = vax[~vax[\"VAERS_ID\"].isin(vdata.index)] \n",
 81 |     "lost_vax "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "13329af6",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# 5. Left outer join\n",
 92 |     "vdata_with_vax_left = vdata.join(vax.set_index(\"VAERS_ID\"), on=\"VAERS_ID\") \n",
 93 |     "vdata_with_vax_left.groupby(\"VAERS_ID\").size().sort_values() \n",
 94 |     "# vdata_with_vax_left = vdata.merge(vax, on=\"VAERS_ID\", how=\"left\") # Docker - alternate version\n",
 95 |     "# vdata_with_vax_left.groupby(\"VAERS_ID\").size().sort_values() # Docker - alternate version"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "id": "24d2f205",
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# 6. Right join\n",
106 |     "dead = vdata[vdata.DIED == \"Y\"] \n",
107 |     "vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n",
108 |     "vax19_dead = vax19.join(dead.set_index(\"VAERS_ID\"), on=\"VAERS_ID\", how=\"right\") \n",
109 |     "len(vax19), len(dead), len(vax19_dead) \n",
110 |     "len(vax19_dead[vax19_dead.VAERS_ID.duplicated()]) \n",
111 |     "len(vax19_dead) - len(dead) "
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "id": "a92b51dc-111a-4b05-a5b4-1124a631bc03",
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "# 6. Right join (alternate method)\n",
122 |     "# dead = vdata[vdata.DIED == \"Y\"] \n",
123 |     "# vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n",
124 |     "# vax19_dead = vax19.merge(dead, on=\"VAERS_ID\", how=\"right\")\n",
125 |     "# len(vax19), len(dead), len(vax19_dead)\n",
126 |     "# len(vax19_dead[vax19_dead.VAERS_ID.duplicated()])\n",
127 |     "# len(vax19_dead) - len(dead) "
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "b877c4b5",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "# Problematic lots\n",
138 |     "vax19_dead[\"STATE\"] = vax19_dead[\"STATE\"].str.upper() \n",
139 |     "dead_lot = vax19_dead[[\"VAERS_ID\", \"VAX_LOT\", \"STATE\"]].set_index([\"VAERS_ID\", \"VAX_LOT\"]) \n",
140 |     "dead_lot_clean = dead_lot[~dead_lot.index.duplicated()] \n",
141 |     "dead_lot_clean = dead_lot_clean.reset_index() \n",
142 |     "dead_lot_clean[dead_lot_clean.VAERS_ID.isna()] \n",
143 |     "baddies = dead_lot_clean.groupby(\"VAX_LOT\").size().sort_values(ascending=False) "
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "6b3cc8a5",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# Print problematic lots\n",
154 |     "for i, (lot, cnt) in enumerate(baddies.items()):\n",
155 |     "    print(lot, cnt, len(dead_lot_clean[dead_lot_clean.VAX_LOT == lot].groupby(\"STATE\")))\n",
156 |     "    if i == 10:\n",
157 |     "        break"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "d249a4e0",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "# End of Notebook #"
168 |    ]
169 |   }
170 |  ],
171 |  "metadata": {
172 |   "kernelspec": {
173 |    "display_name": "Python 3 (ipykernel)",
174 |    "language": "python",
175 |    "name": "python3"
176 |   },
177 |   "language_info": {
178 |    "codemirror_mode": {
179 |     "name": "ipython",
180 |     "version": 3
181 |    },
182 |    "file_extension": ".py",
183 |    "mimetype": "text/x-python",
184 |    "name": "python",
185 |    "nbconvert_exporter": "python",
186 |    "pygments_lexer": "ipython3",
187 |    "version": "3.12.11"
188 |   }
189 |  },
190 |  "nbformat": 4,
191 |  "nbformat_minor": 5
192 | }
193 | 


--------------------------------------------------------------------------------
/Ch01/Welcome.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "20b669c8",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# BioInformatics with Python Cookbook - Fourth Edition #"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "id": "32e90c95",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Welcome to the book! #\n",
 21 |     "# 1-1 Welcome #"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "id": "727b23fe",
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "Welcome to the BioInformatics with Python Cookbook Fourth Edition!\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "print(\"Welcome to the BioInformatics with Python Cookbook Fourth Edition!\")"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "id": "2437ebca-218f-498a-ad08-158f5364face",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Install packages using Conda"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 4,
 55 |    "id": "869bd2c5-2117-493e-a0c9-aefd885b8259",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Retrieving notices: done\n",
 63 |       "Collecting package metadata (current_repodata.json): done\n",
 64 |       "Solving environment: unsuccessful initial attempt using frozen solve. Retrying with flexible solve.\n",
 65 |       "Collecting package metadata (repodata.json): \\ WARNING conda.models.version:get_matcher(563): Using .* with relational operator is superfluous and deprecated and will be removed in a future version of conda. Your spec was 1.8.0.*, but conda is ignoring the .* and treating it as 1.8.0\n",
 66 |       "WARNING conda.models.version:get_matcher(563): Using .* with relational operator is superfluous and deprecated and will be removed in a future version of conda. Your spec was 1.9.0.*, but conda is ignoring the .* and treating it as 1.9.0\n",
 67 |       "done\n",
 68 |       "Solving environment: - ^C\n",
 69 |       "unsuccessful initial attempt using frozen solve. Retrying with flexible solve.\n",
 70 |       "\n",
 71 |       "CondaError: KeyboardInterrupt\n",
 72 |       "\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "! conda install -y biopython==1.84 jupyterlab==4.3.0 matplotlib==3.9.2 numpy==2.1.0 pandas==2.2.3 scipy==1.14.1 "
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "id": "257691cc-bbe3-438d-89d2-94a537c789ab",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Install Jupytext"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 6,
 93 |    "id": "c0e9235a-7f3f-4633-9134-13f50fb17209",
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "Requirement already satisfied: jupytext in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (1.17.1)\n",
101 |       "Requirement already satisfied: markdown-it-py>=1.0 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (2.2.0)\n",
102 |       "Requirement already satisfied: mdit-py-plugins in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (0.5.0)\n",
103 |       "Requirement already satisfied: nbformat in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (5.10.4)\n",
104 |       "Requirement already satisfied: packaging in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (25.0)\n",
105 |       "Requirement already satisfied: pyyaml in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupytext) (6.0.2)\n",
106 |       "Requirement already satisfied: mdurl~=0.1 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from markdown-it-py>=1.0->jupytext) (0.1.0)\n",
107 |       "Requirement already satisfied: fastjsonschema>=2.15 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (2.20.0)\n",
108 |       "Requirement already satisfied: jsonschema>=2.6 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (4.25.0)\n",
109 |       "Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (5.8.1)\n",
110 |       "Requirement already satisfied: traitlets>=5.1 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from nbformat->jupytext) (5.14.3)\n",
111 |       "Requirement already satisfied: attrs>=22.2.0 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (24.3.0)\n",
112 |       "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (2023.7.1)\n",
113 |       "Requirement already satisfied: referencing>=0.28.4 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (0.36.2)\n",
114 |       "Requirement already satisfied: rpds-py>=0.7.1 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jsonschema>=2.6->nbformat->jupytext) (0.22.3)\n",
115 |       "Requirement already satisfied: platformdirs>=2.5 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from jupyter-core!=5.0.*,>=4.12->nbformat->jupytext) (4.3.7)\n",
116 |       "Requirement already satisfied: typing-extensions>=4.4.0 in /Users/shanebrubaker/anaconda3/lib/python3.11/site-packages (from referencing>=0.28.4->jsonschema>=2.6->nbformat->jupytext) (4.14.1)\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "! pip install jupytext"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "d5abaf2d",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "## End of Notebook ##"
132 |    ]
133 |   }
134 |  ],
135 |  "metadata": {
136 |   "jupytext": {
137 |    "formats": "ipynb,py:light"
138 |   },
139 |   "kernelspec": {
140 |    "display_name": "Python 3 (ipykernel)",
141 |    "language": "python",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.11.13"
155 |   }
156 |  },
157 |  "nbformat": 4,
158 |  "nbformat_minor": 5
159 | }
160 | 


--------------------------------------------------------------------------------
/Ch10/Ch10-2-aligning-genetic-data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "0ba48448-2dc8-4c01-bf7b-d73e960ce9dc",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch10-2 - Aligning genetic data [Updated]"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "3d640c71-be77-4c60-885f-9b03489326d3",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install packages\n",
 21 |     "! brew install trimal\n",
 22 |     "! brew install mafft\n",
 23 |     "! brew install muscle"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "63c8069a-c598-4e76-ad49-18051468d11a",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# 8.  Run MAFFT to align the genomes\n",
 34 |     "#     Note- this takes about 30-60 minutes to run\n",
 35 |     "import subprocess\n",
 36 |     "from Bio.Align.Applications import MafftCommandline\n",
 37 |     "# Define the MAFFT command\n",
 38 |     "mafft_cline = MafftCommandline(input=\"sample.fasta\", ep=0.123, reorder=True, maxiterate=1000, localpair=True)\n",
 39 |     "# Print the command (for debugging purposes)\n",
 40 |     "print(\"Running MAFFT with command:\", mafft_cline)\n",
 41 |     "# Run MAFFT using subprocess\n",
 42 |     "process = subprocess.run(\n",
 43 |     "    str(mafft_cline),  # Convert command to string\n",
 44 |     "    shell=True,        # Run in shell environment\n",
 45 |     "    capture_output=True,  # Capture stdout and stderr\n",
 46 |     "    text=True  # Ensure output is captured as text (string)\n",
 47 |     ")\n",
 48 |     "# Check for errors\n",
 49 |     "if process.returncode != 0:\n",
 50 |     "    print(\"Error running MAFFT:\", process.stderr)\n",
 51 |     "else:\n",
 52 |     "    # Save the aligned output to a file\n",
 53 |     "    with open(\"align.fasta\", \"w\") as w:\n",
 54 |     "        w.write(process.stdout)\n",
 55 |     "print(\"Alignment completed and saved to align.fasta\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "63f8fcd5-d243-44b1-9f22-f29d27f48181",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "# Once the above is completed, you should see the file align.fasta in your working directory"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "a2d30c45-147c-41a8-9935-859b77d7787f",
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# 9.  Use TrimAl to trim sequences\n",
 76 |     "import os\n",
 77 |     "os.system('trimal -automated1 -in align.fasta -out trim.fasta -fasta') "
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "aa59be90-5b5c-4115-856b-d0a642d0e376",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# You should see the file trim.fasta as the output"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "id": "29a14d97-e1c0-42c1-a46f-86c3327edce7",
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# 10.  Run MUSCLE to align the proteins (This uses MUSCLE V5)\n",
 98 |     "import subprocess\n",
 99 |     "import os\n",
100 |     "\n",
101 |     "my_genes = ['NP', 'L', 'VP35', 'VP40'] \n",
102 |     "for gene in my_genes:\n",
103 |     "\n",
104 |     "    input_file = f\"{gene}_P.fasta\"\n",
105 |     "    output_file = f\"{gene}_P_align.fasta\"\n",
106 |     "\n",
107 |     "    # Verify if the input file exists\n",
108 |     "    if not os.path.exists(input_file):\n",
109 |     "        print(f\"Error: Input file '{input_file}' not found.\")\n",
110 |     "    else:\n",
111 |     "        # Construct the correct command for MUSCLE v5+\n",
112 |     "        muscle_cmd = f\"muscle -align {input_file} -output {output_file}\"\n",
113 |     "\n",
114 |     "        print(f\"Running MUSCLE with command: {muscle_cmd}\")\n",
115 |     "\n",
116 |     "        # Run MUSCLE using subprocess\n",
117 |     "        process = subprocess.run(\n",
118 |     "            muscle_cmd, shell=True, capture_output=True, text=True\n",
119 |     "        )\n",
120 |     "\n",
121 |     "        # Check for errors\n",
122 |     "        if process.returncode != 0:\n",
123 |     "            print(\"Error running MUSCLE:\", process.stderr)\n",
124 |     "        else:\n",
125 |     "            print(f\"Alignment completed and saved to {output_file}\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "id": "bd0737c6-786f-4c60-b68b-5820f0e1e140",
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "# You should see file four files: NP_P_align.fasta, L_P_align.fasta, VP35_P_align.fasta, VP40_P_align.fasta as the output"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "id": "2f3391e6-c1bd-4196-8ddf-272118e58bc7",
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# 11.  Align genes by back-translation\n",
146 |     "from Bio import SeqIO \n",
147 |     "from Bio.Seq import Seq \n",
148 |     "from Bio.SeqRecord import SeqRecord \n",
149 |     "for gene in my_genes: \n",
150 |     "    gene_seqs = {} \n",
151 |     "    unal_gene = SeqIO.parse('%s.fasta' % gene, 'fasta') \n",
152 |     "    for rec in unal_gene: \n",
153 |     "        gene_seqs[rec.id] = rec.seq \n",
154 |     "    al_prot = SeqIO.parse('%s_P_align.fasta' % gene, 'fasta') \n",
155 |     "    al_genes = [] \n",
156 |     "    for protein in al_prot: \n",
157 |     "        my_id = protein.id \n",
158 |     "        seq = '' \n",
159 |     "        pos = 0 \n",
160 |     "        for c in protein.seq: \n",
161 |     "            if c == '-': \n",
162 |     "                seq += '---' \n",
163 |     "            else: \n",
164 |     "                seq += str(gene_seqs[my_id][pos:pos + 3]) \n",
165 |     "                pos += 3 \n",
166 |     "        al_genes.append(SeqRecord(Seq(seq), id=my_id)) \n",
167 |     "    SeqIO.write(al_genes, '%s_align.fasta' % gene, 'fasta') "
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "ce889cf2-4c3e-4344-897f-442cc559e599",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# You should see 4 output files:  NP_align.fasta, L_align.fasta, VP35_align.fasta, VP40_align.fasta"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "id": "56e37a0c-fba0-45f5-8e1d-a73445504e4b",
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "## End of Notebook ##"
188 |    ]
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3 (ipykernel)",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.11.3"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 5
212 | }
213 | 


--------------------------------------------------------------------------------
/Ch03/Ch02-1-pandas-basic.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "57186378",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch02-1-pandas-basic\n",
 11 |     "#  Overview of basic pandas functionality for manipulating data files and tables"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "0bda3b91",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "# Libraries\n",
 22 |     "import pandas as pd"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "8e82984b",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Read in the Vaccine data\n",
 33 |     "vdata = pd.read_csv(\"2021VAERSDATA.csv.gz\", encoding=\"iso-8859-1\") "
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "8acaabe3",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "vdata.columns "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "id": "c0f9e4a3",
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "vdata.dtypes"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "1ca72187",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Get the shape of your data\n",
 64 |     "vdata.shape"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "374fc081",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Access a pandas array using an integer-based location\n",
 75 |     "vdata.iloc[0] "
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "id": "b1e0f929",
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# Set the index using a column\n",
 86 |     "vdata = vdata.set_index(\"VAERS_ID\") "
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "id": "10cdfb94",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# Get the data using a key\n",
 97 |     "vdata.loc[916600] "
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "5e40a3bc",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# Use head to look at the top part of the data\n",
108 |     "vdata.head(3) "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "id": "7405daa6",
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Retrieve the first 3 rows using an array specification\n",
119 |     "vdata.iloc[:3] "
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "28dde50b",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# Restrict the output to certain columns\n",
130 |     "vdata.iloc[:5, 2:4] "
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "id": "93836659",
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "# Compute the maximum age in the dataset\n",
141 |     "vdata[\"AGE_YRS\"].max() "
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "id": "813b90ef",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "# A different style of notation\n",
152 |     "vdata.AGE_YRS.max() "
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "id": "39bc5c15",
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# Plot the data\n",
163 |     "vdata[\"AGE_YRS\"].sort_values().plot(use_index=False) "
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "id": "6a686cf9",
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "# Second plot\n",
174 |     "vdata[\"AGE_YRS\"].plot.hist(bins=20) "
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "id": "714251b7",
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# Plot using matplotlib\n",
185 |     "import matplotlib.pyplot as plt \n",
186 |     "fig, ax = plt.subplots(1, 2, sharey=True) \n",
187 |     "fig.suptitle(\"Age of adverse events\") \n",
188 |     "vdata[\"AGE_YRS\"].sort_values().plot(use_index=False, ax=ax[0], xlabel=\"Obervation\", ylabel=\"Age\") \n",
189 |     "vdata[\"AGE_YRS\"].plot.hist(bins=20, orientation=\"horizontal\") "
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "id": "2a9d6911",
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "# Count events per year\n",
200 |     "vdata[\"AGE_YRS\"].dropna().apply(lambda x: int(x)).value_counts() "
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "id": "663f65b4",
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# Count the number of people who died\n",
211 |     "vdata.DIED.value_counts(dropna=False) "
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "id": "d88bec1d",
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# Set the is_dead column\n",
222 |     "vdata[\"is_dead\"] = (vdata.DIED == \"Y\") "
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "id": "7eecbe5e",
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "# Associate data about deaths with vaccine involved\n",
233 |     "dead = vdata[vdata.is_dead] \n",
234 |     "vax = pd.read_csv(\"2021VAERSVAX.csv.gz\", encoding=\"iso-8859-1\").set_index(\"VAERS_ID\") \n",
235 |     "vax.groupby(\"VAX_TYPE\").size().sort_values() \n",
236 |     "vax19 = vax[vax.VAX_TYPE == \"COVID19\"] \n",
237 |     "vax19_dead = dead.join(vax19) "
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "id": "a0020e32",
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# Top 10 covid vaccine lots\n",
248 |     "baddies = vax19_dead.groupby(\"VAX_LOT\").size().sort_values(ascending=False) "
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "id": "5f8f71be",
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "# Print out the top lots\n",
259 |     "for i, (lot,cnt) in enumerate(baddies.items()):\n",
260 |     "    print(lot, cnt, len(vax19_dead[vax19_dead.VAX_LOT == lot].groupby(\"STATE\")))\n",
261 |     "    if i == 10:\n",
262 |     "        break"
263 |    ]
264 |   }
265 |  ],
266 |  "metadata": {
267 |   "kernelspec": {
268 |    "display_name": "Python 3 (ipykernel)",
269 |    "language": "python",
270 |    "name": "python3"
271 |   },
272 |   "language_info": {
273 |    "codemirror_mode": {
274 |     "name": "ipython",
275 |     "version": 3
276 |    },
277 |    "file_extension": ".py",
278 |    "mimetype": "text/x-python",
279 |    "name": "python",
280 |    "nbconvert_exporter": "python",
281 |    "pygments_lexer": "ipython3",
282 |    "version": "3.12.2"
283 |   }
284 |  },
285 |  "nbformat": 4,
286 |  "nbformat_minor": 5
287 | }
288 | 


--------------------------------------------------------------------------------
/Ch08/Ch08-3-pdb-uniprot.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "785942a2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch08-3 - Using PDB & UniProt"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "16f46a4c",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Import libraries\n",
 21 |     "import requests\n",
 22 |     "import sys\n",
 23 |     "import json"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "af788ba3",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# 1. Fetch protein data from UniProt given an Accession\n",
 34 |     "def fetch_protein_data_json(accession):\n",
 35 |     "    \"\"\"\n",
 36 |     "    Fetch protein data from the EBI Proteins API in JSON format.\n",
 37 |     "\n",
 38 |     "    Parameters:\n",
 39 |     "        accession (str): Protein accession number (e.g., P21802).\n",
 40 |     "\n",
 41 |     "    Returns:\n",
 42 |     "        dict: The protein data as a JSON object (Python dictionary).\n",
 43 |     "    \"\"\"\n",
 44 |     "    request_url = f\"https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession={accession}\"\n",
 45 |     "    headers = {\"Accept\": \"application/json\"}  # Request JSON format\n",
 46 |     "\n",
 47 |     "    try:\n",
 48 |     "        print(f\"Fetching data for accession: {accession}\")\n",
 49 |     "        response = requests.get(request_url, headers=headers, timeout=30)\n",
 50 |     "        response.raise_for_status()  # Raise an exception for HTTP errors\n",
 51 |     "        return response.json()  # Parse JSON response directly into a Python dictionary\n",
 52 |     "    except requests.exceptions.RequestException as e:\n",
 53 |     "        print(f\"Error fetching protein data: {e}\")\n",
 54 |     "        sys.exit(1)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "id": "069dfb5f",
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# Function to save the JSON data to a file\n",
 65 |     "def save_json_to_file(data, filename):\n",
 66 |     "    \"\"\"\n",
 67 |     "    Save JSON data to a file.\n",
 68 |     "\n",
 69 |     "    Parameters:\n",
 70 |     "        data (dict): The JSON data to save.\n",
 71 |     "        filename (str): The name of the file to save the data to.\n",
 72 |     "\n",
 73 |     "    Returns:\n",
 74 |     "        None\n",
 75 |     "    \"\"\"\n",
 76 |     "    try:\n",
 77 |     "        with open(filename, \"w\") as json_file:\n",
 78 |     "            json.dump(data, json_file, indent=4)\n",
 79 |     "        print(f\"Protein data saved to {filename}\")\n",
 80 |     "    except IOError as e:\n",
 81 |     "        print(f\"Error saving data to file: {e}\")\n",
 82 |     "        sys.exit(1)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "id": "6966d15e",
 89 |    "metadata": {
 90 |     "scrolled": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# Execute functions for accession number of interest\n",
 95 |     "def main():\n",
 96 |     "    accession = \"P21802\"  # Example accession\n",
 97 |     "    output_file = \"protein_data.json\"  # File to save the JSON response\n",
 98 |     "\n",
 99 |     "    # Fetch protein data\n",
100 |     "    protein_data = fetch_protein_data_json(accession)\n",
101 |     "\n",
102 |     "    # Print the JSON data as a Python dictionary\n",
103 |     "    print(\"Protein Data (JSON):\")\n",
104 |     "    print(protein_data)\n",
105 |     "\n",
106 |     "    # Save the data to a file\n",
107 |     "    save_json_to_file(protein_data, output_file)\n",
108 |     "\n",
109 |     "if __name__ == \"__main__\":\n",
110 |     "    main()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "4c2a2954",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "# Move file to output\n",
121 |     "! mv protein_data.json output/"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "id": "08d994f1",
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# 2. Query PDB"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "id": "5b72aefe",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Import Libraries\n",
142 |     "import os\n",
143 |     "import requests\n",
144 |     "from Bio import PDB"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "id": "62f76cc5",
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "# Download from PDB given and ID\n",
155 |     "def download_pdb(pdb_id, output_dir=\"output\"):\n",
156 |     "    \"\"\"\n",
157 |     "    Downloads a PDB file and associated metadata from the Protein Data Bank.\n",
158 |     "\n",
159 |     "    :param pdb_id: The 4-character PDB ID (e.g., '1A8M')\n",
160 |     "    :param output_dir: Directory where files will be saved\n",
161 |     "    \"\"\"\n",
162 |     "    pdb_id = pdb_id.lower()  # Ensure the PDB ID is lowercase\n",
163 |     "    base_url = \"https://files.rcsb.org/download\"\n",
164 |     "    metadata_url = f\"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}\"\n",
165 |     "    pdb_url = f\"{base_url}/{pdb_id}.pdb\"\n",
166 |     "    \n",
167 |     "    # Create output directory if it doesn't exist\n",
168 |     "    os.makedirs(output_dir, exist_ok=True)\n",
169 |     "    \n",
170 |     "    # Download PDB file\n",
171 |     "    pdb_file_path = os.path.join(output_dir, f\"{pdb_id}.pdb\")\n",
172 |     "    response = requests.get(pdb_url)\n",
173 |     "    if response.status_code == 200:\n",
174 |     "        with open(pdb_file_path, \"w\") as file:\n",
175 |     "            file.write(response.text)\n",
176 |     "        print(f\"PDB file saved at: {pdb_file_path}\")\n",
177 |     "    else:\n",
178 |     "        print(f\"Failed to download PDB file for {pdb_id}.\")\n",
179 |     "    \n",
180 |     "    # Download metadata\n",
181 |     "    metadata_file_path = os.path.join(output_dir, f\"{pdb_id}_metadata.json\")\n",
182 |     "    response = requests.get(metadata_url)\n",
183 |     "    if response.status_code == 200:\n",
184 |     "        with open(metadata_file_path, \"w\") as file:\n",
185 |     "            file.write(response.text)\n",
186 |     "        print(f\"Metadata saved at: {metadata_file_path}\")\n",
187 |     "    else:\n",
188 |     "        print(f\"Failed to download metadata for {pdb_id}.\")"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "f3dab7d9",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "# Run the function for our protein\n",
199 |     "pdb_id = \"1A8M\"  \n",
200 |     "download_pdb(pdb_id)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "id": "6167a641",
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "## End of Notebook ##"
211 |    ]
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "kernelspec": {
216 |    "display_name": "Python 3 (ipykernel)",
217 |    "language": "python",
218 |    "name": "python3"
219 |   },
220 |   "language_info": {
221 |    "codemirror_mode": {
222 |     "name": "ipython",
223 |     "version": 3
224 |    },
225 |    "file_extension": ".py",
226 |    "mimetype": "text/x-python",
227 |    "name": "python",
228 |    "nbconvert_exporter": "python",
229 |    "pygments_lexer": "ipython3",
230 |    "version": "3.11.3"
231 |   }
232 |  },
233 |  "nbformat": 4,
234 |  "nbformat_minor": 5
235 | }
236 | 


--------------------------------------------------------------------------------
/Ch09/Ch09-3-geometric-operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "e5b41ab1",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch09-3 Geometric Operations"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "b6cfdd54-15f6-4826-9d5b-f46af6ba0b01",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Installations (if not already completed)\n",
 21 |     "# ! pip install biopython"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "993c591b-f603-488d-8c5f-a5f7cabee97d",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# 1. Import Libraries\n",
 32 |     "import numpy as np\n",
 33 |     "from Bio import PDB \n",
 34 |     "import pandas as pd"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "9a4aa2bf",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# 2. Retrieve data\n",
 45 |     "repository = PDB.PDBList() \n",
 46 |     "parser = PDB.PDBParser() \n",
 47 |     "repository.retrieve_pdb_file('1TUP', pdir='.', file_format='pdb') \n",
 48 |     "p53_1tup = parser.get_structure('P 53', 'pdb1tup.ent') \n",
 49 |     "# Note - it is OK to get warnings about \"Structure Exists\" here"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "id": "17b74123",
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# 3. Recall residue types\n",
 60 |     "my_residues = set() \n",
 61 |     "for residue in p53_1tup.get_residues(): \n",
 62 |     "    my_residues.add(residue.id[0]) \n",
 63 |     "print(my_residues) "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "id": "9a7aa5fe",
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# 4. Compute masses for chains, zincs, and waters\n",
 74 |     "# Function to compute mass\n",
 75 |     "def get_mass(atoms, accept_fun=lambda x: True):\n",
 76 |     "    return sum([atom.mass for atom in atoms if accept_fun(atom)])\n",
 77 |     "# Extract chain names\n",
 78 |     "chain_names = [chain.id for chain in p53_1tup.get_chains()]\n",
 79 |     "# Initialize NumPy array for masses\n",
 80 |     "my_mass = np.ndarray((len(chain_names), 3))\n",
 81 |     "# Iterate over chains to compute mass\n",
 82 |     "for i, chain in enumerate(p53_1tup.get_chains()):\n",
 83 |     "    my_mass[i, 0] = get_mass(chain.get_atoms())\n",
 84 |     "print(\"Mass array:\", my_mass)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "id": "f89881a3-60ac-4a4c-8c92-ce23dd8427b4",
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "# Alternate for the above with better formatting\n",
 95 |     "import numpy as np\n",
 96 |     "\n",
 97 |     "# Function to compute mass\n",
 98 |     "def get_mass(atoms, accept_fun=lambda x: True):\n",
 99 |     "    \"\"\"Calculate total mass of atoms that pass the acceptance function.\"\"\"\n",
100 |     "    return sum([atom.mass for atom in atoms if accept_fun(atom)])\n",
101 |     "\n",
102 |     "# Filter functions for different atom types\n",
103 |     "def is_not_water(atom):\n",
104 |     "    \"\"\"Returns True if atom is not part of a water molecule.\"\"\"\n",
105 |     "    return atom.get_parent().get_resname() != 'HOH'\n",
106 |     "\n",
107 |     "def is_zinc(atom):\n",
108 |     "    \"\"\"Returns True if atom is zinc.\"\"\"\n",
109 |     "    return atom.element == 'ZN'\n",
110 |     "\n",
111 |     "def is_water(atom):\n",
112 |     "    \"\"\"Returns True if atom is part of a water molecule.\"\"\"\n",
113 |     "    return atom.get_parent().get_resname() == 'HOH'\n",
114 |     "\n",
115 |     "# Extract chain names\n",
116 |     "chain_names = [chain.id for chain in p53_1tup.get_chains()]\n",
117 |     "\n",
118 |     "# Initialize NumPy array for masses (3 columns: No water, Zincs, Water)\n",
119 |     "my_mass = np.zeros((len(chain_names), 3))\n",
120 |     "\n",
121 |     "# Iterate over chains to compute mass for each category\n",
122 |     "for i, chain in enumerate(p53_1tup.get_chains()):\n",
123 |     "    my_mass[i, 0] = get_mass(chain.get_atoms(), is_not_water)  # No water\n",
124 |     "    my_mass[i, 1] = get_mass(chain.get_atoms(), is_zinc)       # Zincs\n",
125 |     "    my_mass[i, 2] = get_mass(chain.get_atoms(), is_water)      # Water\n",
126 |     "\n",
127 |     "# Create nicely formatted table\n",
128 |     "print(\"Mass Distribution by Chain (Daltons)\")\n",
129 |     "print(\"=\" * 45)\n",
130 |     "print(f\"{'Chain':<6} {'No water':<12} {'Zincs':<12} {'Water':<12}\")\n",
131 |     "print(\"-\" * 45)\n",
132 |     "\n",
133 |     "for i, chain_id in enumerate(chain_names):\n",
134 |     "    print(f\"{chain_id:<6} {my_mass[i, 0]:<12.2f} {my_mass[i, 1]:<12.2f} {my_mass[i, 2]:<12.2f}\")\n",
135 |     "\n",
136 |     "print(\"-\" * 45)\n",
137 |     "\n",
138 |     "# Calculate and display totals\n",
139 |     "totals = np.sum(my_mass, axis=0)\n",
140 |     "print(f\"{'Total':<6} {totals[0]:<12.2f} {totals[1]:<12.2f} {totals[2]:<12.2f}\")\n",
141 |     "\n",
142 |     "print(f\"\\nRaw mass array:\\n{my_mass}\")"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "088df255",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# 5. Compute geometric center and center of mass\n",
153 |     "def get_center(atoms, \n",
154 |     "    weight_fun=lambda atom: 1 if atom.parent.id[0] != 'W' else 0): \n",
155 |     "    xsum = ysum = zsum = 0.0 \n",
156 |     "    acum = 0.0 \n",
157 |     "    for atom in atoms: \n",
158 |     "        x, y, z = atom.coord \n",
159 |     "        weight = weight_fun(atom) \n",
160 |     "        acum += weight \n",
161 |     "        xsum += weight * x \n",
162 |     "        ysum += weight * y \n",
163 |     "        zsum += weight * z \n",
164 |     "    return xsum / acum, ysum / acum, zsum / acum \n",
165 |     "    print(get_center(p53_1tup.get_atoms())) \n",
166 |     "    print(get_center(p53_1tup.get_atoms(), \n",
167 |     "    weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0)) "
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "1b5c3c06",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "# 6. Compute center of mass and geometric center of each chain\n",
178 |     "my_center = np.ndarray((len(chain_names), 6)) \n",
179 |     "for i, chain in enumerate(p53_1tup.get_chains()): \n",
180 |     "    x, y, z = get_center(chain.get_atoms()) \n",
181 |     "    my_center[i, 0] = x \n",
182 |     "    my_center[i, 1] = y \n",
183 |     "    my_center[i, 2] = z \n",
184 |     "    x, y, z = get_center(chain.get_atoms(), \n",
185 |     "        weight_fun=lambda atom: atom.mass if atom.parent.id[0] != 'W' else 0) \n",
186 |     "    my_center[i, 3] = x \n",
187 |     "    my_center[i, 4] = y \n",
188 |     "    my_center[i, 5] = z \n",
189 |     "weights = pd.DataFrame(my_center, index=chain_names, \n",
190 |     "    columns=['X', 'Y', 'Z', 'X (Mass)', 'Y (Mass)', 'Z (Mass)']) \n",
191 |     "print(weights) "
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "id": "d7b3bd59",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "## End of Notebook ##"
202 |    ]
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3 (ipykernel)",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.12.11"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 5
226 | }
227 | 


--------------------------------------------------------------------------------
/Ch08/Ch08-2-using-sra.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "5259b0e5",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch08-2 Using the Short Read Archive"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "debe30e4",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# To get the following code to run, you'll need to make sure fasterq-dump is already in your PATH\n",
 21 |     "#  If you have not already done so, you'll want to install it and add it to your PATH\n",
 22 |     "#  Then close out your notebooks and restart jupyter notebook from a terminal where you know you can\n",
 23 |     "#   see fasterq-dump in your path"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "id": "c2f80ae6",
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# Refer to Ch 5 Recipe 2 for sra tools install\n",
 34 |     "# Run the command below in your terminal to make sure fasterq-dump is in your path and add it to your zshrc file\n",
 35 |     "#   (check that your path is correct, it may not be the same as below)\n",
 36 |     "! echo 'export PATH=$PATH:~/Software/sratoolkit.3.1.1-mac-x86_64/bin' >> ~/.zshrc\n",
 37 |     "! source ~/.zshrc \n",
 38 |     "# check that fasterq-dump is working\n",
 39 |     "! fasterq-dump -h\n",
 40 |     "# Then restart your jupyter notebook"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "id": "9d917dd1",
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# Install pysradb\n",
 51 |     "! pip install pysradb"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "9a425eee",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Import Libraries\n",
 62 |     "import os\n",
 63 |     "import subprocess\n",
 64 |     "from pysradb.sraweb import SRAweb"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "3728a992",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "def fetch_sra_metadata(sra_accession):\n",
 75 |     "    \"\"\"\n",
 76 |     "    Retrieve metadata for a given SRA accession using pysradb.\n",
 77 |     "    \n",
 78 |     "    Parameters:\n",
 79 |     "        sra_accession (str): SRA study or run accession (e.g., SRP, SRX, SRA, or ERR).\n",
 80 |     "        \n",
 81 |     "    Returns:\n",
 82 |     "        metadata (DataFrame): Metadata table for the SRA accession.\n",
 83 |     "    \"\"\"\n",
 84 |     "    db = SRAweb()\n",
 85 |     "    metadata = db.sra_metadata(sra_accession, detailed=True)\n",
 86 |     "    return metadata"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "id": "4a86b2b1",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "def download_sra_run(run_accession, output_dir=\"sra_data\"):\n",
 97 |     "    \"\"\"\n",
 98 |     "    Download SRA run data using fasterq-dump.\n",
 99 |     "    \n",
100 |     "    Parameters:\n",
101 |     "        run_accession (str): The specific SRA run accession (e.g., SRR12345678).\n",
102 |     "        output_dir (str): Directory to save the downloaded data.\n",
103 |     "        \n",
104 |     "    Returns:\n",
105 |     "        None\n",
106 |     "    \"\"\"\n",
107 |     "    if not os.path.exists(output_dir):\n",
108 |     "        os.makedirs(output_dir)\n",
109 |     "    \n",
110 |     "    try:\n",
111 |     "        print(f\"Downloading SRA run {run_accession}...\")\n",
112 |     "        # Command to download and convert to FASTQ\n",
113 |     "        subprocess.run(\n",
114 |     "            [\"fasterq-dump\", run_accession, \"--outdir\", output_dir, \"--split-files\"],\n",
115 |     "            check=True\n",
116 |     "        )\n",
117 |     "        print(f\"Download complete. Files saved in {output_dir}\")\n",
118 |     "    except subprocess.CalledProcessError as e:\n",
119 |     "        print(f\"Error downloading {run_accession}: {e}\")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "2946335e",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "def main():\n",
130 |     "    sra_accession = \"SRR536546\"  # Small test dataset\n",
131 |     "    # Fetch metadata\n",
132 |     "    metadata = fetch_sra_metadata(sra_accession)\n",
133 |     "    print(\"Metadata for the accession:\")\n",
134 |     "    print(metadata)\n",
135 |     "\n",
136 |     "    # Download the first run as an example\n",
137 |     "    if not metadata.empty:\n",
138 |     "        first_run = metadata[\"run_accession\"].iloc[0]\n",
139 |     "        download_sra_run(first_run)\n",
140 |     "    else:\n",
141 |     "        print(\"No runs found for this accession.\")\n",
142 |     "if __name__ == \"__main__\":\n",
143 |     "    main()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "id": "55f23863",
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# Move folder to output\n",
154 |     "! mv sra_data output/"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "26340898",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# Use BLAST #"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "id": "373178eb",
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "# Query using BLAST via the NCBI API\n",
175 |     "from Bio.Blast import NCBIWWW\n",
176 |     "from Bio.Blast import NCBIXML\n",
177 |     "\n",
178 |     "# Define a sample FASTA sequence\n",
179 |     "query_sequence = \">test_query\\nATGGCCATTGTAATCATGTTCTAATAGTGTTCA\"\n",
180 |     "\n",
181 |     "# Submit the query to NCBI BLAST (nucleotide BLAST: blastn)\n",
182 |     "result_handle = NCBIWWW.qblast(\"blastn\", \"nt\", query_sequence)\n",
183 |     "\n",
184 |     "# Save the results to a file\n",
185 |     "with open(\"blast_result.xml\", \"w\") as out_file:\n",
186 |     "    out_file.write(result_handle.read())\n",
187 |     "\n",
188 |     "print(\"BLAST search completed! Results saved in 'blast_result.xml'\")"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "c1bc752f",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "# Parse the BLAST output #\n",
199 |     "# Read BLAST results from XML file\n",
200 |     "with open(\"blast_result.xml\") as result_file:\n",
201 |     "    blast_records = NCBIXML.read(result_file)\n",
202 |     "\n",
203 |     "# Print top hits\n",
204 |     "for alignment in blast_records.alignments[:5]:  # Display top 5 hits\n",
205 |     "    print(f\"Hit: {alignment.title}\")\n",
206 |     "    for hsp in alignment.hsps:\n",
207 |     "        print(f\"  Score: {hsp.score}, E-value: {hsp.expect}\")"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "dfc177b1",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "# Move the file to the output subdirectory\n",
218 |     "! mv blast_result.xml output/"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "fc02d67c",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "## End of Notebook ##"
229 |    ]
230 |   }
231 |  ],
232 |  "metadata": {
233 |   "kernelspec": {
234 |    "display_name": "Python 3 (ipykernel)",
235 |    "language": "python",
236 |    "name": "python3"
237 |   },
238 |   "language_info": {
239 |    "codemirror_mode": {
240 |     "name": "ipython",
241 |     "version": 3
242 |    },
243 |    "file_extension": ".py",
244 |    "mimetype": "text/x-python",
245 |    "name": "python",
246 |    "nbconvert_exporter": "python",
247 |    "pygments_lexer": "ipython3",
248 |    "version": "3.12.11"
249 |   }
250 |  },
251 |  "nbformat": 4,
252 |  "nbformat_minor": 5
253 | }
254 | 


--------------------------------------------------------------------------------
/Ch11/Ch11-1-plink.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "24b7b42f-b2e4-44dd-93f4-6a9367318395",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch11-1-plink [Updated]"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "e2e4f87a-2179-4f3b-b561-26f83ada3027",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install Plink\n",
 21 |     "#. Download the appropriate binary for your system from here:\n",
 22 |     "#. https://www.cog-genomics.org/plink/2.0/\n",
 23 |     "# Move the file from your Downloads directory to your Ch11 working directory \n",
 24 |     "# unzip the file\n",
 25 |     "# Test by running: \n",
 26 |     "# ./plink2\n",
 27 |     "# You will most likely get a message saying you cannot trust this file.  If so, do the following:\n",
 28 |     "    # 1. Go into your Mac Settings ->\n",
 29 |     "    # 2. Open System Preferences → Security & Privacy:\n",
 30 |     "    # 3. Click on the General tab.\n",
 31 |     "    # 4. Look for a message saying that \"plink2 was blocked because it is from an unidentified developer.\"\n",
 32 |     "    # 5. Click Allow Anyway\n",
 33 |     "# You should now be able to run ./plink2 \n",
 34 |     "# You may need to click Allow All again and provide the administrator password for your Mac"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "e8ec5e2c-2eca-40cc-ab26-8ff7cb994469",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# Download the data\n",
 45 |     "# First go here and download the 2 files:\n",
 46 |     "# https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/plink_format/\n",
 47 |     "# move them to your Ch11/data working directory\n",
 48 |     "# Unzip the files:\n",
 49 |     "! gunzip data/hapmap3_r3_b36_fwd.consensus.qc.poly.map.gz \n",
 50 |     "! gunzip data/hapmap3_r3_b36_fwd.consensus.qc.poly.ped.gz \n",
 51 |     "# Get the relationships file\n",
 52 |     "! wget https://ftp.ncbi.nlm.nih.gov/hapmap/genotypes/hapmap3_r3/relationships_w_pops_041510.txt\n",
 53 |     "! mv relationships_w_pops_041510.txt data/"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "dded2683-c7c3-473d-a54f-d8c3b873765e",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# 1 - get metadata for samples\n",
 64 |     "# Import libraries\n",
 65 |     "import os\n",
 66 |     "from collections import defaultdict \n",
 67 |     "f = open('data/relationships_w_pops_041510.txt') \n",
 68 |     "pop_ind = defaultdict(list) \n",
 69 |     "f.readline() # header \n",
 70 |     "offspring = [] \n",
 71 |     "for l in f: \n",
 72 |     "    toks = l.rstrip().split('\\t') \n",
 73 |     "    fam_id = toks[0] \n",
 74 |     "    ind_id = toks[1] \n",
 75 |     "    mom = toks[2] \n",
 76 |     "    dad = toks[3] \n",
 77 |     "    if mom != '0' or dad != '0': \n",
 78 |     "        offspring.append((fam_id, ind_id)) \n",
 79 |     "    pop = toks[-1] \n",
 80 |     "pop_ind[pop].append((fam_id, ind_id)) \n",
 81 |     "f.close() "
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "1d74e714-f0d8-41be-b19d-599c0be59cd5",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# 2 - subsample the data\n",
 92 |     "#. Note - replace the path below with your path to plink2\n",
 93 |     "! ./plink2 --pedmap data/hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap10 --thin 0.1 --geno 0.1 --export ped \n",
 94 |     "! ./plink2 --pedmap data/hapmap3_r3_b36_fwd.consensus.qc.poly --out hapmap1 --thin 0.01 --geno 0.1 --export ped "
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "id": "636bfbe0-6c0a-4597-9fc7-32a947225272",
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "# 3. Generate subsets with just the autosomes\n",
105 |     "def get_non_auto_SNPs(map_file, exclude_file): \n",
106 |     "    f = open(map_file) \n",
107 |     "    w = open(exclude_file, 'w') \n",
108 |     "    for l in f: \n",
109 |     "        toks = l.rstrip().split('\\t') \n",
110 |     "        try: \n",
111 |     "            chrom = int(toks[0]) \n",
112 |     "        except ValueError: \n",
113 |     "            rs = toks[1] \n",
114 |     "            w.write('%s\\n' % rs) \n",
115 |     "    w.close() \n",
116 |     "get_non_auto_SNPs('hapmap1.map', 'exclude1.txt') \n",
117 |     "get_non_auto_SNPs('hapmap10.map', 'exclude10.txt') \n",
118 |     "os.system('./plink2 --pedmap hapmap1 --out hapmap1_auto --exclude exclude1.txt --export ped') \n",
119 |     "os.system('./plink2 --pedmap hapmap10 --out hapmap10_auto --exclude exclude10.txt --export ped') "
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "id": "c9444cbc-36b2-4131-9d6f-e19a6d6ff0b1",
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "# 5. Function to generate a list of SNPs belonging to autosomes is defined above"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "id": "e0240969-9150-48e3-9bd9-7384f747adab",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# 6. Datasets without offspring\n",
140 |     "os.system('./plink2 --pedmap hapmap10_auto --filter-founders --out hapmap10_auto_noofs --export ped') "
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "e864875b-0d9c-4bcb-9262-76b8f6eaffc3",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# 7. Generate LD-pruned dataset\n",
151 |     "os.system('./plink2 --pedmap hapmap10_auto_noofs --indep-pairwise 50 10 0.1 --out keep --export ped') \n",
152 |     "#os.system('~/work/CookBook/Ch11/plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --recode --out hapmap10_auto_noofs_ld --export ped')\n",
153 |     "# Remove --recode from the original commmand to remove an error: \n",
154 |     "os.system('./plink2 --pedmap hapmap10_auto_noofs --extract keep.prune.in --out hapmap10_auto_noofs_ld --export ped')"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "12df8491-9975-433e-a746-7a85536d7581",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "# 8.  Recode cases in different formats\n",
165 |     "# os.system('~/work/CookBook/Ch11/plink2 --file hapmap10_auto_noofs_ld --recode12 tab --out hapmap10_auto_noofs_ld_12 --export ped 12') \n",
166 |     "#. Note - fixed above original command to not use --file\n",
167 |     "os.system('./plink2 --pedmap hapmap10_auto_noofs_ld --export ped --out hapmap10_auto_noofs_ld_12')\n",
168 |     "# os.system('~/work/CookBook/Ch11/plink2 --make-bed --file hapmap10_auto_noofs_ld --out hapmap10_auto_noofs_ld') \n",
169 |     "# Note - fixed above original command to not use --file\n",
170 |     "os.system('./plink2 --pedmap hapmap10_auto_noofs_ld --make-bed --out hapmap10_auto_noofs_ld')"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "id": "9d74a18a-f332-4dd9-8de0-9239b3fb303e",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# 9. Extract a single chromosome for analysis\n",
181 |     "os.system('./plink2 --pedmap hapmap10_auto_noofs --chr 2 --out hapmap10_auto_noofs_2 --export ped') "
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "12902b40-83a5-4f38-bd70-b9e5b4bbbfc1",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "## End of Notebook ##"
192 |    ]
193 |   }
194 |  ],
195 |  "metadata": {
196 |   "kernelspec": {
197 |    "display_name": "Python 3 (ipykernel)",
198 |    "language": "python",
199 |    "name": "python3"
200 |   },
201 |   "language_info": {
202 |    "codemirror_mode": {
203 |     "name": "ipython",
204 |     "version": 3
205 |    },
206 |    "file_extension": ".py",
207 |    "mimetype": "text/x-python",
208 |    "name": "python",
209 |    "nbconvert_exporter": "python",
210 |    "pygments_lexer": "ipython3",
211 |    "version": "3.11.14"
212 |   }
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 5
216 | }
217 | 


--------------------------------------------------------------------------------
/Ch04/Ch04-4-decision-trees.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "5c902a08",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "# Ch04-4 - Using Decision Trees to Explore Breast Cancer data"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "344986b2",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# Install the Seaborn library for graphing\n",
 21 |     "! pip install seaborn==0.13.2"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "id": "bb4bb351",
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# Import necessary libraries\n",
 32 |     "from sklearn.datasets import load_breast_cancer\n",
 33 |     "from sklearn.model_selection import train_test_split\n",
 34 |     "from sklearn.tree import DecisionTreeClassifier, plot_tree\n",
 35 |     "from sklearn.metrics import (\n",
 36 |     "    accuracy_score, \n",
 37 |     "    confusion_matrix, \n",
 38 |     "    classification_report, \n",
 39 |     "    precision_score, \n",
 40 |     "    recall_score, \n",
 41 |     "    f1_score\n",
 42 |     ")\n",
 43 |     "import matplotlib.pyplot as plt\n",
 44 |     "import seaborn as sns\n",
 45 |     "import numpy as np"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "c71173e5",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Load the breast cancer dataset\n",
 56 |     "data = load_breast_cancer()\n",
 57 |     "X, y = data.data, data.target"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "b19fbc89",
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# Split the data into training and testing sets\n",
 68 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 69 |     "    X, y, test_size=0.2, random_state=42\n",
 70 |     ")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "id": "df42c41a",
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Create and train the decision tree classifier\n",
 81 |     "dt_classifier = DecisionTreeClassifier(\n",
 82 |     "    random_state=42,  # For reproducibility\n",
 83 |     "    max_depth=5,      # Limit tree depth to prevent overfitting\n",
 84 |     "    criterion='gini'  # Can also use 'entropy'\n",
 85 |     ")\n",
 86 |     "dt_classifier.fit(X_train, y_train)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "id": "be10774e",
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# Make predictions\n",
 97 |     "y_pred = dt_classifier.predict(X_test)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "0f16bec7",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# Performance metrics\n",
108 |     "print(\"Decision Tree Performance Metrics:\")\n",
109 |     "print(\"-\" * 30)\n",
110 |     "print(f\"Accuracy: {accuracy_score(y_test, y_pred):.4f}\")\n",
111 |     "print(f\"Precision: {precision_score(y_test, y_pred):.4f}\")\n",
112 |     "print(f\"Recall: {recall_score(y_test, y_pred):.4f}\")\n",
113 |     "print(f\"F1 Score: {f1_score(y_test, y_pred):.4f}\")"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "c85362d2",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "# Detailed classification report\n",
124 |     "print(\"\\nDetailed Classification Report:\")\n",
125 |     "print(classification_report(y_test, y_pred, \n",
126 |     "                            target_names=data.target_names))"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "id": "e61f82e4",
133 |    "metadata": {
134 |     "scrolled": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "# Confusion Matrix Visualization\n",
139 |     "plt.figure(figsize=(8, 6))\n",
140 |     "cm = confusion_matrix(y_test, y_pred)\n",
141 |     "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',\n",
142 |     "            xticklabels=data.target_names,\n",
143 |     "            yticklabels=data.target_names)\n",
144 |     "plt.title('Confusion Matrix for Decision Tree')\n",
145 |     "plt.xlabel('Predicted Label')\n",
146 |     "plt.ylabel('True Label')\n",
147 |     "plt.tight_layout()\n",
148 |     "plt.show()"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "1fe7bfba",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "# Feature Importance Visualization\n",
159 |     "plt.figure(figsize=(10, 6))\n",
160 |     "feature_importance = dt_classifier.feature_importances_\n",
161 |     "sorted_idx = np.argsort(feature_importance)\n",
162 |     "pos = np.arange(sorted_idx.shape[0]) + .5\n",
163 |     "\n",
164 |     "plt.barh(pos, feature_importance[sorted_idx], align='center')\n",
165 |     "plt.yticks(pos, [data.feature_names[i] for i in sorted_idx])\n",
166 |     "plt.xlabel('Feature Importance')\n",
167 |     "plt.title('Decision Tree Feature Importance')\n",
168 |     "plt.tight_layout()\n",
169 |     "plt.show()"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "id": "99250c28",
176 |    "metadata": {
177 |     "scrolled": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "# Visualize the Decision Tree\n",
182 |     "plt.figure(figsize=(20,10))\n",
183 |     "plot_tree(dt_classifier, \n",
184 |     "          feature_names=data.feature_names,\n",
185 |     "          class_names=data.target_names,\n",
186 |     "          filled=True, \n",
187 |     "          rounded=True)\n",
188 |     "plt.title('Decision Tree Classifier')\n",
189 |     "plt.show()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "id": "d7fc8563-a9e9-4694-8f05-51ecfeca5487",
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "# Clearer view of the image\n",
200 |     "import matplotlib.pyplot as plt\n",
201 |     "from sklearn import tree\n",
202 |     "\n",
203 |     "# Create a larger figure with higher resolution\n",
204 |     "plt.figure(figsize=(24, 12), dpi=300)  # Larger size, higher resolution\n",
205 |     "\n",
206 |     "# Plot the decision tree\n",
207 |     "tree.plot_tree(dt_classifier, \n",
208 |     "               feature_names=data.feature_names,\n",
209 |     "               class_names=data.target_names,\n",
210 |     "               filled=True, \n",
211 |     "               rounded=True,\n",
212 |     "               fontsize=14)  # Increase font size\n",
213 |     "\n",
214 |     "# Add a title\n",
215 |     "plt.title('Decision Tree Classifier', fontsize=18)\n",
216 |     "\n",
217 |     "# Show the plot\n",
218 |     "plt.show()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "ea51a67a",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# Optional: Cross-validation for more robust performance estimation\n",
229 |     "from sklearn.model_selection import cross_val_score\n",
230 |     "cv_scores = cross_val_score(dt_classifier, X, y, cv=5)\n",
231 |     "print(\"\\nCross-Validation Scores:\")\n",
232 |     "print(f\"Mean CV Score: {cv_scores.mean():.4f}\")\n",
233 |     "print(f\"Standard Deviation: {cv_scores.std():.4f}\")"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "7f2160aa",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "## End of Notebook ##"
244 |    ]
245 |   }
246 |  ],
247 |  "metadata": {
248 |   "kernelspec": {
249 |    "display_name": "Python 3 (ipykernel)",
250 |    "language": "python",
251 |    "name": "python3"
252 |   },
253 |   "language_info": {
254 |    "codemirror_mode": {
255 |     "name": "ipython",
256 |     "version": 3
257 |    },
258 |    "file_extension": ".py",
259 |    "mimetype": "text/x-python",
260 |    "name": "python",
261 |    "nbconvert_exporter": "python",
262 |    "pygments_lexer": "ipython3",
263 |    "version": "3.8.6"
264 |   }
265 |  },
266 |  "nbformat": 4,
267 |  "nbformat_minor": 5
268 | }
269 | 


--------------------------------------------------------------------------------