├── .gitignore
├── .travis.yml
├── CODE-OF-CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile.template
├── ISSUE_TEMPLATE.md
├── LICENSE
├── README.md
├── book
├── applications
│ ├── biological-diversity.md
│ ├── images
│ │ ├── basic-unrooted-tree1.jpg
│ │ ├── example_big_dm.png
│ │ ├── unifrac_tree_d0.5.graffle
│ │ ├── unifrac_tree_d0.5.png
│ │ ├── unifrac_tree_d0.graffle
│ │ ├── unifrac_tree_d0.png
│ │ ├── unifrac_tree_d1.graffle
│ │ ├── unifrac_tree_d1.png
│ │ ├── unifrac_tree_with_distances.graffle
│ │ ├── unifrac_tree_with_distances.png
│ │ ├── unifrac_tree_with_distances_ab.graffle
│ │ └── unifrac_tree_with_distances_ab.png
│ ├── index.md
│ └── index.yaml
├── back-matter
│ ├── about-the-author.md
│ ├── glossary.md
│ ├── index.md
│ └── index.yaml
├── config.yaml
├── exercises
│ ├── index.md
│ ├── index.yaml
│ ├── multiple-sequence-alignment.md
│ └── pairwise-alignment.md
├── fundamentals
│ ├── database-searching.md
│ ├── images
│ │ ├── Darwins_tree_of_life_1859.png
│ │ ├── Pace_Big_Tree.png
│ │ ├── alignment.graffle
│ │ ├── alignment.png
│ │ ├── basic-rooted-tree1.jpg
│ │ ├── basic-unrooted-tree1.jpg
│ │ ├── centroid-cluster.1.png
│ │ ├── centroid-cluster.2.png
│ │ ├── centroid-cluster.3.png
│ │ ├── centroid-cluster.4.png
│ │ ├── centroid-cluster.5.png
│ │ ├── centroid-cluster.6.png
│ │ ├── centroid-cluster.graffle
│ │ ├── cluster-legend.graffle
│ │ ├── cluster-legend.png
│ │ ├── cluster-template.graffle
│ │ │ ├── data.plist
│ │ │ └── image1.pdf
│ │ ├── cluster-types.graffle
│ │ │ ├── data.plist
│ │ │ └── image2.pdf
│ │ ├── cluster-types.png
│ │ ├── furthest-neighbor.1.png
│ │ ├── furthest-neighbor.2.png
│ │ ├── furthest-neighbor.3.png
│ │ ├── furthest-neighbor.4.png
│ │ ├── furthest-neighbor.5.png
│ │ ├── furthest-neighbor.6.png
│ │ ├── furthest-neighbor.graffle
│ │ │ ├── data.plist
│ │ │ └── image1.pdf
│ │ ├── msa-tree-a1.graffle
│ │ ├── msa-tree-a1.png
│ │ ├── msa-tree-a2.graffle
│ │ ├── msa-tree-a2.png
│ │ ├── msa-tree-a3.graffle
│ │ ├── msa-tree-a3.png
│ │ ├── msa-tree-final.graffle
│ │ ├── msa-tree-final.png
│ │ ├── msa-tree-input.graffle
│ │ ├── msa-tree-input.png
│ │ ├── nearest-neighbor.1.png
│ │ ├── nearest-neighbor.2.png
│ │ ├── nearest-neighbor.3.png
│ │ ├── nearest-neighbor.4.png
│ │ ├── nearest-neighbor.5.png
│ │ ├── nearest-neighbor.6.png
│ │ ├── nearest-neighbor.graffle
│ │ ├── sequence-evo-tree.graffle
│ │ ├── sequence-evo-tree.png
│ │ ├── spider-tree.png
│ │ ├── tree-monophyly.graffle
│ │ ├── tree-monophyly.png
│ │ ├── tree-polyphyly.graffle
│ │ ├── tree-polyphyly.png
│ │ ├── tree-schematic1.graffle
│ │ ├── tree-schematic1.png
│ │ ├── upgma-tree-final.graffle
│ │ ├── upgma-tree-final.png
│ │ ├── upgma-tree-iter1.graffle
│ │ ├── upgma-tree-iter1.png
│ │ ├── upgma-tree-iter2.graffle
│ │ ├── upgma-tree-iter2.png
│ │ ├── upgma-tree-iter3.graffle
│ │ └── upgma-tree-iter3.png
│ ├── index.md
│ ├── index.yaml
│ ├── machine-learning.md
│ ├── multiple-sequence-alignment.md
│ ├── pairwise-alignment.md
│ ├── phylogeny-reconstruction.md
│ └── sequence-mapping-and-clustering.md
├── getting-started
│ ├── biological-information.md
│ ├── images
│ │ ├── central-dogma.png
│ │ ├── genetic-code.png
│ │ ├── greg-in-telluride.png
│ │ └── trna.png
│ ├── index.md
│ ├── index.yaml
│ └── reading-iab.md
├── images
│ └── logo.png
├── index.md
└── index.yaml
├── custom.css
├── environment.yml
├── iab
├── __init__.py
├── algorithms
│ └── __init__.py
├── data
│ └── __init__.py
└── format
│ ├── __init__.py
│ └── dialog_box.py
├── licenses
├── runipy.txt
└── scikit-bio.txt
├── paper.bib
├── paper.md
├── runipynbs.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 |
3 | # C extensions
4 | *.so
5 |
6 | # Packages
7 | *.egg
8 | *.egg-info
9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 |
22 | # Installer logs
23 | pip-log.txt
24 |
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 |
30 | # Translations
31 | *.mo
32 |
33 | # ipynb checkpoint files
34 | .ipynb_checkpoints
35 |
36 | # Mr Developer
37 | .mr.developer.cfg
38 | .project
39 | .pydevproject
40 |
41 | # vim
42 | .*.swp
43 |
44 | # macos
45 | .DS_Store
46 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Taken and modified from
2 | # https://github.com/biocore/scikit-bio/blob/master/.travis.yml
3 | dist: bionic
4 | language: python
5 | services:
6 | - xvfb
7 | env:
8 | - PYTHON_VERSION=3.5
9 | before_install:
10 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
11 | - chmod +x miniconda.sh
12 | - ./miniconda.sh -b
13 | - export PATH=/home/travis/miniconda3/bin:$PATH
14 | install:
15 | - conda env create -n iab -f environment.yml
16 | - source activate iab
17 | - pip install .
18 | - conda install -y nose
19 | - pip install https://github.com/caporaso-lab/build-iab/archive/master.zip
20 | - biab notebook -i book -o ipynb
21 | script:
22 | - nosetests --with-doctest
23 | - cd ipynb
24 | - jupyter nbconvert *ipynb */*ipynb --execute --ExecutePreprocessor.timeout=-1
25 |
--------------------------------------------------------------------------------
/CODE-OF-CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct: An Introduction to Applied Bioinformatics
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at caporaso-lab@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to *An Introduction to Applied Bioinformatics*
2 |
3 | *An Introduction to Applied Bioinformatics* (IAB), is an open source project and we welcome community contributions. Contributions should generally be made in the form of GitHub pull requests. We've simplified the process of submitting these for changes to the IAB content, as described below.
4 |
5 | You will need a (free) GitHub account to submit a pull request.
6 |
7 | Before considering a contribution to IAB, please read the project [Code of Conduct](https://github.com/applied-bioinformatics/An-Introduction-to-Applied-Bioinformatics/blob/master/CODE-OF-CONDUCT.md).
8 |
9 | ## How IAB is developed
10 |
11 | IAB is written in [markdown](http://commonmark.org/). [build-iab](https://github.com/caporaso-lab/build-iab) is then used to convert that markdown to html (for static online viewing) and ipynb (for interactive use). If you're submitting changes to content, you'll be submitting changes to markdown files. These are much more manageable that submitting changes to IPython notebooks, as it's much easier to diff the content.
12 |
13 | ## Small contributions
14 |
15 | Contributions such as typo fixes, wording changes, and small code updates are relatively easy to submit. Every unit, chapter, section, and subsection heading in IAB has an *Edit on GitHub* link below it. You should click the link corresponding to the section that you want to edit, which will take you to the GitHub online editor. You can make your changes and submit a [pull request](https://help.github.com/articles/using-pull-requests/) from that page. This will initiate the IAB tests, which will ensure that your change hasn't broken anything. After the tests pass, we will review your changes and either merge them, request modifications before merging, or let you know why we won't integrate your proposed changes.
16 |
17 | Watch a five-minute YouTube video illustrating this process [here](https://www.youtube.com/watch?v=s9-aZrX5CY8).
18 |
19 | ## Large contributions
20 |
21 | If you're interested in making contributions involving code refactoring, new chapters or sections, restructuring of content, etc, you should first comment on existing issues (or create new issues) indicating what you want to work on, and wait for us to discuss the changes with you before you get started. This will ensure that the changes you want to make are in line with the direction of the project, compatible with future plans, and that there is no one else already working on related contributions. This will avoid wasted time if your change involves something that we wouldn't ultimately be merged. If in doubt, bring it up on the [issue tracker](https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/issues/) first.
22 |
23 | ## Who's contributing?
24 |
25 | See the list of [IAB contributors](https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/graphs/contributors) to find out who is involved with the project. If you submit a pull request that is merged, your GitHub account will be automatically listed on that page.
26 |
27 | ## Technical points
28 |
29 | ### Building IAB locally
30 |
31 | If you're interested in building the IAB html and/or IPython Notebooks locally, you'll need to install IAB and build-iab. You can do this as follows:
32 |
33 | ```
34 | pip install numpy
35 | pip install https://github.com/caporaso-lab/build-iab/archive/master.zip
36 | wget https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/archive/master.zip
37 | unzip master.zip
38 | cd An-Introduction-To-Applied-Bioinformatics-master/
39 | pip install .
40 | ```
41 |
42 | Then, to build the IPython Notebooks, you can run:
43 |
44 | ```
45 | biab notebook -i book -o ipynb
46 | ```
47 |
48 | or to build the HTML version, you can run:
49 |
50 | ```
51 | biab html -i book -o html
52 | ```
53 |
54 | ### Linking to other sections of the text
55 |
56 | All section headings must have ids associated with them. Should be generated as follows:
57 |
58 | ```bash
59 | $ biab idgen
60 |
61 | ```
62 |
63 | When you define a section heading, you'd end it with the tag returned from the above command. For example:
64 |
65 | ```markdown
66 | ## Some section
67 | ```
68 |
69 | If you then wanted to link to that section from somewhere else in the text, you could do that with a markdown link as follows:
70 |
71 | ```markdown
72 | This concept is discussed in further detail [above](alias://9mM4Bb).
73 | ```
74 |
75 | You should always link using these ids, and never statically link to other sections of the text with URLs (because a section name might change, but its id won't).
76 |
77 | ## License and license changes
78 |
79 | The IAB license is available [here](https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/blob/master/LICENSE). This license may change over time, but the online version of IAB will always be the most current version, and will be available free of charge.
80 |
81 | By contributing to IAB, you are agreeing that Greg Caporaso has sole discretion over the license and any future changes to the license. If a paid (e.g., printed) copy of IAB is ever created, contributors are not entitled to payments or royalties of any kind. Your contribution of content represents your agreement with these terms.
82 |
--------------------------------------------------------------------------------
/Dockerfile.template:
--------------------------------------------------------------------------------
1 | # Distributed under the terms of the Modified BSD License.
2 | #
3 | # This image is expecting to be built alongside the *built* IAB docs,
4 | # e.g. https://github.com/applied-bioinformatics/built-iab,
5 | # and is designed for running on mybinder.org
6 |
7 | FROM jupyter/minimal-notebook:58169ec3cfd3
8 |
9 | LABEL maintainer="Greg Caporaso "
10 |
11 | USER root
12 |
13 | ENV DISPLAY=:99
14 |
15 | RUN apt-get update
16 | RUN apt-get install -y xvfb x11-utils
17 |
18 | COPY IAB-notebooks ${HOME}/IAB-notebooks/
19 | COPY .jupyter/custom/custom.css ${HOME}/.jupyter/custom/custom.css
20 | # `fix-permissions` ships with jupyter/minimal-notebook
21 | RUN fix-permissions ${HOME}
22 | RUN rm -rf work
23 |
24 | USER ${NB_UID}
25 |
26 | COPY environment.yml ${HOME}
27 | RUN conda env update -n base -f environment.yml
28 | RUN rm environment.yml
29 |
30 | # This is almost identical to the `ENTRYPOINT` defined in jupyter/minimal-notebook,
31 | # except we tack on a `xvfb-run` on the end, which ensures that ete3 has X.
32 | ENTRYPOINT ["tini", "-g", "--", "xvfb-run"]
33 |
--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### Describe the environment
2 |
3 | What version of IAB are you reading (e.g. v0.0.1)?
4 |
5 | ```python
6 | from iab import __version__ as iab_version
7 | print(iab_version)
8 | ```
9 |
10 | ### Describe the problem
11 |
12 | #### Steps to reproduce
13 |
14 | 1. ...
15 | 2. ...
16 | 3. ...
17 |
18 | #### Observed Results
19 |
20 | * What happened? This could be a description, log output, etc.
21 |
22 | #### Expected Results
23 |
24 | * What did you expect to happen?
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # An Introduction To Applied Bioinformatics (1st edition)
2 |
3 | This project has been succeeded by An Introduction to Applied Bioinformatics, 2nd Edition. You can find that book at http://readIAB.org, and the source repositiory at https://github.com/applied-bioinformatics/iab2. This project (IAB 1st Edition) is no longer under active development.
4 |
--------------------------------------------------------------------------------
/book/applications/images/basic-unrooted-tree1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/basic-unrooted-tree1.jpg
--------------------------------------------------------------------------------
/book/applications/images/example_big_dm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/example_big_dm.png
--------------------------------------------------------------------------------
/book/applications/images/unifrac_tree_d0.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_d0.5.png
--------------------------------------------------------------------------------
/book/applications/images/unifrac_tree_d0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_d0.png
--------------------------------------------------------------------------------
/book/applications/images/unifrac_tree_d1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_d1.png
--------------------------------------------------------------------------------
/book/applications/images/unifrac_tree_with_distances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_with_distances.png
--------------------------------------------------------------------------------
/book/applications/images/unifrac_tree_with_distances_ab.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_with_distances_ab.png
--------------------------------------------------------------------------------
/book/applications/index.md:
--------------------------------------------------------------------------------
1 | # Applications
2 |
--------------------------------------------------------------------------------
/book/applications/index.yaml:
--------------------------------------------------------------------------------
1 | contents:
2 | - biological-diversity
3 |
--------------------------------------------------------------------------------
/book/back-matter/about-the-author.md:
--------------------------------------------------------------------------------
1 | # About the author
2 |
3 | My name is Greg Caporaso. I'm the primary author of *An Introduction to Applied Bioinformatics*, but there are [other contributors](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/graphs/contributors) and I hope that list will grow.
4 |
5 |
6 |
7 |
8 |
9 | I have degrees in Computer Science (B.S., University of Colorado, 2001) and Biochemistry (B.A., University of Colorado, 2004; Ph.D., University of Colorado, 2009). Following my formal training, I joined [Rob Knight's lab](http://knightlab.ucsd.edu), then at the University of Colorado, for approximately 2 years as a post-doctoral scholar. In 2011, I joined the faculty at [Northern Arizona University (NAU)](www.nau.edu) where I'm now an Associate Professor. I [teach](http://www.caporasolab.us/teaching/) one course per year in bioinformatics for graduate and undergraduate students of Biology. I also run a [research lab](http://www.caporasolab.us/) in the [Pathogen and Microbiome Institute](http://pmi.nau.edu/), which is focused on developing bioinformatics software and studying microbiomes.
10 |
11 | I'm not the world expert on the topics that I present in IAB, but I have a passion for bioinformatics, open source software, writing, and education. When I'm learning a new bioinformatics concept, for example an algorithm like pairwise alignment or a statistical technique like Monte Carlo simulation, implementing it is usually the best way for me to understand it. This led me to start developing IAB, as I found that my implementations helped my students learn the concepts too. I think that one of my strongest skills is the ability to break complex ideas into accessible components. I do this well for bioinformatics because I remember (and still regularly experience) the challenges of learning it, so can relate to newcomers in the field.
12 |
13 | I'm active in open source bioinformatics software development, and am most widely known for my development and leadership roles on [QIIME](http://qiime.org) and [QIIME 2](https://qiime2.org). I'm involved in other bioinformatics software projects as well (see my [GitHub page](http://github.com/gregcaporaso)). IAB is one of the projects that I'm currently most excited about, and I truly hope that it's as useful for you as it is fun for me.
14 |
15 | For updates on IAB and various other things, you can [follow me on Twitter](https://twitter.com/gregcaporaso).
16 |
--------------------------------------------------------------------------------
/book/back-matter/glossary.md:
--------------------------------------------------------------------------------
1 | # Glossary
2 |
3 | ## Pairwise alignment (noun)
4 |
5 | A hypothesis about which bases or amino acids in two biological sequences are derived from a common ancestral base or amino acid. By definition, the *aligned sequences* will be of equal length with gaps (usually denoted with ``-``, or ``.`` for terminal gaps) indicating hypothesized insertion deletion events. A pairwise alignment may be represented as follows:
6 |
7 | ```
8 | ACC---GTAC
9 | CCCATCGTAG
10 | ```
11 |
12 | ## kmer (noun)
13 |
14 | A kmer is simply a word (or list of adjacent characters) in a sequence of length k. For example, the overlapping kmers in the sequence ``ACCGTGACCAGTTACCAGTTTGACCAA`` are as follows:
15 |
16 | ```python
17 | >>> import skbio
18 | >>> skbio.DNA('ACCGTGACCAGTTACCAGTTTGACCAA').kmer_frequencies(k=5, overlap=True)
19 | ```
20 |
21 | It is common for bioinformaticians to substitute the value of `k` for the letter _k_ in the word _kmer_. For example, you might here someone say "we identified all seven-mers in our sequence", to mean they identified all kmers of length seven.
22 |
--------------------------------------------------------------------------------
/book/back-matter/index.md:
--------------------------------------------------------------------------------
1 | # Back Matter
2 |
--------------------------------------------------------------------------------
/book/back-matter/index.yaml:
--------------------------------------------------------------------------------
1 | contents:
2 | - about-the-author
3 | - glossary
4 |
--------------------------------------------------------------------------------
/book/config.yaml:
--------------------------------------------------------------------------------
1 | repo: caporaso-lab/An-Introduction-to-Applied-Bioinformatics
2 | root: book
3 |
--------------------------------------------------------------------------------
/book/exercises/index.md:
--------------------------------------------------------------------------------
1 | # Exercises
2 |
3 | This section contains exercises corresponding to different chapters. In the future these will be structured to link specifically to and from the relevant chapters.
4 |
--------------------------------------------------------------------------------
/book/exercises/index.yaml:
--------------------------------------------------------------------------------
1 | contents:
2 | - pairwise-alignment
3 | - multiple-sequence-alignment
4 |
--------------------------------------------------------------------------------
/book/exercises/pairwise-alignment.md:
--------------------------------------------------------------------------------
1 |
2 | # Local sequence alignment exercises
3 |
4 | ## Purpose
5 |
6 | The purpose of this exercise is to have you combine a few of the topics we have covered in class up to this point, and get you working with code. (Don't panic: you don't have to write code - everything in this assignment can be achieved with cutting and pasting code that we've already written for you.)
7 |
8 | You will need to be familiar with local and global alignments (see the pairwise alignment section) and the process of PCR (see the molecular biology section). The aim is to develop your bioinformatics problem solving skills, while also introducing you to interacting with the IPython Notebook.
9 |
10 | ## Background
11 |
12 | You should start by reading the section 4.3 [here](http://www.ncbi.nlm.nih.gov/books/NBK21129/#A6064) of *Genomes*, by TA Brown. You may find the entire chapter useful.
13 |
14 | A common process in bioinformatics is looking at the composition of microorganisms in a given environment. For instance, we could take a sample from a desk in an office, the gut of a human subject, or the Southern Ocean and ask what microorganisms are present in each of the different samples. The most common way to answer this question is to sequence the 16S rRNA gene. What makes this gene so useful is that it contains several conserved areas, which means that we can isolate it from the full genomes of many organisms using PCR, however it also contains highly variable regions so that we can tell the organisms apart. The sequence of the 16S rRNA therefore serves as a *fingerprint* for a microorganism. If we find it in a sample, that suggests that the organism is present in that sample.
15 |
16 | ## Goals
17 |
18 | In this exercise we will provide you with the full-length 16S rRNA sequences from five different bacterial organisms, and ten candidate primer sequences. Using both global and local sequence alignment, we'll ask you to select what you think is the best single primer pair to use for amplifying 16S rRNA from these five organisms for use in profiling diverse communities of microorganisms.
19 |
20 | Throughout this notebook there will be questions that you are required to answer. They will be written in bold so you know they are required and they should also help you with the overall process.
21 |
22 | ## Hints
23 |
24 | * Read all of the cells containing text very carefully!
25 |
26 | * You may write code or use a text editor if you wish, however all of the tools necessary to answer the questions are present in this notebook.
27 |
28 | * Spend some time thinking about what the question is and how you can go about answering it. This assignment is based largely on problem solving skills and it may take time to develop a good strategy.
29 |
30 | * Get help, that's what office hours are for!
31 |
32 | * You are allowed to discuss the assignment with other students, however your work needs to be your own. Using or looking at code or commands generated by another student is strictly prohibited. If you're in doubt over whether some type of interaction is acceptable for this assignment, ask.
33 |
34 | ## Getting started
35 |
36 | The first thing you will want to do is import a couple of functions that will be necessary for this problem.
37 |
38 | ```python
39 | >>> %pylab inline
40 | >>> from iab.algorithms import sw_align_nt, nw_align_nt
41 | ```
42 |
43 | Next, in order to make sure the function was imported properly, and to see how it works run the `help` command on it. Read the help text carefully, it will be important to understand exactly what each function does.
44 |
45 | ```python
46 | >>> help(sw_align_nt)
47 | ```
48 |
49 | ```python
50 | >>> help(nw_align_nt)
51 | ```
52 |
53 | This next function, ``slice_sequence``, will let you let you easily extract segments of a sequence that are of interest to you (for example, the region between where two primers align.
54 |
55 | ```python
56 | >>> def slice_sequence(sequence, start_pos, end_pos):
57 | >>> """ Given a sequence, return the substring between start_pos and end_pos
58 | ...
59 | >>> Parameters
60 | >>> ----------
61 | >>> sequence: string
62 | >>> The sequence to be sliced
63 | >>> start_pos: int
64 | >>> The starting position for the new sequence
65 | >>> end_pos: int
66 | >>> The ending position for the new sequence
67 | ...
68 | >>> Returns
69 | >>> -------
70 | >>> string
71 | >>> A substring of the input string between start_pos and end_pos
72 | ...
73 | >>> """
74 | >>> if len(sequence) == 0:
75 | >>> raise ValueError("The sequence is empty")
76 | >>> if start_pos < 1:
77 | >>> raise ValueError("Starting position must be greater than zero.")
78 | >>> if end_pos > len(sequence):
79 | >>> raise ValueError("Ending position cannot be larger than the length of the sequence.")
80 | >>> if start_pos > end_pos:
81 | >>> raise ValueError("The starting position must be less than the ending positions.")
82 | >>> return sequence[start_pos-1:end_pos]
83 | ```
84 |
85 | The following cell contains the full-length 16S rRNA sequences of five diverse bacterial organisms. Make sure to run this cell in order to load the sequences into memory.
86 |
87 | ```python
88 | >>> # k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Escherichia; s__coli
89 | >>> s1 = ['656881','GGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTTGTTGGTGGGGTAACGGCTCACCAAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTA']
90 | ...
91 | >>> # k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__S24-7; g__; s__
92 | >>> s2 = ['305251','GATGAACGCTAGCGACAGGCTTAACACATGCAAGTCGAGGGGCAGCGAGATTGTGGCAACACGATTGTCGGCGACCGGCGCACTGGTGAGTAACACGTATGCAACCTGCCGCGCACTGGGGGATAATCTTGGGAAACCGAGTCTAATACCCCGTAGGCCTTGTTGCCGCATGGTAATAAGGTAAGAGGAGTGATCCGATGCGCGATGGGCATGCGGCGCATTAGCTAGTTGGCGGGGTAACAGCCCACCAAGGCGACGATGCGTAGGGGTTCTGAGAGGAAGGTCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCAAGTCGCGTGCGGGAGGGAGGCCCTACGGGTCGTAAACCGCTTTTGATGGGGGGTAACCATGCGGACGAGTCCGCATCTGAGAGCACCCATCGAATAAGCATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGCTATTCAAGTCAGCGGTCAAATTGCGGTGCTCAACGCCGTATCGCCGTTGAAACTGAGTTGGCTAGAGTGAGAGTGAGGAAGGCGGAATGCGCGGTGTAGCGGTGAAATGCATAGATATTGCGCAGAACTCCGATTGCGAAGGCAGCTTTCCAATTCTCTACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGGTCACTAGCTGTGCGCCCTGATTAAAGGGAGCGTGGCCGAGCGAAAGCGTTAAGTGACCCACCTTGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTCAAACGCTTGCGGGAGTCTATTTGAAAGGATAGATGCCCTTCGGGGCTGCAAGCGAGGTGCTGCATGGTTGTCGTCAGCTCGTGCCGTGAGGTGTCGGCTTAAGTGCCATAACGAGCGCAACCCCCATCTTCAGTTGCCGTCGGGTAGAGCCGGGCACTCTGGAGAGACTGCCGGCGCAAGCTGTGAGGAAGGCGGGGATGACGTCAAATCAGCACGGCCCTTACGTCCGGGGCGACACACGTGTTACAATGGCGGGGCACAGAGGGAAGCCAGGCGGTGACGTCGAGCGGATCCCGAAAACCCGTCTCAGTTCGGATCGGAGTCTGCAGCTCGACTCCGTGAAGCTGGATTCGCTAGTAATCGCGCATCAGCCATGGCGCGGTGAATACGTTCCCGGGCCT']
93 | ...
94 | >>> # k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Chlorophyta; f__Ulvophyceae; g__; s__
95 | >>> s3 = ['577032', 'ATGAACGCTGGCGGCATGCTTAACACATGCAAGTTGAACGGGTTTAAGTTTATTAAACTTAAACAAGTAGCGGACGGGTGAGTAACGCGTAAGAACCTACCTTTAGGTAAGGAATAACTATTGGAAGCGATAGATAATACCTTATAAGCTTATAGTAAAAGATAAAATCGCCTAAGGATGGGCTTGCGTCTGATTAGCTTGTTGGTGATTTAAAAGATTCACCAAGGCAACGATCAGTAGTTGGTCTAAGAGGATGATCAACCACACTGGGACTGAGACACGGCCCAGACCTCTACGGAGGGCAGCAGTGAGGAATTTTCCGCAATGGGCGAAAGCCTGACGGAGCAATGCCGCGTGGAGGATGAAAGCTTGTGAGTCGTAAACTCCTTTTCTTAGTGAAGAAATAAGACGGTATCTAAGGAATAAGCATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATGCAAGCGTTATTCGGAATTATTGGGCGTAAAGCGTCTGTAGGTGGTTTTTTAAGTCTACTGTTAAATATTAAGGCTTAACCTTAAAAAAGCGGTATGAAACTAAAAAACTTGAGTTTAGTAGAGGTAGAGGGAATTCTCGGTGTAGTGGTGAAATGCGTAGAGGTCGGGAAGAACACCGGTAGCGAAAGCGCTCTACTGGGCTAAAACTGACACTCAGAGACGAAAGCTAAAGTAGCGAATGGGATTAGATACCCCAGTAGTCTTAGCTGTAAACGATGGGTACTAGATGTTGCGCGTATCGATCCGTGCAGTATCGTAGCTAACGCGTTAAGTACCCCGCCTGGGAAGTATGCTCGCAAGAGTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCAGAACTTGACATGTCACAAATTTTCTTGAAAAAGAAAAGTGCCTTAGGGAGTGTGAACACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTTTTTAGTTGCCATCATTTAGTTGGGAACTCTAAAAAGACTGCCGGTGACAAACCGGAGGAAGGTGAGGATGACGTCAAGTCAGCATGCCCCTTATGTTCTGGGCTACACACGTGCTACAATGATTATGACAAAGGGTAGCGAATTCGCGAGAATCAGCCAATCTCATAAACATAGTCTAAGTTCGGATTGCAGGCTGAAACTCGCCTGCATGAAGCTGGAATCGCTAGTAATCGCCGGTCAGCTATACGGCGGTGAATCCGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGAAGTTGGCTACGCCCGAAGTCGTTATCTTAACCTTTTTGGAGGGAGGCGCCTAAGGTGGAGCCAGTGACTGGGGTGA']
96 | ...
97 | >>> # k__Bacteria; p__GN02; c__BD1-5; o__; f__; g__; s__
98 | >>> s4 = ['200762','AGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGGTAACGGGTGTAGCAATACATGCTGACGAGCGGCGGACGGGTGAGCAATATTTGGGAATCTGCCTATTAGTGGGGGACAACCCGGGGAAACTCGGGCTAATACCGCATACGCTCTACGGAGGAAAGCCGGGGACCGCAAGGCCTGGCGCTAATAGATGAGCCCAAATCGGATTAGCTAGTTGGTGAGGTAAAGGCTCACCAAGGCGACGATCCGTAGCTGGTCTGAGAGGACGACCAGCCACACCGGAACTGAGACACGGTCCGGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATGGAGCGACACCGCGTGAAGGATGAAGCCTTTGTTGGTGTAAACTTCTTTTCTCTGGGAAGATAATGACGGTACCAGAGGAATAAGGGGCGGCAAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGCCCCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGTCTGTAGGTGGTCTGGAAAGTTTCAAGTGAAAGGTCAGGGCTTAACCCTGTACTTGCTTGGAAAACTATCAGACTTGAGTGCGGGAGAGGCAAGCAGAACTGTATGAGTAGGGGTGCAATCCGTTGATACATACAAGAATACCAAAAGCGAAGGCAGCTTGCTGGAACGCTACTGACACTGAGAGACGAAAGCGTGGGGAGCAAAAGGGATTAGATACCCCTGTAGTCCACGCCCTAAACGATGGATGCTAAATGTCGGCGCAAGCCGGTGTTTCAAGCTAACGCATTAAGCATCCCGCCTGAGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATAGACGGGGACCCGCACAAGCAGTGGATCATGTGGTTTAATTCGACACTAAACGAGGAACCTCACCTAGGCTTGACATTGATAGAATTTGCTGGAAACAGCGAAGTGCCTGCAAGGGAACTTGAAAACAGGCGCTGCATGGTTGTCGTCAGCTCGTGCCTTGAGGTGTTCGGTTAAGTCCGTTAACGAGCGCAACCCATGTCGTTAGTTATTATGTCTAACGAGACTGCTCGAGTTAATCGAGAGGAAGGTGTGGATGACGTCAAATCAGCATGGCCCTTATGCCTAGGGCTACACACATGATACAATGGTCGGTACAAAGGGTTGCCAAGTGGTAACACGGAGCCAATCCCAGAAAGCCGATCTCAGTCCAGATTGAGGGCTGCAACTCGCCCTCATGAAGTTGGAATTGCTAGTAATCGTGAATCAGCTATGTCACGGTGAATCTGTTCCCGGGTCTTGTACTCACCGCCCGTCAAACCATGGGAGGTGTGCGTACCTGAAGTCCTTCGAGTAATACGGAGGCCCACGGTAAACACACTGACTGGGGTTAAGTCGTAACAAGGTA']
99 | ...
100 | >>> # k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__; s__
101 | >>> s5 = ['3728119','CTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGATGACGGGAGCTTGCTCCTTGATTCAGCGGCGGACGGGTGAGTAATGCCTAGGAATCTGCCTGGTAGTGGGGGACAACGTTTCGAAAGGAACGCTAATACCGCATACGTCCTACGGGAGAAAGCAGGGGACCTTCGGGCCTTGCGCTATCAGATGAGCCTAGGTCGGATTAGCTAGTAGGTGAGGTAATGGCTCACCTAGGCGACGATCCGTAACTGGTCTGAGAGGATGATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGTCTTCGGATTGTAAAGCACTTTAAGTTGGGAGGAAGGGCAGTAAGCTAATACCTTGCTGTTTTGACGTTACCGACAGAATAAGCACCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACGGCAGGGATGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGCTAGGCTGGTTCGTTAAGTCTGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTCGAAACTGGCAGAGCTAGAGTATTGTAGAGGGTGGTGGAATTTCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAAGGAATACCGGTGGCGAAGGCGACCCCCTGGACATGATACTGACGCTCATGTCGTCTTAGCGATAATGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGTTTGGCTTCCGGAGCTAACGCGTTAAGTTGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTACTCTTGACATCCAGAGAACTTTCCAGAGATGGATTGGTGCCTTCGGGAACTCTGACACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTTTGGCCGGGAACTCATAGGAAACTGCCAGTGATCAACTGGAAGAAAGTGGGGATGACCTCCAGTCATCATGGCCCTTACCAGTATGGCTACACACGTGCTACAATGGCGCATACCAAGAGAATCGACCTCGCGAGAGCGAGCGGACTTTATCAAGTGCGTCGTAATCCGGATTGGAGTCTGCCACTCTCACTCGATGAAGTCCGAATCGCTAGTAATCGTGGATTCAGAATTGCTTCGGTGTGAATATCGTTCCCGGGCCTTTGTACACACCCGCCCGGTCACACCATGGG']
102 | ...
103 | >>> sequences = [s1, s2, s3, s4, s5]
104 | ```
105 |
106 | You can now use ``slice_sequence`` to extract a region of interest from a sequence - for example, the region between the starting position of two primer hits. The example below would extract the region between 50 and 250 from sequence 1.
107 |
108 | ```python
109 | >>> slice_sequence(sequences[0][1], 50, 250)
110 | ```
111 |
112 | **Hint:** The numbers here may look a little bit funny. That's because the first item in a list in python is 0, not 1. So, if you want the first entry from the list `sequences` you would type `sequences[0]`, and if you want the second sequence you would type `sequences[1]`. Try this in the cell below until you are comfortable getting the sequence you want. Then try to slice a sequence to the region between positions 25 and 50.
113 |
114 | ```python
115 | ...
116 | ```
117 |
118 | This cell contains the potential primers that we will use to amplify specific regions.
119 |
120 | ```python
121 | >>> primers = [('p1', 'TTCCGGTTGATCCNGCCGGA'), # F21
122 | >>> ('p2', 'ACNGCTCAGTAACACGT'), # F109
123 | >>> ('p3', 'GCTGCCTCCCGTAGGAGT'), # F338
124 | >>> ('p4', 'TACGGNAGGCAGCAG'), # F343
125 | >>> ('p5', 'GTGCCAGCNGCCGCGGTAA'), # F515
126 | >>> ('p6', 'ATTAGATACCCNGGTAGTCC'), # F770
127 | >>> ('p7', 'ATTAGATACCCNNGTAGTCC'), # R806 (reverse complement)
128 | >>> ('p8', 'AGGAATTGGCGGGGCAGCAC'), # R915 (reverse complement)
129 | >>> ('p9', 'AAACTNAAAGGAATTGACGG'), # F926
130 | >>> ('p10', 'AGGTNNGNATGCCCCNAA')] # R1240 (reverse complement)
131 | ```
132 |
133 | If you want to locally align a primer against a primer, a primer against a sequence, or a sequence against a sequence, you could run the following:
134 |
135 | ```python
136 | >>> aln1, aln2, score, start1, start2 = sw_align_nt(primers[0][1], primers[1][1])
137 | >>> print(aln1)
138 | >>> print(aln2)
139 | >>> print(score)
140 | >>> print(start1)
141 | >>> print(start2)
142 | ```
143 |
144 | ```python
145 | >>> aln1, aln2, score, start1, start2 = sw_align_nt(primers[0][1], sequences[0][1])
146 | >>> print(aln1)
147 | >>> print(aln2)
148 | >>> print(score)
149 | >>> print(start1)
150 | >>> print(start2)
151 | ```
152 |
153 | ```python
154 | >>> aln1, aln2, score, start1, start2 = sw_align_nt(primers[0][1], sequences[0][1])
155 | >>> print(aln1)
156 | >>> print(aln2)
157 | >>> print(score)
158 | ```
159 |
160 | If you want to globally align a primer against a primer, you could run the following:
161 |
162 | ```python
163 | >>> aln1, aln2, score = nw_align_nt(primers[0][1], primers[1][1])
164 | >>> print(aln1)
165 | >>> print(aln2)
166 | >>> print(score)
167 | ```
168 |
169 | Notice that there are two additional return values from ``sw_align_nt`` than there are from ``nw_align_nt``. Look at the help for each function to figure out what these values are. Why does it make sense to get them from ``sw_align_nt``, but not from ``nw_align_nt``?
170 |
171 | In the cell below, try to globally align a primer against a sequence. Then try to globally align a different primer against a different sequence.
172 |
173 | ```python
174 | ...
175 | ```
176 |
177 | Hint: if you want to normalize an alignment score by it's length, you can do the following. This may come in handy when comparing alignments.
178 |
179 | ```python
180 | >>> aln1, aln2, score = nw_align_nt(primers[0][1], sequences[0][1])
181 | >>> print(score / len(aln1))
182 | ```
183 |
184 | At this point you have the necessary functions to complete the assignment. Your goal is to pick the best pair of 16S primers, a forward and a reverse primer, given the above five input sequences.
185 |
186 | The best primers will:
187 |
188 | 1. Anneal to all of the 16S rRNA sequences well. This will be determined by achieving a high alignment score between the primer and all of the sequences (though it is OK for there to be some mismatches).
189 | 2. They should amplify a region that is 100-400 base pairs long, due to limitations of current sequencing technology.
190 | 3. Finally, the region that is amplified (i.e., between the primers) should be very different across all species, to allow for accurate *fingerprinting* of the different species.
191 |
192 | ## Question 1
193 |
194 | What is the difference between a local and global alignment in terms of what is aligned? What are the differences in the algorithms that support this? (One paragraph)
195 |
196 | ## Question 2
197 |
198 |
199 | What is the sequence in s2 from position 500 to 505?
200 |
201 | **Hint:** the first base in a sequence is position 1; the first item in a Python list is index 0.
202 |
203 | ```python
204 | ...
205 | ```
206 |
207 | ## Question 3
208 |
209 | What is the Smith-Waterman alignment score of primer `p6` against sequence `s4`? Where in `s4` does the alignment start?
210 |
211 | **Hint:** copy and paste from other cells, that way you only have to make a couple small changes.
212 |
213 | ```python
214 | ...
215 | ```
216 |
217 | ## Question 4
218 |
219 | What is the best pair of primers from the list of available primers to use for amplifying the 16S region for sequencing for the purposes of identifying the organisms present?
220 |
221 | ## More hints
222 |
223 | The best pair of primers will align well to (and therefore be likely to anneal with) the 16S sequences from all of the organisms present in the list.
224 |
225 | The best pair of primers will amplify a region of DNA that is between 100 and 400 base pairs long.
226 |
227 | The best pair of primers will amplify a region that is highly variable (in other words, the amplified regions across the organisms should not align well).
228 |
229 | Think about whether you want to use global or local alignments for the different steps. Are there times when you would want to use a gap penalty other than the default?
230 |
231 | The `N` character present in some of the primer sequences signifies a "degenerate" base, meaning it could be an 'A', 'T', 'G' or 'C'. You shouldn't worry about these for this exercise.
232 |
233 | Good Luck!
234 |
235 | ```python
236 | ...
237 | ```
238 |
--------------------------------------------------------------------------------
/book/fundamentals/images/Darwins_tree_of_life_1859.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/Darwins_tree_of_life_1859.png
--------------------------------------------------------------------------------
/book/fundamentals/images/Pace_Big_Tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/Pace_Big_Tree.png
--------------------------------------------------------------------------------
/book/fundamentals/images/alignment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/alignment.png
--------------------------------------------------------------------------------
/book/fundamentals/images/basic-rooted-tree1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/basic-rooted-tree1.jpg
--------------------------------------------------------------------------------
/book/fundamentals/images/basic-unrooted-tree1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/basic-unrooted-tree1.jpg
--------------------------------------------------------------------------------
/book/fundamentals/images/centroid-cluster.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.1.png
--------------------------------------------------------------------------------
/book/fundamentals/images/centroid-cluster.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.2.png
--------------------------------------------------------------------------------
/book/fundamentals/images/centroid-cluster.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.3.png
--------------------------------------------------------------------------------
/book/fundamentals/images/centroid-cluster.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.4.png
--------------------------------------------------------------------------------
/book/fundamentals/images/centroid-cluster.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.5.png
--------------------------------------------------------------------------------
/book/fundamentals/images/centroid-cluster.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.6.png
--------------------------------------------------------------------------------
/book/fundamentals/images/cluster-legend.graffle:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ActiveLayerIndex
6 | 0
7 | ApplicationVersion
8 |
9 | com.omnigroup.OmniGraffle
10 | 139.18.0.187838
11 |
12 | AutoAdjust
13 |
14 | BackgroundGraphic
15 |
16 | Bounds
17 | {{0, 0}, {576.00002670288086, 733}}
18 | Class
19 | SolidGraphic
20 | ID
21 | 2
22 | Style
23 |
24 | shadow
25 |
26 | Draws
27 | NO
28 |
29 | stroke
30 |
31 | Draws
32 | NO
33 |
34 |
35 |
36 | BaseZoom
37 | 0
38 | CanvasOrigin
39 | {0, 0}
40 | ColumnAlign
41 | 1
42 | ColumnSpacing
43 | 36
44 | CreationDate
45 | 2014-04-08 02:25:49 +0000
46 | Creator
47 | Greg Caporaso
48 | DisplayScale
49 | 1 0/72 in = 1 0/72 in
50 | GraphDocumentVersion
51 | 8
52 | GraphicsList
53 |
54 |
55 | Class
56 | Group
57 | Graphics
58 |
59 |
60 | Bounds
61 | {{12, 202}, {167.5, 42}}
62 | Class
63 | ShapedGraphic
64 | FitText
65 | Vertical
66 | Flow
67 | Resize
68 | ID
69 | 139
70 | Magnets
71 |
72 | {0, 1}
73 | {0, -1}
74 | {1, 0}
75 | {-1, 0}
76 |
77 | Shape
78 | Rectangle
79 | Style
80 |
81 | fill
82 |
83 | Draws
84 | NO
85 |
86 | shadow
87 |
88 | Draws
89 | NO
90 |
91 | stroke
92 |
93 | Draws
94 | NO
95 |
96 |
97 | Text
98 |
99 | Text
100 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf400
101 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
102 | {\colortbl;\red255\green255\blue255;}
103 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
104 |
105 | \f0\fs28 \cf0 A sequence and dissimilarity range.}
106 | VerticalPad
107 | 4
108 |
109 | TextPlacement
110 | 0
111 |
112 |
113 | Class
114 | Group
115 | Graphics
116 |
117 |
118 | AllowConnections
119 | NO
120 | AllowLabelDrop
121 |
122 | AllowToConnect
123 |
124 | Class
125 | LineGraphic
126 | ID
127 | 141
128 | Points
129 |
130 | {90.607142857142833, 100.85714285714289}
131 | {167.74999726031507, 100.85714285714289}
132 |
133 | Style
134 |
135 | stroke
136 |
137 | HeadArrow
138 | 0
139 | Legacy
140 |
141 | TailArrow
142 | FilledBall
143 | TailScale
144 | 0.5
145 |
146 |
147 |
148 |
149 | Bounds
150 | {{23.75, 29.5}, {144, 144}}
151 | Class
152 | ShapedGraphic
153 | ID
154 | 142
155 | Shape
156 | Circle
157 | Style
158 |
159 | fill
160 |
161 | Color
162 |
163 | a
164 | 0.31
165 | b
166 | 0.670097
167 | g
168 | 0.670086
169 | r
170 | 0.670106
171 |
172 | Draws
173 | NO
174 |
175 | shadow
176 |
177 | Draws
178 | NO
179 |
180 | stroke
181 |
182 | Pattern
183 | 1
184 |
185 |
186 | Text
187 |
188 | VerticalPad
189 | 0
190 |
191 |
192 |
193 | ID
194 | 140
195 |
196 |
197 | ID
198 | 138
199 |
200 |
201 | Class
202 | Group
203 | Graphics
204 |
205 |
206 | Bounds
207 | {{193.25001335144043, 202}, {167.5, 42}}
208 | Class
209 | ShapedGraphic
210 | FitText
211 | Vertical
212 | Flow
213 | Resize
214 | ID
215 | 144
216 | Magnets
217 |
218 | {0, 1}
219 | {0, -1}
220 | {1, 0}
221 | {-1, 0}
222 |
223 | Shape
224 | Rectangle
225 | Style
226 |
227 | fill
228 |
229 | Draws
230 | NO
231 |
232 | shadow
233 |
234 | Draws
235 | NO
236 |
237 | stroke
238 |
239 | Draws
240 | NO
241 |
242 |
243 | Text
244 |
245 | Text
246 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf400
247 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
248 | {\colortbl;\red255\green255\blue255;}
249 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
250 |
251 | \f0\fs28 \cf0 An OTU definition based on a single sequence.}
252 | VerticalPad
253 | 4
254 |
255 | TextPlacement
256 | 0
257 |
258 |
259 | Class
260 | Group
261 | Graphics
262 |
263 |
264 | AllowConnections
265 | NO
266 | AllowLabelDrop
267 |
268 | AllowToConnect
269 |
270 | Class
271 | LineGraphic
272 | ID
273 | 146
274 | Points
275 |
276 | {271.85715620858326, 100.85714285714289}
277 | {349.0000106117555, 100.85714285714289}
278 |
279 | Style
280 |
281 | stroke
282 |
283 | HeadArrow
284 | 0
285 | Legacy
286 |
287 | TailArrow
288 | FilledBall
289 | TailScale
290 | 0.5
291 |
292 |
293 |
294 |
295 | Bounds
296 | {{205.00001335144043, 29.5}, {144, 144}}
297 | Class
298 | ShapedGraphic
299 | ID
300 | 147
301 | Shape
302 | Circle
303 | Style
304 |
305 | fill
306 |
307 | Color
308 |
309 | a
310 | 0.31
311 | b
312 | 0.670097
313 | g
314 | 0.670086
315 | r
316 | 0.670106
317 |
318 |
319 | shadow
320 |
321 | Draws
322 | NO
323 |
324 | stroke
325 |
326 | Pattern
327 | 1
328 |
329 |
330 | Text
331 |
332 | VerticalPad
333 | 0
334 |
335 |
336 |
337 | ID
338 | 145
339 |
340 |
341 | ID
342 | 143
343 |
344 |
345 | Class
346 | Group
347 | Graphics
348 |
349 |
350 | Bounds
351 | {{385.75, 202}, {167.5, 42}}
352 | Class
353 | ShapedGraphic
354 | FitText
355 | Vertical
356 | Flow
357 | Resize
358 | ID
359 | 149
360 | Magnets
361 |
362 | {0, 1}
363 | {0, -1}
364 | {1, 0}
365 | {-1, 0}
366 |
367 | Shape
368 | Rectangle
369 | Style
370 |
371 | fill
372 |
373 | Draws
374 | NO
375 |
376 | shadow
377 |
378 | Draws
379 | NO
380 |
381 | stroke
382 |
383 | Draws
384 | NO
385 |
386 |
387 | Text
388 |
389 | Text
390 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf400
391 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
392 | {\colortbl;\red255\green255\blue255;}
393 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
394 |
395 | \f0\fs28 \cf0 An OTU definition based on two sequences.}
396 | VerticalPad
397 | 4
398 |
399 | TextPlacement
400 | 0
401 |
402 |
403 | Class
404 | Group
405 | Graphics
406 |
407 |
408 | Class
409 | Group
410 | Graphics
411 |
412 |
413 | AllowConnections
414 | NO
415 | AllowLabelDrop
416 |
417 | AllowToConnect
418 |
419 | Class
420 | LineGraphic
421 | ID
422 | 152
423 | Points
424 |
425 | {486.85714285714283, 82.35714285714289}
426 | {563.99999726031501, 82.35714285714289}
427 |
428 | Style
429 |
430 | stroke
431 |
432 | HeadArrow
433 | 0
434 | Legacy
435 |
436 | TailArrow
437 | FilledBall
438 | TailScale
439 | 0.5
440 |
441 |
442 |
443 |
444 | Bounds
445 | {{420, 11}, {144, 144}}
446 | Class
447 | ShapedGraphic
448 | ID
449 | 153
450 | Shape
451 | Circle
452 | Style
453 |
454 | fill
455 |
456 | Color
457 |
458 | a
459 | 0.31
460 | b
461 | 0.670097
462 | g
463 | 0.670086
464 | r
465 | 0.670106
466 |
467 |
468 | shadow
469 |
470 | Draws
471 | NO
472 |
473 | stroke
474 |
475 | Pattern
476 | 1
477 |
478 |
479 | Text
480 |
481 | VerticalPad
482 | 0
483 |
484 |
485 |
486 | ID
487 | 151
488 |
489 |
490 | Class
491 | Group
492 | Graphics
493 |
494 |
495 | AllowConnections
496 | NO
497 | AllowLabelDrop
498 |
499 | AllowToConnect
500 |
501 | Class
502 | LineGraphic
503 | ID
504 | 155
505 | Points
506 |
507 | {441.85714285714283, 119.35714285714289}
508 | {518.99999726031501, 119.35714285714289}
509 |
510 | Style
511 |
512 | stroke
513 |
514 | HeadArrow
515 | 0
516 | Legacy
517 |
518 | TailArrow
519 | FilledBall
520 | TailScale
521 | 0.5
522 |
523 |
524 |
525 |
526 | Bounds
527 | {{375, 48}, {144, 144}}
528 | Class
529 | ShapedGraphic
530 | ID
531 | 156
532 | Shape
533 | Circle
534 | Style
535 |
536 | fill
537 |
538 | Color
539 |
540 | a
541 | 0.31
542 | b
543 | 0.670097
544 | g
545 | 0.670086
546 | r
547 | 0.670106
548 |
549 |
550 | shadow
551 |
552 | Draws
553 | NO
554 |
555 | stroke
556 |
557 | Pattern
558 | 1
559 |
560 |
561 | Text
562 |
563 | VerticalPad
564 | 0
565 |
566 |
567 |
568 | ID
569 | 154
570 |
571 |
572 | ID
573 | 150
574 |
575 |
576 | ID
577 | 148
578 |
579 |
580 | GridInfo
581 |
582 | GuidesLocked
583 | NO
584 | GuidesVisible
585 | YES
586 | HPages
587 | 1
588 | ImageCounter
589 | 1
590 | KeepToScale
591 |
592 | Layers
593 |
594 |
595 | Lock
596 | NO
597 | Name
598 | Layer 1
599 | Print
600 | YES
601 | View
602 | YES
603 |
604 |
605 | LayoutInfo
606 |
607 | Animate
608 | NO
609 | circoMinDist
610 | 18
611 | circoSeparation
612 | 0.0
613 | layoutEngine
614 | dot
615 | neatoSeparation
616 | 0.0
617 | twopiSeparation
618 | 0.0
619 |
620 | LinksVisible
621 | NO
622 | MagnetsVisible
623 | NO
624 | MasterSheets
625 |
626 | ModificationDate
627 | 2014-04-08 02:56:00 +0000
628 | Modifier
629 | Greg Caporaso
630 | NotesVisible
631 | NO
632 | Orientation
633 | 2
634 | OriginVisible
635 | NO
636 | PageBreaks
637 | YES
638 | PrintInfo
639 |
640 | NSBottomMargin
641 |
642 | float
643 | 41
644 |
645 | NSHorizonalPagination
646 |
647 | coded
648 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
649 |
650 | NSLeftMargin
651 |
652 | float
653 | 18
654 |
655 | NSPaperSize
656 |
657 | size
658 | {612.00002670288086, 792}
659 |
660 | NSPrintReverseOrientation
661 |
662 | int
663 | 0
664 |
665 | NSPrinter
666 |
667 | coded
668 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAlOU1ByaW50ZXIAhIQITlNPYmplY3QAhZKEhIQITlNTdHJpbmcBlIQBKxNDYW5vbiBNRjQ1MDAgU2VyaWVzhoY=
669 |
670 | NSPrinterName
671 |
672 | string
673 | Canon MF4500 Series
674 |
675 | NSRightMargin
676 |
677 | float
678 | 18
679 |
680 | NSTopMargin
681 |
682 | float
683 | 18
684 |
685 |
686 | PrintOnePage
687 |
688 | ReadOnly
689 | NO
690 | RowAlign
691 | 1
692 | RowSpacing
693 | 36
694 | SheetTitle
695 | Canvas 1
696 | SmartAlignmentGuidesActive
697 | YES
698 | SmartDistanceGuidesActive
699 | YES
700 | UniqueID
701 | 1
702 | UseEntirePage
703 |
704 | VPages
705 | 1
706 | WindowInfo
707 |
708 | CurrentSheet
709 | 0
710 | ExpandedCanvases
711 |
712 |
713 | name
714 | Canvas 1
715 |
716 |
717 | Frame
718 | {{432, 151}, {828, 872}}
719 | ListView
720 |
721 | OutlineWidth
722 | 142
723 | RightSidebar
724 |
725 | ShowRuler
726 |
727 | Sidebar
728 |
729 | SidebarWidth
730 | 120
731 | VisibleRegion
732 | {{-58, 0}, {693, 733}}
733 | Zoom
734 | 1
735 | ZoomValues
736 |
737 |
738 | Canvas 1
739 | 1
740 | 1
741 |
742 |
743 |
744 |
745 |
746 |
--------------------------------------------------------------------------------
/book/fundamentals/images/cluster-legend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-legend.png
--------------------------------------------------------------------------------
/book/fundamentals/images/cluster-template.graffle/data.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ActiveLayerIndex
6 | 0
7 | ApplicationVersion
8 |
9 | com.omnigroup.OmniGraffle
10 | 139.18.0.187838
11 |
12 | AutoAdjust
13 |
14 | BackgroundGraphic
15 |
16 | Bounds
17 | {{0, 0}, {576.00002670288086, 733}}
18 | Class
19 | SolidGraphic
20 | ID
21 | 2
22 | Style
23 |
24 | shadow
25 |
26 | Draws
27 | NO
28 |
29 | stroke
30 |
31 | Draws
32 | NO
33 |
34 |
35 |
36 | BaseZoom
37 | 0
38 | CanvasOrigin
39 | {0, 0}
40 | ColumnAlign
41 | 1
42 | ColumnSpacing
43 | 36
44 | CreationDate
45 | 2014-04-08 02:14:28 +0000
46 | Creator
47 | Greg Caporaso
48 | DisplayScale
49 | 1 0/72 in = 1 0/72 in
50 | GraphDocumentVersion
51 | 8
52 | GraphicsList
53 |
54 |
55 | Bounds
56 | {{167, 381}, {143, 143}}
57 | Class
58 | ShapedGraphic
59 | ID
60 | 123
61 | ImageID
62 | 1
63 | ManualSizeImage
64 | YES
65 | Offset
66 | {0.23333333325606806, 0.24444444436349985}
67 | Shape
68 | Circle
69 | Style
70 |
71 | fill
72 |
73 | Draws
74 | NO
75 |
76 | image fill
77 |
78 | ImageSizing
79 | 0
80 | Offset
81 | {0.23333333325606806, 0.24444444436349985}
82 |
83 | shadow
84 |
85 | Draws
86 | NO
87 |
88 | stroke
89 |
90 | Draws
91 | NO
92 |
93 |
94 |
95 |
96 | Class
97 | Group
98 | Graphics
99 |
100 |
101 | AllowConnections
102 | NO
103 | AllowLabelDrop
104 |
105 | AllowToConnect
106 |
107 | Class
108 | LineGraphic
109 | ID
110 | 97
111 | Points
112 |
113 | {313.85714285714278, 114.49999564034604}
114 | {390.99999726031507, 114.49999564034604}
115 |
116 | Style
117 |
118 | stroke
119 |
120 | HeadArrow
121 | 0
122 | Legacy
123 |
124 | TailArrow
125 | FilledBall
126 | TailScale
127 | 0.5
128 |
129 |
130 |
131 |
132 | Bounds
133 | {{247, 43.142852783203097}, {144.00000000000003, 144.00000000000003}}
134 | Class
135 | ShapedGraphic
136 | ID
137 | 98
138 | Shape
139 | Circle
140 | Style
141 |
142 | fill
143 |
144 | Color
145 |
146 | a
147 | 0.31
148 | b
149 | 0.670097
150 | g
151 | 0.670086
152 | r
153 | 0.670106
154 |
155 | Draws
156 | NO
157 |
158 | shadow
159 |
160 | Draws
161 | NO
162 |
163 | stroke
164 |
165 | Pattern
166 | 1
167 |
168 |
169 | Text
170 |
171 | VerticalPad
172 | 0
173 |
174 |
175 |
176 | ID
177 | 96
178 |
179 |
180 | Class
181 | Group
182 | Graphics
183 |
184 |
185 | AllowConnections
186 | NO
187 | AllowLabelDrop
188 |
189 | AllowToConnect
190 |
191 | Class
192 | LineGraphic
193 | ID
194 | 86
195 | Points
196 |
197 | {118.85714285714283, 114.49999999999977}
198 | {195.99999726031507, 114.49999999999977}
199 |
200 | Style
201 |
202 | stroke
203 |
204 | HeadArrow
205 | 0
206 | Legacy
207 |
208 | TailArrow
209 | FilledBall
210 | TailScale
211 | 0.5
212 |
213 |
214 |
215 |
216 | Bounds
217 | {{52, 43.142857142856883}, {144, 144}}
218 | Class
219 | ShapedGraphic
220 | ID
221 | 87
222 | Shape
223 | Circle
224 | Style
225 |
226 | fill
227 |
228 | Color
229 |
230 | a
231 | 0.31
232 | b
233 | 0.670097
234 | g
235 | 0.670086
236 | r
237 | 0.670106
238 |
239 |
240 | shadow
241 |
242 | Draws
243 | NO
244 |
245 | stroke
246 |
247 | Pattern
248 | 1
249 |
250 |
251 | Text
252 |
253 | VerticalPad
254 | 0
255 |
256 |
257 |
258 | ID
259 | 85
260 |
261 |
262 | GridInfo
263 |
264 | GuidesLocked
265 | NO
266 | GuidesVisible
267 | YES
268 | HPages
269 | 1
270 | ImageCounter
271 | 2
272 | ImageLinkBack
273 |
274 |
275 | ApplicationURL
276 | http://www.omnigroup.com/applications/OmniGraffle
277 | appData
278 |
279 | Color
280 |
281 | w
282 | 1
283 |
284 | DocumentSettings
285 |
286 | ApplicationVersion
287 |
288 | com.omnigroup.OmniGraffle
289 | 139.18.0.187838
290 |
291 | CreationDate
292 | 2014-04-07 21:49:27 +0000
293 | Creator
294 | Greg Caporaso
295 | FileName
296 | cluster-types.graffle
297 | GraphDocumentVersion
298 | 8
299 | ModelCount
300 | 1
301 | ModelIndex
302 | 0
303 | ModificationDate
304 | 2014-04-08 02:08:05 +0000
305 | Modifier
306 | Greg Caporaso
307 | SheetTitle
308 | Canvas 1
309 |
310 | GraphicsList
311 |
312 |
313 | Bounds
314 | {{449.5, 311.01062596097904}, {142.97872340425533, 142.97872340425533}}
315 | Class
316 | ShapedGraphic
317 | ID
318 | 122
319 | LayerIndex
320 | 0
321 | Shape
322 | Circle
323 | Style
324 |
325 | fill
326 |
327 | Color
328 |
329 | a
330 | 0.31
331 | b
332 | 0.670097
333 | g
334 | 0.670086
335 | r
336 | 0.670106
337 |
338 |
339 | shadow
340 |
341 | Draws
342 | NO
343 |
344 | stroke
345 |
346 | Draws
347 | NO
348 | Pattern
349 | 1
350 |
351 |
352 | Text
353 |
354 | VerticalPad
355 | 0
356 |
357 |
358 |
359 | Layers
360 |
361 |
362 | Lock
363 | NO
364 | Name
365 | Layer 1
366 | Print
367 | YES
368 | View
369 | YES
370 |
371 |
372 | ZoomLevel
373 | 1
374 |
375 | bundleId
376 | com.omnigroup.OmniGraffle
377 | refresh
378 | 0.0
379 | serverAppName
380 | OmniGraffle
381 | serverName
382 | OmniGraffle
383 | version
384 | A
385 |
386 |
387 | ImageList
388 |
389 | image1.pdf
390 |
391 | KeepToScale
392 |
393 | Layers
394 |
395 |
396 | Lock
397 | NO
398 | Name
399 | Layer 1
400 | Print
401 | YES
402 | View
403 | YES
404 |
405 |
406 | LayoutInfo
407 |
408 | Animate
409 | NO
410 | circoMinDist
411 | 18
412 | circoSeparation
413 | 0.0
414 | layoutEngine
415 | dot
416 | neatoSeparation
417 | 0.0
418 | twopiSeparation
419 | 0.0
420 |
421 | LinksVisible
422 | NO
423 | MagnetsVisible
424 | NO
425 | MasterSheets
426 |
427 | ModificationDate
428 | 2014-04-08 02:15:07 +0000
429 | Modifier
430 | Greg Caporaso
431 | NotesVisible
432 | NO
433 | Orientation
434 | 2
435 | OriginVisible
436 | NO
437 | PageBreaks
438 | YES
439 | PrintInfo
440 |
441 | NSBottomMargin
442 |
443 | float
444 | 41
445 |
446 | NSHorizonalPagination
447 |
448 | coded
449 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
450 |
451 | NSLeftMargin
452 |
453 | float
454 | 18
455 |
456 | NSPaperSize
457 |
458 | size
459 | {612.00002670288086, 792}
460 |
461 | NSPrintReverseOrientation
462 |
463 | int
464 | 0
465 |
466 | NSPrinter
467 |
468 | coded
469 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAlOU1ByaW50ZXIAhIQITlNPYmplY3QAhZKEhIQITlNTdHJpbmcBlIQBKxNDYW5vbiBNRjQ1MDAgU2VyaWVzhoY=
470 |
471 | NSPrinterName
472 |
473 | string
474 | Canon MF4500 Series
475 |
476 | NSRightMargin
477 |
478 | float
479 | 18
480 |
481 | NSTopMargin
482 |
483 | float
484 | 18
485 |
486 |
487 | PrintOnePage
488 |
489 | ReadOnly
490 | NO
491 | RowAlign
492 | 1
493 | RowSpacing
494 | 36
495 | SheetTitle
496 | Canvas 1
497 | SmartAlignmentGuidesActive
498 | YES
499 | SmartDistanceGuidesActive
500 | YES
501 | UniqueID
502 | 1
503 | UseEntirePage
504 |
505 | VPages
506 | 1
507 | WindowInfo
508 |
509 | CurrentSheet
510 | 0
511 | ExpandedCanvases
512 |
513 |
514 | name
515 | Canvas 1
516 |
517 |
518 | Frame
519 | {{210, 193}, {711, 872}}
520 | ListView
521 |
522 | OutlineWidth
523 | 142
524 | RightSidebar
525 |
526 | ShowRuler
527 |
528 | Sidebar
529 |
530 | SidebarWidth
531 | 120
532 | VisibleRegion
533 | {{0, 0}, {576, 733}}
534 | Zoom
535 | 1
536 | ZoomValues
537 |
538 |
539 | Canvas 1
540 | 1
541 | 1
542 |
543 |
544 |
545 |
546 |
547 |
--------------------------------------------------------------------------------
/book/fundamentals/images/cluster-template.graffle/image1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-template.graffle/image1.pdf
--------------------------------------------------------------------------------
/book/fundamentals/images/cluster-types.graffle/image2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-types.graffle/image2.pdf
--------------------------------------------------------------------------------
/book/fundamentals/images/cluster-types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-types.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.1.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.2.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.3.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.4.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.5.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.6.png
--------------------------------------------------------------------------------
/book/fundamentals/images/furthest-neighbor.graffle/image1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.graffle/image1.pdf
--------------------------------------------------------------------------------
/book/fundamentals/images/msa-tree-a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-a1.png
--------------------------------------------------------------------------------
/book/fundamentals/images/msa-tree-a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-a2.png
--------------------------------------------------------------------------------
/book/fundamentals/images/msa-tree-a3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-a3.png
--------------------------------------------------------------------------------
/book/fundamentals/images/msa-tree-final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-final.png
--------------------------------------------------------------------------------
/book/fundamentals/images/msa-tree-input.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-input.png
--------------------------------------------------------------------------------
/book/fundamentals/images/nearest-neighbor.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.1.png
--------------------------------------------------------------------------------
/book/fundamentals/images/nearest-neighbor.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.2.png
--------------------------------------------------------------------------------
/book/fundamentals/images/nearest-neighbor.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.3.png
--------------------------------------------------------------------------------
/book/fundamentals/images/nearest-neighbor.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.4.png
--------------------------------------------------------------------------------
/book/fundamentals/images/nearest-neighbor.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.5.png
--------------------------------------------------------------------------------
/book/fundamentals/images/nearest-neighbor.6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.6.png
--------------------------------------------------------------------------------
/book/fundamentals/images/sequence-evo-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/sequence-evo-tree.png
--------------------------------------------------------------------------------
/book/fundamentals/images/spider-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/spider-tree.png
--------------------------------------------------------------------------------
/book/fundamentals/images/tree-monophyly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/tree-monophyly.png
--------------------------------------------------------------------------------
/book/fundamentals/images/tree-polyphyly.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/tree-polyphyly.png
--------------------------------------------------------------------------------
/book/fundamentals/images/tree-schematic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/tree-schematic1.png
--------------------------------------------------------------------------------
/book/fundamentals/images/upgma-tree-final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-final.png
--------------------------------------------------------------------------------
/book/fundamentals/images/upgma-tree-iter1.graffle:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ActiveLayerIndex
6 | 0
7 | ApplicationVersion
8 |
9 | com.omnigroup.OmniGraffle
10 | 139.18.0.187838
11 |
12 | AutoAdjust
13 |
14 | BackgroundGraphic
15 |
16 | Bounds
17 | {{0, 0}, {576.00002670288086, 733}}
18 | Class
19 | SolidGraphic
20 | ID
21 | 2
22 | Style
23 |
24 | shadow
25 |
26 | Draws
27 | NO
28 |
29 | stroke
30 |
31 | Draws
32 | NO
33 |
34 |
35 |
36 | BaseZoom
37 | 0
38 | CanvasOrigin
39 | {0, 0}
40 | ColumnAlign
41 | 1
42 | ColumnSpacing
43 | 36
44 | CreationDate
45 | 2014-02-25 16:00:53 +0000
46 | Creator
47 | Greg Caporaso
48 | DisplayScale
49 | 1 0/72 in = 1.0000 in
50 | GraphDocumentVersion
51 | 8
52 | GraphicsList
53 |
54 |
55 | Bounds
56 | {{270.43200826644897, 213.91587829589844}, {24, 20}}
57 | Class
58 | ShapedGraphic
59 | FitText
60 | YES
61 | Flow
62 | Resize
63 | ID
64 | 342
65 | Magnets
66 |
67 | {0, 1}
68 | {0, -1}
69 | {1, 0}
70 | {-1, 0}
71 |
72 | Shape
73 | Rectangle
74 | Style
75 |
76 | fill
77 |
78 | Draws
79 | NO
80 |
81 | shadow
82 |
83 | Draws
84 | NO
85 |
86 | stroke
87 |
88 | Draws
89 | NO
90 |
91 |
92 | Text
93 |
94 | Text
95 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
96 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
97 | {\colortbl;\red255\green255\blue255;}
98 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
99 |
100 | \f0\fs20 \cf0 0.5}
101 | VerticalPad
102 | 4
103 |
104 | TextPlacement
105 | 0
106 | Wrap
107 | NO
108 |
109 |
110 | Bounds
111 | {{270, 259.5}, {24, 20}}
112 | Class
113 | ShapedGraphic
114 | FitText
115 | YES
116 | Flow
117 | Resize
118 | ID
119 | 341
120 | Magnets
121 |
122 | {0, 1}
123 | {0, -1}
124 | {1, 0}
125 | {-1, 0}
126 |
127 | Shape
128 | Rectangle
129 | Style
130 |
131 | fill
132 |
133 | Draws
134 | NO
135 |
136 | shadow
137 |
138 | Draws
139 | NO
140 |
141 | stroke
142 |
143 | Draws
144 | NO
145 |
146 |
147 | Text
148 |
149 | Text
150 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
151 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
152 | {\colortbl;\red255\green255\blue255;}
153 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
154 |
155 | \f0\fs20 \cf0 0.5}
156 | VerticalPad
157 | 4
158 |
159 | TextPlacement
160 | 0
161 | Wrap
162 | NO
163 |
164 |
165 | Bounds
166 | {{306, 262}, {25, 25}}
167 | Class
168 | ShapedGraphic
169 | FitText
170 | YES
171 | Flow
172 | Resize
173 | FontInfo
174 |
175 | Font
176 | Helvetica
177 | Size
178 | 10
179 |
180 | ID
181 | 337
182 | Magnets
183 |
184 | {0, 1}
185 | {0, -1}
186 | {1, 0}
187 | {-1, 0}
188 |
189 | Shape
190 | Rectangle
191 | Style
192 |
193 | fill
194 |
195 | Draws
196 | NO
197 |
198 | shadow
199 |
200 | Draws
201 | NO
202 |
203 | stroke
204 |
205 | Draws
206 | NO
207 |
208 |
209 | Text
210 |
211 | Text
212 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
213 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
214 | {\colortbl;\red255\green255\blue255;}
215 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
216 |
217 | \f0\fs28 \cf0 s5}
218 | VerticalPad
219 | 4
220 |
221 | TextPlacement
222 | 0
223 | Wrap
224 | NO
225 |
226 |
227 | Bounds
228 | {{306, 216.5}, {25, 25}}
229 | Class
230 | ShapedGraphic
231 | FitText
232 | YES
233 | Flow
234 | Resize
235 | FontInfo
236 |
237 | Font
238 | Helvetica
239 | Size
240 | 10
241 |
242 | ID
243 | 336
244 | Magnets
245 |
246 | {0, 1}
247 | {0, -1}
248 | {1, 0}
249 | {-1, 0}
250 |
251 | Shape
252 | Rectangle
253 | Style
254 |
255 | fill
256 |
257 | Draws
258 | NO
259 |
260 | shadow
261 |
262 | Draws
263 | NO
264 |
265 | stroke
266 |
267 | Draws
268 | NO
269 |
270 |
271 | Text
272 |
273 | Text
274 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
275 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
276 | {\colortbl;\red255\green255\blue255;}
277 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc
278 |
279 | \f0\fs28 \cf0 s4}
280 | VerticalPad
281 | 4
282 |
283 | TextPlacement
284 | 0
285 | Wrap
286 | NO
287 |
288 |
289 | Class
290 | LineGraphic
291 | ID
292 | 324
293 | Points
294 |
295 | {259.12761506276161, 277.23850901192492}
296 | {304.61715481171564, 277.23850901192492}
297 | {304.61715481171564, 277.23850901192492}
298 |
299 | Style
300 |
301 | stroke
302 |
303 | HeadArrow
304 | 0
305 | Legacy
306 |
307 | LineType
308 | 1
309 | TailArrow
310 | 0
311 | Width
312 | 2
313 |
314 |
315 |
316 |
317 | Class
318 | LineGraphic
319 | ID
320 | 325
321 | Points
322 |
323 | {257.86401673640177, 230.91587904250571}
324 | {303.35355648535568, 230.91587904250571}
325 | {303.35355648535568, 230.91587904250571}
326 |
327 | Style
328 |
329 | stroke
330 |
331 | HeadArrow
332 | 0
333 | Legacy
334 |
335 | LineType
336 | 1
337 | TailArrow
338 | 0
339 | Width
340 | 2
341 |
342 |
343 |
344 |
345 | Class
346 | LineGraphic
347 | ID
348 | 326
349 | Points
350 |
351 | {257.86405464435143, 276.60395243700134}
352 | {257.86401673640171, 230.91587904250582}
353 | {257.86401673640171, 230.91587904250582}
354 |
355 | Rotation
356 | 90
357 | Style
358 |
359 | stroke
360 |
361 | HeadArrow
362 | 0
363 | Legacy
364 |
365 | LineType
366 | 1
367 | TailArrow
368 | 0
369 | Width
370 | 2
371 |
372 |
373 |
374 |
375 | GridInfo
376 |
377 | GuidesLocked
378 | NO
379 | GuidesVisible
380 | YES
381 | HPages
382 | 1
383 | ImageCounter
384 | 1
385 | KeepToScale
386 |
387 | Layers
388 |
389 |
390 | Lock
391 | NO
392 | Name
393 | Layer 1
394 | Print
395 | YES
396 | View
397 | YES
398 |
399 |
400 | LayoutInfo
401 |
402 | Animate
403 | NO
404 | circoMinDist
405 | 18
406 | circoSeparation
407 | 0.0
408 | layoutEngine
409 | dot
410 | neatoSeparation
411 | 0.0
412 | twopiSeparation
413 | 0.0
414 |
415 | LinksVisible
416 | NO
417 | MagnetsVisible
418 | NO
419 | MasterSheets
420 |
421 | ModificationDate
422 | 2014-02-25 16:08:12 +0000
423 | Modifier
424 | Greg Caporaso
425 | NotesVisible
426 | NO
427 | Orientation
428 | 2
429 | OriginVisible
430 | NO
431 | PageBreaks
432 | YES
433 | PrintInfo
434 |
435 | NSBottomMargin
436 |
437 | float
438 | 41
439 |
440 | NSHorizonalPagination
441 |
442 | coded
443 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG
444 |
445 | NSLeftMargin
446 |
447 | float
448 | 18
449 |
450 | NSPaperSize
451 |
452 | size
453 | {612.00002670288086, 792}
454 |
455 | NSPrintReverseOrientation
456 |
457 | int
458 | 0
459 |
460 | NSPrinter
461 |
462 | coded
463 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAlOU1ByaW50ZXIAhIQITlNPYmplY3QAhZKEhIQITlNTdHJpbmcBlIQBKxNDYW5vbiBNRjQ1MDAgU2VyaWVzhoY=
464 |
465 | NSPrinterName
466 |
467 | string
468 | Canon MF4500 Series
469 |
470 | NSRightMargin
471 |
472 | float
473 | 18
474 |
475 | NSTopMargin
476 |
477 | float
478 | 18
479 |
480 |
481 | PrintOnePage
482 |
483 | ReadOnly
484 | NO
485 | RowAlign
486 | 1
487 | RowSpacing
488 | 36
489 | SheetTitle
490 | Canvas 1
491 | SmartAlignmentGuidesActive
492 | YES
493 | SmartDistanceGuidesActive
494 | YES
495 | UniqueID
496 | 1
497 | UseEntirePage
498 |
499 | VPages
500 | 1
501 | WindowInfo
502 |
503 | CurrentSheet
504 | 0
505 | ExpandedCanvases
506 |
507 |
508 | name
509 | Canvas 1
510 |
511 |
512 | Frame
513 | {{410, 54}, {711, 872}}
514 | ListView
515 |
516 | OutlineWidth
517 | 142
518 | RightSidebar
519 |
520 | ShowRuler
521 |
522 | Sidebar
523 |
524 | SidebarWidth
525 | 120
526 | VisibleRegion
527 | {{0, 0}, {576, 733}}
528 | Zoom
529 | 1
530 | ZoomValues
531 |
532 |
533 | Canvas 1
534 | 1
535 | 1
536 |
537 |
538 |
539 |
540 |
541 |
--------------------------------------------------------------------------------
/book/fundamentals/images/upgma-tree-iter1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-iter1.png
--------------------------------------------------------------------------------
/book/fundamentals/images/upgma-tree-iter2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-iter2.png
--------------------------------------------------------------------------------
/book/fundamentals/images/upgma-tree-iter3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-iter3.png
--------------------------------------------------------------------------------
/book/fundamentals/index.md:
--------------------------------------------------------------------------------
1 | # Fundamentals
2 |
--------------------------------------------------------------------------------
/book/fundamentals/index.yaml:
--------------------------------------------------------------------------------
1 | contents:
2 | - pairwise-alignment
3 | - database-searching
4 | - multiple-sequence-alignment
5 | - phylogeny-reconstruction
6 | - sequence-mapping-and-clustering
7 | - machine-learning
8 |
--------------------------------------------------------------------------------
/book/fundamentals/machine-learning.md:
--------------------------------------------------------------------------------
1 | # Machine learning in bioinformatics (work-in-progress)
2 |
3 | **This chapter is currently a work-in-progress, and is incomplete.**
4 |
5 | Machine learning algorithms are commonly used in bioinformatics for a variety of tasks. Typically, the common thread in these tasks is that the user would like the algorithm to assist in the identification of patterns in a complex data set. In this chapter we'll implement a few machine learning algorithms so we can gain an in-depth understanding of how they work. In practice though, there are many mature machine learning libraries that you'd want to use. [scikit-learn](http://scikit-learn.org/) is a popular and well-documented Python library for machine learning which many bioinformatics researchers and software developers use in their work.
6 |
7 | These algorithms generally work beginning with a collection of samples and some user-defined features of those samples. These data are typically represented in a matrix, where samples are the rows and features are the columns. There are a few different high-level tasks that are common in machine learning, including classification, regression, and dimensionality reduction. In a classification task, a user provides examples of data that fall into certain discrete classes (for example, _healthy_ and _disease_), and tries to have the computer develop a model that can differentiate those classes based on the defined features. If successful, the resulting model could be applied to data where the class isn't known ahead of time, in attempt to predict the class from the features. A regression task is similar, except that a continuous value will be predicted rather than a discrete value. Dimensionality reduction tasks, on the other hand, generally don't have classes or labels assigned ahead of time, and the user is hoping to identify which samples are most similar to each other based on new features that are defined by the algorithm. The goal here might be to reduce the number of features from thousands or more to around two or three that explain most of the variation in the data. This allows the user to explore the samples visually, for example in a scatter plot, which would not be feasible if there were thousands of features.
8 |
9 | In this chapter we'll explore two classification algorithms and one dimensionality reduction task in the context of some real-world examples.
10 |
11 | ## Defining a classification problem
12 |
13 | We'll explore machine learning classifiers in the context of a familiar topic: taxonomic classification of 16S rRNA sequences. We previously explored this problem in [Sequence Homology Searching](alias://d22e6b), so it is likely worth spending a few minutes skimming that chapter if it's not fresh in your mind.
14 |
15 | Briefly, the problem that we are going to address here is as follows. We have a query sequence ($q_i$) which is not taxonomically annotated (meaning we don't know the taxonomy of the organism whose genome it is found in), and a reference database ($R$) of taxonomically annotated sequences ($r_1, r_2, r_3, ... r_n$). We want to infer a taxonomic annotation for $q_i$. We'll again work with the [Greengenes](http://greengenes.secondgenome.com/) database, which we'll access using [QIIME default reference project](https://github.com/biocore/qiime-default-reference). Greengenes is a database of 16S rRNA gene sequences. (This should all sound very familiar - if not, I again suggest that you review [Sequence Homology Searching](alias://d22e6b).)
16 |
17 | This time, instead of using sequence alignment to identify the most likely taxonomic origin of a sequence, we'll train classifiers by building [kmer](alias://C7hMX5)-based models of the 16S sequences of taxa in our reference database. We'll then run our query sequences through those models to identify the most likely taxonomic origin of each query sequence. Since we know the taxonomic origin of our query sequences in this case, we can evaluate the accuracy of our classifiers by seeing how often they return the known taxonomy assignment. If our training and testing approaches are well-designed, the performance on our tests will inform us of how accurate we can expect our classifier to be on data where the actual taxonomic origin is unknown.
18 |
19 | Let's jump in...
20 |
21 | ### Naive Bayes classifiers
22 |
23 | The first classifier we'll explore is the popular and relatively simple Naive Bayes classifier. This classifier uses Bayes Theorem to determine the most likely label for an unknown input based on a probabilistic model it has constructed from training data. (_The preceding text needs work._) The model that is constructed is based on user-defined features of the sequences. The most commonly used features for sequence classification tasks such as this is overlapping [kmers](alias://C7hMX5).
24 |
25 | We'll begin by importing some libraries that we'll use in this chapter, and then [preparing our reference database and query sequences as we did previously](alias://gAKBxE).
26 |
27 | ```python
28 | >>> %pylab inline
29 | ...
30 | >>> from IPython.core import page
31 | >>> page.page = print
32 | ...
33 | >>> import pandas as pd
34 | >>> import skbio
35 | >>> import numpy as np
36 | >>> import itertools
37 | >>> import collections
38 | ```
39 |
40 | ```python
41 | >>> from iab.algorithms import load_taxonomy_reference_database
42 | ...
43 | >>> %psource load_taxonomy_reference_database
44 | ```
45 |
46 | ```python
47 | >>> reference_taxonomy, reference_db = load_taxonomy_reference_database()
48 | ```
49 |
50 | ```python
51 | >>> reference_db[0]
52 | ```
53 |
54 | ```python
55 | >>> reference_db[-1]
56 | ```
57 |
58 | We'll select a random subset of the reference database to work with here.
59 |
60 | ```python
61 | >>> reference_db = np.random.choice(reference_db, 500, replace=False)
62 | >>> print("%s sequences are present in the subsampled database." % len(reference_db))
63 | ```
64 |
65 | The first thing our Naive Bayes classifier will need is the set of all possible words of length ``k``. This will be dependent on the value of ``k`` and the characters in our alphabet (i.e., the characters that we should expect to find in the reference database). This set is referred to as ``W``, and can be computed as follows. Given the following alphabet, how many kmers of length 2 are there (i.e., 2-mers)? How many 7-mers are there? How many 7-mers are there if there are twenty characters in our alphabet (as would be the case if we were working with protein sequences instead of DNA sequences)?
66 |
67 | ```python
68 | >>> alphabet = skbio.DNA.nondegenerate_chars
69 | >>> k = 2
70 | ...
71 | >>> def compute_W(alphabet, k):
72 | >>> return set(map(''.join, itertools.product(alphabet, repeat=k)))
73 | ...
74 | >>> W = compute_W(alphabet, k)
75 | >>> print('Alphabet contains the characters: %s' % ', '.join(alphabet))
76 | >>> print('For an alphabet size of %d, W contains %d length-%d kmers.' % (len(alphabet), len(W), k))
77 | ```
78 |
79 | scikit-bio provides methods for identifying all kmers in a ``skbio.DNA`` sequence object, and for computing the kmer frequencies. This information can be obtained for one of our reference sequences as follows:
80 |
81 | ```python
82 | >>> kmers = reference_db[0].iter_kmers(k=k)
83 | >>> for kmer in kmers:
84 | ... print(kmer, end=' ')
85 | ```
86 |
87 | ```python
88 | >>> print(reference_db[0].kmer_frequencies(k=k))
89 | ```
90 |
91 | This information can be convenient to store in a pandas ``Series`` object:
92 |
93 | ```python
94 | >>> pd.Series(reference_db[0].kmer_frequencies(k=k), name=reference_db[0].metadata['id'])
95 | ```
96 |
97 | To train our taxonomic classifier, we next need to define a few things. First, at what level of taxonomic specificity do we want to classify our sequences? We should expect to achieve higher accuracy at less specific taxonomic levels such as phylum or class, but these are likely to be less informative biologically than more specific levels such as genus or species. Let's start classifying at the phylum level to keep our task simple, since we're working with a small subset of the reference database here. In Greengenes, phylum is the second level of the taxonomy.
98 |
99 | Next, how long should our kmers be? We don't have a good idea of this to start with. The longer our kmers, the more likely they are to be specific to certain taxa, which is good because that will help with classification. However, if they get too long it becomes less likely that we'll observe those kmers in sequences that aren't represented in our database because the longer the sequence is the more likely we are to see variation across other organisms that are assigned to the same taxonomy. Based on some of my own work in this area, I'll start us out with 7-mers (i.e., kmers of length 7).
100 |
101 | Finally, we'll need to know the value of `W`, defined above as the set of all possible kmers given our alphabet and the value of `k`.
102 |
103 | As an exercise, I recommend exploring the impact of the value of `k` and `taxonomic_level` on the accuracy of our classifier after reading this chapter.
104 |
105 | ```python
106 | >>> taxonomic_level = 2
107 | >>> k = 7
108 | >>> alphabet = skbio.DNA.nondegenerate_chars
109 | ```
110 |
111 | Next, we'll compute a table of the per-sequence kmer counts for all kmers in `W` for all sequences in our reference database. We'll also store the taxonomic label of each of our reference sequences at our specified taxonomic level. We can store this information in a pandas `DataFrame`, and then view the first 25 rows of that table.
112 |
113 | ```python
114 | >>> def get_taxon_at_level(taxon, level):
115 | ... taxon = [l.strip() for l in taxon.split(';')]
116 | ... return '; '.join(taxon[:level])
117 | ...
118 | >>> W = compute_W(alphabet, k)
119 | ...
120 | >>> per_sequence_kmer_counts = []
121 | >>> for reference_sequence in reference_db:
122 | ... taxon = get_taxon_at_level(reference_sequence.metadata['taxonomy'], taxonomic_level)
123 | ... kmer_counts = dict.fromkeys(W, 0)
124 | ... kmer_counts.update(reference_sequence.kmer_frequencies(k=k))
125 | ... per_sequence_kmer_counts.append(pd.Series(kmer_counts, name=taxon))
126 | ...
127 | >>> per_sequence_kmer_counts = pd.DataFrame(data=per_sequence_kmer_counts).fillna(0).T
128 | >>> per_sequence_kmer_counts[:25]
129 | ```
130 |
131 | With this information, we'll next compute our "kmer probability table" (EXISTING NAME FOR THIS?). The content of this table will be the probability of observing each kmer in W given a taxon. This is computed based on a few values:
132 |
133 | $N$ : The total number of sequences in the training set.
134 |
135 | $n(w_i)$ : The number of total sequences containing kmer _i_.
136 |
137 | $P_i$ : The probability of observing kmer _i_. Initially it might seem as though this would be computed as $n(w_i) / N$, but this neglects the possibility of that a kmer observed in a query sequence might not be represented in our reference database, so a small pseudocount is added to the numerator and denomenator.
138 |
139 | $P(w_i | taxon)$ : The probability of observing a kmer given a taxon. Again, it would seem that this would be computed as the proportion of sequences in the taxon containing the kmer, but this would neglect that we'll likely observe kmers in our query sequences that are not represented in our reference database. As pseudocount is therefore added again to the numerator and denominator. This time the pseudocount in the numerator is scaled by how frequent the kmer is in the reference database as a whole: specifically, it is $P_i$.
140 |
141 | Our "kmer probability table" is $P(w_i | taxon)$ computed for all kmers in W and all taxa represented in our reference database. We'll compute that and again look at the first 25 rows.
142 |
143 | ```python
144 | >>> def compute_kmer_probability_table(per_sequence_kmer_counts):
145 | ... N = len(per_sequence_kmer_counts) # number of training sequences
146 | ...
147 | ... # number of sequences containing kmer wi
148 | ... n_wi = per_sequence_kmer_counts.astype(bool).sum(axis=1)
149 | ... n_wi.name = 'n(w_i)'
150 | ...
151 | ... # probabilities of observing each kmer
152 | ... Pi = (n_wi + 0.5) / (N + 1)
153 | ... Pi.name = 'P_i'
154 | ...
155 | ... # number of times each taxon appears in training set
156 | ... taxon_counts = collections.Counter(per_sequence_kmer_counts.columns)
157 | ... n_taxon_members_containing_kmer = per_sequence_kmer_counts.astype(bool).groupby(level=0, axis=1).sum()
158 | ...
159 | ... # probabilities of observing each kmer in each taxon
160 | ... p_wi_t = []
161 | ... for taxon, count in taxon_counts.items():
162 | ... p_wi_t.append(pd.Series((n_taxon_members_containing_kmer[taxon] + Pi) / (count + 1), name=taxon))
163 | ...
164 | ... return pd.DataFrame(p_wi_t).T
165 | ```
166 |
167 | ```python
168 | >>> kmer_probability_table = compute_kmer_probability_table(per_sequence_kmer_counts)
169 | ```
170 |
171 | ```python
172 | >>> kmer_probability_table[:25]
173 | ```
174 |
175 | With our kmer probability table we are now ready to classify unknown sequences. We'll begin by defining some query sequences. We'll pull these at random from our reference sequences, which means that some of the query sequences will be represented in our reference database and some won't be. This is the sitatuation that is typically encountered in practice. To simulate real-world 16S taxonomy classification tasks, we'll also trim out 200 bases of our reference sequences since (as of this writing) we typically don't obtain full-length 16S sequences from a DNA sequencing instrument.
176 |
177 | ```python
178 | >>> from iab.algorithms import load_taxonomy_query_sequences
179 | ...
180 | >>> %psource load_taxonomy_query_sequences
181 | ```
182 |
183 | ```python
184 | >>> import random
185 | ...
186 | >>> queries = load_taxonomy_query_sequences()
187 | >>> queries = random.sample(queries, k=50)
188 | ```
189 |
190 | ```python
191 | >>> queries[0]
192 | ```
193 |
194 | For a given query sequence, its taxonomy will be classified as follows. First, the set of all kmers will be extracted from the sequence. This is referred to as $V$. Then, for all taxa in the kmer probability table, the probability of observing the query sequence will be computed given that taxon: $P(query | taxon)$. This is computed as the product of all its kmer probabilities for the given taxon. (It should be clear based on this formula why it was necessary to add pseudocounts when computing our kmer probability table - if not, kmer probabilities of zero would result in a zero probability of the sequence being derived from that taxon at this step.)
195 |
196 | After computing $P(query | taxon)$ for all taxa, the taxonomy assignment return is simply the one achieving the maximum probability. Here we'll classify a sequence and look at the resulting taxonomy assignment.
197 |
198 | ```python
199 | >>> def classify_V(V, kmer_probability_table):
200 | ... P_S_t = [] # probability of the sequence given the taxon
201 | ... for taxon in kmer_probability_table:
202 | ... kmer_probabilities = kmer_probability_table[taxon]
203 | ... probability = 1.0
204 | ... for v_i in V:
205 | ... probability *= kmer_probabilities[v_i]
206 | ... P_S_t.append((probability, taxon))
207 | ... return max(P_S_t)[1], V
208 | ...
209 | >>> def classify_sequence(query_sequence, kmer_probability_table, k):
210 | ... V = list(map(str, query_sequence.iter_kmers(k=k)))
211 | ... return classify_V(V, kmer_probability_table)
212 | ```
213 |
214 | ```python
215 | >>> taxon_assignment, V = classify_sequence(queries[0], kmer_probability_table, k)
216 | >>> print(taxon_assignment)
217 | ```
218 |
219 | Since we know the actual taxonomy assignment for this sequence, we can look that up in our reference database. Was your assignment correct? Try this with a few query sequences and keep track of how many times the classifier achieved the correct assignment.
220 |
221 | ```python
222 | >>> get_taxon_at_level(reference_taxonomy[queries[0].metadata['id']], taxonomic_level)
223 | ```
224 |
225 | Because the query and reference sequences that were working with were randomly selected from the full reference database, each time you run this notebook you should observe different results. Chances are however that if you run the above steps multiple times you'll get the wrong taxonomy assignment at least some of the time. Up to this point, we've left out an important piece of information: how confident should we be in our assignment, or in other words, how dependent is our taxonomy assignment on our specific query? If there were slight differences in our query (e.g., because we observed a very closely related organism, such as one of the same species but a different strain, or because we sequenced a different region of the 16S sequence) would we obtain the same taxonomy assignment? If so, we should have higher confidence in our assignment. If not, we should have lower confidence in our assignment.
226 |
227 | We can quantify confidence using an approach called bootstrapping. With a bootstrap approach, we'll get our taxonomy assignment as we did above, but then for some user-specified number of times, we'll create random subsets of V sampled with replacement (DEFINE THIS). We'll then assign taxonomy to each random subset of V, and count the number of times the resulting taxonomy assignment is the same that we achieved when assigning taxonomy to V. The count divided by the number of iterations we've chosen to run will be our confidence value. If the assignments are often the same we'll have a high confidence value. If the assignments are often different, we'll have a low confidence value.
228 |
229 | Let's now assign taxonomy and compute a confidence for that assignment.
230 |
231 | ```python
232 | >>> def classify_sequence_with_confidence(sequence, kmer_probability_table, k,
233 | ... confidence_iterations=100):
234 | ... taxon, V = classify_sequence(sequence, kmer_probability_table, k)
235 | ...
236 | ... count_same_taxon = 0
237 | ... subsample_size = int(len(V) * 0.1)
238 | ... for i in range(confidence_iterations):
239 | ... subsample_V = np.random.choice(V, subsample_size, replace=True)
240 | ... subsample_taxon, _ = classify_V(subsample_V, kmer_probability_table)
241 | ... if taxon == subsample_taxon:
242 | ... count_same_taxon += 1
243 | ... confidence = count_same_taxon / confidence_iterations
244 | ...
245 | ... return (taxon, confidence)
246 | ```
247 |
248 | ```python
249 | >>> taxon_assignment, confidence = classify_sequence_with_confidence(queries[0], kmer_probability_table, k)
250 | >>> print(taxon_assignment)
251 | >>> print(confidence)
252 | ```
253 |
254 | How did the computed confidence compare to the accuracy taxonomy assignment?
255 |
256 | We don't have an _a priori_ idea for what good versus bad confidence scores are, but we can use our reference database to explore that. We might want this information so we can come up with a confidence threshold, above which we would accept a taxonomy assignment and below which we might reject it. To explore this, let's compute taxonomy assignments and confidence for all of our query sequences and then see what the distributions of confidence scores look like for correct assignments and incorrect assignments.
257 |
258 | ```python
259 | >>> correct_assignment_confidences = []
260 | >>> incorrect_assignment_confidences = []
261 | >>> summary = []
262 | ...
263 | >>> for query in queries:
264 | ... predicted_taxonomy, confidence = classify_sequence_with_confidence(query, kmer_probability_table, k)
265 | ... actual_taxonomy = get_taxon_at_level(reference_taxonomy[query.metadata['id']], taxonomic_level)
266 | ... if actual_taxonomy == predicted_taxonomy:
267 | ... correct_assignment_confidences.append(confidence)
268 | ... else:
269 | ... incorrect_assignment_confidences.append(confidence)
270 | ...
271 | ... summary.append([predicted_taxonomy, actual_taxonomy, confidence])
272 | >>> summary = pd.DataFrame(summary, columns=['Predicted taxonomy', 'Actual taxonomy', 'Confidence'])
273 | ```
274 |
275 | ```python
276 | >>> import seaborn as sns
277 | ...
278 | >>> ax = sns.boxplot(data=[correct_assignment_confidences, incorrect_assignment_confidences])
279 | >>> ax = sns.swarmplot(data=[correct_assignment_confidences, incorrect_assignment_confidences], color="black")
280 | >>> _ = ax.set_xticklabels(['Correct assignments', 'Incorrect assignments'])
281 | >>> _ = ax.set_ylabel('Confidence')
282 | ```
283 |
284 | What does this plot tell you about how well setting a confidence threshold is likely to work? If you never wanted to reject a correct assignment, how often would you accept an incorrect assignment? If you never wanted to accept an incorrect assignment, how often would you reject a correct assignment?
285 |
286 | ```python
287 | >>> summary # maybe explore whether certain taxa are more frequently wrong than others...
288 | ```
289 |
290 | ### Random Forest classifiers
291 |
292 | Coming soon...
293 |
294 | ## Defining a dimensionality reduction problem
295 |
296 | [This content](alias://b1cdbe) will be adapted and ported here.
297 |
--------------------------------------------------------------------------------
/book/fundamentals/multiple-sequence-alignment.md:
--------------------------------------------------------------------------------
1 |
2 | # Generalized dynamic programming for multiple sequence alignment
3 |
4 | Until now we worked with alignments between two sequences, but it is likely that you will want to align many sequences at the same time. For example, if you are trying to gain insight on the evolutionary relationships between all of the 16S bacterial genes in a given sample, it would be time consuming and very inefficient to compare them two at a time. It would be more efficient and useful to compare all of the 16S sequences from the bacteria in the same alignment.
5 | In the pairwise sequence alignment chapter, we went over dynamic programming algorithms. It's possible to generalize Smith-Waterman and Needleman-Wunsch, the dynamic programming algorithms that we explored for pairwise sequence alignment, to identify the optimal alignment of more than two sequences. Remember that our scoring scheme for pairwise alignment with Smith-Waterman looked like the following:
6 |
7 | $$
8 | \begin{align}
9 | & F(0, 0) = 0\\
10 | & F(i, 0) = F(i-1, 0) - d\\
11 | & F(0, j) = F(0, j-1) - d\\
12 | \\
13 | & F(i, j) = max \begin{pmatrix}
14 | & F(i-1, j-1) + s(c_i, c_j)\\
15 | & F(i-1, j) - d\\
16 | & F(i, j-1) - d)\\
17 | \end{pmatrix}
18 | \end{align}
19 | $$
20 |
21 | To generalize this to three sequences, we could create $3 \times 3$ scoring, dynamic programming, and traceback matrices. Our scoring scheme would then look like the following:
22 |
23 | $$
24 | \begin{align}
25 | & F(0, 0, 0) = 0\\
26 | & F(i, 0, 0) = F(i-1, 0, 0) - d\\
27 | & F(0, j, 0) = F(0, j-1, 0) - d\\
28 | & F(0, 0, k) = F(0, 0, k-1) - d\\
29 | \\
30 | & F(i, j, k) = max \begin{pmatrix}
31 | F(i-1, j-1, k-1) + s(c_i, c_j) + s(c_i, c_k) + s(c_j, c_k)\\
32 | F(i, j-1, k-1) + s(c_j, c_k) - d\\
33 | F(i-1, j, k-1) + s(c_i, c_k) - d\\
34 | F(i-1, j-1, k) + s(c_i, c_j) - d\\
35 | F(i, j, k-1) - 2d\\
36 | F(i, j-1, k) - 2d\\
37 | F(i-1, j, k) - 2d\\
38 | \end{pmatrix}
39 | \end{align}
40 | $$
41 |
42 | However the complexity of this algorithm is much worse than for pairwise alignment. For pairwise alignment, remember that if aligning two sequences of lengths $m$ and $n$, the runtime of the algorithm will be proportional to $m \times n$. If $n$ is longer than or as long as $m$, we simplify the statement to say that the runtime of the algorithm will be be proportional to $n^2$. This curve has a pretty scary trajectory: runtime for pairwise alignment with dynamic programming is said to scale quadratically.
43 |
44 | ```python
45 | >>> %pylab inline
46 | >>> from functools import partial
47 | >>> from IPython.core import page
48 | >>> page.page = print
49 | ```
50 |
51 | ```python
52 | >>> import matplotlib.pyplot as plt
53 | ...
54 | >>> seq_lengths = range(25)
55 | >>> s2_times = [t ** 2 for t in range(25)]
56 | ...
57 | >>> plt.plot(range(25), s2_times)
58 | >>> plt.xlabel('Sequence Length')
59 | >>> plt.ylabel('Runtime (s)')
60 | ```
61 |
62 | The exponent in the $n^2$ term comes from the fact that, in pairwise alignment, if we assume our sequences are both of length $n$, there are $n \times n$ cells to fill in in the dynamic programming matrix. If we were to generalize either Smith-Waterman or Needleman-Wunsch to three sequences, we would need to create a 3 dimensional array to score and trace back the alignment. For sequences of length $n$, we would therefore have $n \times n \times n$ cells to fill in, and our runtime versus sequence length curve would look like the following.
63 |
64 | ```python
65 | >>> s3_times = [t ** 3 for t in range(25)]
66 | ...
67 | >>> plt.plot(range(25), s3_times)
68 | >>> plt.xlabel('Sequence Length')
69 | >>> plt.ylabel('Runtime (s)')
70 | ```
71 |
72 | That curve looks steeper than the curve for pairwise alignment, and the values on the y-axis are bigger, but it's not really clear how much of a problem this is until we plot runtime for three sequences in the context of the run times for pairwise alignment.
73 |
74 | ```python
75 | >>> plt.plot(range(25), s2_times)
76 | >>> plt.plot(range(25), s3_times)
77 | >>> plt.xlabel('Sequence Length')
78 | >>> plt.ylabel('Runtime (s)')
79 | ```
80 |
81 | And for four sequences:
82 |
83 | ```python
84 | >>> s4_times = [t ** 4 for t in range(25)]
85 | ...
86 | >>> plt.plot(range(25), s2_times)
87 | >>> plt.plot(range(25), s3_times)
88 | >>> plt.plot(range(25), s4_times)
89 | >>> plt.xlabel('Sequence Length')
90 | >>> plt.ylabel('Runtime (s)')
91 | ```
92 |
93 | We clearly have a problem here, and that is that the runtime for multiple sequence alignment using full dynamic programming algorithms grows exponentially with the number of sequences to be aligned. If $n$ is our sequence length, and $s$ is the number of sequences, that means that runtime is proportional to $n^s$. In pairwise alignment, $s$ is always equal to 2, so the problem is more manageable. However, for the general case of $s$ sequences, we really can't even consider Smith-Waterman or Needleman-Wunsch for more than just a few sequences. The pattern in the plots above should illustrate why.
94 |
95 | As we explored with database searching, we need to figure out how to align fewer sequences. This is where *progressive alignment* comes in.
96 |
97 | ## Progressive alignment
98 |
99 | In progressive alignment, the problem of exponential growth of runtime and space is managed by selectively aligning pairs of sequences, and aligning alignments of sequences. What we typically do is identify a pair of closely related sequences, and align those. Then, we identify the next most closely related sequence to that initial pair, and align that sequence to the alignment. This concept of aligning a sequence to an alignment is new, and we'll come back to it in just a few minutes. The other concept of identifying the most closely related sequences, and then the next most closely related sequence, and so on should sound familiar. It effectively means that we're traversing a tree. And herein lies our problem: we need a tree to efficiently align multiple sequences, but we need an alignment to build a good tree.
100 |
101 | You probably have two burning questions in your mind right now:
102 |
103 | 1. How do we build a tree to guide the alignment process, if we need an alignment to build a good tree?
104 | 2. How do we align a sequence to an alignment, or an alignment to an alignment?
105 |
106 | We'll explore both of those through-out the rest of this notebook. First, let's cover the process of progressive multiple sequence alignment, just assuming for a moment that we know how to do both of those things.
107 |
108 | The process of progressive multiple sequence alignment could look like the following. First, we start with some sequences and a tree representing the relationship between those sequences. We'll call this our guide tree, because it's going to guide us through the process of multiple sequence alignment. In progressive multiple sequence alignment, we build a multiple sequence alignment for each internal node of the tree, where the alignment at a given internal node contains all of the sequences in the clade defined by that node.
109 |
110 |
111 |
112 | Starting from the root node, descend the bottom branch of the tree until you get to the an internal node. If an alignment hasn't been constructed for that node yet, continue descending the tree until to get to a pair of nodes. In this case, we follow the two branches to the tips. We then align the sequences at that pair of tips (usually with Needleman-Wunsch, for multiple sequence alignment), and assign that alignment to the node connecting those tips.
113 |
114 |
115 |
116 | Next, we want to find what to align the resulting alignment to, so start from the root node and descend the top branch of the tree. When you get to the next node, determine if an alignment has already been created for that node. If not, our job is to build that alignment so we have something to align against. In this case, that means that we need to align `s1`, `s2`, and `s3`. We can achieve this by aligning `s1` and `s3` first, to get the alignment at the internal node connecting them.
117 |
118 |
119 |
120 | We can next align the alignment of `s1` and `s3` with `s2`, to get the alignment at the internal node connecting those clades.
121 |
122 |
123 |
124 | And finally, we can compute the alignment at the root node of the tree, by aligning the alignment of `s1`, `s2`, and `s3` with the alignment of `s4` and `s5`.
125 |
126 |
127 |
128 | The alignment at the root node is our multiple sequence alignment.
129 |
130 | ### Building the guide tree
131 |
132 | Let's address the first of our outstanding questions. I mentioned above that *we need an alignment to build a good tree*. The key word here is *good*. We can build a very rough tree - one that we would never want to present as representing the actual relationships between the sequences in question - without first aligning the sequences. Remember that building a UPGMA tree requires only a distance matrix, so if we can find a non-alignment-dependent way to compute distances between the sequences, we can build a rough UPGMA tree from them.
133 |
134 | Let's compute distances between the sequences based on their *word* composition. We'll define a *word* here as `k` adjacent characters in the sequence. We can then define a function that will return all of the words in a sequence as follows. These words can be defined as being overlapping, or non-overlapping. We'll go with overlapping for this example, as the more words we have, the better our guide tree should be.
135 |
136 | ```python
137 | >>> from skbio import DNA
138 | >>> %psource DNA.iter_kmers
139 | ```
140 |
141 | ```python
142 | >>> for e in DNA("ACCGGTGACCAGTTGACCAGTA").iter_kmers(3):
143 | ... print(e)
144 | ```
145 |
146 | ```python
147 | >>> for e in DNA("ACCGGTGACCAGTTGACCAGTA").iter_kmers(7):
148 | ... print(e)
149 | ```
150 |
151 | ```python
152 | >>> for e in DNA("ACCGGTGACCAGTTGACCAGTA").iter_kmers(3, overlap=False):
153 | ... print(e)
154 | ```
155 |
156 | If we then have two sequences, we can compute the word counts for each and define a distance between the sequences as the fraction of words that are unique to either sequence.
157 |
158 | ```python
159 | >>> from iab.algorithms import kmer_distance
160 | >>> %psource kmer_distance
161 | ```
162 |
163 | We can then use this as a distance function...
164 |
165 | ```python
166 | >>> s1 = DNA("ACCGGTGACCAGTTGACCAGT")
167 | >>> s2 = DNA("ATCGGTACCGGTAGAAGT")
168 | >>> s3 = DNA("GGTACCAAATAGAA")
169 | ...
170 | >>> print(s1.distance(s2, kmer_distance))
171 | >>> print(s1.distance(s3, kmer_distance))
172 | ```
173 |
174 | If we wanted to override the default to create (for example) a 5-mer distance function, we could use ``functools.partial``.
175 |
176 | ```python
177 | >>> fivemer_distance = partial(kmer_distance, k=5)
178 | ...
179 | >>> s1 = DNA("ACCGGTGACCAGTTGACCAGT")
180 | >>> s2 = DNA("ATCGGTACCGGTAGAAGT")
181 | >>> s3 = DNA("GGTACCAAATAGAA")
182 | ...
183 | >>> print(s1.distance(s2, fivemer_distance))
184 | >>> print(s1.distance(s3, fivemer_distance))
185 | ```
186 |
187 | We can now apply one of these functions to build a distance matrix for a set of sequences that we want to align.
188 |
189 | ```python
190 | >>> query_sequences = [DNA("ACCGGTGACCAGTTGACCAGT", {"id": "s1"}),
191 | ... DNA("ATCGGTACCGGTAGAAGT", {"id": "s2"}),
192 | ... DNA("GGTACCAAATAGAA", {"id": "s3"}),
193 | ... DNA("GGCACCAAACAGAA", {"id": "s4"}),
194 | ... DNA("GGCCCACTGAT", {"id": "s5"})]
195 | ```
196 |
197 | ```python
198 | >>> from skbio import DistanceMatrix
199 | ...
200 | >>> guide_dm = DistanceMatrix.from_iterable(query_sequences, metric=kmer_distance, key='id')
201 | ```
202 |
203 | scikit-bio also has some basic visualization functionality for these objects. For example, we can easily visualize this object as a heatmap.
204 |
205 | ```python
206 | >>> fig = guide_dm.plot(cmap='Greens')
207 | ```
208 |
209 | We can next use some functionality from SciPy to cluster the sequences with UPGMA, and print out a dendrogram.
210 |
211 | ```python
212 | >>> from scipy.cluster.hierarchy import average, dendrogram, to_tree
213 | ...
214 | >>> for q in query_sequences:
215 | ... print(q)
216 | ...
217 | >>> guide_lm = average(guide_dm.condensed_form())
218 | >>> guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
219 | ... link_color_func=lambda x: 'black')
220 | >>> guide_tree = to_tree(guide_lm)
221 | ```
222 |
223 | ```python
224 | >>> from iab.algorithms import guide_tree_from_sequences
225 | >>> %psource guide_tree_from_sequences
226 | ```
227 |
228 | ```python
229 | >>> t = guide_tree_from_sequences(query_sequences, display_tree=True)
230 | ```
231 |
232 | We now have a guide tree, so we can move on to the next step of progressive alignment.
233 |
234 | ### Generalization of Needleman-Wunsch (with affine gap scoring) for progressive multiple sequence alignment
235 |
236 | Next, we'll address our second burning question: aligning alignments. As illustrated above, there are basically three different types of pairwise alignment we need to support for progressive multiple sequence alignment with Needleman-Wunsch. These are:
237 |
238 | 1. Alignment of a pair of sequences.
239 | 2. Alignment of a sequence and an alignment.
240 | 3. Alignment of a pair of alignments.
241 |
242 | Standard Needleman-Wunsch supports the first, and it is very easy to generalize it to support the latter two. The only change that is necessary is in how the alignment of two non-gap characters is scored. Recall that we previously scored an alignment of two characters by looking up the score of substitution from one to the other in a substitution matrix. To adapt this for aligning a sequence to an alignment, or for aligning an alignment to an alignment, we compute this substitution as the average score of aligning the pairs of characters.
243 |
244 | For example, if we want to align the alignment column from $aln1$:
245 |
246 | ```
247 | A
248 | C
249 | ```
250 |
251 | to the alignment column from $aln2$:
252 |
253 | ```
254 | T
255 | G
256 | ```
257 |
258 | we could compute the substitution score using the matrix $m$ as:
259 |
260 | $$
261 | s = \frac{m[A][T] + m[A][G] + m[C][T] + m[C][G]}{aln1_{length} \times aln2_{length}}
262 | $$
263 |
264 | The following code adapts our implementation of Needleman-Wunsch to support aligning a sequence to an alignment, or aligning an alignment to an alignment.
265 |
266 | ```python
267 | >>> from iab.algorithms import format_dynamic_programming_matrix, format_traceback_matrix
268 | >>> from skbio.alignment._pairwise import _compute_score_and_traceback_matrices
269 | ...
270 | >>> %psource _compute_score_and_traceback_matrices
271 | ```
272 |
273 | ```python
274 | >>> from skbio.alignment._pairwise import _traceback
275 | >>> %psource _traceback
276 | ```
277 |
278 | ```python
279 | >>> from skbio.alignment import global_pairwise_align_nucleotide
280 | >>> %psource global_pairwise_align_nucleotide
281 | ```
282 |
283 | For the sake of the examples below, I'm going to override one of the ``global_pairwise_align_nucleotide`` defaults to penalize terminal gaps. This effectively tells the algorithm that we know we have a collection of sequences that are homologous from beginning to end.
284 |
285 | ```python
286 | >>> global_pairwise_align_nucleotide = partial(global_pairwise_align_nucleotide, penalize_terminal_gaps=True)
287 | ```
288 |
289 | For example, we can still use this code to align pairs of sequences (but note that we now need to pass those sequences in as a pair of one-item lists):
290 |
291 | ```python
292 | >>> aln1, _, _ = global_pairwise_align_nucleotide(query_sequences[0], query_sequences[1])
293 | >>> print(aln1)
294 | ```
295 |
296 | We can align that alignment to one of our other sequences.
297 |
298 | ```python
299 | >>> aln1, _, _ = global_pairwise_align_nucleotide(aln1, query_sequences[2])
300 | >>> print(aln1)
301 | ```
302 |
303 | Alternatively, we can align another pair of sequences:
304 |
305 | ```python
306 | >>> aln2, _, _ = global_pairwise_align_nucleotide(query_sequences[2], query_sequences[3])
307 | >>> print(aln2)
308 | ```
309 |
310 | And then align that alignment against our previous alignment:
311 |
312 | ```python
313 | >>> aln3, _, _ = global_pairwise_align_nucleotide(aln1, aln2)
314 | >>> print(aln3)
315 | ```
316 |
317 | ### Putting it all together: progressive multiple sequence alignment
318 |
319 | We can now combine all of these steps to take a set of query sequences, build a guide tree, perform progressive multiple sequence alignment, and return the guide tree (as a SciPy linkage matrix) and the alignment.
320 |
321 | ```python
322 | >>> from skbio import TreeNode
323 | >>> guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids)
324 | ```
325 |
326 | We can view the guide tree in [Newick format](http://scikit-bio.org/docs/latest/generated/skbio.io.newick.html) as follows:
327 |
328 | ```python
329 | >>> print(guide_tree)
330 | ```
331 |
332 | ```python
333 | >>> from iab.algorithms import progressive_msa
334 | >>> %psource progressive_msa
335 | ```
336 |
337 | ```python
338 | >>> msa = progressive_msa(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, guide_tree=guide_tree)
339 | >>> print(msa)
340 | ```
341 |
342 | We can now build a (hopefully) improved tree from our multiple sequence alignment. First we'll look at our original distance matrix again, and then the distance matrix generated from the progressive multiple sequence alignment.
343 |
344 | ```python
345 | >>> fig = guide_dm.plot(cmap='Greens')
346 | ```
347 |
348 | ```python
349 | >>> msa_dm = DistanceMatrix.from_iterable(msa, metric=kmer_distance)
350 | >>> fig = msa_dm.plot(cmap='Greens')
351 | ```
352 |
353 | The UPGMA trees that result from these alignments are very different. First we'll look at the guide tree, and then the tree resulting from the progressive multiple sequence alignment.
354 |
355 | ```python
356 | >>> d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right',
357 | ... link_color_func=lambda x: 'black')
358 | ```
359 |
360 | ```python
361 | >>> msa_lm = average(msa_dm.condensed_form())
362 | >>> d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right',
363 | ... link_color_func=lambda x: 'black')
364 | ```
365 |
366 | And we can wrap this all up in a single convenience function:
367 |
368 | ```python
369 | >>> from iab.algorithms import progressive_msa_and_tree
370 | >>> %psource progressive_msa_and_tree
371 | ```
372 |
373 | ```python
374 | >>> msa = progressive_msa(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, guide_tree=guide_tree)
375 | ```
376 |
377 | ```python
378 | >>> msa, tree = progressive_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide,
379 | ... display_tree=True, display_aln=True)
380 | ```
381 |
382 | ## Progressive alignment versus iterative alignment
383 |
384 | In an iterative alignment, the output tree from the above progressive alignment is used as a guide tree, and the full process repeated. This is performed to reduce errors that result from a low-quality guide tree.
385 |
386 | ```python
387 | >>> from iab.algorithms import iterative_msa_and_tree
388 | >>> %psource iterative_msa_and_tree
389 | ```
390 |
391 | ```python
392 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=1, display_aln=True, display_tree=True)
393 | ```
394 |
395 | ```python
396 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=2, display_aln=True, display_tree=True)
397 | ```
398 |
399 | ```python
400 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=3, display_aln=True, display_tree=True)
401 | ```
402 |
403 | ```python
404 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=5, display_aln=True, display_tree=True)
405 | ```
406 |
407 | Some references that I used in assembling these notes include [1](http://statweb.stanford.edu/~nzhang/345_web/sequence_slides3.pdf), [2](http://math.mit.edu/classes/18.417/Slides/alignment.pdf), [3](http://www.sciencedirect.com/science/article/pii/0378111988903307), [4](http://bioinformatics.oxfordjournals.org/content/23/21/2947.full), and [5](http://nar.oxfordjournals.org/content/32/5/1792.full).
408 |
--------------------------------------------------------------------------------
/book/getting-started/biological-information.md:
--------------------------------------------------------------------------------
1 | # Biological Information
2 |
3 | Biological systems and computer systems are analogous in ways that may not be immediately apparent. Before we dive into using computers to study biology, let's briefly explore a relationship between the two: information processing is one of the most fundamental functions of both.
4 |
5 | ## Central Dogma of Molecular Biology
6 |
7 | The Central Dogma of Molecular Biology describes information flow in biological systems. It begins with DNA, a relatively long-lived information storage molecule, from which information typically flows in two directions: into new DNA molecules, during the process of replication, or into messenger RNA (mRNA), during the processing of transcription. mRNA is a relatively short-lived molecule that transfers information that is used to synthesize protein molecules by the ribosome. Proteins are often thought of as the building blocks of life. They serve a variety of purposes, ranging from molecular machines such as transmembrane ion transporters, to structural molecules like myosin, a major component of muscle fibers. There are some uncommon circumstances where information flows differently, for example some viruses can reverse transcribe RNA to DNA, but proteins seem to be a terminal output of this information flow. Once a protein has been created, we are aware of no process that can work backwards to re-create the RNA or DNA that encoded it.
8 |
9 | We'll revisit these ideas at the end of this chapter, but first let's establish some concepts that will help us to understand and even quantify information. These ideas have their roots in Boolean algebra and Information Theory. Bear with me while I introduce some concepts that may be new to you, and may initially seem unrelated.
10 |
11 |
12 |
13 | Figure 1: The central dogma of molecular biology represents information flow in biological systems. Blue pathways are generally observed in cellular life. Red pathways are observed in special cases such as RNA viruses. (Figure attribution: Narayanese at English Wikipedia [Public domain], via Wikimedia Commons.)
14 |
15 |
16 |
17 | ## Binary and decimal numerical systems
18 |
19 | Humans most frequently use a _base 10_ or decimal numerical system for representing numbers. _Base 10_ means than there are ten available digits including zero. These are the digits 0, 1, 2, 3, 4, 5, 6, 7, 8, and 9. We represent numbers larger than 9 using multiple places: the _ones_ place, the _tens_ place, the _hundreds_ place, and so on. These are the exponents of 10: the ones place is $10^{0}$, the tens place is $10^{1}$, the hundreds place is $10^{2}$, and so on. When we write a decimal number with multiple places, such as 42, what we're representing is a four in the tens place plus a two in the ones place, or $4 \times 10^{1} + 2 \times 10^{0} = 42$.
20 |
21 | You've probably heard that computers use a _base 2_ or binary numerical system to represent numbers. The _base_ again describes the number of available digits, so in a base 2 or binary system, there are two digits, 0 and 1. These are defined as the binary digits. As in the decimal system, numbers larger than 1 are represented using multiple places. The places in a binary number are again based on exponents, but this time they are the exponents of 2. Instead of a ones place, a tens place, and a hundreds place, the first three places in a binary number are the ones place ($2^0$), the _twos_ place ($2^1$), and the _fours_ place ($2^2$). Thus the interpretation of the binary number `011` is $0 \times 2^2 + 1 \times 2^1 + 1 \times 2^0 = 3$.
22 |
23 | When working with numbers that may be other than base 10, by convention numbers would be written as $(n)_b$, where $n$ is the number, and $b$ is the base of the number. For example, $(11)_{10}$ represents the decimal number 11, because the base is 10. $(11)_2$ represents the decimal number 3: because the base is 2, we know that this is a binary number.
24 |
25 | Here are some binary numbers and formulas for translating them to their decimal equivalents.
26 |
27 | * $(0)_2$ is the decimal number 0 ($0 \times 2^0$)
28 | * $(1)_2$ is the decimal number 1 ($1 \times 2^0$)
29 | * $(01)_2$ is also the decimal number 1 ($0 \times 2^1 + 1 \times 2^0$)
30 | * $(11)_2$ is the decimal number 3 ($1 \times 2^1 + 1 \times 2^0$)
31 | * $(110)_2$ is the decimal number 6 ($1 \times 2^2 + 1 \times 2^1 + 0 \times 2^0$)
32 | * $(111)_2$ is the decimal number 7 ($1 \times 2^2 + 1 \times 2^1 + 1 \times 2^0$)
33 |
34 | A single **bi**nary digi**t** (a zero or one) is referred to as a _bit_, and bits can be used to encode a lot more than just numbers.
35 |
36 | ## Encoding messages in bits
37 |
38 | The messages encoded by bits can be nearly anything, provided that the sender of the message and the recipient of the message have agreed on a coding scheme which describes how a message can be encoded in bits or decoded from bits. The number of messages that can be sent using bits is a simple function of the number of places that are used. For example, if the only messages I want to transmit to you are "yes" and "no", I could achieve that by transmitting a single bit of information. You and I could agree that "yes" will be represented by the bit 1, and "no" will be represented by the bit 0.
39 |
40 | Internally, computers send and receive messages that are encoded using electrical currents. To reduce errors in message transmission, the electrical currents are interpreted only as being off or on, such that a message may be transmitted as off-on-on. These two states, off and on, are often interpreted by computers as binary numbers, where zero is synonymous with off (no current) and one is synonymous with on (current). So our message of off-on-on could be read as the binary number 011, or the decimal number 3.
41 |
42 | To illustrate a useful system that operates on the transmission of one bit of information, I'll describe a photosensor for an outdoor spotlight. In this example the photosensor is the sender of the message and the spotlight is the receiver of the message. The transmission of a zero from the photosensor to the spotlight could mean that it is currently light outside, and the transmission of a one could mean that it is currently dark outside. (The meanings of zero and one could be reversed: all that matters is that the sender and the receiver know what each value means.) The photosensor can monitor the available light, and send a message to the spotlight once per minute. If it is currently light outside, the photosensor will send a zero to the spotlight and the spotlight will turn off or remain off. If it is currently dark outside, the photosensor will send a one to the spotlight, and the spotlight will turn on or remain on. The photosensor is functioning as an on/off switch for the spotlight, transmitting one bit of information every minute.
43 |
44 | There are couple of important things to consider in this example. First, the meaning of "currently light outside" and "currently dark outside" are embodied in the photosensor. It must make a decision on whether it is light or dark on it's own, because it is only transmitting one bit of information (zero equals light and one equals dark). The message it sends isn't complex enough to describe how light or dark it currently is outside -it's effectively only flipping a switch on or off.
45 |
46 | To enable the transmission of more complex messages more bits can be used. One bit allows us to transmit two messages: 0 and 1, which in our photosensor example are interpreted as _off_ and _on_, respectively. If our message is based on two bits we can transmit four messages, 00, 01, 10, or 11. A real-world example of this could be a light switch with four states: off, low brightness, medium brightness, and high brightness. If our message is based on three bits, we can transmit eight messages, 000, 001, 010, 011, 100, 101, 110, or 111. There is a pattern emerging here. If `n` is number of bits that you have available to send a message, the number of distinct messages that you can send is $2^n$. To generalize this formula further, if the number of available digits in the system is `b`, and the number of places you can use in your message is `n`, then the number of messages that can be sent is $b^n$.
47 |
48 | In computer systems, the bit is the most fundamental unit of information. The next largest unit is the byte, which is composed of eight bits. How many messages can be encoded in one byte?
49 |
50 | ## Protein sequences are encoded in a base 4 system
51 |
52 | The building blocks of DNA are four chemical compounds called adenine, cytosine, guanine, and thymine. We often represent these compounds with the abbreviations A, C, G and T, respectively. One of the primary roles of DNA in biological organisms is to encode the primary structure, or amino acid sequence, of proteins. As with computer systems, this information is represented based on discrete states, but in biological systems there are four states rather than two. Each position or place in an exon of a protein-coding DNA sequence can contain one of these compounds, and the linear order of the compounds can encode a message.
53 |
54 | When first translated, proteins are composed of simpler units, the amino acids, and most organisms use 20 different amino acids to build proteins. Because there are four DNA bases (A, C, G, and T) and twenty amino acids, we need more than one base to transmit the message of what amino acid comes next in a protein from DNA to the ribosome. How many DNA bases we need depends on how many messages we want to be able to send, which in this case is 20 (for the twenty amino acids). So, how many DNA bases are needed to encode the 20 canonical amino acids?
55 |
56 | As mentioned above, we can determine the number of messages we can send in a given numerical system with a given number of places using the formula $b^n$. For messages encoded in DNA, $b$ is four, so with one place (or one DNA base) we can send four messages. Since four is less than twenty, we'll need longer messages to encode the twenty amino acids. If our message were composed of two bases, we could send $4^2=16$ messages - that's still less than twenty, so we'll need more bases. If our message were composed of three bases, we could send $4^3=64$ messages. This is more than twenty, which means that we can encode all of the amino acids (with some messages to spare) in three bases. It's important to note that the number of places we can use must be a whole number - "2.5 bases of DNA" is not a meaningful quantity.
57 |
58 | Amino acids are in fact encoded by three nucleotide bases, and the three base messages are referred to as _codons_. The mapping of codons to amino acids is referred to as the _genetic code_. Each codon represents exactly one amino acid, with the exception of some, the _stop codons_, which indicate the end of a message. Because there are 64 codons but only twenty-one messages that need to be transmitted (the twenty amino acids and the "stop" signal), some amino acids and the stop signal are represented by more than one codon. This is referred to as the redundancy of the genetic code.
59 |
60 |
61 |
62 | Figure 2: The vertebrate RNA genetic code. The corresponding DNA genetic code is identical, except that Us are replaced with Ts. (Figure attribution: NIH [Public domain], via Wikimedia Commons.)
63 |
64 |
65 |
66 | The scikit-bio Python library that was mentioned in the previous chapter has functionality for translating DNA sequences into protein sequences. We can apply that as follows:
67 |
68 | ```python
69 | >>> import skbio
70 | >>> # the following sequence is NCBI reference sequence NM_005368.3
71 | ... protein = skbio.DNA(
72 | ... "AAACCCCAGCTGTTGGGGCCAGGACACCCAGTGAGCCCATACTTGCTCTTTTTGTCTTCTTCAGACTGCGCCATGGG"
73 | ... "GCTCAGCGACGGGGAATGGCAGTTGGTGCTGAACGTCTGGGGGAAGGTGGAGGCTGACATCCCAGGCCATGGGCAGG"
74 | ... "AAGTCCTCATCAGGCTCTTTAAGGGTCACCCAGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAG"
75 | ... "GACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAA"
76 | ... "GAAGGGGCATCATGAGGCAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACC"
77 | ... "TGGAGTTCATCTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGG"
78 | ... "GCCATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGC"
79 | ... "CCCTGCCGCTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAG"
80 | ... "AGTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTGTTGAAG"
81 | ... "TTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTGGCCCTTGGCTCAC"
82 | ... "TGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGAACTTCTTCCAACCTCCAAAC"
83 | ... "TGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGAAG"
84 | ... "ACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGACTATCTGGA"
85 | ... "ACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCAACATCTCA"
86 | ... ).translate(stop='require')
87 | ```
88 |
89 | The above step translated a DNA sequence to a protein sequence, but it didn't show the result. The view a nicely formatted summary of the result, we can view a representation of the variable as follows:
90 |
91 | ```python
92 | >>> protein
93 | Protein
94 | ---------------------------------------------------------------------
95 | Stats:
96 | length: 178
97 | has gaps: False
98 | has degenerates: False
99 | has non-degenerates: True
100 | has stops: False
101 | ---------------------------------------------------------------------
102 | 0 KPQLLGPGHP VSPYLLFLSS SDCAMGLSDG EWQLVLNVWG KVEADIPGHG QEVLIRLFKG
103 | 60 HPETLEKFDK FKHLKSEDEM KASEDLKKHG ATVLTALGGI LKKKGHHEAE IKPLAQSHAT
104 | 120 KHKIPVKYLE FISECIIQVL QSKHPGDFGA DAQGAMNKAL ELFRKDMASN YKELGFQG
105 | ```
106 |
107 | If we just want the protein sequence itself, you can call `print` on the variable. How might you determine what this sequence is?
108 |
109 | ```python
110 | >>> print(protein)
111 | ```
112 |
113 | Genomes contain messages other than protein sequences, so in reality the messages encoded by DNA in our genomes are more complex than a base 4 numerical system. For example, the structure that a chromosomal region adopts can impact whether genes in that region are expressed or not, which can have profound phenotypic impacts. This is a higher-level message that is encoded in our genomes. So, while in some ways we can relate the information contained in our genomes to the way information is stored in computers, our genomes are not just programs that are executed. Even the simplest cellular organisms are far more complex than the most complex machines of humankind! "An airplane is nothing if you compare it to a pelican," observed Herman Dune.
114 |
115 | Also ignored in this discussion is that additional characters are sometimes used to represent ambiguity in our knowledge of a DNA sequence, or to concisely represent more than one sequence. The [IUPAC nucleic acid notation](https://en.wikipedia.org/wiki/Nucleic_acid_notation) is what we've been using in this chapter, where A, C, G, and T represent adenine, cytosine, guanine, and thymine, respectively. Other characters are definited in this notation. For example, N is defined to mean either A, C, G, or T, and is thus commonly encountered in readouts of DNA sequences at positions where the base couldn't be determined. These _degenerate_ characters couldn't be represented in the base 4 numerical system we've been discussing here. They also don't exist in nature - we just use them to talk about DNA sequences.
116 |
117 | ## Quantifying information
118 |
119 | Information is a quantifiable concept, an idea that has its roots in Boolean algebra and in Claude Shannon's work on Information Theory. The most basic unit of information is the _binary digit_, or _bit_, which has two possible states. Depending on the domain, the symbols representing these two states might be `0` and `1`, `yes` and `no`, `+` and `-`, `true` and `false`, or `on` and `off`. When you answer a "yes/no" question in a conversation, or a "true/false" question on an exam, you're providing one bit of information.
120 |
121 | _Information_ is technically defined as a sequence of symbols that can be interpreted as a message. To put these terms in the context of our examples above, our _message_ is a decimal number, our _symbols_ are 0 and 1, and the sequence is the ordered collection of symbols, such as `011`. The number of places (let's call that $p$), and the number of symbols (let's call that $n_{symbols}$) define the number of different messages ($n_{messages}$) that can be encoded as: $n_{messages} = n_{symbols}^p$.
122 |
123 | Let's apply this formula to determine how many messages can be sent with one byte of information:
124 |
125 | ```python
126 | >>> n_symbols = 2 # we'll use the two available binary numbers, 0 and 1
127 | >>> p = 8 # because there are 8 places, or bits, in a byte
128 | ...
129 | >>> print(n_symbols**p)
130 | ```
131 |
132 | Since bases in a DNA sequence are represented with four characters, each position in a sequence contains two bits of information. We know this because we could represent all four bases using two places in a binary number. For example, 00 could represent A, 01 could represent C, 01 could represent G, and 11 could represent T. These assignments of binary numbers to DNA bases is arbitrary.) In other words, if we have two symbols and two places, we can send four messages ($2^2=4$), so one base of DNA represents 2 bits of information. A DNA sequence that is 100 bases long would therefore contain 200 bits of information.
133 |
134 | More generally, if we send a message using a numerical system with $s$ symbols, and our message is $p$ places long, the number of bits that are sent would be $n$ in the following equation: $s^p = 2^n$. We could solve for $n$ as: $n = \log_{2}s^p$.
135 |
136 | ## The genetic code
137 |
138 | As mentioned above, the genetic code describes the mapping of codons to amino acids. This mapping is embodied in an organism's [transfer RNA (tRNA)](http://pdb101.rcsb.org/motm/15) molecules. As illustrated in Figure 3, one end of the folded tRNA contains the "anticodon loop", which is the complementary sequence to the mRNA's codon. On the other end of the tRNA is the acceptor stem, which contains the amino acid attachment site. Through interaction with an [aminoacyl tRNA synthetase](https://pdb101.rcsb.org/motm/16), the amino acid corresponding to the anticodon is covalently linked to the acceptor stem. During translation, a tRNA's anticodon pairs with a codon in a messenger RNA (mRNA) inside the ribosome and thereby provides the next amino acid needed for protein synthesis.
139 |
140 |
141 |
142 | Figure 3: The secondary and tertiary structure of a transfer RNA (tRNA) molecule. (Figure attribution: This image was obtained from OpenStax Microbiology, a free microbiology text book, and is licensed under CC-BY. OpenStax Microbiology can be accessed for free at https://openstax.org/books/microbiology/pages/1-introduction.)
143 |
144 |
145 |
146 | Figure 3 also illustrates two different views of the structure of a tRNA molecule. The secondary ($2^o$) structure is presented on the left, and the tertiary ($3^o$) structure is presented on the right. For a nucleic acid molecule, like tRNA, the secondary structure refers to the base pairing interactions in a folded molecule. The tertiary structure refers to the three-dimensional structure that the molecule takes inside of an organism. The primary ($1^o$) structure, which isn't illustrated here, refers to the linear sequence of nucleotides in the tRNA molecule. The primary structure of a phenylalanine tRNA (i.e., tRNAPhe) from yeast in [FASTA format](https://en.wikipedia.org/wiki/FASTA_format), for example, is as follows.
147 |
148 | ```
149 | >4TNA:A|PDBID|CHAIN|SEQUENCE
150 | GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA
151 | ```
152 |
153 | Each level of structure contains some information about the other levels. For example, if we were to examine the primary structure of tRNAPhe, we might find that there are stretches of certain bases that could form stable base pairing interactions with each other. That could help us to make a prediction about the secondary structure of tRNAPhe. We could use that information, along with knowledge about the physics of nucleic acid molecules to make predictions about how the molecule would fold inside of a cell (i.e., its tertiary structure). If we knew the tertiary structure of a molecule with similar primary structure to tRNAPhe, that information would also be very helpful in predicting the tertiary structure of tRNAPhe because we expect nucleic acids or proteins with similar primary structures to also have similar secondary and tertiary structures, though this isn't always the case.
154 |
155 | The primary and secondary structure of a molecule alone doesn't currently allow us to make perfect predictions about the tertiary structure that a molecular will adopt. Personally, I think additional information is needed so this type of prediction won't ever be entirely reliable. But this is a classic problem in bioinformatics.
156 |
157 | ## Summary
158 |
159 | In this section we explored different numerical systems, and discussed relationships between how computers and organisms represent information. In the next section we'll dive into the fundamentals of bioinformatics.
160 |
--------------------------------------------------------------------------------
/book/getting-started/images/central-dogma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/central-dogma.png
--------------------------------------------------------------------------------
/book/getting-started/images/genetic-code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/genetic-code.png
--------------------------------------------------------------------------------
/book/getting-started/images/greg-in-telluride.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/greg-in-telluride.png
--------------------------------------------------------------------------------
/book/getting-started/images/trna.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/trna.png
--------------------------------------------------------------------------------
/book/getting-started/index.md:
--------------------------------------------------------------------------------
1 | # Getting started
2 |
--------------------------------------------------------------------------------
/book/getting-started/index.yaml:
--------------------------------------------------------------------------------
1 | contents:
2 | - reading-iab
3 | - biological-information
4 |
--------------------------------------------------------------------------------
/book/getting-started/reading-iab.md:
--------------------------------------------------------------------------------
1 | # Reading An Introduction to Applied Bioinformatics
2 |
3 | **Bioinformatics, as I see it, is the application of the tools of computer science (things like programming languages, algorithms, and databases) to address biological problems (for example, inferring the evolutionary relationship between a group of organisms based on fragments of their genomes, or understanding if or how the community of microorganisms that live in my gut changes if I modify my diet).** Bioinformatics is a rapidly growing field, largely in response to the vast increase in the quantity of data that biologists now grapple with. Students from varied disciplines (e.g., biology, computer science, statistics, and biochemistry) and stages of their educational careers (undergraduate, graduate, or postdoctoral) are becoming interested in bioinformatics.
4 |
5 | *An **I**ntroduction to **A**pplied **B**ioinformatics*, or **IAB**, is a free, open access bioinformatics text available at http://readIAB.org. **It introduces readers to the core concepts of bioinformatics in the context of their implementation and application to real-world problems and data.** IAB makes extensive use of the [scikit-bio](http://www.scikit-bio.org) Python package, which provides production-ready implementations of core bioinformatics algorithms and data structures. As readers are learning a concept, for example, pairwise sequence alignment, they are presented with its scikit-bio implementation directly in the text. scikit-bio code is well annotated (adhering to the [pep8](https://www.python.org/dev/peps/pep-0008/) and [numpydoc](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) conventions), so readers can use it to assist with their understanding of the concept. Readers of IAB also therefore learn the concepts in the context of tools they can use to develop their own bioinformatics software and pipelines, enabling them to rapidly get started on their own projects. While some theory is discussed, the focus of IAB is on what readers need to know to be effective, practicing bioinformaticians.
6 |
7 | IAB is **completely open access**, with all software being BSD-licensed, and all text being licensed under Creative Commons Attribution Only (i.e., CC BY-NC-SA 4.0). All development and publication is coordinated under [public revision control](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics).
8 |
9 | [My](alias://fedd13) goal for IAB is for it to make bioinformatics as accessible as possible to students from varied backgrounds, and to get more and diverse people into this hugely exciting field. I'm very interested in hearing from readers and instructors who are using IAB, so get in touch if you have corrections, suggestions for how to improve the content, or any other thoughts or comments on the text. In the spirit of openness, I'd prefer to be contacted via the [IAB issue tracker](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/issues/). I'll respond to direct e-mail as well, but I'm often backlogged on e-mail (just ask my students), so e-mail responses are likely to be slower.
10 |
11 | I hope you find IAB useful, and that you enjoy reading it!
12 |
13 | ## Who should read IAB?
14 |
15 | IAB is written for scientists, software developers, and students interested in understanding and applying bioinformatics methods, and ultimately in developing their own bioinformatics analysis pipelines or software.
16 |
17 | IAB was initially developed for an undergraduate course cross-listed in computer science and biology with no pre-requisites. It therefore assumes little background in biology or computer science, however some basic background is very helpful. For example, an understanding of the roles of and relationship between DNA and protein in a cell, and the ability to read and follow well-annotated python code, are both helpful (but not necessary) to get started.
18 |
19 | In the *Getting started with [Biology](alias://cf88ac) and [Computer Science](alias://6ad7e1)* sections below I provide some suggestions for other texts that will help you to get started.
20 |
21 | ## How to read IAB
22 |
23 | IAB can be read interactively as a series of [Jupyter Notebooks](https://jupyter.org) or read statically. Due to popular demand, a print version may ultimately be available for a fee, but the full and most recent version of IAB will always be available for free on the [project website](http://readIAB.org). The recommended way to read IAB is interactively as this allows readers to execute code directly in the text. For example, when learning pairwise alignment, users can align sequences provided in IAB (or their own sequences) and modify parameters (or even the algorithm itself) to see how changes affect the resulting alignments.
24 |
25 | IAB is constantly being updated. As I teach with it, I will often update text or add new chapters in an effort to keep up with advances in the field. The project website contains the most up-to-date recommendations on how to read IAB or teach with IAB, including strategies for dealing with changing content. (For example, if you're teaching with IAB, you can fork the IAB repository and only pull updates into your fork when you're ready for them. If _forking repositories_ and _pulling updates_ are terms that don't mean anything to you right now, you can safely ignore this!)
26 |
27 | IAB is split into four different sections: *Getting started*, *Fundamentals*, *Applications*, and *Wrapping up*. You should start reading IAB by working through the *Getting started* and *Fundamentals* chapters in order. You should then read the *Applications* chapters and *Wrapping up* in any order, based on your own interest.
28 |
29 | ## Using Jupyter Notebooks to read IAB interactively
30 |
31 | IAB can be read interactively as a series of Jupyter Notebooks. The main source for information about Jupyter Notebooks is the [Jupyter website](https://jupyter.org). You can find information there on how to use Jupyter Notebooks as well as setting up and running a Jupyter Notebook server (for example, if you'd like to make one available to your students).
32 |
33 | Most of the code that is used in IAB comes from [scikit-bio](http://scikit-bio.org) package, or other Python scientific computing tools. You can access these in the same way that you would in a Python script. For example:
34 |
35 | ```python
36 | >>> import skbio
37 | >>> from IPython.core import page
38 | >>> page.page = print
39 | ```
40 |
41 | We can then access functions, variables, and classes from these modules.
42 |
43 | ```python
44 | >>> print(skbio.title)
45 | >>> print(skbio.art)
46 | ```
47 |
48 | We'll inspect a lot of source code in IAB as we explore bioinformatics algorithms. If you're ever interested in seeing the source code for some functionality that we're using, you can do that using Jupyter's ``psource`` magic.
49 |
50 | ```python
51 | >>> from skbio.alignment import TabularMSA
52 | >>> %psource TabularMSA.conservation
53 | ```
54 |
55 | The documentation for scikit-bio is also very extensive. You can view the documentation for the `TabularMSA` object, for example, [here](http://scikit-bio.org/docs/latest/generated/skbio.alignment.TabularMSA.html). These documents will be invaluable for learning how to use the objects.
56 |
57 | ## Reading list
58 |
59 | ### Getting started with Biology
60 |
61 | If you're new to biology, these are some books and resources that will help you get started.
62 |
63 | * [The Processes of Life](http://amzn.to/1P0dc2E) by Lawrence Hunter. *An introduction to biology for computer scientists.*
64 |
65 | * The [NIH Bookshelf](http://www.ncbi.nlm.nih.gov/books/) A lot of free biology texts, some obviously better than others.
66 |
67 | ### Getting started with Computer Science and programming
68 |
69 | If you're new to Computer Science and programming, these are some books and resources that will help you get started.
70 |
71 | * [Software Carpentry](http://www.software-carpentry.org) *Online resources for learning scientific computing skills, and regular in-person workshops all over the world. Taking a Software Carpentry workshop **will** pay off for biology students interested in a career in research.*
72 |
73 | * [Practical Computing for Biologists](http://amzn.to/1Ukx5S6) by Steven Haddock and Casey Dunn. *A great introduction to many computational skills that are required of modern biologists. I *highly* recommend this book to all Biology undergraduate and graduate students.*
74 |
75 | * [Practical Programming: A Introduction to Computer Science Using Python](http://amzn.to/1P0dmqM) by Jennifer Campbell, Paul Gries, Jason Montojo, Greg Wilson. *An introduction to the python programming language and basic computer science. This is a great first programming book for people interested in bioinformatics or scientific computing in general.*
76 |
77 | * [The Pragmatic Programmer](http://amzn.to/1P0dl6i) by Andrew Hunt. *A more advanced book on becoming a better programmer. This book is excellent, and I highly recommend it for anyone developing bioinformatics software. You should know how to program and have done some software development before jumping into this.*
78 |
79 | ### Philosophy of biology and popular science books
80 |
81 | These are some books that I've enjoyed, that have also helped me think about biological systems. These are generally written for a more popular audience, so should be accessible to any readers of *An Introduction to Applied Bioinformatics*.
82 |
83 | * [The Selfish Gene](http://amzn.to/1UkyQ1R) by Richard Dawkins.
84 |
85 | * [Ever Since Darwin](http://amzn.to/1Ukzdt7) by Stephen Jay Gould. *This is the first book in a series of collections of short essays.*
86 |
87 | * [The Demon Haunted World](http://amzn.to/1UkyIzi) by Carl Sagan.
88 |
89 | * [Sex and Death](http://amzn.to/1UkySXg) by Kim Sterelny.
90 |
91 | * [Gödel, Escher, Bach](http://amzn.to/1UkzxYL) by Douglas Hofstadter.
92 |
93 | ## Need help?
94 |
95 | If you're having issues getting *An Introduction to Applied Bioinformatics* running on your computer, or you have corrections or suggestions on the content, you should get in touch through the [IAB issue tracker](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/issues). This will generally be much faster than e-mailing the author directly, as there are multiple people who monitor the issue tracker. It also helps us manage our technical support load if we can consolidate all requests and responses in one place.
96 |
97 | ## Contributing and Code of Conduct
98 |
99 | If you're interested in contributing content or features to IAB, you should start by reviewing the project's [Code of Conduct](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/blob/master/CODE-OF-CONDUCT.md) and [Contributing Guide](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/blob/master/CONTRIBUTING.md).
100 |
101 | ## Acknowledgements
102 |
103 | *An Introduction to Applied Bioinformatics* was funded in part by the [Alfred P. Sloan Foundation](www.sloan.org). Initial prototyping was funded by [Arizona's Technology and Research Initiative Fund](http://nau.edu/Research/Funding/Technology-Research-Initiative-Fund/). The style of the project was inspired by [Bayesian Methods for Hackers](http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/).
104 |
105 | See the repository's [contributors page](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/graphs/contributors) for information on who has contributed to the project.
106 |
--------------------------------------------------------------------------------
/book/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/images/logo.png
--------------------------------------------------------------------------------
/book/index.md:
--------------------------------------------------------------------------------
1 |
2 | # An Introduction To Applied Bioinformatics
3 |
--------------------------------------------------------------------------------
/book/index.yaml:
--------------------------------------------------------------------------------
1 | contents:
2 | - getting-started
3 | - fundamentals
4 | - applications
5 | - exercises
6 | - back-matter
7 |
--------------------------------------------------------------------------------
/custom.css:
--------------------------------------------------------------------------------
1 | /* Custom styling to help align things, and correct section numbers */
2 |
3 | h4, h5, h6 {
4 | font-size: 120% !important;
5 | }
6 |
7 | .iab-edit {
8 | float: right;
9 | font-size: 14px;
10 | }
11 |
12 | .anchor-link {
13 | display: none !important;
14 | }
15 |
16 | /* from SO: https://stackoverflow.com/a/4098296/579416 */
17 | .cell ol {
18 | counter-reset: item;
19 | }
20 | .cell li {
21 | display: block;
22 | }
23 | .cell li:before {
24 | content: counters(item, ".") " ";
25 | counter-increment: item ;
26 | }
27 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | channels:
2 | - etetoolkit
3 | - conda-forge
4 | - defaults
5 | dependencies:
6 | - python=3.6
7 | - pip
8 | - notebook
9 | - scikit-bio >= 0.5.5, < 0.6.0
10 | - networkx = 2.3.0
11 | - ete3
12 | - ete_toolchain
13 | - pip:
14 | - https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/archive/master.zip
15 |
--------------------------------------------------------------------------------
/iab/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # -----------------------------------------------------------------------------
4 | # This work is licensed under the Creative Commons
5 | # Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
6 | # copy of this license, visit
7 | # http://creativecommons.org/licenses/by-nc-sa/4.0/.
8 | # -----------------------------------------------------------------------------
9 |
10 | import pkg_resources
11 |
12 | __version__ = pkg_resources.get_distribution(
13 | 'An-Introduction-To-Applied-Bioinformatics').version
14 |
--------------------------------------------------------------------------------
/iab/format/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/iab/format/__init__.py
--------------------------------------------------------------------------------
/iab/format/dialog_box.py:
--------------------------------------------------------------------------------
1 | import markdown2
2 | from IPython.display import HTML
3 |
4 |
5 | def make_box(section_text='', header_background_color='none',
6 | header_text='', header_text_color='none',
7 | icon='', section_background_color='none',
8 | section_text_color='none', style=''):
9 | """Generic fucntion to create dialog box.
10 |
11 | This is a generic function that displays a string as HTML that is
12 | displayed inline in an IPython notebook. The HTML is automatically
13 | displayed when the function is called.
14 |
15 | Parameters
16 | ----------
17 | section_text : str
18 | The text that will be displayed in the main section of the dialog
19 | box
20 | header_background_color : str
21 | The color of the header background.
22 | header_text : str
23 | The text to be displayed in the header.
24 | header_text_color : str
25 | The color of the text in the header.
26 | icon : str
27 | The tag for the image or icon that will be displayed next to the
28 | header text.
29 | section_background_color : str
30 | The background color of the main text section.
31 | section_text_color : str
32 | The color of the main section text.
33 | style : str
34 | The name of the style used in the css class. This prevents the dialog
35 | box from overwriting other HTML objects and should therefore be
36 | unique.
37 |
38 | Returns
39 | -------
40 | IPython.core.display.HTML
41 | IPython object that displays the HTML inline in the IPython notebook
42 |
43 | Examples
44 | --------
45 | >>> from iab.format.dialog_box import make_box
46 | >>> make_box('foo',\
47 | header_background_color='#000',\
48 | header_text='Developer Note',\
49 | header_text_color='#76EE00',\
50 | icon="",\
52 | section_background_color='#e5e5e5',\
53 | section_text_color='#000',\
54 | style='developer_note')
55 |
56 |
57 |
58 | """
59 | return HTML("""
60 |
61 |
63 |
96 |
97 |
98 |
109 | """ % {"style": style,
110 | "header_background_color": header_background_color,
111 | "header_text_color": header_text_color,
112 | "section_background_color": section_background_color,
113 | "section_text_color": section_text_color,
114 | "icon": icon,
115 | "header_text": header_text,
116 | "section_text": markdown2.markdown(section_text)})
117 |
118 |
119 | def link(section_text):
120 | return make_box(section_text=section_text,
121 | header_background_color='dodgerblue',
122 | header_text='Additional Resources',
123 | header_text_color='#fff',
124 | icon='',
125 | section_background_color='#e8f3ff',
126 | section_text_color='dodgerblue',
127 | style='link_box')
128 |
129 |
130 | def warning(section_text):
131 | return make_box(section_text=section_text,
132 | header_background_color='#FFCC00',
133 | header_text='Warning!',
134 | header_text_color='darkred',
135 | icon='',
136 | section_background_color='#FFF9E5',
137 | section_text_color='darkred',
138 | style="warning_box")
139 |
140 |
141 | def additional_info(section_text):
142 | return make_box(section_text=section_text,
143 | header_background_color='#590059',
144 | header_text='Additional Information',
145 | header_text_color='#fff',
146 | icon='',
147 | section_background_color='#eee5ee',
148 | section_text_color='#590059',
149 | style='additional_box')
150 |
151 |
152 | def developer_note(section_text):
153 | return make_box(section_text=section_text,
154 | header_background_color='#000',
155 | header_text='Developer Note',
156 | header_text_color='#76EE00',
157 | icon="""""",
159 | section_background_color='#e5e5e5',
160 | section_text_color='#000',
161 | style='developer_note')
162 |
--------------------------------------------------------------------------------
/licenses/runipy.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013, Paul Butler
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright notice, this
10 | list of conditions and the following disclaimer in the documentation and/or
11 | other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
16 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
17 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
18 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
19 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
21 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
22 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 |
--------------------------------------------------------------------------------
/licenses/scikit-bio.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013--, scikit-bio development team.
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification,
5 | are permitted provided that the following conditions are met:
6 |
7 | * Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 |
10 | * Redistributions in binary form must reproduce the above copyright notice, this
11 | list of conditions and the following disclaimer in the documentation and/or
12 | other materials provided with the distribution.
13 |
14 | * Neither the name of the {organization} nor the names of its
15 | contributors may be used to endorse or promote products derived from
16 | this software without specific prior written permission.
17 |
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/paper.bib:
--------------------------------------------------------------------------------
1 |
2 | % Generated by Paperpile. Check out http://paperpile.com for more information.
3 | % BibTeX export options can be customized via Settings -> BibTeX.
4 |
5 | @BOOK{Dunn2010-ik,
6 | title = "Practical Computing for Biologists",
7 | author = "Dunn, Casey and Haddock, Steven HD",
8 | abstract = "Increasingly, scientists find themselves facing exponentially
9 | larger data sets and analyses without suitable tools to deal
10 | with them. Many biologists end up using spreadsheet programs for
11 | most of their data-processing tasks and spend hours clicking
12 | around or copying and pasting, and then repeating the process
13 | for other data files. Practical Computing for Biologists shows
14 | you how to use many freely available computing tools to work
15 | more powerfully and effectively. The book was born out of the
16 | authors' own experience in developing tools for their research
17 | and helping other biologists with their computational problems.
18 | Although many of the techniques are relevant to molecular
19 | bioinformatics, the motivation for the book is much broader,
20 | focusing on topics and techniques that are applicable to a range
21 | of scientific endeavors. Twenty-two chapters organized into six
22 | parts address these topics and more: Searching with regular
23 | expressions The Unix command line Python programming and
24 | debugging Creating and editing graphics Databases Performing
25 | analyses on remote servers Working with electronics While most
26 | of the concepts and examples apply to any operating system, the
27 | main narrative focuses on Mac OS X. Where there are differences
28 | for Windows and Linux users, parallel instructions are provided
29 | in the margin and in an appendix. The book is designed to be
30 | used as a self-guided resource for researchers, a companion book
31 | in a course, or as a primary textbook. Practical Computing for
32 | Biologists will free you from the most frustrating and
33 | time-consuming aspects of data processing so you can focus on
34 | the pleasures of scientific inquiry.",
35 | publisher = "Sinauer Associates, Inc.",
36 | edition = "First edition",
37 | month = nov,
38 | year = 2010,
39 | url = "http://practicalcomputing.org/"
40 | }
41 |
42 | @ARTICLE{Wilson2016-kh,
43 | title = "Software Carpentry: lessons learned",
44 | author = "Wilson, Greg",
45 | abstract = "Since its start in 1998, Software Carpentry has evolved from a
46 | week-long training course at the US national laboratories into a
47 | worldwide volunteer effort to improve researchers' computing
48 | skills. This paper explains what we have learned along the way,
49 | the challenges we now face, and our plans for the future.",
50 | journal = "F1000 Research",
51 | volume = 3,
52 | month = jan,
53 | year = 2016,
54 | doi = "10.12688/F1000RESEARCH.3-62.V2"
55 | }
56 |
57 | @ARTICLE{Searls2014-ac,
58 | title = "A new online computational biology curriculum",
59 | author = "Searls, David B",
60 | abstract = "A recent proliferation of Massive Open Online Courses (MOOCs) and
61 | other web-based educational resources has greatly increased the
62 | potential for effective self-study in many fields. This article
63 | introduces a catalog of several hundred free video courses of
64 | potential interest to those wishing to expand their knowledge of
65 | bioinformatics and computational biology. The courses are
66 | organized into eleven subject areas modeled on university
67 | departments and are accompanied by commentary and career advice.",
68 | journal = "PLoS Computational Biology",
69 | volume = 10,
70 | number = 6,
71 | pages = "e1003662",
72 | month = jun,
73 | year = 2014,
74 | doi = "10.1371/journal.pcbi.1003662"
75 | }
76 |
77 | @BOOK{Felsenstein2003-wm,
78 | title = "Inferring Phylogenies",
79 | author = "Felsenstein, Joseph",
80 | abstract = "Phylogenies (evolutionary trees) are basic to thinking about and
81 | analyzing differences between species. Statistical,
82 | computational, and algorithmic work on them has been ongoing for
83 | four decades, with great advances in understanding. Yet no book
84 | has summarized this work until now. Inferring Phylogenies
85 | explains clearly the assumptions and logic of making inferences
86 | about phylogenies, and using them to make inferences about
87 | evolutionary processes. It is an essential text and reference
88 | for anyone who wants to understand how phylogenies are
89 | reconstructed and how they are used. As phylogenies are inferred
90 | with various kinds of data, this book concentrates on some of
91 | the central ones: discretely coded characters, molecular
92 | sequences, gene frequencies, and quantitative traits. Also
93 | covered are restriction sites, RAPDs, and microsatellites.
94 | Inferring Phylogenies is intended for graduate-level courses,
95 | assuming some knowledge of statistics, mathematics (calculus and
96 | fundamental matrix algebra), molecular sequences, and
97 | quantitative genetics.",
98 | publisher = "Sinauer Associates",
99 | edition = "Second edition",
100 | month = sep,
101 | year = 2003
102 | }
103 |
104 | @BOOK{Durbin1998-ru,
105 | title = "Biological Sequence Analysis: Probabilistic Models of Proteins
106 | and Nucleic Acids",
107 | author = "Durbin, Richard and Eddy, Sean R and Krogh, Anders and
108 | Mitchison, Graeme",
109 | abstract = "Probablistic models are becoming increasingly important in
110 | analyzing the huge amount of data being produced by large-scale
111 | DNA-sequencing efforts such as the Human Genome Project. For
112 | example, hidden Markov models are used for analyzing biological
113 | sequences, linguistic-grammar-based probabilistic models for
114 | identifying RNA secondary structure, and probabilistic
115 | evolutionary models for inferring phylogenies of sequences from
116 | different organisms. This book gives a unified, up-to-date and
117 | self-contained account, with a Bayesian slant, of such methods,
118 | and more generally to probabilistic methods of sequence
119 | analysis. Written by an interdisciplinary team of authors, it is
120 | accessible to molecular biologists, computer scientists, and
121 | mathematicians with no formal knowledge of the other fields, and
122 | at the same time presents the state of the art in this new and
123 | important field.",
124 | publisher = "Cambridge University Press",
125 | edition = "First edition",
126 | month = may,
127 | year = 1998,
128 | }
129 |
130 | @ARTICLE{Searls2012-ab,
131 | title = "An online bioinformatics curriculum",
132 | author = "Searls, David B",
133 | abstract = "Online learning initiatives over the past decade have become
134 | increasingly comprehensive in their selection of courses and
135 | sophisticated in their presentation, culminating in the recent
136 | announcement of a number of consortium and startup activities
137 | that promise to make a university education on the internet, free
138 | of charge, a real possibility. At this pivotal moment it is
139 | appropriate to explore the potential for obtaining comprehensive
140 | bioinformatics training with currently existing free video
141 | resources. This article presents such a bioinformatics curriculum
142 | in the form of a virtual course catalog, together with editorial
143 | commentary, and an assessment of strengths, weaknesses, and
144 | likely future directions for open online learning in this field.",
145 | journal = "PLoS Computational Biology",
146 | volume = 8,
147 | number = 9,
148 | pages = "e1002632",
149 | month = sep,
150 | year = 2012,
151 | doi = "10.1371/journal.pcbi.1002632"
152 | }
153 |
--------------------------------------------------------------------------------
/paper.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'An Introduction to Applied Bioinformatics: a free, open, and interactive text.'
3 | tags:
4 | - bioinformatics
5 | - python
6 | authors:
7 | - name: Evan Bolyen
8 | orcid: 0000-0002-5362-6782
9 | affiliation: 1
10 | - name: Jai Ram Rideout
11 | affiliation: 1
12 | - name: John Chase
13 | affiliation: 1
14 | - name: T. Anders Pitman
15 | affiliation: 1
16 | - name: Arron Shiffer
17 | affiliation: "1, 2"
18 | - name: Willow Mercurio
19 | affiliation: 1
20 | - name: Matthew R Dillon
21 | orcid: 0000-0002-7713-1952
22 | affiliation: 1
23 | - name: J Gregory Caporaso
24 | orcid: 0000-0002-8865-1670
25 | affiliation: "1, 2"
26 | affiliations:
27 | - name: Pathogen and Microbiome Institute, Northern Arizona University, Flagstaff, AZ, USA.
28 | index: 1
29 | - name: Department of Biological Sciences, Northern Arizona University, Flagstaff, AZ, USA.
30 | index: 2
31 | date: 17 July 2018
32 | bibliography: paper.bib
33 | ---
34 |
35 | # Summary
36 |
37 | _Statement of need_: Due to the increasing rate of biological data generation, bioinformatics is rapidly growing as a field and is now an essential part of scientific advances in human health and environmental sciences. Online and publicly accessible resources for learning bioinformatics exist (e.g., [Rosalind](http://rosalind.info), [@Searls2012-ab; @Searls2014-ac]), and there are excellent textbooks and courses in the area, some focused heavily on theory [@Felsenstein2003-wm; @Durbin1998-ru], and others geared toward learning specific skills such as Python programming or the Unix shell [@Dunn2010-ik; @Wilson2016-kh]. An Introduction to Applied Bioinformatics (IAB) is a free, online bioinformatics text that bridges the gap between theory and application by teaching fundamentals of bioinformatics in the context of their implementation, using an interactive framework based on highly relevant tools including Python 3, Jupyter Notebooks, and GitHub.
38 |
39 | IAB is geared toward students who are completely new to bioinformatics, though having completed an introductory course (or book) in both Computer Science and Biology are useful prerequisites. IAB readers begin on the [project website](http://readIAB.org). While it is possible to view the content statically from this page, we recommend that readers work interactively by installing IAB. Readers progress through chapters that introduce fundamental topics, such as sequence homology searching and multiple sequence alignment, and presents their Python 3 implementation. Because the content is presented in Jupyter Notebooks, students can edit and execute the code, for example to explore how changing k-word size or an alignment gap penalty might impact the results of a database search. The Python code that readers interact with is intended for educational purposes, where the implementation is made as simple as possible, sometimes at the cost of computational efficiency. Chapters therefore also include examples of performing the same analyses with [scikit-bio](http://scikit-bio.org), a production-quality bioinformatics Python 3 library. This enables a rapid transition from learning theory, or how an algorithm works, to applying techniques in a real-world setting.
40 |
41 | IAB additionally contains Wikipedia-style “Edit” links in each section of the text. When one of these links is followed, the reader is taken to the GitHub online editor where they can submit a pull request to modify content or code. Readers are therefore introduced to GitHub through a user-friendly web interface, and can begin building their GitHub activity history (commonly reviewed by bioinformatics hiring managers). Finally, every time a change is proposed via GitHub, all of the executable content of IAB is automatically tested. This continuous integration testing ensures that IAB example code remains functional as changes are introduced, solving an issue that plagues printed applied computational texts (for example because they describe an outdated software interface).
42 |
43 | IAB evolved from lecture materials developed by Dr. Caporaso for an introductory bioinformatics course targeted toward computer science and biology undergraduates (typically juniors or seniors) at Northern Arizona University. Since the early stages of its development, it has been used to teach at least ten courses and short (e.g., one day) bioinformatics workshops. As it became clear that the content and format was useful for teaching bioinformatics, Dr. Caporaso applied for and received grants from the Arizona Technology and Research Initiative and the Alfred P Sloan Foundation to further develop the resource. The content was originally written in Jupyter Notebooks, but as the project grew, it became difficult to maintain the notebooks and in particular to review submissions from others. The Jupyter Notebooks were transitioned to markdown files which are now the source for static HTML and Jupyter Notebook renderings of the content.
44 |
45 | The current version of IAB contains six chapters covering fundamental concepts and their applications. It is a dynamic resource that will be expanded, revised and updated over time. Its lifecycle is thus more similar to an active software project than a textbook: a practical approach to education in a rapidly changing field.
46 |
47 | # Acknowledgements
48 | An Introduction to Applied Bioinformatics is funded by a grant from the Alfred P Sloan Foundation to JGC. Initial prototyping was funded by a grant from the Arizona Technology and Research Initiative to JGC. EB, JC, and JGC had partial salary support from National Cancer Institute of the National Institutes of Health under the awards for the Partnership of Native American Cancer Prevention U54CA143924 (UACC) and U54CA143925 (NAU). We would like to thank the students in our courses, whose questions and feedback have shaped IAB.
49 |
50 | # Author Contributions
51 | JGC is the primary author of the IAB content. EB, JRR, and JC have developed components of the underlying framework. AP, AS, and WM have provided useful feedback on the content.
52 |
53 | # References
54 |
--------------------------------------------------------------------------------
/runipynbs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # -----------------------------------------------------------------------------
4 | # This work is licensed under the Creative Commons
5 | # Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
6 | # copy of this license, visit
7 | # http://creativecommons.org/licenses/by-nc-sa/4.0/.
8 | # -----------------------------------------------------------------------------
9 |
10 | import logging
11 | import os
12 | import sys
13 |
14 | from runipy.notebook_runner import NotebookRunner, NotebookError
15 | import nbformat
16 |
17 | # Taken and modified from
18 | # https://github.com/paulgb/runipy/blob/master/runipy/main.py
19 | # See licenses/ directory for runipy license.
20 | def main():
21 | log_format = '%(asctime)s %(message)s'
22 | log_datefmt = '%m/%d/%Y %I:%M:%S %p'
23 | ignore_dirs = ['.git', '.ipynb_checkpoints']
24 |
25 | logging.basicConfig(level=logging.DEBUG, format=log_format,
26 | datefmt=log_datefmt)
27 |
28 | if len(sys.argv) > 1:
29 | start_path = sys.argv[1]
30 | else:
31 | start_path = '.'
32 |
33 | if not os.path.exists(start_path):
34 | logging.error("Directory or file '%s' does not exist.", start_path)
35 | sys.exit(1)
36 |
37 | if os.path.isfile(start_path):
38 | run_notebook(start_path)
39 | else:
40 | for root, dirs, files in os.walk(start_path):
41 | dirs.sort()
42 | for ignore_dir in ignore_dirs:
43 | if ignore_dir in dirs:
44 | dirs.remove(ignore_dir)
45 |
46 | for name in sorted(files):
47 | if name.endswith('.ipynb'):
48 | nbpath = os.path.normpath(os.path.join(root, name))
49 | run_notebook(nbpath)
50 |
51 |
52 | def run_notebook(nbpath):
53 | logging.info("Reading notebook '%s'", nbpath)
54 | with open(nbpath) as nbfile:
55 | notebook = nbformat.read(nbfile, as_version=3)
56 |
57 | runner = NotebookRunner(notebook)
58 |
59 | try:
60 | runner.run_notebook()
61 | except NotebookError:
62 | logging.error("An error occurred while executing notebook '%s'. "
63 | "Exiting with nonzero exit status", nbpath)
64 | sys.exit(1)
65 |
66 |
67 | if __name__ == '__main__':
68 | main()
69 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # -----------------------------------------------------------------------------
4 | # This work is licensed under the Creative Commons
5 | # Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
6 | # copy of this license, visit
7 | # http://creativecommons.org/licenses/by-nc-sa/4.0/.
8 | # -----------------------------------------------------------------------------
9 |
10 | __version__ = '0.1.4.dev0'
11 |
12 | from setuptools import find_packages, setup
13 | import sys
14 |
15 | # Check that Python version is 3, since build will
16 | # currently complete in Python 2, but the notebooks
17 | # won't work.
18 | python_version = sys.version_info.major
19 | if python_version != 3:
20 | sys.exit("IAB can only be used with Python 3. "
21 | "You are currently running Python %d." % python_version)
22 |
23 | classes = """
24 | Development Status :: 1 - Planning
25 | Framework :: IPython
26 | Intended Audience :: Developers
27 | Intended Audience :: Education
28 | Intended Audience :: Science/Research
29 | Natural Language :: English
30 | Operating System :: MacOS :: MacOS X
31 | Operating System :: POSIX
32 | Operating System :: Unix
33 | Programming Language :: Python
34 | Programming Language :: Python :: 3.5
35 | Topic :: Scientific/Engineering
36 | Topic :: Scientific/Engineering :: Bio-Informatics
37 | """
38 | classifiers = [s.strip() for s in classes.split('\n') if s]
39 |
40 | description = ("An Introduction To Applied Bioinformatics (IAB): "
41 | "Interactive lessions in bioinformatics.")
42 |
43 | setup(name='An-Introduction-To-Applied-Bioinformatics',
44 | version=__version__,
45 | license='CC BY-NC-SA 4.0',
46 | description=description,
47 | long_description=description,
48 | author='Greg Caporaso',
49 | author_email='gregcaporaso@gmail.com',
50 | maintainer='Greg Caporaso',
51 | maintainer_email='gregcaporaso@gmail.com',
52 | url='http://readIAB.org',
53 | packages=find_packages(),
54 | install_requires=['scikit-bio >= 0.5.5, < 0.6.0',
55 | 'jupyter', 'seaborn',
56 | 'qiime-default-reference >= 0.1.3, < 0.2.0',
57 | 'pandas',
58 | 'markdown2 >= 2.3.0',
59 | 'tabulate',
60 | 'networkx == 2.3.0',
61 | 'ete3',
62 | 'ipymd >= 0.1.2'],
63 | classifiers=classifiers)
64 |
--------------------------------------------------------------------------------