├── .gitignore ├── .travis.yml ├── CODE-OF-CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile.template ├── ISSUE_TEMPLATE.md ├── LICENSE ├── README.md ├── book ├── applications │ ├── biological-diversity.md │ ├── images │ │ ├── basic-unrooted-tree1.jpg │ │ ├── example_big_dm.png │ │ ├── unifrac_tree_d0.5.graffle │ │ ├── unifrac_tree_d0.5.png │ │ ├── unifrac_tree_d0.graffle │ │ ├── unifrac_tree_d0.png │ │ ├── unifrac_tree_d1.graffle │ │ ├── unifrac_tree_d1.png │ │ ├── unifrac_tree_with_distances.graffle │ │ ├── unifrac_tree_with_distances.png │ │ ├── unifrac_tree_with_distances_ab.graffle │ │ └── unifrac_tree_with_distances_ab.png │ ├── index.md │ └── index.yaml ├── back-matter │ ├── about-the-author.md │ ├── glossary.md │ ├── index.md │ └── index.yaml ├── config.yaml ├── exercises │ ├── index.md │ ├── index.yaml │ ├── multiple-sequence-alignment.md │ └── pairwise-alignment.md ├── fundamentals │ ├── database-searching.md │ ├── images │ │ ├── Darwins_tree_of_life_1859.png │ │ ├── Pace_Big_Tree.png │ │ ├── alignment.graffle │ │ ├── alignment.png │ │ ├── basic-rooted-tree1.jpg │ │ ├── basic-unrooted-tree1.jpg │ │ ├── centroid-cluster.1.png │ │ ├── centroid-cluster.2.png │ │ ├── centroid-cluster.3.png │ │ ├── centroid-cluster.4.png │ │ ├── centroid-cluster.5.png │ │ ├── centroid-cluster.6.png │ │ ├── centroid-cluster.graffle │ │ ├── cluster-legend.graffle │ │ ├── cluster-legend.png │ │ ├── cluster-template.graffle │ │ │ ├── data.plist │ │ │ └── image1.pdf │ │ ├── cluster-types.graffle │ │ │ ├── data.plist │ │ │ └── image2.pdf │ │ ├── cluster-types.png │ │ ├── furthest-neighbor.1.png │ │ ├── furthest-neighbor.2.png │ │ ├── furthest-neighbor.3.png │ │ ├── furthest-neighbor.4.png │ │ ├── furthest-neighbor.5.png │ │ ├── furthest-neighbor.6.png │ │ ├── furthest-neighbor.graffle │ │ │ ├── data.plist │ │ │ └── image1.pdf │ │ ├── msa-tree-a1.graffle │ │ ├── msa-tree-a1.png │ │ ├── msa-tree-a2.graffle │ │ ├── msa-tree-a2.png │ │ ├── msa-tree-a3.graffle │ │ ├── msa-tree-a3.png │ │ ├── msa-tree-final.graffle │ │ ├── msa-tree-final.png │ │ ├── msa-tree-input.graffle │ │ ├── msa-tree-input.png │ │ ├── nearest-neighbor.1.png │ │ ├── nearest-neighbor.2.png │ │ ├── nearest-neighbor.3.png │ │ ├── nearest-neighbor.4.png │ │ ├── nearest-neighbor.5.png │ │ ├── nearest-neighbor.6.png │ │ ├── nearest-neighbor.graffle │ │ ├── sequence-evo-tree.graffle │ │ ├── sequence-evo-tree.png │ │ ├── spider-tree.png │ │ ├── tree-monophyly.graffle │ │ ├── tree-monophyly.png │ │ ├── tree-polyphyly.graffle │ │ ├── tree-polyphyly.png │ │ ├── tree-schematic1.graffle │ │ ├── tree-schematic1.png │ │ ├── upgma-tree-final.graffle │ │ ├── upgma-tree-final.png │ │ ├── upgma-tree-iter1.graffle │ │ ├── upgma-tree-iter1.png │ │ ├── upgma-tree-iter2.graffle │ │ ├── upgma-tree-iter2.png │ │ ├── upgma-tree-iter3.graffle │ │ └── upgma-tree-iter3.png │ ├── index.md │ ├── index.yaml │ ├── machine-learning.md │ ├── multiple-sequence-alignment.md │ ├── pairwise-alignment.md │ ├── phylogeny-reconstruction.md │ └── sequence-mapping-and-clustering.md ├── getting-started │ ├── biological-information.md │ ├── images │ │ ├── central-dogma.png │ │ ├── genetic-code.png │ │ ├── greg-in-telluride.png │ │ └── trna.png │ ├── index.md │ ├── index.yaml │ └── reading-iab.md ├── images │ └── logo.png ├── index.md └── index.yaml ├── custom.css ├── environment.yml ├── iab ├── __init__.py ├── algorithms │ └── __init__.py ├── data │ └── __init__.py └── format │ ├── __init__.py │ └── dialog_box.py ├── licenses ├── runipy.txt └── scikit-bio.txt ├── paper.bib ├── paper.md ├── runipynbs.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # ipynb checkpoint files 34 | .ipynb_checkpoints 35 | 36 | # Mr Developer 37 | .mr.developer.cfg 38 | .project 39 | .pydevproject 40 | 41 | # vim 42 | .*.swp 43 | 44 | # macos 45 | .DS_Store 46 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Taken and modified from 2 | # https://github.com/biocore/scikit-bio/blob/master/.travis.yml 3 | dist: bionic 4 | language: python 5 | services: 6 | - xvfb 7 | env: 8 | - PYTHON_VERSION=3.5 9 | before_install: 10 | - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 11 | - chmod +x miniconda.sh 12 | - ./miniconda.sh -b 13 | - export PATH=/home/travis/miniconda3/bin:$PATH 14 | install: 15 | - conda env create -n iab -f environment.yml 16 | - source activate iab 17 | - pip install . 18 | - conda install -y nose 19 | - pip install https://github.com/caporaso-lab/build-iab/archive/master.zip 20 | - biab notebook -i book -o ipynb 21 | script: 22 | - nosetests --with-doctest 23 | - cd ipynb 24 | - jupyter nbconvert *ipynb */*ipynb --execute --ExecutePreprocessor.timeout=-1 25 | -------------------------------------------------------------------------------- /CODE-OF-CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct: An Introduction to Applied Bioinformatics 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at caporaso-lab@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to *An Introduction to Applied Bioinformatics* 2 | 3 | *An Introduction to Applied Bioinformatics* (IAB), is an open source project and we welcome community contributions. Contributions should generally be made in the form of GitHub pull requests. We've simplified the process of submitting these for changes to the IAB content, as described below. 4 | 5 | You will need a (free) GitHub account to submit a pull request. 6 | 7 | Before considering a contribution to IAB, please read the project [Code of Conduct](https://github.com/applied-bioinformatics/An-Introduction-to-Applied-Bioinformatics/blob/master/CODE-OF-CONDUCT.md). 8 | 9 | ## How IAB is developed 10 | 11 | IAB is written in [markdown](http://commonmark.org/). [build-iab](https://github.com/caporaso-lab/build-iab) is then used to convert that markdown to html (for static online viewing) and ipynb (for interactive use). If you're submitting changes to content, you'll be submitting changes to markdown files. These are much more manageable that submitting changes to IPython notebooks, as it's much easier to diff the content. 12 | 13 | ## Small contributions 14 | 15 | Contributions such as typo fixes, wording changes, and small code updates are relatively easy to submit. Every unit, chapter, section, and subsection heading in IAB has an *Edit on GitHub* link below it. You should click the link corresponding to the section that you want to edit, which will take you to the GitHub online editor. You can make your changes and submit a [pull request](https://help.github.com/articles/using-pull-requests/) from that page. This will initiate the IAB tests, which will ensure that your change hasn't broken anything. After the tests pass, we will review your changes and either merge them, request modifications before merging, or let you know why we won't integrate your proposed changes. 16 | 17 | Watch a five-minute YouTube video illustrating this process [here](https://www.youtube.com/watch?v=s9-aZrX5CY8). 18 | 19 | ## Large contributions 20 | 21 | If you're interested in making contributions involving code refactoring, new chapters or sections, restructuring of content, etc, you should first comment on existing issues (or create new issues) indicating what you want to work on, and wait for us to discuss the changes with you before you get started. This will ensure that the changes you want to make are in line with the direction of the project, compatible with future plans, and that there is no one else already working on related contributions. This will avoid wasted time if your change involves something that we wouldn't ultimately be merged. If in doubt, bring it up on the [issue tracker](https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/issues/) first. 22 | 23 | ## Who's contributing? 24 | 25 | See the list of [IAB contributors](https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/graphs/contributors) to find out who is involved with the project. If you submit a pull request that is merged, your GitHub account will be automatically listed on that page. 26 | 27 | ## Technical points 28 | 29 | ### Building IAB locally 30 | 31 | If you're interested in building the IAB html and/or IPython Notebooks locally, you'll need to install IAB and build-iab. You can do this as follows: 32 | 33 | ``` 34 | pip install numpy 35 | pip install https://github.com/caporaso-lab/build-iab/archive/master.zip 36 | wget https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/archive/master.zip 37 | unzip master.zip 38 | cd An-Introduction-To-Applied-Bioinformatics-master/ 39 | pip install . 40 | ``` 41 | 42 | Then, to build the IPython Notebooks, you can run: 43 | 44 | ``` 45 | biab notebook -i book -o ipynb 46 | ``` 47 | 48 | or to build the HTML version, you can run: 49 | 50 | ``` 51 | biab html -i book -o html 52 | ``` 53 | 54 | ### Linking to other sections of the text 55 | 56 | All section headings must have ids associated with them. Should be generated as follows: 57 | 58 | ```bash 59 | $ biab idgen 60 | 61 | ``` 62 | 63 | When you define a section heading, you'd end it with the tag returned from the above command. For example: 64 | 65 | ```markdown 66 | ## Some section 67 | ``` 68 | 69 | If you then wanted to link to that section from somewhere else in the text, you could do that with a markdown link as follows: 70 | 71 | ```markdown 72 | This concept is discussed in further detail [above](alias://9mM4Bb). 73 | ``` 74 | 75 | You should always link using these ids, and never statically link to other sections of the text with URLs (because a section name might change, but its id won't). 76 | 77 | ## License and license changes 78 | 79 | The IAB license is available [here](https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/blob/master/LICENSE). This license may change over time, but the online version of IAB will always be the most current version, and will be available free of charge. 80 | 81 | By contributing to IAB, you are agreeing that Greg Caporaso has sole discretion over the license and any future changes to the license. If a paid (e.g., printed) copy of IAB is ever created, contributors are not entitled to payments or royalties of any kind. Your contribution of content represents your agreement with these terms. 82 | -------------------------------------------------------------------------------- /Dockerfile.template: -------------------------------------------------------------------------------- 1 | # Distributed under the terms of the Modified BSD License. 2 | # 3 | # This image is expecting to be built alongside the *built* IAB docs, 4 | # e.g. https://github.com/applied-bioinformatics/built-iab, 5 | # and is designed for running on mybinder.org 6 | 7 | FROM jupyter/minimal-notebook:58169ec3cfd3 8 | 9 | LABEL maintainer="Greg Caporaso " 10 | 11 | USER root 12 | 13 | ENV DISPLAY=:99 14 | 15 | RUN apt-get update 16 | RUN apt-get install -y xvfb x11-utils 17 | 18 | COPY IAB-notebooks ${HOME}/IAB-notebooks/ 19 | COPY .jupyter/custom/custom.css ${HOME}/.jupyter/custom/custom.css 20 | # `fix-permissions` ships with jupyter/minimal-notebook 21 | RUN fix-permissions ${HOME} 22 | RUN rm -rf work 23 | 24 | USER ${NB_UID} 25 | 26 | COPY environment.yml ${HOME} 27 | RUN conda env update -n base -f environment.yml 28 | RUN rm environment.yml 29 | 30 | # This is almost identical to the `ENTRYPOINT` defined in jupyter/minimal-notebook, 31 | # except we tack on a `xvfb-run` on the end, which ensures that ete3 has X. 32 | ENTRYPOINT ["tini", "-g", "--", "xvfb-run"] 33 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Describe the environment 2 | 3 | What version of IAB are you reading (e.g. v0.0.1)? 4 | 5 | ```python 6 | from iab import __version__ as iab_version 7 | print(iab_version) 8 | ``` 9 | 10 | ### Describe the problem 11 | 12 | #### Steps to reproduce 13 | 14 | 1. ... 15 | 2. ... 16 | 3. ... 17 | 18 | #### Observed Results 19 | 20 | * What happened? This could be a description, log output, etc. 21 | 22 | #### Expected Results 23 | 24 | * What did you expect to happen? 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # An Introduction To Applied Bioinformatics (1st edition) 2 | 3 | This project has been succeeded by An Introduction to Applied Bioinformatics, 2nd Edition. You can find that book at http://readIAB.org, and the source repositiory at https://github.com/applied-bioinformatics/iab2. This project (IAB 1st Edition) is no longer under active development. 4 | -------------------------------------------------------------------------------- /book/applications/images/basic-unrooted-tree1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/basic-unrooted-tree1.jpg -------------------------------------------------------------------------------- /book/applications/images/example_big_dm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/example_big_dm.png -------------------------------------------------------------------------------- /book/applications/images/unifrac_tree_d0.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_d0.5.png -------------------------------------------------------------------------------- /book/applications/images/unifrac_tree_d0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_d0.png -------------------------------------------------------------------------------- /book/applications/images/unifrac_tree_d1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_d1.png -------------------------------------------------------------------------------- /book/applications/images/unifrac_tree_with_distances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_with_distances.png -------------------------------------------------------------------------------- /book/applications/images/unifrac_tree_with_distances_ab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/applications/images/unifrac_tree_with_distances_ab.png -------------------------------------------------------------------------------- /book/applications/index.md: -------------------------------------------------------------------------------- 1 | # Applications 2 | -------------------------------------------------------------------------------- /book/applications/index.yaml: -------------------------------------------------------------------------------- 1 | contents: 2 | - biological-diversity 3 | -------------------------------------------------------------------------------- /book/back-matter/about-the-author.md: -------------------------------------------------------------------------------- 1 | # About the author 2 | 3 | My name is Greg Caporaso. I'm the primary author of *An Introduction to Applied Bioinformatics*, but there are [other contributors](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/graphs/contributors) and I hope that list will grow. 4 | 5 |
6 | 7 |
8 | 9 | I have degrees in Computer Science (B.S., University of Colorado, 2001) and Biochemistry (B.A., University of Colorado, 2004; Ph.D., University of Colorado, 2009). Following my formal training, I joined [Rob Knight's lab](http://knightlab.ucsd.edu), then at the University of Colorado, for approximately 2 years as a post-doctoral scholar. In 2011, I joined the faculty at [Northern Arizona University (NAU)](www.nau.edu) where I'm now an Associate Professor. I [teach](http://www.caporasolab.us/teaching/) one course per year in bioinformatics for graduate and undergraduate students of Biology. I also run a [research lab](http://www.caporasolab.us/) in the [Pathogen and Microbiome Institute](http://pmi.nau.edu/), which is focused on developing bioinformatics software and studying microbiomes. 10 | 11 | I'm not the world expert on the topics that I present in IAB, but I have a passion for bioinformatics, open source software, writing, and education. When I'm learning a new bioinformatics concept, for example an algorithm like pairwise alignment or a statistical technique like Monte Carlo simulation, implementing it is usually the best way for me to understand it. This led me to start developing IAB, as I found that my implementations helped my students learn the concepts too. I think that one of my strongest skills is the ability to break complex ideas into accessible components. I do this well for bioinformatics because I remember (and still regularly experience) the challenges of learning it, so can relate to newcomers in the field. 12 | 13 | I'm active in open source bioinformatics software development, and am most widely known for my development and leadership roles on [QIIME](http://qiime.org) and [QIIME 2](https://qiime2.org). I'm involved in other bioinformatics software projects as well (see my [GitHub page](http://github.com/gregcaporaso)). IAB is one of the projects that I'm currently most excited about, and I truly hope that it's as useful for you as it is fun for me. 14 | 15 | For updates on IAB and various other things, you can [follow me on Twitter](https://twitter.com/gregcaporaso). 16 | -------------------------------------------------------------------------------- /book/back-matter/glossary.md: -------------------------------------------------------------------------------- 1 | # Glossary 2 | 3 | ## Pairwise alignment (noun) 4 | 5 | A hypothesis about which bases or amino acids in two biological sequences are derived from a common ancestral base or amino acid. By definition, the *aligned sequences* will be of equal length with gaps (usually denoted with ``-``, or ``.`` for terminal gaps) indicating hypothesized insertion deletion events. A pairwise alignment may be represented as follows: 6 | 7 | ``` 8 | ACC---GTAC 9 | CCCATCGTAG 10 | ``` 11 | 12 | ## kmer (noun) 13 | 14 | A kmer is simply a word (or list of adjacent characters) in a sequence of length k. For example, the overlapping kmers in the sequence ``ACCGTGACCAGTTACCAGTTTGACCAA`` are as follows: 15 | 16 | ```python 17 | >>> import skbio 18 | >>> skbio.DNA('ACCGTGACCAGTTACCAGTTTGACCAA').kmer_frequencies(k=5, overlap=True) 19 | ``` 20 | 21 | It is common for bioinformaticians to substitute the value of `k` for the letter _k_ in the word _kmer_. For example, you might here someone say "we identified all seven-mers in our sequence", to mean they identified all kmers of length seven. 22 | -------------------------------------------------------------------------------- /book/back-matter/index.md: -------------------------------------------------------------------------------- 1 | # Back Matter 2 | -------------------------------------------------------------------------------- /book/back-matter/index.yaml: -------------------------------------------------------------------------------- 1 | contents: 2 | - about-the-author 3 | - glossary 4 | -------------------------------------------------------------------------------- /book/config.yaml: -------------------------------------------------------------------------------- 1 | repo: caporaso-lab/An-Introduction-to-Applied-Bioinformatics 2 | root: book 3 | -------------------------------------------------------------------------------- /book/exercises/index.md: -------------------------------------------------------------------------------- 1 | # Exercises 2 | 3 | This section contains exercises corresponding to different chapters. In the future these will be structured to link specifically to and from the relevant chapters. 4 | -------------------------------------------------------------------------------- /book/exercises/index.yaml: -------------------------------------------------------------------------------- 1 | contents: 2 | - pairwise-alignment 3 | - multiple-sequence-alignment 4 | -------------------------------------------------------------------------------- /book/exercises/pairwise-alignment.md: -------------------------------------------------------------------------------- 1 | 2 | # Local sequence alignment exercises 3 | 4 | ## Purpose 5 | 6 | The purpose of this exercise is to have you combine a few of the topics we have covered in class up to this point, and get you working with code. (Don't panic: you don't have to write code - everything in this assignment can be achieved with cutting and pasting code that we've already written for you.) 7 | 8 | You will need to be familiar with local and global alignments (see the pairwise alignment section) and the process of PCR (see the molecular biology section). The aim is to develop your bioinformatics problem solving skills, while also introducing you to interacting with the IPython Notebook. 9 | 10 | ## Background 11 | 12 | You should start by reading the section 4.3 [here](http://www.ncbi.nlm.nih.gov/books/NBK21129/#A6064) of *Genomes*, by TA Brown. You may find the entire chapter useful. 13 | 14 | A common process in bioinformatics is looking at the composition of microorganisms in a given environment. For instance, we could take a sample from a desk in an office, the gut of a human subject, or the Southern Ocean and ask what microorganisms are present in each of the different samples. The most common way to answer this question is to sequence the 16S rRNA gene. What makes this gene so useful is that it contains several conserved areas, which means that we can isolate it from the full genomes of many organisms using PCR, however it also contains highly variable regions so that we can tell the organisms apart. The sequence of the 16S rRNA therefore serves as a *fingerprint* for a microorganism. If we find it in a sample, that suggests that the organism is present in that sample. 15 | 16 | ## Goals 17 | 18 | In this exercise we will provide you with the full-length 16S rRNA sequences from five different bacterial organisms, and ten candidate primer sequences. Using both global and local sequence alignment, we'll ask you to select what you think is the best single primer pair to use for amplifying 16S rRNA from these five organisms for use in profiling diverse communities of microorganisms. 19 | 20 | Throughout this notebook there will be questions that you are required to answer. They will be written in bold so you know they are required and they should also help you with the overall process. 21 | 22 | ## Hints 23 | 24 | * Read all of the cells containing text very carefully! 25 | 26 | * You may write code or use a text editor if you wish, however all of the tools necessary to answer the questions are present in this notebook. 27 | 28 | * Spend some time thinking about what the question is and how you can go about answering it. This assignment is based largely on problem solving skills and it may take time to develop a good strategy. 29 | 30 | * Get help, that's what office hours are for! 31 | 32 | * You are allowed to discuss the assignment with other students, however your work needs to be your own. Using or looking at code or commands generated by another student is strictly prohibited. If you're in doubt over whether some type of interaction is acceptable for this assignment, ask. 33 | 34 | ## Getting started 35 | 36 | The first thing you will want to do is import a couple of functions that will be necessary for this problem. 37 | 38 | ```python 39 | >>> %pylab inline 40 | >>> from iab.algorithms import sw_align_nt, nw_align_nt 41 | ``` 42 | 43 | Next, in order to make sure the function was imported properly, and to see how it works run the `help` command on it. Read the help text carefully, it will be important to understand exactly what each function does. 44 | 45 | ```python 46 | >>> help(sw_align_nt) 47 | ``` 48 | 49 | ```python 50 | >>> help(nw_align_nt) 51 | ``` 52 | 53 | This next function, ``slice_sequence``, will let you let you easily extract segments of a sequence that are of interest to you (for example, the region between where two primers align. 54 | 55 | ```python 56 | >>> def slice_sequence(sequence, start_pos, end_pos): 57 | >>> """ Given a sequence, return the substring between start_pos and end_pos 58 | ... 59 | >>> Parameters 60 | >>> ---------- 61 | >>> sequence: string 62 | >>> The sequence to be sliced 63 | >>> start_pos: int 64 | >>> The starting position for the new sequence 65 | >>> end_pos: int 66 | >>> The ending position for the new sequence 67 | ... 68 | >>> Returns 69 | >>> ------- 70 | >>> string 71 | >>> A substring of the input string between start_pos and end_pos 72 | ... 73 | >>> """ 74 | >>> if len(sequence) == 0: 75 | >>> raise ValueError("The sequence is empty") 76 | >>> if start_pos < 1: 77 | >>> raise ValueError("Starting position must be greater than zero.") 78 | >>> if end_pos > len(sequence): 79 | >>> raise ValueError("Ending position cannot be larger than the length of the sequence.") 80 | >>> if start_pos > end_pos: 81 | >>> raise ValueError("The starting position must be less than the ending positions.") 82 | >>> return sequence[start_pos-1:end_pos] 83 | ``` 84 | 85 | The following cell contains the full-length 16S rRNA sequences of five diverse bacterial organisms. Make sure to run this cell in order to load the sequences into memory. 86 | 87 | ```python 88 | >>> # k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Escherichia; s__coli 89 | >>> s1 = ['656881','GGGTGAGTAATGTCTGGGAAACTGCCTGATGGAGGGGGATAACTACTGGAAACGGTAGCTAATACCGCATAACGTCGCAAGACCAAAGAGGGGGACCTTCGGGCCTCTTGCCATCGGATGTGCCCAGATGGGATTAGCTTGTTGGTGGGGTAACGGCTCACCAAGGCGACGATCCCTAGCTGGTCTGAGAGGATGACCAGCCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCCTGATGCAGCCATGCCGCGTGTATGAAGAAGGCCTTCGGGTTGTAAAGTACTTTCAGCGGGGAGGAAGGGAGTAAAGTTAATACCTTTGCTCATTGACGTTACCCGCAGAAGAAGCACCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGTTAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCTGATACTGGCAAGCTTGAGTCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACGAAGACTGACGCTCAGGTGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGCGTGGCTTCCGGAGCTAACGCGTTAAGTCGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGTCTTGACATCCACGGAAGTTTTCAGAGATGAGAATGTGCCTTCGGGAACCGTGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTCCGGCCGGGAACTCAAAGGAGACTGCCAGTGATAAACTGGAGGAAGGTGGGGATGACGTCAAGTCATCATGGCCCTTACGACCAGGGCTACACACGTGCTACAATGGCGCATACAAAGAGAAGCGACCTCGCGAGAGCAAGCGGACCTCATAAAGTGCGTCGTAGTCCGGATTGGAGTCTGCAACTCGACTCCATGAAGTCGGAATCGCTAGTAATCGTGGATCAGAATGCCACGGTGAATACGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTA'] 90 | ... 91 | >>> # k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__S24-7; g__; s__ 92 | >>> s2 = ['305251','GATGAACGCTAGCGACAGGCTTAACACATGCAAGTCGAGGGGCAGCGAGATTGTGGCAACACGATTGTCGGCGACCGGCGCACTGGTGAGTAACACGTATGCAACCTGCCGCGCACTGGGGGATAATCTTGGGAAACCGAGTCTAATACCCCGTAGGCCTTGTTGCCGCATGGTAATAAGGTAAGAGGAGTGATCCGATGCGCGATGGGCATGCGGCGCATTAGCTAGTTGGCGGGGTAACAGCCCACCAAGGCGACGATGCGTAGGGGTTCTGAGAGGAAGGTCCCCCACACTGGTACTGAGACACGGACCAGACTCCTACGGGAGGCAGCAGTGAGGAATATTGGTCAATGGGCGGAAGCCTGAACCAGCCAAGTCGCGTGCGGGAGGGAGGCCCTACGGGTCGTAAACCGCTTTTGATGGGGGGTAACCATGCGGACGAGTCCGCATCTGAGAGCACCCATCGAATAAGCATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGCAGGCGGCTATTCAAGTCAGCGGTCAAATTGCGGTGCTCAACGCCGTATCGCCGTTGAAACTGAGTTGGCTAGAGTGAGAGTGAGGAAGGCGGAATGCGCGGTGTAGCGGTGAAATGCATAGATATTGCGCAGAACTCCGATTGCGAAGGCAGCTTTCCAATTCTCTACTGACGCTCATGCACGAAAGCGTGGGTATCGAACAGGATTAGATACCCTGGTAGTCCACGCGGTAAACGATGGTCACTAGCTGTGCGCCCTGATTAAAGGGAGCGTGGCCGAGCGAAAGCGTTAAGTGACCCACCTTGGGAGTACGCCGGCAACGGTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGAGGAACATGTGGTTTAATTCGATGATACGCGAGGAACCTTACCCGGGCTCAAACGCTTGCGGGAGTCTATTTGAAAGGATAGATGCCCTTCGGGGCTGCAAGCGAGGTGCTGCATGGTTGTCGTCAGCTCGTGCCGTGAGGTGTCGGCTTAAGTGCCATAACGAGCGCAACCCCCATCTTCAGTTGCCGTCGGGTAGAGCCGGGCACTCTGGAGAGACTGCCGGCGCAAGCTGTGAGGAAGGCGGGGATGACGTCAAATCAGCACGGCCCTTACGTCCGGGGCGACACACGTGTTACAATGGCGGGGCACAGAGGGAAGCCAGGCGGTGACGTCGAGCGGATCCCGAAAACCCGTCTCAGTTCGGATCGGAGTCTGCAGCTCGACTCCGTGAAGCTGGATTCGCTAGTAATCGCGCATCAGCCATGGCGCGGTGAATACGTTCCCGGGCCT'] 93 | ... 94 | >>> # k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Chlorophyta; f__Ulvophyceae; g__; s__ 95 | >>> s3 = ['577032', 'ATGAACGCTGGCGGCATGCTTAACACATGCAAGTTGAACGGGTTTAAGTTTATTAAACTTAAACAAGTAGCGGACGGGTGAGTAACGCGTAAGAACCTACCTTTAGGTAAGGAATAACTATTGGAAGCGATAGATAATACCTTATAAGCTTATAGTAAAAGATAAAATCGCCTAAGGATGGGCTTGCGTCTGATTAGCTTGTTGGTGATTTAAAAGATTCACCAAGGCAACGATCAGTAGTTGGTCTAAGAGGATGATCAACCACACTGGGACTGAGACACGGCCCAGACCTCTACGGAGGGCAGCAGTGAGGAATTTTCCGCAATGGGCGAAAGCCTGACGGAGCAATGCCGCGTGGAGGATGAAAGCTTGTGAGTCGTAAACTCCTTTTCTTAGTGAAGAAATAAGACGGTATCTAAGGAATAAGCATCGGCTAACTCCGTGCCAGCAGCCGCGGTAATACGGAGGATGCAAGCGTTATTCGGAATTATTGGGCGTAAAGCGTCTGTAGGTGGTTTTTTAAGTCTACTGTTAAATATTAAGGCTTAACCTTAAAAAAGCGGTATGAAACTAAAAAACTTGAGTTTAGTAGAGGTAGAGGGAATTCTCGGTGTAGTGGTGAAATGCGTAGAGGTCGGGAAGAACACCGGTAGCGAAAGCGCTCTACTGGGCTAAAACTGACACTCAGAGACGAAAGCTAAAGTAGCGAATGGGATTAGATACCCCAGTAGTCTTAGCTGTAAACGATGGGTACTAGATGTTGCGCGTATCGATCCGTGCAGTATCGTAGCTAACGCGTTAAGTACCCCGCCTGGGAAGTATGCTCGCAAGAGTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCAGAACTTGACATGTCACAAATTTTCTTGAAAAAGAAAAGTGCCTTAGGGAGTGTGAACACAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTGTTTTTAGTTGCCATCATTTAGTTGGGAACTCTAAAAAGACTGCCGGTGACAAACCGGAGGAAGGTGAGGATGACGTCAAGTCAGCATGCCCCTTATGTTCTGGGCTACACACGTGCTACAATGATTATGACAAAGGGTAGCGAATTCGCGAGAATCAGCCAATCTCATAAACATAGTCTAAGTTCGGATTGCAGGCTGAAACTCGCCTGCATGAAGCTGGAATCGCTAGTAATCGCCGGTCAGCTATACGGCGGTGAATCCGTTCCCGGGCCTTGTACACACCGCCCGTCACACCATGGAAGTTGGCTACGCCCGAAGTCGTTATCTTAACCTTTTTGGAGGGAGGCGCCTAAGGTGGAGCCAGTGACTGGGGTGA'] 96 | ... 97 | >>> # k__Bacteria; p__GN02; c__BD1-5; o__; f__; g__; s__ 98 | >>> s4 = ['200762','AGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCGTGCCTAACACATGCAAGTCGAGCGGTAACGGGTGTAGCAATACATGCTGACGAGCGGCGGACGGGTGAGCAATATTTGGGAATCTGCCTATTAGTGGGGGACAACCCGGGGAAACTCGGGCTAATACCGCATACGCTCTACGGAGGAAAGCCGGGGACCGCAAGGCCTGGCGCTAATAGATGAGCCCAAATCGGATTAGCTAGTTGGTGAGGTAAAGGCTCACCAAGGCGACGATCCGTAGCTGGTCTGAGAGGACGACCAGCCACACCGGAACTGAGACACGGTCCGGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATGGAGCGACACCGCGTGAAGGATGAAGCCTTTGTTGGTGTAAACTTCTTTTCTCTGGGAAGATAATGACGGTACCAGAGGAATAAGGGGCGGCAAACTTCGTGCCAGCAGCCGCGGTAATACGAAGGCCCCAAGCGTTATCCGGAATTACTGGGCGTAAAGCGTCTGTAGGTGGTCTGGAAAGTTTCAAGTGAAAGGTCAGGGCTTAACCCTGTACTTGCTTGGAAAACTATCAGACTTGAGTGCGGGAGAGGCAAGCAGAACTGTATGAGTAGGGGTGCAATCCGTTGATACATACAAGAATACCAAAAGCGAAGGCAGCTTGCTGGAACGCTACTGACACTGAGAGACGAAAGCGTGGGGAGCAAAAGGGATTAGATACCCCTGTAGTCCACGCCCTAAACGATGGATGCTAAATGTCGGCGCAAGCCGGTGTTTCAAGCTAACGCATTAAGCATCCCGCCTGAGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATAGACGGGGACCCGCACAAGCAGTGGATCATGTGGTTTAATTCGACACTAAACGAGGAACCTCACCTAGGCTTGACATTGATAGAATTTGCTGGAAACAGCGAAGTGCCTGCAAGGGAACTTGAAAACAGGCGCTGCATGGTTGTCGTCAGCTCGTGCCTTGAGGTGTTCGGTTAAGTCCGTTAACGAGCGCAACCCATGTCGTTAGTTATTATGTCTAACGAGACTGCTCGAGTTAATCGAGAGGAAGGTGTGGATGACGTCAAATCAGCATGGCCCTTATGCCTAGGGCTACACACATGATACAATGGTCGGTACAAAGGGTTGCCAAGTGGTAACACGGAGCCAATCCCAGAAAGCCGATCTCAGTCCAGATTGAGGGCTGCAACTCGCCCTCATGAAGTTGGAATTGCTAGTAATCGTGAATCAGCTATGTCACGGTGAATCTGTTCCCGGGTCTTGTACTCACCGCCCGTCAAACCATGGGAGGTGTGCGTACCTGAAGTCCTTCGAGTAATACGGAGGCCCACGGTAAACACACTGACTGGGGTTAAGTCGTAACAAGGTA'] 99 | ... 100 | >>> # k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__; s__ 101 | >>> s5 = ['3728119','CTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGATGACGGGAGCTTGCTCCTTGATTCAGCGGCGGACGGGTGAGTAATGCCTAGGAATCTGCCTGGTAGTGGGGGACAACGTTTCGAAAGGAACGCTAATACCGCATACGTCCTACGGGAGAAAGCAGGGGACCTTCGGGCCTTGCGCTATCAGATGAGCCTAGGTCGGATTAGCTAGTAGGTGAGGTAATGGCTCACCTAGGCGACGATCCGTAACTGGTCTGAGAGGATGATCAGTCACACTGGAACTGAGACACGGTCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGGACAATGGGCGAAAGCCTGATCCAGCCATGCCGCGTGTGTGAAGAAGGTCTTCGGATTGTAAAGCACTTTAAGTTGGGAGGAAGGGCAGTAAGCTAATACCTTGCTGTTTTGACGTTACCGACAGAATAAGCACCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACGGCAGGGATGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGCTAGGCTGGTTCGTTAAGTCTGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTCGAAACTGGCAGAGCTAGAGTATTGTAGAGGGTGGTGGAATTTCAGGTGTAGCGGTGAAATGCGTAGATATCTGGAAGGAATACCGGTGGCGAAGGCGACCCCCTGGACATGATACTGACGCTCATGTCGTCTTAGCGATAATGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGTCGACTTGGAGGTTGTGCCCTTGAGGTTTGGCTTCCGGAGCTAACGCGTTAAGTTGACCGCCTGGGGAGTACGGCCGCAAGGTTAAAACTCAAATGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTACTCTTGACATCCAGAGAACTTTCCAGAGATGGATTGGTGCCTTCGGGAACTCTGACACAGGTGCTGCATGGCTGTCGTCAGCTCGTGTTGTGAAATGTTGGGTTAAGTCCCGCAACGAGCGCAACCCTTATCCTTTGTTGCCAGCGGTTTGGCCGGGAACTCATAGGAAACTGCCAGTGATCAACTGGAAGAAAGTGGGGATGACCTCCAGTCATCATGGCCCTTACCAGTATGGCTACACACGTGCTACAATGGCGCATACCAAGAGAATCGACCTCGCGAGAGCGAGCGGACTTTATCAAGTGCGTCGTAATCCGGATTGGAGTCTGCCACTCTCACTCGATGAAGTCCGAATCGCTAGTAATCGTGGATTCAGAATTGCTTCGGTGTGAATATCGTTCCCGGGCCTTTGTACACACCCGCCCGGTCACACCATGGG'] 102 | ... 103 | >>> sequences = [s1, s2, s3, s4, s5] 104 | ``` 105 | 106 | You can now use ``slice_sequence`` to extract a region of interest from a sequence - for example, the region between the starting position of two primer hits. The example below would extract the region between 50 and 250 from sequence 1. 107 | 108 | ```python 109 | >>> slice_sequence(sequences[0][1], 50, 250) 110 | ``` 111 | 112 | **Hint:** The numbers here may look a little bit funny. That's because the first item in a list in python is 0, not 1. So, if you want the first entry from the list `sequences` you would type `sequences[0]`, and if you want the second sequence you would type `sequences[1]`. Try this in the cell below until you are comfortable getting the sequence you want. Then try to slice a sequence to the region between positions 25 and 50. 113 | 114 | ```python 115 | ... 116 | ``` 117 | 118 | This cell contains the potential primers that we will use to amplify specific regions. 119 | 120 | ```python 121 | >>> primers = [('p1', 'TTCCGGTTGATCCNGCCGGA'), # F21 122 | >>> ('p2', 'ACNGCTCAGTAACACGT'), # F109 123 | >>> ('p3', 'GCTGCCTCCCGTAGGAGT'), # F338 124 | >>> ('p4', 'TACGGNAGGCAGCAG'), # F343 125 | >>> ('p5', 'GTGCCAGCNGCCGCGGTAA'), # F515 126 | >>> ('p6', 'ATTAGATACCCNGGTAGTCC'), # F770 127 | >>> ('p7', 'ATTAGATACCCNNGTAGTCC'), # R806 (reverse complement) 128 | >>> ('p8', 'AGGAATTGGCGGGGCAGCAC'), # R915 (reverse complement) 129 | >>> ('p9', 'AAACTNAAAGGAATTGACGG'), # F926 130 | >>> ('p10', 'AGGTNNGNATGCCCCNAA')] # R1240 (reverse complement) 131 | ``` 132 | 133 | If you want to locally align a primer against a primer, a primer against a sequence, or a sequence against a sequence, you could run the following: 134 | 135 | ```python 136 | >>> aln1, aln2, score, start1, start2 = sw_align_nt(primers[0][1], primers[1][1]) 137 | >>> print(aln1) 138 | >>> print(aln2) 139 | >>> print(score) 140 | >>> print(start1) 141 | >>> print(start2) 142 | ``` 143 | 144 | ```python 145 | >>> aln1, aln2, score, start1, start2 = sw_align_nt(primers[0][1], sequences[0][1]) 146 | >>> print(aln1) 147 | >>> print(aln2) 148 | >>> print(score) 149 | >>> print(start1) 150 | >>> print(start2) 151 | ``` 152 | 153 | ```python 154 | >>> aln1, aln2, score, start1, start2 = sw_align_nt(primers[0][1], sequences[0][1]) 155 | >>> print(aln1) 156 | >>> print(aln2) 157 | >>> print(score) 158 | ``` 159 | 160 | If you want to globally align a primer against a primer, you could run the following: 161 | 162 | ```python 163 | >>> aln1, aln2, score = nw_align_nt(primers[0][1], primers[1][1]) 164 | >>> print(aln1) 165 | >>> print(aln2) 166 | >>> print(score) 167 | ``` 168 | 169 | Notice that there are two additional return values from ``sw_align_nt`` than there are from ``nw_align_nt``. Look at the help for each function to figure out what these values are. Why does it make sense to get them from ``sw_align_nt``, but not from ``nw_align_nt``? 170 | 171 | In the cell below, try to globally align a primer against a sequence. Then try to globally align a different primer against a different sequence. 172 | 173 | ```python 174 | ... 175 | ``` 176 | 177 | Hint: if you want to normalize an alignment score by it's length, you can do the following. This may come in handy when comparing alignments. 178 | 179 | ```python 180 | >>> aln1, aln2, score = nw_align_nt(primers[0][1], sequences[0][1]) 181 | >>> print(score / len(aln1)) 182 | ``` 183 | 184 | At this point you have the necessary functions to complete the assignment. Your goal is to pick the best pair of 16S primers, a forward and a reverse primer, given the above five input sequences. 185 | 186 | The best primers will: 187 | 188 | 1. Anneal to all of the 16S rRNA sequences well. This will be determined by achieving a high alignment score between the primer and all of the sequences (though it is OK for there to be some mismatches). 189 | 2. They should amplify a region that is 100-400 base pairs long, due to limitations of current sequencing technology. 190 | 3. Finally, the region that is amplified (i.e., between the primers) should be very different across all species, to allow for accurate *fingerprinting* of the different species. 191 | 192 | ## Question 1 193 | 194 | What is the difference between a local and global alignment in terms of what is aligned? What are the differences in the algorithms that support this? (One paragraph) 195 | 196 | ## Question 2 197 | 198 | 199 | What is the sequence in s2 from position 500 to 505? 200 | 201 | **Hint:** the first base in a sequence is position 1; the first item in a Python list is index 0. 202 | 203 | ```python 204 | ... 205 | ``` 206 | 207 | ## Question 3 208 | 209 | What is the Smith-Waterman alignment score of primer `p6` against sequence `s4`? Where in `s4` does the alignment start? 210 | 211 | **Hint:** copy and paste from other cells, that way you only have to make a couple small changes. 212 | 213 | ```python 214 | ... 215 | ``` 216 | 217 | ## Question 4 218 | 219 | What is the best pair of primers from the list of available primers to use for amplifying the 16S region for sequencing for the purposes of identifying the organisms present? 220 | 221 | ## More hints 222 | 223 | The best pair of primers will align well to (and therefore be likely to anneal with) the 16S sequences from all of the organisms present in the list. 224 | 225 | The best pair of primers will amplify a region of DNA that is between 100 and 400 base pairs long. 226 | 227 | The best pair of primers will amplify a region that is highly variable (in other words, the amplified regions across the organisms should not align well). 228 | 229 | Think about whether you want to use global or local alignments for the different steps. Are there times when you would want to use a gap penalty other than the default? 230 | 231 | The `N` character present in some of the primer sequences signifies a "degenerate" base, meaning it could be an 'A', 'T', 'G' or 'C'. You shouldn't worry about these for this exercise. 232 | 233 | Good Luck! 234 | 235 | ```python 236 | ... 237 | ``` 238 | -------------------------------------------------------------------------------- /book/fundamentals/images/Darwins_tree_of_life_1859.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/Darwins_tree_of_life_1859.png -------------------------------------------------------------------------------- /book/fundamentals/images/Pace_Big_Tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/Pace_Big_Tree.png -------------------------------------------------------------------------------- /book/fundamentals/images/alignment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/alignment.png -------------------------------------------------------------------------------- /book/fundamentals/images/basic-rooted-tree1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/basic-rooted-tree1.jpg -------------------------------------------------------------------------------- /book/fundamentals/images/basic-unrooted-tree1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/basic-unrooted-tree1.jpg -------------------------------------------------------------------------------- /book/fundamentals/images/centroid-cluster.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.1.png -------------------------------------------------------------------------------- /book/fundamentals/images/centroid-cluster.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.2.png -------------------------------------------------------------------------------- /book/fundamentals/images/centroid-cluster.3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.3.png -------------------------------------------------------------------------------- /book/fundamentals/images/centroid-cluster.4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.4.png -------------------------------------------------------------------------------- /book/fundamentals/images/centroid-cluster.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.5.png -------------------------------------------------------------------------------- /book/fundamentals/images/centroid-cluster.6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/centroid-cluster.6.png -------------------------------------------------------------------------------- /book/fundamentals/images/cluster-legend.graffle: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ActiveLayerIndex 6 | 0 7 | ApplicationVersion 8 | 9 | com.omnigroup.OmniGraffle 10 | 139.18.0.187838 11 | 12 | AutoAdjust 13 | 14 | BackgroundGraphic 15 | 16 | Bounds 17 | {{0, 0}, {576.00002670288086, 733}} 18 | Class 19 | SolidGraphic 20 | ID 21 | 2 22 | Style 23 | 24 | shadow 25 | 26 | Draws 27 | NO 28 | 29 | stroke 30 | 31 | Draws 32 | NO 33 | 34 | 35 | 36 | BaseZoom 37 | 0 38 | CanvasOrigin 39 | {0, 0} 40 | ColumnAlign 41 | 1 42 | ColumnSpacing 43 | 36 44 | CreationDate 45 | 2014-04-08 02:25:49 +0000 46 | Creator 47 | Greg Caporaso 48 | DisplayScale 49 | 1 0/72 in = 1 0/72 in 50 | GraphDocumentVersion 51 | 8 52 | GraphicsList 53 | 54 | 55 | Class 56 | Group 57 | Graphics 58 | 59 | 60 | Bounds 61 | {{12, 202}, {167.5, 42}} 62 | Class 63 | ShapedGraphic 64 | FitText 65 | Vertical 66 | Flow 67 | Resize 68 | ID 69 | 139 70 | Magnets 71 | 72 | {0, 1} 73 | {0, -1} 74 | {1, 0} 75 | {-1, 0} 76 | 77 | Shape 78 | Rectangle 79 | Style 80 | 81 | fill 82 | 83 | Draws 84 | NO 85 | 86 | shadow 87 | 88 | Draws 89 | NO 90 | 91 | stroke 92 | 93 | Draws 94 | NO 95 | 96 | 97 | Text 98 | 99 | Text 100 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf400 101 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 102 | {\colortbl;\red255\green255\blue255;} 103 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 104 | 105 | \f0\fs28 \cf0 A sequence and dissimilarity range.} 106 | VerticalPad 107 | 4 108 | 109 | TextPlacement 110 | 0 111 | 112 | 113 | Class 114 | Group 115 | Graphics 116 | 117 | 118 | AllowConnections 119 | NO 120 | AllowLabelDrop 121 | 122 | AllowToConnect 123 | 124 | Class 125 | LineGraphic 126 | ID 127 | 141 128 | Points 129 | 130 | {90.607142857142833, 100.85714285714289} 131 | {167.74999726031507, 100.85714285714289} 132 | 133 | Style 134 | 135 | stroke 136 | 137 | HeadArrow 138 | 0 139 | Legacy 140 | 141 | TailArrow 142 | FilledBall 143 | TailScale 144 | 0.5 145 | 146 | 147 | 148 | 149 | Bounds 150 | {{23.75, 29.5}, {144, 144}} 151 | Class 152 | ShapedGraphic 153 | ID 154 | 142 155 | Shape 156 | Circle 157 | Style 158 | 159 | fill 160 | 161 | Color 162 | 163 | a 164 | 0.31 165 | b 166 | 0.670097 167 | g 168 | 0.670086 169 | r 170 | 0.670106 171 | 172 | Draws 173 | NO 174 | 175 | shadow 176 | 177 | Draws 178 | NO 179 | 180 | stroke 181 | 182 | Pattern 183 | 1 184 | 185 | 186 | Text 187 | 188 | VerticalPad 189 | 0 190 | 191 | 192 | 193 | ID 194 | 140 195 | 196 | 197 | ID 198 | 138 199 | 200 | 201 | Class 202 | Group 203 | Graphics 204 | 205 | 206 | Bounds 207 | {{193.25001335144043, 202}, {167.5, 42}} 208 | Class 209 | ShapedGraphic 210 | FitText 211 | Vertical 212 | Flow 213 | Resize 214 | ID 215 | 144 216 | Magnets 217 | 218 | {0, 1} 219 | {0, -1} 220 | {1, 0} 221 | {-1, 0} 222 | 223 | Shape 224 | Rectangle 225 | Style 226 | 227 | fill 228 | 229 | Draws 230 | NO 231 | 232 | shadow 233 | 234 | Draws 235 | NO 236 | 237 | stroke 238 | 239 | Draws 240 | NO 241 | 242 | 243 | Text 244 | 245 | Text 246 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf400 247 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 248 | {\colortbl;\red255\green255\blue255;} 249 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 250 | 251 | \f0\fs28 \cf0 An OTU definition based on a single sequence.} 252 | VerticalPad 253 | 4 254 | 255 | TextPlacement 256 | 0 257 | 258 | 259 | Class 260 | Group 261 | Graphics 262 | 263 | 264 | AllowConnections 265 | NO 266 | AllowLabelDrop 267 | 268 | AllowToConnect 269 | 270 | Class 271 | LineGraphic 272 | ID 273 | 146 274 | Points 275 | 276 | {271.85715620858326, 100.85714285714289} 277 | {349.0000106117555, 100.85714285714289} 278 | 279 | Style 280 | 281 | stroke 282 | 283 | HeadArrow 284 | 0 285 | Legacy 286 | 287 | TailArrow 288 | FilledBall 289 | TailScale 290 | 0.5 291 | 292 | 293 | 294 | 295 | Bounds 296 | {{205.00001335144043, 29.5}, {144, 144}} 297 | Class 298 | ShapedGraphic 299 | ID 300 | 147 301 | Shape 302 | Circle 303 | Style 304 | 305 | fill 306 | 307 | Color 308 | 309 | a 310 | 0.31 311 | b 312 | 0.670097 313 | g 314 | 0.670086 315 | r 316 | 0.670106 317 | 318 | 319 | shadow 320 | 321 | Draws 322 | NO 323 | 324 | stroke 325 | 326 | Pattern 327 | 1 328 | 329 | 330 | Text 331 | 332 | VerticalPad 333 | 0 334 | 335 | 336 | 337 | ID 338 | 145 339 | 340 | 341 | ID 342 | 143 343 | 344 | 345 | Class 346 | Group 347 | Graphics 348 | 349 | 350 | Bounds 351 | {{385.75, 202}, {167.5, 42}} 352 | Class 353 | ShapedGraphic 354 | FitText 355 | Vertical 356 | Flow 357 | Resize 358 | ID 359 | 149 360 | Magnets 361 | 362 | {0, 1} 363 | {0, -1} 364 | {1, 0} 365 | {-1, 0} 366 | 367 | Shape 368 | Rectangle 369 | Style 370 | 371 | fill 372 | 373 | Draws 374 | NO 375 | 376 | shadow 377 | 378 | Draws 379 | NO 380 | 381 | stroke 382 | 383 | Draws 384 | NO 385 | 386 | 387 | Text 388 | 389 | Text 390 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf400 391 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 392 | {\colortbl;\red255\green255\blue255;} 393 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 394 | 395 | \f0\fs28 \cf0 An OTU definition based on two sequences.} 396 | VerticalPad 397 | 4 398 | 399 | TextPlacement 400 | 0 401 | 402 | 403 | Class 404 | Group 405 | Graphics 406 | 407 | 408 | Class 409 | Group 410 | Graphics 411 | 412 | 413 | AllowConnections 414 | NO 415 | AllowLabelDrop 416 | 417 | AllowToConnect 418 | 419 | Class 420 | LineGraphic 421 | ID 422 | 152 423 | Points 424 | 425 | {486.85714285714283, 82.35714285714289} 426 | {563.99999726031501, 82.35714285714289} 427 | 428 | Style 429 | 430 | stroke 431 | 432 | HeadArrow 433 | 0 434 | Legacy 435 | 436 | TailArrow 437 | FilledBall 438 | TailScale 439 | 0.5 440 | 441 | 442 | 443 | 444 | Bounds 445 | {{420, 11}, {144, 144}} 446 | Class 447 | ShapedGraphic 448 | ID 449 | 153 450 | Shape 451 | Circle 452 | Style 453 | 454 | fill 455 | 456 | Color 457 | 458 | a 459 | 0.31 460 | b 461 | 0.670097 462 | g 463 | 0.670086 464 | r 465 | 0.670106 466 | 467 | 468 | shadow 469 | 470 | Draws 471 | NO 472 | 473 | stroke 474 | 475 | Pattern 476 | 1 477 | 478 | 479 | Text 480 | 481 | VerticalPad 482 | 0 483 | 484 | 485 | 486 | ID 487 | 151 488 | 489 | 490 | Class 491 | Group 492 | Graphics 493 | 494 | 495 | AllowConnections 496 | NO 497 | AllowLabelDrop 498 | 499 | AllowToConnect 500 | 501 | Class 502 | LineGraphic 503 | ID 504 | 155 505 | Points 506 | 507 | {441.85714285714283, 119.35714285714289} 508 | {518.99999726031501, 119.35714285714289} 509 | 510 | Style 511 | 512 | stroke 513 | 514 | HeadArrow 515 | 0 516 | Legacy 517 | 518 | TailArrow 519 | FilledBall 520 | TailScale 521 | 0.5 522 | 523 | 524 | 525 | 526 | Bounds 527 | {{375, 48}, {144, 144}} 528 | Class 529 | ShapedGraphic 530 | ID 531 | 156 532 | Shape 533 | Circle 534 | Style 535 | 536 | fill 537 | 538 | Color 539 | 540 | a 541 | 0.31 542 | b 543 | 0.670097 544 | g 545 | 0.670086 546 | r 547 | 0.670106 548 | 549 | 550 | shadow 551 | 552 | Draws 553 | NO 554 | 555 | stroke 556 | 557 | Pattern 558 | 1 559 | 560 | 561 | Text 562 | 563 | VerticalPad 564 | 0 565 | 566 | 567 | 568 | ID 569 | 154 570 | 571 | 572 | ID 573 | 150 574 | 575 | 576 | ID 577 | 148 578 | 579 | 580 | GridInfo 581 | 582 | GuidesLocked 583 | NO 584 | GuidesVisible 585 | YES 586 | HPages 587 | 1 588 | ImageCounter 589 | 1 590 | KeepToScale 591 | 592 | Layers 593 | 594 | 595 | Lock 596 | NO 597 | Name 598 | Layer 1 599 | Print 600 | YES 601 | View 602 | YES 603 | 604 | 605 | LayoutInfo 606 | 607 | Animate 608 | NO 609 | circoMinDist 610 | 18 611 | circoSeparation 612 | 0.0 613 | layoutEngine 614 | dot 615 | neatoSeparation 616 | 0.0 617 | twopiSeparation 618 | 0.0 619 | 620 | LinksVisible 621 | NO 622 | MagnetsVisible 623 | NO 624 | MasterSheets 625 | 626 | ModificationDate 627 | 2014-04-08 02:56:00 +0000 628 | Modifier 629 | Greg Caporaso 630 | NotesVisible 631 | NO 632 | Orientation 633 | 2 634 | OriginVisible 635 | NO 636 | PageBreaks 637 | YES 638 | PrintInfo 639 | 640 | NSBottomMargin 641 | 642 | float 643 | 41 644 | 645 | NSHorizonalPagination 646 | 647 | coded 648 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG 649 | 650 | NSLeftMargin 651 | 652 | float 653 | 18 654 | 655 | NSPaperSize 656 | 657 | size 658 | {612.00002670288086, 792} 659 | 660 | NSPrintReverseOrientation 661 | 662 | int 663 | 0 664 | 665 | NSPrinter 666 | 667 | coded 668 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAlOU1ByaW50ZXIAhIQITlNPYmplY3QAhZKEhIQITlNTdHJpbmcBlIQBKxNDYW5vbiBNRjQ1MDAgU2VyaWVzhoY= 669 | 670 | NSPrinterName 671 | 672 | string 673 | Canon MF4500 Series 674 | 675 | NSRightMargin 676 | 677 | float 678 | 18 679 | 680 | NSTopMargin 681 | 682 | float 683 | 18 684 | 685 | 686 | PrintOnePage 687 | 688 | ReadOnly 689 | NO 690 | RowAlign 691 | 1 692 | RowSpacing 693 | 36 694 | SheetTitle 695 | Canvas 1 696 | SmartAlignmentGuidesActive 697 | YES 698 | SmartDistanceGuidesActive 699 | YES 700 | UniqueID 701 | 1 702 | UseEntirePage 703 | 704 | VPages 705 | 1 706 | WindowInfo 707 | 708 | CurrentSheet 709 | 0 710 | ExpandedCanvases 711 | 712 | 713 | name 714 | Canvas 1 715 | 716 | 717 | Frame 718 | {{432, 151}, {828, 872}} 719 | ListView 720 | 721 | OutlineWidth 722 | 142 723 | RightSidebar 724 | 725 | ShowRuler 726 | 727 | Sidebar 728 | 729 | SidebarWidth 730 | 120 731 | VisibleRegion 732 | {{-58, 0}, {693, 733}} 733 | Zoom 734 | 1 735 | ZoomValues 736 | 737 | 738 | Canvas 1 739 | 1 740 | 1 741 | 742 | 743 | 744 | 745 | 746 | -------------------------------------------------------------------------------- /book/fundamentals/images/cluster-legend.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-legend.png -------------------------------------------------------------------------------- /book/fundamentals/images/cluster-template.graffle/data.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ActiveLayerIndex 6 | 0 7 | ApplicationVersion 8 | 9 | com.omnigroup.OmniGraffle 10 | 139.18.0.187838 11 | 12 | AutoAdjust 13 | 14 | BackgroundGraphic 15 | 16 | Bounds 17 | {{0, 0}, {576.00002670288086, 733}} 18 | Class 19 | SolidGraphic 20 | ID 21 | 2 22 | Style 23 | 24 | shadow 25 | 26 | Draws 27 | NO 28 | 29 | stroke 30 | 31 | Draws 32 | NO 33 | 34 | 35 | 36 | BaseZoom 37 | 0 38 | CanvasOrigin 39 | {0, 0} 40 | ColumnAlign 41 | 1 42 | ColumnSpacing 43 | 36 44 | CreationDate 45 | 2014-04-08 02:14:28 +0000 46 | Creator 47 | Greg Caporaso 48 | DisplayScale 49 | 1 0/72 in = 1 0/72 in 50 | GraphDocumentVersion 51 | 8 52 | GraphicsList 53 | 54 | 55 | Bounds 56 | {{167, 381}, {143, 143}} 57 | Class 58 | ShapedGraphic 59 | ID 60 | 123 61 | ImageID 62 | 1 63 | ManualSizeImage 64 | YES 65 | Offset 66 | {0.23333333325606806, 0.24444444436349985} 67 | Shape 68 | Circle 69 | Style 70 | 71 | fill 72 | 73 | Draws 74 | NO 75 | 76 | image fill 77 | 78 | ImageSizing 79 | 0 80 | Offset 81 | {0.23333333325606806, 0.24444444436349985} 82 | 83 | shadow 84 | 85 | Draws 86 | NO 87 | 88 | stroke 89 | 90 | Draws 91 | NO 92 | 93 | 94 | 95 | 96 | Class 97 | Group 98 | Graphics 99 | 100 | 101 | AllowConnections 102 | NO 103 | AllowLabelDrop 104 | 105 | AllowToConnect 106 | 107 | Class 108 | LineGraphic 109 | ID 110 | 97 111 | Points 112 | 113 | {313.85714285714278, 114.49999564034604} 114 | {390.99999726031507, 114.49999564034604} 115 | 116 | Style 117 | 118 | stroke 119 | 120 | HeadArrow 121 | 0 122 | Legacy 123 | 124 | TailArrow 125 | FilledBall 126 | TailScale 127 | 0.5 128 | 129 | 130 | 131 | 132 | Bounds 133 | {{247, 43.142852783203097}, {144.00000000000003, 144.00000000000003}} 134 | Class 135 | ShapedGraphic 136 | ID 137 | 98 138 | Shape 139 | Circle 140 | Style 141 | 142 | fill 143 | 144 | Color 145 | 146 | a 147 | 0.31 148 | b 149 | 0.670097 150 | g 151 | 0.670086 152 | r 153 | 0.670106 154 | 155 | Draws 156 | NO 157 | 158 | shadow 159 | 160 | Draws 161 | NO 162 | 163 | stroke 164 | 165 | Pattern 166 | 1 167 | 168 | 169 | Text 170 | 171 | VerticalPad 172 | 0 173 | 174 | 175 | 176 | ID 177 | 96 178 | 179 | 180 | Class 181 | Group 182 | Graphics 183 | 184 | 185 | AllowConnections 186 | NO 187 | AllowLabelDrop 188 | 189 | AllowToConnect 190 | 191 | Class 192 | LineGraphic 193 | ID 194 | 86 195 | Points 196 | 197 | {118.85714285714283, 114.49999999999977} 198 | {195.99999726031507, 114.49999999999977} 199 | 200 | Style 201 | 202 | stroke 203 | 204 | HeadArrow 205 | 0 206 | Legacy 207 | 208 | TailArrow 209 | FilledBall 210 | TailScale 211 | 0.5 212 | 213 | 214 | 215 | 216 | Bounds 217 | {{52, 43.142857142856883}, {144, 144}} 218 | Class 219 | ShapedGraphic 220 | ID 221 | 87 222 | Shape 223 | Circle 224 | Style 225 | 226 | fill 227 | 228 | Color 229 | 230 | a 231 | 0.31 232 | b 233 | 0.670097 234 | g 235 | 0.670086 236 | r 237 | 0.670106 238 | 239 | 240 | shadow 241 | 242 | Draws 243 | NO 244 | 245 | stroke 246 | 247 | Pattern 248 | 1 249 | 250 | 251 | Text 252 | 253 | VerticalPad 254 | 0 255 | 256 | 257 | 258 | ID 259 | 85 260 | 261 | 262 | GridInfo 263 | 264 | GuidesLocked 265 | NO 266 | GuidesVisible 267 | YES 268 | HPages 269 | 1 270 | ImageCounter 271 | 2 272 | ImageLinkBack 273 | 274 | 275 | ApplicationURL 276 | http://www.omnigroup.com/applications/OmniGraffle 277 | appData 278 | 279 | Color 280 | 281 | w 282 | 1 283 | 284 | DocumentSettings 285 | 286 | ApplicationVersion 287 | 288 | com.omnigroup.OmniGraffle 289 | 139.18.0.187838 290 | 291 | CreationDate 292 | 2014-04-07 21:49:27 +0000 293 | Creator 294 | Greg Caporaso 295 | FileName 296 | cluster-types.graffle 297 | GraphDocumentVersion 298 | 8 299 | ModelCount 300 | 1 301 | ModelIndex 302 | 0 303 | ModificationDate 304 | 2014-04-08 02:08:05 +0000 305 | Modifier 306 | Greg Caporaso 307 | SheetTitle 308 | Canvas 1 309 | 310 | GraphicsList 311 | 312 | 313 | Bounds 314 | {{449.5, 311.01062596097904}, {142.97872340425533, 142.97872340425533}} 315 | Class 316 | ShapedGraphic 317 | ID 318 | 122 319 | LayerIndex 320 | 0 321 | Shape 322 | Circle 323 | Style 324 | 325 | fill 326 | 327 | Color 328 | 329 | a 330 | 0.31 331 | b 332 | 0.670097 333 | g 334 | 0.670086 335 | r 336 | 0.670106 337 | 338 | 339 | shadow 340 | 341 | Draws 342 | NO 343 | 344 | stroke 345 | 346 | Draws 347 | NO 348 | Pattern 349 | 1 350 | 351 | 352 | Text 353 | 354 | VerticalPad 355 | 0 356 | 357 | 358 | 359 | Layers 360 | 361 | 362 | Lock 363 | NO 364 | Name 365 | Layer 1 366 | Print 367 | YES 368 | View 369 | YES 370 | 371 | 372 | ZoomLevel 373 | 1 374 | 375 | bundleId 376 | com.omnigroup.OmniGraffle 377 | refresh 378 | 0.0 379 | serverAppName 380 | OmniGraffle 381 | serverName 382 | OmniGraffle 383 | version 384 | A 385 | 386 | 387 | ImageList 388 | 389 | image1.pdf 390 | 391 | KeepToScale 392 | 393 | Layers 394 | 395 | 396 | Lock 397 | NO 398 | Name 399 | Layer 1 400 | Print 401 | YES 402 | View 403 | YES 404 | 405 | 406 | LayoutInfo 407 | 408 | Animate 409 | NO 410 | circoMinDist 411 | 18 412 | circoSeparation 413 | 0.0 414 | layoutEngine 415 | dot 416 | neatoSeparation 417 | 0.0 418 | twopiSeparation 419 | 0.0 420 | 421 | LinksVisible 422 | NO 423 | MagnetsVisible 424 | NO 425 | MasterSheets 426 | 427 | ModificationDate 428 | 2014-04-08 02:15:07 +0000 429 | Modifier 430 | Greg Caporaso 431 | NotesVisible 432 | NO 433 | Orientation 434 | 2 435 | OriginVisible 436 | NO 437 | PageBreaks 438 | YES 439 | PrintInfo 440 | 441 | NSBottomMargin 442 | 443 | float 444 | 41 445 | 446 | NSHorizonalPagination 447 | 448 | coded 449 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG 450 | 451 | NSLeftMargin 452 | 453 | float 454 | 18 455 | 456 | NSPaperSize 457 | 458 | size 459 | {612.00002670288086, 792} 460 | 461 | NSPrintReverseOrientation 462 | 463 | int 464 | 0 465 | 466 | NSPrinter 467 | 468 | coded 469 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAlOU1ByaW50ZXIAhIQITlNPYmplY3QAhZKEhIQITlNTdHJpbmcBlIQBKxNDYW5vbiBNRjQ1MDAgU2VyaWVzhoY= 470 | 471 | NSPrinterName 472 | 473 | string 474 | Canon MF4500 Series 475 | 476 | NSRightMargin 477 | 478 | float 479 | 18 480 | 481 | NSTopMargin 482 | 483 | float 484 | 18 485 | 486 | 487 | PrintOnePage 488 | 489 | ReadOnly 490 | NO 491 | RowAlign 492 | 1 493 | RowSpacing 494 | 36 495 | SheetTitle 496 | Canvas 1 497 | SmartAlignmentGuidesActive 498 | YES 499 | SmartDistanceGuidesActive 500 | YES 501 | UniqueID 502 | 1 503 | UseEntirePage 504 | 505 | VPages 506 | 1 507 | WindowInfo 508 | 509 | CurrentSheet 510 | 0 511 | ExpandedCanvases 512 | 513 | 514 | name 515 | Canvas 1 516 | 517 | 518 | Frame 519 | {{210, 193}, {711, 872}} 520 | ListView 521 | 522 | OutlineWidth 523 | 142 524 | RightSidebar 525 | 526 | ShowRuler 527 | 528 | Sidebar 529 | 530 | SidebarWidth 531 | 120 532 | VisibleRegion 533 | {{0, 0}, {576, 733}} 534 | Zoom 535 | 1 536 | ZoomValues 537 | 538 | 539 | Canvas 1 540 | 1 541 | 1 542 | 543 | 544 | 545 | 546 | 547 | -------------------------------------------------------------------------------- /book/fundamentals/images/cluster-template.graffle/image1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-template.graffle/image1.pdf -------------------------------------------------------------------------------- /book/fundamentals/images/cluster-types.graffle/image2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-types.graffle/image2.pdf -------------------------------------------------------------------------------- /book/fundamentals/images/cluster-types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/cluster-types.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.1.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.2.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.3.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.4.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.5.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.6.png -------------------------------------------------------------------------------- /book/fundamentals/images/furthest-neighbor.graffle/image1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/furthest-neighbor.graffle/image1.pdf -------------------------------------------------------------------------------- /book/fundamentals/images/msa-tree-a1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-a1.png -------------------------------------------------------------------------------- /book/fundamentals/images/msa-tree-a2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-a2.png -------------------------------------------------------------------------------- /book/fundamentals/images/msa-tree-a3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-a3.png -------------------------------------------------------------------------------- /book/fundamentals/images/msa-tree-final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-final.png -------------------------------------------------------------------------------- /book/fundamentals/images/msa-tree-input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/msa-tree-input.png -------------------------------------------------------------------------------- /book/fundamentals/images/nearest-neighbor.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.1.png -------------------------------------------------------------------------------- /book/fundamentals/images/nearest-neighbor.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.2.png -------------------------------------------------------------------------------- /book/fundamentals/images/nearest-neighbor.3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.3.png -------------------------------------------------------------------------------- /book/fundamentals/images/nearest-neighbor.4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.4.png -------------------------------------------------------------------------------- /book/fundamentals/images/nearest-neighbor.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.5.png -------------------------------------------------------------------------------- /book/fundamentals/images/nearest-neighbor.6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/nearest-neighbor.6.png -------------------------------------------------------------------------------- /book/fundamentals/images/sequence-evo-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/sequence-evo-tree.png -------------------------------------------------------------------------------- /book/fundamentals/images/spider-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/spider-tree.png -------------------------------------------------------------------------------- /book/fundamentals/images/tree-monophyly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/tree-monophyly.png -------------------------------------------------------------------------------- /book/fundamentals/images/tree-polyphyly.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/tree-polyphyly.png -------------------------------------------------------------------------------- /book/fundamentals/images/tree-schematic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/tree-schematic1.png -------------------------------------------------------------------------------- /book/fundamentals/images/upgma-tree-final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-final.png -------------------------------------------------------------------------------- /book/fundamentals/images/upgma-tree-iter1.graffle: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ActiveLayerIndex 6 | 0 7 | ApplicationVersion 8 | 9 | com.omnigroup.OmniGraffle 10 | 139.18.0.187838 11 | 12 | AutoAdjust 13 | 14 | BackgroundGraphic 15 | 16 | Bounds 17 | {{0, 0}, {576.00002670288086, 733}} 18 | Class 19 | SolidGraphic 20 | ID 21 | 2 22 | Style 23 | 24 | shadow 25 | 26 | Draws 27 | NO 28 | 29 | stroke 30 | 31 | Draws 32 | NO 33 | 34 | 35 | 36 | BaseZoom 37 | 0 38 | CanvasOrigin 39 | {0, 0} 40 | ColumnAlign 41 | 1 42 | ColumnSpacing 43 | 36 44 | CreationDate 45 | 2014-02-25 16:00:53 +0000 46 | Creator 47 | Greg Caporaso 48 | DisplayScale 49 | 1 0/72 in = 1.0000 in 50 | GraphDocumentVersion 51 | 8 52 | GraphicsList 53 | 54 | 55 | Bounds 56 | {{270.43200826644897, 213.91587829589844}, {24, 20}} 57 | Class 58 | ShapedGraphic 59 | FitText 60 | YES 61 | Flow 62 | Resize 63 | ID 64 | 342 65 | Magnets 66 | 67 | {0, 1} 68 | {0, -1} 69 | {1, 0} 70 | {-1, 0} 71 | 72 | Shape 73 | Rectangle 74 | Style 75 | 76 | fill 77 | 78 | Draws 79 | NO 80 | 81 | shadow 82 | 83 | Draws 84 | NO 85 | 86 | stroke 87 | 88 | Draws 89 | NO 90 | 91 | 92 | Text 93 | 94 | Text 95 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370 96 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 97 | {\colortbl;\red255\green255\blue255;} 98 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 99 | 100 | \f0\fs20 \cf0 0.5} 101 | VerticalPad 102 | 4 103 | 104 | TextPlacement 105 | 0 106 | Wrap 107 | NO 108 | 109 | 110 | Bounds 111 | {{270, 259.5}, {24, 20}} 112 | Class 113 | ShapedGraphic 114 | FitText 115 | YES 116 | Flow 117 | Resize 118 | ID 119 | 341 120 | Magnets 121 | 122 | {0, 1} 123 | {0, -1} 124 | {1, 0} 125 | {-1, 0} 126 | 127 | Shape 128 | Rectangle 129 | Style 130 | 131 | fill 132 | 133 | Draws 134 | NO 135 | 136 | shadow 137 | 138 | Draws 139 | NO 140 | 141 | stroke 142 | 143 | Draws 144 | NO 145 | 146 | 147 | Text 148 | 149 | Text 150 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370 151 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 152 | {\colortbl;\red255\green255\blue255;} 153 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 154 | 155 | \f0\fs20 \cf0 0.5} 156 | VerticalPad 157 | 4 158 | 159 | TextPlacement 160 | 0 161 | Wrap 162 | NO 163 | 164 | 165 | Bounds 166 | {{306, 262}, {25, 25}} 167 | Class 168 | ShapedGraphic 169 | FitText 170 | YES 171 | Flow 172 | Resize 173 | FontInfo 174 | 175 | Font 176 | Helvetica 177 | Size 178 | 10 179 | 180 | ID 181 | 337 182 | Magnets 183 | 184 | {0, 1} 185 | {0, -1} 186 | {1, 0} 187 | {-1, 0} 188 | 189 | Shape 190 | Rectangle 191 | Style 192 | 193 | fill 194 | 195 | Draws 196 | NO 197 | 198 | shadow 199 | 200 | Draws 201 | NO 202 | 203 | stroke 204 | 205 | Draws 206 | NO 207 | 208 | 209 | Text 210 | 211 | Text 212 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370 213 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 214 | {\colortbl;\red255\green255\blue255;} 215 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 216 | 217 | \f0\fs28 \cf0 s5} 218 | VerticalPad 219 | 4 220 | 221 | TextPlacement 222 | 0 223 | Wrap 224 | NO 225 | 226 | 227 | Bounds 228 | {{306, 216.5}, {25, 25}} 229 | Class 230 | ShapedGraphic 231 | FitText 232 | YES 233 | Flow 234 | Resize 235 | FontInfo 236 | 237 | Font 238 | Helvetica 239 | Size 240 | 10 241 | 242 | ID 243 | 336 244 | Magnets 245 | 246 | {0, 1} 247 | {0, -1} 248 | {1, 0} 249 | {-1, 0} 250 | 251 | Shape 252 | Rectangle 253 | Style 254 | 255 | fill 256 | 257 | Draws 258 | NO 259 | 260 | shadow 261 | 262 | Draws 263 | NO 264 | 265 | stroke 266 | 267 | Draws 268 | NO 269 | 270 | 271 | Text 272 | 273 | Text 274 | {\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370 275 | \cocoascreenfonts1{\fonttbl\f0\fswiss\fcharset0 Helvetica;} 276 | {\colortbl;\red255\green255\blue255;} 277 | \pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\qc 278 | 279 | \f0\fs28 \cf0 s4} 280 | VerticalPad 281 | 4 282 | 283 | TextPlacement 284 | 0 285 | Wrap 286 | NO 287 | 288 | 289 | Class 290 | LineGraphic 291 | ID 292 | 324 293 | Points 294 | 295 | {259.12761506276161, 277.23850901192492} 296 | {304.61715481171564, 277.23850901192492} 297 | {304.61715481171564, 277.23850901192492} 298 | 299 | Style 300 | 301 | stroke 302 | 303 | HeadArrow 304 | 0 305 | Legacy 306 | 307 | LineType 308 | 1 309 | TailArrow 310 | 0 311 | Width 312 | 2 313 | 314 | 315 | 316 | 317 | Class 318 | LineGraphic 319 | ID 320 | 325 321 | Points 322 | 323 | {257.86401673640177, 230.91587904250571} 324 | {303.35355648535568, 230.91587904250571} 325 | {303.35355648535568, 230.91587904250571} 326 | 327 | Style 328 | 329 | stroke 330 | 331 | HeadArrow 332 | 0 333 | Legacy 334 | 335 | LineType 336 | 1 337 | TailArrow 338 | 0 339 | Width 340 | 2 341 | 342 | 343 | 344 | 345 | Class 346 | LineGraphic 347 | ID 348 | 326 349 | Points 350 | 351 | {257.86405464435143, 276.60395243700134} 352 | {257.86401673640171, 230.91587904250582} 353 | {257.86401673640171, 230.91587904250582} 354 | 355 | Rotation 356 | 90 357 | Style 358 | 359 | stroke 360 | 361 | HeadArrow 362 | 0 363 | Legacy 364 | 365 | LineType 366 | 1 367 | TailArrow 368 | 0 369 | Width 370 | 2 371 | 372 | 373 | 374 | 375 | GridInfo 376 | 377 | GuidesLocked 378 | NO 379 | GuidesVisible 380 | YES 381 | HPages 382 | 1 383 | ImageCounter 384 | 1 385 | KeepToScale 386 | 387 | Layers 388 | 389 | 390 | Lock 391 | NO 392 | Name 393 | Layer 1 394 | Print 395 | YES 396 | View 397 | YES 398 | 399 | 400 | LayoutInfo 401 | 402 | Animate 403 | NO 404 | circoMinDist 405 | 18 406 | circoSeparation 407 | 0.0 408 | layoutEngine 409 | dot 410 | neatoSeparation 411 | 0.0 412 | twopiSeparation 413 | 0.0 414 | 415 | LinksVisible 416 | NO 417 | MagnetsVisible 418 | NO 419 | MasterSheets 420 | 421 | ModificationDate 422 | 2014-02-25 16:08:12 +0000 423 | Modifier 424 | Greg Caporaso 425 | NotesVisible 426 | NO 427 | Orientation 428 | 2 429 | OriginVisible 430 | NO 431 | PageBreaks 432 | YES 433 | PrintInfo 434 | 435 | NSBottomMargin 436 | 437 | float 438 | 41 439 | 440 | NSHorizonalPagination 441 | 442 | coded 443 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG 444 | 445 | NSLeftMargin 446 | 447 | float 448 | 18 449 | 450 | NSPaperSize 451 | 452 | size 453 | {612.00002670288086, 792} 454 | 455 | NSPrintReverseOrientation 456 | 457 | int 458 | 0 459 | 460 | NSPrinter 461 | 462 | coded 463 | BAtzdHJlYW10eXBlZIHoA4QBQISEhAlOU1ByaW50ZXIAhIQITlNPYmplY3QAhZKEhIQITlNTdHJpbmcBlIQBKxNDYW5vbiBNRjQ1MDAgU2VyaWVzhoY= 464 | 465 | NSPrinterName 466 | 467 | string 468 | Canon MF4500 Series 469 | 470 | NSRightMargin 471 | 472 | float 473 | 18 474 | 475 | NSTopMargin 476 | 477 | float 478 | 18 479 | 480 | 481 | PrintOnePage 482 | 483 | ReadOnly 484 | NO 485 | RowAlign 486 | 1 487 | RowSpacing 488 | 36 489 | SheetTitle 490 | Canvas 1 491 | SmartAlignmentGuidesActive 492 | YES 493 | SmartDistanceGuidesActive 494 | YES 495 | UniqueID 496 | 1 497 | UseEntirePage 498 | 499 | VPages 500 | 1 501 | WindowInfo 502 | 503 | CurrentSheet 504 | 0 505 | ExpandedCanvases 506 | 507 | 508 | name 509 | Canvas 1 510 | 511 | 512 | Frame 513 | {{410, 54}, {711, 872}} 514 | ListView 515 | 516 | OutlineWidth 517 | 142 518 | RightSidebar 519 | 520 | ShowRuler 521 | 522 | Sidebar 523 | 524 | SidebarWidth 525 | 120 526 | VisibleRegion 527 | {{0, 0}, {576, 733}} 528 | Zoom 529 | 1 530 | ZoomValues 531 | 532 | 533 | Canvas 1 534 | 1 535 | 1 536 | 537 | 538 | 539 | 540 | 541 | -------------------------------------------------------------------------------- /book/fundamentals/images/upgma-tree-iter1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-iter1.png -------------------------------------------------------------------------------- /book/fundamentals/images/upgma-tree-iter2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-iter2.png -------------------------------------------------------------------------------- /book/fundamentals/images/upgma-tree-iter3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/fundamentals/images/upgma-tree-iter3.png -------------------------------------------------------------------------------- /book/fundamentals/index.md: -------------------------------------------------------------------------------- 1 | # Fundamentals 2 | -------------------------------------------------------------------------------- /book/fundamentals/index.yaml: -------------------------------------------------------------------------------- 1 | contents: 2 | - pairwise-alignment 3 | - database-searching 4 | - multiple-sequence-alignment 5 | - phylogeny-reconstruction 6 | - sequence-mapping-and-clustering 7 | - machine-learning 8 | -------------------------------------------------------------------------------- /book/fundamentals/machine-learning.md: -------------------------------------------------------------------------------- 1 | # Machine learning in bioinformatics (work-in-progress) 2 | 3 | **This chapter is currently a work-in-progress, and is incomplete.** 4 | 5 | Machine learning algorithms are commonly used in bioinformatics for a variety of tasks. Typically, the common thread in these tasks is that the user would like the algorithm to assist in the identification of patterns in a complex data set. In this chapter we'll implement a few machine learning algorithms so we can gain an in-depth understanding of how they work. In practice though, there are many mature machine learning libraries that you'd want to use. [scikit-learn](http://scikit-learn.org/) is a popular and well-documented Python library for machine learning which many bioinformatics researchers and software developers use in their work. 6 | 7 | These algorithms generally work beginning with a collection of samples and some user-defined features of those samples. These data are typically represented in a matrix, where samples are the rows and features are the columns. There are a few different high-level tasks that are common in machine learning, including classification, regression, and dimensionality reduction. In a classification task, a user provides examples of data that fall into certain discrete classes (for example, _healthy_ and _disease_), and tries to have the computer develop a model that can differentiate those classes based on the defined features. If successful, the resulting model could be applied to data where the class isn't known ahead of time, in attempt to predict the class from the features. A regression task is similar, except that a continuous value will be predicted rather than a discrete value. Dimensionality reduction tasks, on the other hand, generally don't have classes or labels assigned ahead of time, and the user is hoping to identify which samples are most similar to each other based on new features that are defined by the algorithm. The goal here might be to reduce the number of features from thousands or more to around two or three that explain most of the variation in the data. This allows the user to explore the samples visually, for example in a scatter plot, which would not be feasible if there were thousands of features. 8 | 9 | In this chapter we'll explore two classification algorithms and one dimensionality reduction task in the context of some real-world examples. 10 | 11 | ## Defining a classification problem 12 | 13 | We'll explore machine learning classifiers in the context of a familiar topic: taxonomic classification of 16S rRNA sequences. We previously explored this problem in [Sequence Homology Searching](alias://d22e6b), so it is likely worth spending a few minutes skimming that chapter if it's not fresh in your mind. 14 | 15 | Briefly, the problem that we are going to address here is as follows. We have a query sequence ($q_i$) which is not taxonomically annotated (meaning we don't know the taxonomy of the organism whose genome it is found in), and a reference database ($R$) of taxonomically annotated sequences ($r_1, r_2, r_3, ... r_n$). We want to infer a taxonomic annotation for $q_i$. We'll again work with the [Greengenes](http://greengenes.secondgenome.com/) database, which we'll access using [QIIME default reference project](https://github.com/biocore/qiime-default-reference). Greengenes is a database of 16S rRNA gene sequences. (This should all sound very familiar - if not, I again suggest that you review [Sequence Homology Searching](alias://d22e6b).) 16 | 17 | This time, instead of using sequence alignment to identify the most likely taxonomic origin of a sequence, we'll train classifiers by building [kmer](alias://C7hMX5)-based models of the 16S sequences of taxa in our reference database. We'll then run our query sequences through those models to identify the most likely taxonomic origin of each query sequence. Since we know the taxonomic origin of our query sequences in this case, we can evaluate the accuracy of our classifiers by seeing how often they return the known taxonomy assignment. If our training and testing approaches are well-designed, the performance on our tests will inform us of how accurate we can expect our classifier to be on data where the actual taxonomic origin is unknown. 18 | 19 | Let's jump in... 20 | 21 | ### Naive Bayes classifiers 22 | 23 | The first classifier we'll explore is the popular and relatively simple Naive Bayes classifier. This classifier uses Bayes Theorem to determine the most likely label for an unknown input based on a probabilistic model it has constructed from training data. (_The preceding text needs work._) The model that is constructed is based on user-defined features of the sequences. The most commonly used features for sequence classification tasks such as this is overlapping [kmers](alias://C7hMX5). 24 | 25 | We'll begin by importing some libraries that we'll use in this chapter, and then [preparing our reference database and query sequences as we did previously](alias://gAKBxE). 26 | 27 | ```python 28 | >>> %pylab inline 29 | ... 30 | >>> from IPython.core import page 31 | >>> page.page = print 32 | ... 33 | >>> import pandas as pd 34 | >>> import skbio 35 | >>> import numpy as np 36 | >>> import itertools 37 | >>> import collections 38 | ``` 39 | 40 | ```python 41 | >>> from iab.algorithms import load_taxonomy_reference_database 42 | ... 43 | >>> %psource load_taxonomy_reference_database 44 | ``` 45 | 46 | ```python 47 | >>> reference_taxonomy, reference_db = load_taxonomy_reference_database() 48 | ``` 49 | 50 | ```python 51 | >>> reference_db[0] 52 | ``` 53 | 54 | ```python 55 | >>> reference_db[-1] 56 | ``` 57 | 58 | We'll select a random subset of the reference database to work with here. 59 | 60 | ```python 61 | >>> reference_db = np.random.choice(reference_db, 500, replace=False) 62 | >>> print("%s sequences are present in the subsampled database." % len(reference_db)) 63 | ``` 64 | 65 | The first thing our Naive Bayes classifier will need is the set of all possible words of length ``k``. This will be dependent on the value of ``k`` and the characters in our alphabet (i.e., the characters that we should expect to find in the reference database). This set is referred to as ``W``, and can be computed as follows. Given the following alphabet, how many kmers of length 2 are there (i.e., 2-mers)? How many 7-mers are there? How many 7-mers are there if there are twenty characters in our alphabet (as would be the case if we were working with protein sequences instead of DNA sequences)? 66 | 67 | ```python 68 | >>> alphabet = skbio.DNA.nondegenerate_chars 69 | >>> k = 2 70 | ... 71 | >>> def compute_W(alphabet, k): 72 | >>> return set(map(''.join, itertools.product(alphabet, repeat=k))) 73 | ... 74 | >>> W = compute_W(alphabet, k) 75 | >>> print('Alphabet contains the characters: %s' % ', '.join(alphabet)) 76 | >>> print('For an alphabet size of %d, W contains %d length-%d kmers.' % (len(alphabet), len(W), k)) 77 | ``` 78 | 79 | scikit-bio provides methods for identifying all kmers in a ``skbio.DNA`` sequence object, and for computing the kmer frequencies. This information can be obtained for one of our reference sequences as follows: 80 | 81 | ```python 82 | >>> kmers = reference_db[0].iter_kmers(k=k) 83 | >>> for kmer in kmers: 84 | ... print(kmer, end=' ') 85 | ``` 86 | 87 | ```python 88 | >>> print(reference_db[0].kmer_frequencies(k=k)) 89 | ``` 90 | 91 | This information can be convenient to store in a pandas ``Series`` object: 92 | 93 | ```python 94 | >>> pd.Series(reference_db[0].kmer_frequencies(k=k), name=reference_db[0].metadata['id']) 95 | ``` 96 | 97 | To train our taxonomic classifier, we next need to define a few things. First, at what level of taxonomic specificity do we want to classify our sequences? We should expect to achieve higher accuracy at less specific taxonomic levels such as phylum or class, but these are likely to be less informative biologically than more specific levels such as genus or species. Let's start classifying at the phylum level to keep our task simple, since we're working with a small subset of the reference database here. In Greengenes, phylum is the second level of the taxonomy. 98 | 99 | Next, how long should our kmers be? We don't have a good idea of this to start with. The longer our kmers, the more likely they are to be specific to certain taxa, which is good because that will help with classification. However, if they get too long it becomes less likely that we'll observe those kmers in sequences that aren't represented in our database because the longer the sequence is the more likely we are to see variation across other organisms that are assigned to the same taxonomy. Based on some of my own work in this area, I'll start us out with 7-mers (i.e., kmers of length 7). 100 | 101 | Finally, we'll need to know the value of `W`, defined above as the set of all possible kmers given our alphabet and the value of `k`. 102 | 103 | As an exercise, I recommend exploring the impact of the value of `k` and `taxonomic_level` on the accuracy of our classifier after reading this chapter. 104 | 105 | ```python 106 | >>> taxonomic_level = 2 107 | >>> k = 7 108 | >>> alphabet = skbio.DNA.nondegenerate_chars 109 | ``` 110 | 111 | Next, we'll compute a table of the per-sequence kmer counts for all kmers in `W` for all sequences in our reference database. We'll also store the taxonomic label of each of our reference sequences at our specified taxonomic level. We can store this information in a pandas `DataFrame`, and then view the first 25 rows of that table. 112 | 113 | ```python 114 | >>> def get_taxon_at_level(taxon, level): 115 | ... taxon = [l.strip() for l in taxon.split(';')] 116 | ... return '; '.join(taxon[:level]) 117 | ... 118 | >>> W = compute_W(alphabet, k) 119 | ... 120 | >>> per_sequence_kmer_counts = [] 121 | >>> for reference_sequence in reference_db: 122 | ... taxon = get_taxon_at_level(reference_sequence.metadata['taxonomy'], taxonomic_level) 123 | ... kmer_counts = dict.fromkeys(W, 0) 124 | ... kmer_counts.update(reference_sequence.kmer_frequencies(k=k)) 125 | ... per_sequence_kmer_counts.append(pd.Series(kmer_counts, name=taxon)) 126 | ... 127 | >>> per_sequence_kmer_counts = pd.DataFrame(data=per_sequence_kmer_counts).fillna(0).T 128 | >>> per_sequence_kmer_counts[:25] 129 | ``` 130 | 131 | With this information, we'll next compute our "kmer probability table" (EXISTING NAME FOR THIS?). The content of this table will be the probability of observing each kmer in W given a taxon. This is computed based on a few values: 132 | 133 | $N$ : The total number of sequences in the training set. 134 | 135 | $n(w_i)$ : The number of total sequences containing kmer _i_. 136 | 137 | $P_i$ : The probability of observing kmer _i_. Initially it might seem as though this would be computed as $n(w_i) / N$, but this neglects the possibility of that a kmer observed in a query sequence might not be represented in our reference database, so a small pseudocount is added to the numerator and denomenator. 138 | 139 | $P(w_i | taxon)$ : The probability of observing a kmer given a taxon. Again, it would seem that this would be computed as the proportion of sequences in the taxon containing the kmer, but this would neglect that we'll likely observe kmers in our query sequences that are not represented in our reference database. As pseudocount is therefore added again to the numerator and denominator. This time the pseudocount in the numerator is scaled by how frequent the kmer is in the reference database as a whole: specifically, it is $P_i$. 140 | 141 | Our "kmer probability table" is $P(w_i | taxon)$ computed for all kmers in W and all taxa represented in our reference database. We'll compute that and again look at the first 25 rows. 142 | 143 | ```python 144 | >>> def compute_kmer_probability_table(per_sequence_kmer_counts): 145 | ... N = len(per_sequence_kmer_counts) # number of training sequences 146 | ... 147 | ... # number of sequences containing kmer wi 148 | ... n_wi = per_sequence_kmer_counts.astype(bool).sum(axis=1) 149 | ... n_wi.name = 'n(w_i)' 150 | ... 151 | ... # probabilities of observing each kmer 152 | ... Pi = (n_wi + 0.5) / (N + 1) 153 | ... Pi.name = 'P_i' 154 | ... 155 | ... # number of times each taxon appears in training set 156 | ... taxon_counts = collections.Counter(per_sequence_kmer_counts.columns) 157 | ... n_taxon_members_containing_kmer = per_sequence_kmer_counts.astype(bool).groupby(level=0, axis=1).sum() 158 | ... 159 | ... # probabilities of observing each kmer in each taxon 160 | ... p_wi_t = [] 161 | ... for taxon, count in taxon_counts.items(): 162 | ... p_wi_t.append(pd.Series((n_taxon_members_containing_kmer[taxon] + Pi) / (count + 1), name=taxon)) 163 | ... 164 | ... return pd.DataFrame(p_wi_t).T 165 | ``` 166 | 167 | ```python 168 | >>> kmer_probability_table = compute_kmer_probability_table(per_sequence_kmer_counts) 169 | ``` 170 | 171 | ```python 172 | >>> kmer_probability_table[:25] 173 | ``` 174 | 175 | With our kmer probability table we are now ready to classify unknown sequences. We'll begin by defining some query sequences. We'll pull these at random from our reference sequences, which means that some of the query sequences will be represented in our reference database and some won't be. This is the sitatuation that is typically encountered in practice. To simulate real-world 16S taxonomy classification tasks, we'll also trim out 200 bases of our reference sequences since (as of this writing) we typically don't obtain full-length 16S sequences from a DNA sequencing instrument. 176 | 177 | ```python 178 | >>> from iab.algorithms import load_taxonomy_query_sequences 179 | ... 180 | >>> %psource load_taxonomy_query_sequences 181 | ``` 182 | 183 | ```python 184 | >>> import random 185 | ... 186 | >>> queries = load_taxonomy_query_sequences() 187 | >>> queries = random.sample(queries, k=50) 188 | ``` 189 | 190 | ```python 191 | >>> queries[0] 192 | ``` 193 | 194 | For a given query sequence, its taxonomy will be classified as follows. First, the set of all kmers will be extracted from the sequence. This is referred to as $V$. Then, for all taxa in the kmer probability table, the probability of observing the query sequence will be computed given that taxon: $P(query | taxon)$. This is computed as the product of all its kmer probabilities for the given taxon. (It should be clear based on this formula why it was necessary to add pseudocounts when computing our kmer probability table - if not, kmer probabilities of zero would result in a zero probability of the sequence being derived from that taxon at this step.) 195 | 196 | After computing $P(query | taxon)$ for all taxa, the taxonomy assignment return is simply the one achieving the maximum probability. Here we'll classify a sequence and look at the resulting taxonomy assignment. 197 | 198 | ```python 199 | >>> def classify_V(V, kmer_probability_table): 200 | ... P_S_t = [] # probability of the sequence given the taxon 201 | ... for taxon in kmer_probability_table: 202 | ... kmer_probabilities = kmer_probability_table[taxon] 203 | ... probability = 1.0 204 | ... for v_i in V: 205 | ... probability *= kmer_probabilities[v_i] 206 | ... P_S_t.append((probability, taxon)) 207 | ... return max(P_S_t)[1], V 208 | ... 209 | >>> def classify_sequence(query_sequence, kmer_probability_table, k): 210 | ... V = list(map(str, query_sequence.iter_kmers(k=k))) 211 | ... return classify_V(V, kmer_probability_table) 212 | ``` 213 | 214 | ```python 215 | >>> taxon_assignment, V = classify_sequence(queries[0], kmer_probability_table, k) 216 | >>> print(taxon_assignment) 217 | ``` 218 | 219 | Since we know the actual taxonomy assignment for this sequence, we can look that up in our reference database. Was your assignment correct? Try this with a few query sequences and keep track of how many times the classifier achieved the correct assignment. 220 | 221 | ```python 222 | >>> get_taxon_at_level(reference_taxonomy[queries[0].metadata['id']], taxonomic_level) 223 | ``` 224 | 225 | Because the query and reference sequences that were working with were randomly selected from the full reference database, each time you run this notebook you should observe different results. Chances are however that if you run the above steps multiple times you'll get the wrong taxonomy assignment at least some of the time. Up to this point, we've left out an important piece of information: how confident should we be in our assignment, or in other words, how dependent is our taxonomy assignment on our specific query? If there were slight differences in our query (e.g., because we observed a very closely related organism, such as one of the same species but a different strain, or because we sequenced a different region of the 16S sequence) would we obtain the same taxonomy assignment? If so, we should have higher confidence in our assignment. If not, we should have lower confidence in our assignment. 226 | 227 | We can quantify confidence using an approach called bootstrapping. With a bootstrap approach, we'll get our taxonomy assignment as we did above, but then for some user-specified number of times, we'll create random subsets of V sampled with replacement (DEFINE THIS). We'll then assign taxonomy to each random subset of V, and count the number of times the resulting taxonomy assignment is the same that we achieved when assigning taxonomy to V. The count divided by the number of iterations we've chosen to run will be our confidence value. If the assignments are often the same we'll have a high confidence value. If the assignments are often different, we'll have a low confidence value. 228 | 229 | Let's now assign taxonomy and compute a confidence for that assignment. 230 | 231 | ```python 232 | >>> def classify_sequence_with_confidence(sequence, kmer_probability_table, k, 233 | ... confidence_iterations=100): 234 | ... taxon, V = classify_sequence(sequence, kmer_probability_table, k) 235 | ... 236 | ... count_same_taxon = 0 237 | ... subsample_size = int(len(V) * 0.1) 238 | ... for i in range(confidence_iterations): 239 | ... subsample_V = np.random.choice(V, subsample_size, replace=True) 240 | ... subsample_taxon, _ = classify_V(subsample_V, kmer_probability_table) 241 | ... if taxon == subsample_taxon: 242 | ... count_same_taxon += 1 243 | ... confidence = count_same_taxon / confidence_iterations 244 | ... 245 | ... return (taxon, confidence) 246 | ``` 247 | 248 | ```python 249 | >>> taxon_assignment, confidence = classify_sequence_with_confidence(queries[0], kmer_probability_table, k) 250 | >>> print(taxon_assignment) 251 | >>> print(confidence) 252 | ``` 253 | 254 | How did the computed confidence compare to the accuracy taxonomy assignment? 255 | 256 | We don't have an _a priori_ idea for what good versus bad confidence scores are, but we can use our reference database to explore that. We might want this information so we can come up with a confidence threshold, above which we would accept a taxonomy assignment and below which we might reject it. To explore this, let's compute taxonomy assignments and confidence for all of our query sequences and then see what the distributions of confidence scores look like for correct assignments and incorrect assignments. 257 | 258 | ```python 259 | >>> correct_assignment_confidences = [] 260 | >>> incorrect_assignment_confidences = [] 261 | >>> summary = [] 262 | ... 263 | >>> for query in queries: 264 | ... predicted_taxonomy, confidence = classify_sequence_with_confidence(query, kmer_probability_table, k) 265 | ... actual_taxonomy = get_taxon_at_level(reference_taxonomy[query.metadata['id']], taxonomic_level) 266 | ... if actual_taxonomy == predicted_taxonomy: 267 | ... correct_assignment_confidences.append(confidence) 268 | ... else: 269 | ... incorrect_assignment_confidences.append(confidence) 270 | ... 271 | ... summary.append([predicted_taxonomy, actual_taxonomy, confidence]) 272 | >>> summary = pd.DataFrame(summary, columns=['Predicted taxonomy', 'Actual taxonomy', 'Confidence']) 273 | ``` 274 | 275 | ```python 276 | >>> import seaborn as sns 277 | ... 278 | >>> ax = sns.boxplot(data=[correct_assignment_confidences, incorrect_assignment_confidences]) 279 | >>> ax = sns.swarmplot(data=[correct_assignment_confidences, incorrect_assignment_confidences], color="black") 280 | >>> _ = ax.set_xticklabels(['Correct assignments', 'Incorrect assignments']) 281 | >>> _ = ax.set_ylabel('Confidence') 282 | ``` 283 | 284 | What does this plot tell you about how well setting a confidence threshold is likely to work? If you never wanted to reject a correct assignment, how often would you accept an incorrect assignment? If you never wanted to accept an incorrect assignment, how often would you reject a correct assignment? 285 | 286 | ```python 287 | >>> summary # maybe explore whether certain taxa are more frequently wrong than others... 288 | ``` 289 | 290 | ### Random Forest classifiers 291 | 292 | Coming soon... 293 | 294 | ## Defining a dimensionality reduction problem 295 | 296 | [This content](alias://b1cdbe) will be adapted and ported here. 297 | -------------------------------------------------------------------------------- /book/fundamentals/multiple-sequence-alignment.md: -------------------------------------------------------------------------------- 1 | 2 | # Generalized dynamic programming for multiple sequence alignment 3 | 4 | Until now we worked with alignments between two sequences, but it is likely that you will want to align many sequences at the same time. For example, if you are trying to gain insight on the evolutionary relationships between all of the 16S bacterial genes in a given sample, it would be time consuming and very inefficient to compare them two at a time. It would be more efficient and useful to compare all of the 16S sequences from the bacteria in the same alignment. 5 | In the pairwise sequence alignment chapter, we went over dynamic programming algorithms. It's possible to generalize Smith-Waterman and Needleman-Wunsch, the dynamic programming algorithms that we explored for pairwise sequence alignment, to identify the optimal alignment of more than two sequences. Remember that our scoring scheme for pairwise alignment with Smith-Waterman looked like the following: 6 | 7 | $$ 8 | \begin{align} 9 | & F(0, 0) = 0\\ 10 | & F(i, 0) = F(i-1, 0) - d\\ 11 | & F(0, j) = F(0, j-1) - d\\ 12 | \\ 13 | & F(i, j) = max \begin{pmatrix} 14 | & F(i-1, j-1) + s(c_i, c_j)\\ 15 | & F(i-1, j) - d\\ 16 | & F(i, j-1) - d)\\ 17 | \end{pmatrix} 18 | \end{align} 19 | $$ 20 | 21 | To generalize this to three sequences, we could create $3 \times 3$ scoring, dynamic programming, and traceback matrices. Our scoring scheme would then look like the following: 22 | 23 | $$ 24 | \begin{align} 25 | & F(0, 0, 0) = 0\\ 26 | & F(i, 0, 0) = F(i-1, 0, 0) - d\\ 27 | & F(0, j, 0) = F(0, j-1, 0) - d\\ 28 | & F(0, 0, k) = F(0, 0, k-1) - d\\ 29 | \\ 30 | & F(i, j, k) = max \begin{pmatrix} 31 | F(i-1, j-1, k-1) + s(c_i, c_j) + s(c_i, c_k) + s(c_j, c_k)\\ 32 | F(i, j-1, k-1) + s(c_j, c_k) - d\\ 33 | F(i-1, j, k-1) + s(c_i, c_k) - d\\ 34 | F(i-1, j-1, k) + s(c_i, c_j) - d\\ 35 | F(i, j, k-1) - 2d\\ 36 | F(i, j-1, k) - 2d\\ 37 | F(i-1, j, k) - 2d\\ 38 | \end{pmatrix} 39 | \end{align} 40 | $$ 41 | 42 | However the complexity of this algorithm is much worse than for pairwise alignment. For pairwise alignment, remember that if aligning two sequences of lengths $m$ and $n$, the runtime of the algorithm will be proportional to $m \times n$. If $n$ is longer than or as long as $m$, we simplify the statement to say that the runtime of the algorithm will be be proportional to $n^2$. This curve has a pretty scary trajectory: runtime for pairwise alignment with dynamic programming is said to scale quadratically. 43 | 44 | ```python 45 | >>> %pylab inline 46 | >>> from functools import partial 47 | >>> from IPython.core import page 48 | >>> page.page = print 49 | ``` 50 | 51 | ```python 52 | >>> import matplotlib.pyplot as plt 53 | ... 54 | >>> seq_lengths = range(25) 55 | >>> s2_times = [t ** 2 for t in range(25)] 56 | ... 57 | >>> plt.plot(range(25), s2_times) 58 | >>> plt.xlabel('Sequence Length') 59 | >>> plt.ylabel('Runtime (s)') 60 | ``` 61 | 62 | The exponent in the $n^2$ term comes from the fact that, in pairwise alignment, if we assume our sequences are both of length $n$, there are $n \times n$ cells to fill in in the dynamic programming matrix. If we were to generalize either Smith-Waterman or Needleman-Wunsch to three sequences, we would need to create a 3 dimensional array to score and trace back the alignment. For sequences of length $n$, we would therefore have $n \times n \times n$ cells to fill in, and our runtime versus sequence length curve would look like the following. 63 | 64 | ```python 65 | >>> s3_times = [t ** 3 for t in range(25)] 66 | ... 67 | >>> plt.plot(range(25), s3_times) 68 | >>> plt.xlabel('Sequence Length') 69 | >>> plt.ylabel('Runtime (s)') 70 | ``` 71 | 72 | That curve looks steeper than the curve for pairwise alignment, and the values on the y-axis are bigger, but it's not really clear how much of a problem this is until we plot runtime for three sequences in the context of the run times for pairwise alignment. 73 | 74 | ```python 75 | >>> plt.plot(range(25), s2_times) 76 | >>> plt.plot(range(25), s3_times) 77 | >>> plt.xlabel('Sequence Length') 78 | >>> plt.ylabel('Runtime (s)') 79 | ``` 80 | 81 | And for four sequences: 82 | 83 | ```python 84 | >>> s4_times = [t ** 4 for t in range(25)] 85 | ... 86 | >>> plt.plot(range(25), s2_times) 87 | >>> plt.plot(range(25), s3_times) 88 | >>> plt.plot(range(25), s4_times) 89 | >>> plt.xlabel('Sequence Length') 90 | >>> plt.ylabel('Runtime (s)') 91 | ``` 92 | 93 | We clearly have a problem here, and that is that the runtime for multiple sequence alignment using full dynamic programming algorithms grows exponentially with the number of sequences to be aligned. If $n$ is our sequence length, and $s$ is the number of sequences, that means that runtime is proportional to $n^s$. In pairwise alignment, $s$ is always equal to 2, so the problem is more manageable. However, for the general case of $s$ sequences, we really can't even consider Smith-Waterman or Needleman-Wunsch for more than just a few sequences. The pattern in the plots above should illustrate why. 94 | 95 | As we explored with database searching, we need to figure out how to align fewer sequences. This is where *progressive alignment* comes in. 96 | 97 | ## Progressive alignment 98 | 99 | In progressive alignment, the problem of exponential growth of runtime and space is managed by selectively aligning pairs of sequences, and aligning alignments of sequences. What we typically do is identify a pair of closely related sequences, and align those. Then, we identify the next most closely related sequence to that initial pair, and align that sequence to the alignment. This concept of aligning a sequence to an alignment is new, and we'll come back to it in just a few minutes. The other concept of identifying the most closely related sequences, and then the next most closely related sequence, and so on should sound familiar. It effectively means that we're traversing a tree. And herein lies our problem: we need a tree to efficiently align multiple sequences, but we need an alignment to build a good tree. 100 | 101 | You probably have two burning questions in your mind right now: 102 | 103 | 1. How do we build a tree to guide the alignment process, if we need an alignment to build a good tree? 104 | 2. How do we align a sequence to an alignment, or an alignment to an alignment? 105 | 106 | We'll explore both of those through-out the rest of this notebook. First, let's cover the process of progressive multiple sequence alignment, just assuming for a moment that we know how to do both of those things. 107 | 108 | The process of progressive multiple sequence alignment could look like the following. First, we start with some sequences and a tree representing the relationship between those sequences. We'll call this our guide tree, because it's going to guide us through the process of multiple sequence alignment. In progressive multiple sequence alignment, we build a multiple sequence alignment for each internal node of the tree, where the alignment at a given internal node contains all of the sequences in the clade defined by that node. 109 | 110 | 111 | 112 | Starting from the root node, descend the bottom branch of the tree until you get to the an internal node. If an alignment hasn't been constructed for that node yet, continue descending the tree until to get to a pair of nodes. In this case, we follow the two branches to the tips. We then align the sequences at that pair of tips (usually with Needleman-Wunsch, for multiple sequence alignment), and assign that alignment to the node connecting those tips. 113 | 114 | 115 | 116 | Next, we want to find what to align the resulting alignment to, so start from the root node and descend the top branch of the tree. When you get to the next node, determine if an alignment has already been created for that node. If not, our job is to build that alignment so we have something to align against. In this case, that means that we need to align `s1`, `s2`, and `s3`. We can achieve this by aligning `s1` and `s3` first, to get the alignment at the internal node connecting them. 117 | 118 | 119 | 120 | We can next align the alignment of `s1` and `s3` with `s2`, to get the alignment at the internal node connecting those clades. 121 | 122 | 123 | 124 | And finally, we can compute the alignment at the root node of the tree, by aligning the alignment of `s1`, `s2`, and `s3` with the alignment of `s4` and `s5`. 125 | 126 | 127 | 128 | The alignment at the root node is our multiple sequence alignment. 129 | 130 | ### Building the guide tree 131 | 132 | Let's address the first of our outstanding questions. I mentioned above that *we need an alignment to build a good tree*. The key word here is *good*. We can build a very rough tree - one that we would never want to present as representing the actual relationships between the sequences in question - without first aligning the sequences. Remember that building a UPGMA tree requires only a distance matrix, so if we can find a non-alignment-dependent way to compute distances between the sequences, we can build a rough UPGMA tree from them. 133 | 134 | Let's compute distances between the sequences based on their *word* composition. We'll define a *word* here as `k` adjacent characters in the sequence. We can then define a function that will return all of the words in a sequence as follows. These words can be defined as being overlapping, or non-overlapping. We'll go with overlapping for this example, as the more words we have, the better our guide tree should be. 135 | 136 | ```python 137 | >>> from skbio import DNA 138 | >>> %psource DNA.iter_kmers 139 | ``` 140 | 141 | ```python 142 | >>> for e in DNA("ACCGGTGACCAGTTGACCAGTA").iter_kmers(3): 143 | ... print(e) 144 | ``` 145 | 146 | ```python 147 | >>> for e in DNA("ACCGGTGACCAGTTGACCAGTA").iter_kmers(7): 148 | ... print(e) 149 | ``` 150 | 151 | ```python 152 | >>> for e in DNA("ACCGGTGACCAGTTGACCAGTA").iter_kmers(3, overlap=False): 153 | ... print(e) 154 | ``` 155 | 156 | If we then have two sequences, we can compute the word counts for each and define a distance between the sequences as the fraction of words that are unique to either sequence. 157 | 158 | ```python 159 | >>> from iab.algorithms import kmer_distance 160 | >>> %psource kmer_distance 161 | ``` 162 | 163 | We can then use this as a distance function... 164 | 165 | ```python 166 | >>> s1 = DNA("ACCGGTGACCAGTTGACCAGT") 167 | >>> s2 = DNA("ATCGGTACCGGTAGAAGT") 168 | >>> s3 = DNA("GGTACCAAATAGAA") 169 | ... 170 | >>> print(s1.distance(s2, kmer_distance)) 171 | >>> print(s1.distance(s3, kmer_distance)) 172 | ``` 173 | 174 | If we wanted to override the default to create (for example) a 5-mer distance function, we could use ``functools.partial``. 175 | 176 | ```python 177 | >>> fivemer_distance = partial(kmer_distance, k=5) 178 | ... 179 | >>> s1 = DNA("ACCGGTGACCAGTTGACCAGT") 180 | >>> s2 = DNA("ATCGGTACCGGTAGAAGT") 181 | >>> s3 = DNA("GGTACCAAATAGAA") 182 | ... 183 | >>> print(s1.distance(s2, fivemer_distance)) 184 | >>> print(s1.distance(s3, fivemer_distance)) 185 | ``` 186 | 187 | We can now apply one of these functions to build a distance matrix for a set of sequences that we want to align. 188 | 189 | ```python 190 | >>> query_sequences = [DNA("ACCGGTGACCAGTTGACCAGT", {"id": "s1"}), 191 | ... DNA("ATCGGTACCGGTAGAAGT", {"id": "s2"}), 192 | ... DNA("GGTACCAAATAGAA", {"id": "s3"}), 193 | ... DNA("GGCACCAAACAGAA", {"id": "s4"}), 194 | ... DNA("GGCCCACTGAT", {"id": "s5"})] 195 | ``` 196 | 197 | ```python 198 | >>> from skbio import DistanceMatrix 199 | ... 200 | >>> guide_dm = DistanceMatrix.from_iterable(query_sequences, metric=kmer_distance, key='id') 201 | ``` 202 | 203 | scikit-bio also has some basic visualization functionality for these objects. For example, we can easily visualize this object as a heatmap. 204 | 205 | ```python 206 | >>> fig = guide_dm.plot(cmap='Greens') 207 | ``` 208 | 209 | We can next use some functionality from SciPy to cluster the sequences with UPGMA, and print out a dendrogram. 210 | 211 | ```python 212 | >>> from scipy.cluster.hierarchy import average, dendrogram, to_tree 213 | ... 214 | >>> for q in query_sequences: 215 | ... print(q) 216 | ... 217 | >>> guide_lm = average(guide_dm.condensed_form()) 218 | >>> guide_d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 219 | ... link_color_func=lambda x: 'black') 220 | >>> guide_tree = to_tree(guide_lm) 221 | ``` 222 | 223 | ```python 224 | >>> from iab.algorithms import guide_tree_from_sequences 225 | >>> %psource guide_tree_from_sequences 226 | ``` 227 | 228 | ```python 229 | >>> t = guide_tree_from_sequences(query_sequences, display_tree=True) 230 | ``` 231 | 232 | We now have a guide tree, so we can move on to the next step of progressive alignment. 233 | 234 | ### Generalization of Needleman-Wunsch (with affine gap scoring) for progressive multiple sequence alignment 235 | 236 | Next, we'll address our second burning question: aligning alignments. As illustrated above, there are basically three different types of pairwise alignment we need to support for progressive multiple sequence alignment with Needleman-Wunsch. These are: 237 | 238 | 1. Alignment of a pair of sequences. 239 | 2. Alignment of a sequence and an alignment. 240 | 3. Alignment of a pair of alignments. 241 | 242 | Standard Needleman-Wunsch supports the first, and it is very easy to generalize it to support the latter two. The only change that is necessary is in how the alignment of two non-gap characters is scored. Recall that we previously scored an alignment of two characters by looking up the score of substitution from one to the other in a substitution matrix. To adapt this for aligning a sequence to an alignment, or for aligning an alignment to an alignment, we compute this substitution as the average score of aligning the pairs of characters. 243 | 244 | For example, if we want to align the alignment column from $aln1$: 245 | 246 | ``` 247 | A 248 | C 249 | ``` 250 | 251 | to the alignment column from $aln2$: 252 | 253 | ``` 254 | T 255 | G 256 | ``` 257 | 258 | we could compute the substitution score using the matrix $m$ as: 259 | 260 | $$ 261 | s = \frac{m[A][T] + m[A][G] + m[C][T] + m[C][G]}{aln1_{length} \times aln2_{length}} 262 | $$ 263 | 264 | The following code adapts our implementation of Needleman-Wunsch to support aligning a sequence to an alignment, or aligning an alignment to an alignment. 265 | 266 | ```python 267 | >>> from iab.algorithms import format_dynamic_programming_matrix, format_traceback_matrix 268 | >>> from skbio.alignment._pairwise import _compute_score_and_traceback_matrices 269 | ... 270 | >>> %psource _compute_score_and_traceback_matrices 271 | ``` 272 | 273 | ```python 274 | >>> from skbio.alignment._pairwise import _traceback 275 | >>> %psource _traceback 276 | ``` 277 | 278 | ```python 279 | >>> from skbio.alignment import global_pairwise_align_nucleotide 280 | >>> %psource global_pairwise_align_nucleotide 281 | ``` 282 | 283 | For the sake of the examples below, I'm going to override one of the ``global_pairwise_align_nucleotide`` defaults to penalize terminal gaps. This effectively tells the algorithm that we know we have a collection of sequences that are homologous from beginning to end. 284 | 285 | ```python 286 | >>> global_pairwise_align_nucleotide = partial(global_pairwise_align_nucleotide, penalize_terminal_gaps=True) 287 | ``` 288 | 289 | For example, we can still use this code to align pairs of sequences (but note that we now need to pass those sequences in as a pair of one-item lists): 290 | 291 | ```python 292 | >>> aln1, _, _ = global_pairwise_align_nucleotide(query_sequences[0], query_sequences[1]) 293 | >>> print(aln1) 294 | ``` 295 | 296 | We can align that alignment to one of our other sequences. 297 | 298 | ```python 299 | >>> aln1, _, _ = global_pairwise_align_nucleotide(aln1, query_sequences[2]) 300 | >>> print(aln1) 301 | ``` 302 | 303 | Alternatively, we can align another pair of sequences: 304 | 305 | ```python 306 | >>> aln2, _, _ = global_pairwise_align_nucleotide(query_sequences[2], query_sequences[3]) 307 | >>> print(aln2) 308 | ``` 309 | 310 | And then align that alignment against our previous alignment: 311 | 312 | ```python 313 | >>> aln3, _, _ = global_pairwise_align_nucleotide(aln1, aln2) 314 | >>> print(aln3) 315 | ``` 316 | 317 | ### Putting it all together: progressive multiple sequence alignment 318 | 319 | We can now combine all of these steps to take a set of query sequences, build a guide tree, perform progressive multiple sequence alignment, and return the guide tree (as a SciPy linkage matrix) and the alignment. 320 | 321 | ```python 322 | >>> from skbio import TreeNode 323 | >>> guide_tree = TreeNode.from_linkage_matrix(guide_lm, guide_dm.ids) 324 | ``` 325 | 326 | We can view the guide tree in [Newick format](http://scikit-bio.org/docs/latest/generated/skbio.io.newick.html) as follows: 327 | 328 | ```python 329 | >>> print(guide_tree) 330 | ``` 331 | 332 | ```python 333 | >>> from iab.algorithms import progressive_msa 334 | >>> %psource progressive_msa 335 | ``` 336 | 337 | ```python 338 | >>> msa = progressive_msa(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, guide_tree=guide_tree) 339 | >>> print(msa) 340 | ``` 341 | 342 | We can now build a (hopefully) improved tree from our multiple sequence alignment. First we'll look at our original distance matrix again, and then the distance matrix generated from the progressive multiple sequence alignment. 343 | 344 | ```python 345 | >>> fig = guide_dm.plot(cmap='Greens') 346 | ``` 347 | 348 | ```python 349 | >>> msa_dm = DistanceMatrix.from_iterable(msa, metric=kmer_distance) 350 | >>> fig = msa_dm.plot(cmap='Greens') 351 | ``` 352 | 353 | The UPGMA trees that result from these alignments are very different. First we'll look at the guide tree, and then the tree resulting from the progressive multiple sequence alignment. 354 | 355 | ```python 356 | >>> d = dendrogram(guide_lm, labels=guide_dm.ids, orientation='right', 357 | ... link_color_func=lambda x: 'black') 358 | ``` 359 | 360 | ```python 361 | >>> msa_lm = average(msa_dm.condensed_form()) 362 | >>> d = dendrogram(msa_lm, labels=msa_dm.ids, orientation='right', 363 | ... link_color_func=lambda x: 'black') 364 | ``` 365 | 366 | And we can wrap this all up in a single convenience function: 367 | 368 | ```python 369 | >>> from iab.algorithms import progressive_msa_and_tree 370 | >>> %psource progressive_msa_and_tree 371 | ``` 372 | 373 | ```python 374 | >>> msa = progressive_msa(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, guide_tree=guide_tree) 375 | ``` 376 | 377 | ```python 378 | >>> msa, tree = progressive_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, 379 | ... display_tree=True, display_aln=True) 380 | ``` 381 | 382 | ## Progressive alignment versus iterative alignment 383 | 384 | In an iterative alignment, the output tree from the above progressive alignment is used as a guide tree, and the full process repeated. This is performed to reduce errors that result from a low-quality guide tree. 385 | 386 | ```python 387 | >>> from iab.algorithms import iterative_msa_and_tree 388 | >>> %psource iterative_msa_and_tree 389 | ``` 390 | 391 | ```python 392 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=1, display_aln=True, display_tree=True) 393 | ``` 394 | 395 | ```python 396 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=2, display_aln=True, display_tree=True) 397 | ``` 398 | 399 | ```python 400 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=3, display_aln=True, display_tree=True) 401 | ``` 402 | 403 | ```python 404 | >>> msa, tree = iterative_msa_and_tree(query_sequences, pairwise_aligner=global_pairwise_align_nucleotide, num_iterations=5, display_aln=True, display_tree=True) 405 | ``` 406 | 407 | Some references that I used in assembling these notes include [1](http://statweb.stanford.edu/~nzhang/345_web/sequence_slides3.pdf), [2](http://math.mit.edu/classes/18.417/Slides/alignment.pdf), [3](http://www.sciencedirect.com/science/article/pii/0378111988903307), [4](http://bioinformatics.oxfordjournals.org/content/23/21/2947.full), and [5](http://nar.oxfordjournals.org/content/32/5/1792.full). 408 | -------------------------------------------------------------------------------- /book/getting-started/biological-information.md: -------------------------------------------------------------------------------- 1 | # Biological Information 2 | 3 | Biological systems and computer systems are analogous in ways that may not be immediately apparent. Before we dive into using computers to study biology, let's briefly explore a relationship between the two: information processing is one of the most fundamental functions of both. 4 | 5 | ## Central Dogma of Molecular Biology 6 | 7 | The Central Dogma of Molecular Biology describes information flow in biological systems. It begins with DNA, a relatively long-lived information storage molecule, from which information typically flows in two directions: into new DNA molecules, during the process of replication, or into messenger RNA (mRNA), during the processing of transcription. mRNA is a relatively short-lived molecule that transfers information that is used to synthesize protein molecules by the ribosome. Proteins are often thought of as the building blocks of life. They serve a variety of purposes, ranging from molecular machines such as transmembrane ion transporters, to structural molecules like myosin, a major component of muscle fibers. There are some uncommon circumstances where information flows differently, for example some viruses can reverse transcribe RNA to DNA, but proteins seem to be a terminal output of this information flow. Once a protein has been created, we are aware of no process that can work backwards to re-create the RNA or DNA that encoded it. 8 | 9 | We'll revisit these ideas at the end of this chapter, but first let's establish some concepts that will help us to understand and even quantify information. These ideas have their roots in Boolean algebra and Information Theory. Bear with me while I introduce some concepts that may be new to you, and may initially seem unrelated. 10 | 11 |
12 | 13 |
Figure 1: The central dogma of molecular biology represents information flow in biological systems. Blue pathways are generally observed in cellular life. Red pathways are observed in special cases such as RNA viruses. (Figure attribution: Narayanese at English Wikipedia [Public domain], via Wikimedia Commons.)
14 |
15 |

16 | 17 | ## Binary and decimal numerical systems 18 | 19 | Humans most frequently use a _base 10_ or decimal numerical system for representing numbers. _Base 10_ means than there are ten available digits including zero. These are the digits 0, 1, 2, 3, 4, 5, 6, 7, 8, and 9. We represent numbers larger than 9 using multiple places: the _ones_ place, the _tens_ place, the _hundreds_ place, and so on. These are the exponents of 10: the ones place is $10^{0}$, the tens place is $10^{1}$, the hundreds place is $10^{2}$, and so on. When we write a decimal number with multiple places, such as 42, what we're representing is a four in the tens place plus a two in the ones place, or $4 \times 10^{1} + 2 \times 10^{0} = 42$. 20 | 21 | You've probably heard that computers use a _base 2_ or binary numerical system to represent numbers. The _base_ again describes the number of available digits, so in a base 2 or binary system, there are two digits, 0 and 1. These are defined as the binary digits. As in the decimal system, numbers larger than 1 are represented using multiple places. The places in a binary number are again based on exponents, but this time they are the exponents of 2. Instead of a ones place, a tens place, and a hundreds place, the first three places in a binary number are the ones place ($2^0$), the _twos_ place ($2^1$), and the _fours_ place ($2^2$). Thus the interpretation of the binary number `011` is $0 \times 2^2 + 1 \times 2^1 + 1 \times 2^0 = 3$. 22 | 23 | When working with numbers that may be other than base 10, by convention numbers would be written as $(n)_b$, where $n$ is the number, and $b$ is the base of the number. For example, $(11)_{10}$ represents the decimal number 11, because the base is 10. $(11)_2$ represents the decimal number 3: because the base is 2, we know that this is a binary number. 24 | 25 | Here are some binary numbers and formulas for translating them to their decimal equivalents. 26 | 27 | * $(0)_2$ is the decimal number 0 ($0 \times 2^0$) 28 | * $(1)_2$ is the decimal number 1 ($1 \times 2^0$) 29 | * $(01)_2$ is also the decimal number 1 ($0 \times 2^1 + 1 \times 2^0$) 30 | * $(11)_2$ is the decimal number 3 ($1 \times 2^1 + 1 \times 2^0$) 31 | * $(110)_2$ is the decimal number 6 ($1 \times 2^2 + 1 \times 2^1 + 0 \times 2^0$) 32 | * $(111)_2$ is the decimal number 7 ($1 \times 2^2 + 1 \times 2^1 + 1 \times 2^0$) 33 | 34 | A single **bi**nary digi**t** (a zero or one) is referred to as a _bit_, and bits can be used to encode a lot more than just numbers. 35 | 36 | ## Encoding messages in bits 37 | 38 | The messages encoded by bits can be nearly anything, provided that the sender of the message and the recipient of the message have agreed on a coding scheme which describes how a message can be encoded in bits or decoded from bits. The number of messages that can be sent using bits is a simple function of the number of places that are used. For example, if the only messages I want to transmit to you are "yes" and "no", I could achieve that by transmitting a single bit of information. You and I could agree that "yes" will be represented by the bit 1, and "no" will be represented by the bit 0. 39 | 40 | Internally, computers send and receive messages that are encoded using electrical currents. To reduce errors in message transmission, the electrical currents are interpreted only as being off or on, such that a message may be transmitted as off-on-on. These two states, off and on, are often interpreted by computers as binary numbers, where zero is synonymous with off (no current) and one is synonymous with on (current). So our message of off-on-on could be read as the binary number 011, or the decimal number 3. 41 | 42 | To illustrate a useful system that operates on the transmission of one bit of information, I'll describe a photosensor for an outdoor spotlight. In this example the photosensor is the sender of the message and the spotlight is the receiver of the message. The transmission of a zero from the photosensor to the spotlight could mean that it is currently light outside, and the transmission of a one could mean that it is currently dark outside. (The meanings of zero and one could be reversed: all that matters is that the sender and the receiver know what each value means.) The photosensor can monitor the available light, and send a message to the spotlight once per minute. If it is currently light outside, the photosensor will send a zero to the spotlight and the spotlight will turn off or remain off. If it is currently dark outside, the photosensor will send a one to the spotlight, and the spotlight will turn on or remain on. The photosensor is functioning as an on/off switch for the spotlight, transmitting one bit of information every minute. 43 | 44 | There are couple of important things to consider in this example. First, the meaning of "currently light outside" and "currently dark outside" are embodied in the photosensor. It must make a decision on whether it is light or dark on it's own, because it is only transmitting one bit of information (zero equals light and one equals dark). The message it sends isn't complex enough to describe how light or dark it currently is outside -it's effectively only flipping a switch on or off. 45 | 46 | To enable the transmission of more complex messages more bits can be used. One bit allows us to transmit two messages: 0 and 1, which in our photosensor example are interpreted as _off_ and _on_, respectively. If our message is based on two bits we can transmit four messages, 00, 01, 10, or 11. A real-world example of this could be a light switch with four states: off, low brightness, medium brightness, and high brightness. If our message is based on three bits, we can transmit eight messages, 000, 001, 010, 011, 100, 101, 110, or 111. There is a pattern emerging here. If `n` is number of bits that you have available to send a message, the number of distinct messages that you can send is $2^n$. To generalize this formula further, if the number of available digits in the system is `b`, and the number of places you can use in your message is `n`, then the number of messages that can be sent is $b^n$. 47 | 48 | In computer systems, the bit is the most fundamental unit of information. The next largest unit is the byte, which is composed of eight bits. How many messages can be encoded in one byte? 49 | 50 | ## Protein sequences are encoded in a base 4 system 51 | 52 | The building blocks of DNA are four chemical compounds called adenine, cytosine, guanine, and thymine. We often represent these compounds with the abbreviations A, C, G and T, respectively. One of the primary roles of DNA in biological organisms is to encode the primary structure, or amino acid sequence, of proteins. As with computer systems, this information is represented based on discrete states, but in biological systems there are four states rather than two. Each position or place in an exon of a protein-coding DNA sequence can contain one of these compounds, and the linear order of the compounds can encode a message. 53 | 54 | When first translated, proteins are composed of simpler units, the amino acids, and most organisms use 20 different amino acids to build proteins. Because there are four DNA bases (A, C, G, and T) and twenty amino acids, we need more than one base to transmit the message of what amino acid comes next in a protein from DNA to the ribosome. How many DNA bases we need depends on how many messages we want to be able to send, which in this case is 20 (for the twenty amino acids). So, how many DNA bases are needed to encode the 20 canonical amino acids? 55 | 56 | As mentioned above, we can determine the number of messages we can send in a given numerical system with a given number of places using the formula $b^n$. For messages encoded in DNA, $b$ is four, so with one place (or one DNA base) we can send four messages. Since four is less than twenty, we'll need longer messages to encode the twenty amino acids. If our message were composed of two bases, we could send $4^2=16$ messages - that's still less than twenty, so we'll need more bases. If our message were composed of three bases, we could send $4^3=64$ messages. This is more than twenty, which means that we can encode all of the amino acids (with some messages to spare) in three bases. It's important to note that the number of places we can use must be a whole number - "2.5 bases of DNA" is not a meaningful quantity. 57 | 58 | Amino acids are in fact encoded by three nucleotide bases, and the three base messages are referred to as _codons_. The mapping of codons to amino acids is referred to as the _genetic code_. Each codon represents exactly one amino acid, with the exception of some, the _stop codons_, which indicate the end of a message. Because there are 64 codons but only twenty-one messages that need to be transmitted (the twenty amino acids and the "stop" signal), some amino acids and the stop signal are represented by more than one codon. This is referred to as the redundancy of the genetic code. 59 | 60 |

61 | 62 |
Figure 2: The vertebrate RNA genetic code. The corresponding DNA genetic code is identical, except that Us are replaced with Ts. (Figure attribution: NIH [Public domain], via Wikimedia Commons.)
63 |
64 |

65 | 66 | The scikit-bio Python library that was mentioned in the previous chapter has functionality for translating DNA sequences into protein sequences. We can apply that as follows: 67 | 68 | ```python 69 | >>> import skbio 70 | >>> # the following sequence is NCBI reference sequence NM_005368.3 71 | ... protein = skbio.DNA( 72 | ... "AAACCCCAGCTGTTGGGGCCAGGACACCCAGTGAGCCCATACTTGCTCTTTTTGTCTTCTTCAGACTGCGCCATGGG" 73 | ... "GCTCAGCGACGGGGAATGGCAGTTGGTGCTGAACGTCTGGGGGAAGGTGGAGGCTGACATCCCAGGCCATGGGCAGG" 74 | ... "AAGTCCTCATCAGGCTCTTTAAGGGTCACCCAGAGACTCTGGAGAAGTTTGACAAGTTCAAGCACCTGAAGTCAGAG" 75 | ... "GACGAGATGAAGGCGTCTGAGGACTTAAAGAAGCATGGTGCCACCGTGCTCACCGCCCTGGGTGGCATCCTTAAGAA" 76 | ... "GAAGGGGCATCATGAGGCAGAGATTAAGCCCCTGGCACAGTCGCATGCCACCAAGCACAAGATCCCCGTGAAGTACC" 77 | ... "TGGAGTTCATCTCGGAATGCATCATCCAGGTTCTGCAGAGCAAGCATCCCGGGGACTTTGGTGCTGATGCCCAGGGG" 78 | ... "GCCATGAACAAGGCCCTGGAGCTGTTCCGGAAGGACATGGCCTCCAACTACAAGGAGCTGGGCTTCCAGGGCTAGGC" 79 | ... "CCCTGCCGCTCCCACCCCCACCCATCTGGGCCCCGGGTTCAAGAGAGAGCGGGGTCTGATCTCGTGTAGCCATATAG" 80 | ... "AGTTTGCTTCTGAGTGTCTGCTTTGTTTAGTAGAGGTGGGCAGGAGGAGCTGAGGGGCTGGGGCTGGGGTGTTGAAG" 81 | ... "TTGGCTTTGCATGCCCAGCGATGCGCCTCCCTGTGGGATGTCATCACCCTGGGAACCGGGAGTGGCCCTTGGCTCAC" 82 | ... "TGTGTTCTGCATGGTTTGGATCTGAATTAATTGTCCTTTCTTCTAAATCCCAACCGAACTTCTTCCAACCTCCAAAC" 83 | ... "TGGCTGTAACCCCAAATCCAAGCCATTAACTACACCTGACAGTAGCAATTGTCTGATTAATCACTGGCCCCTTGAAG" 84 | ... "ACAGCAGAATGTCCCTTTGCAATGAGGAGGAGATCTGGGCTGGGCGGGCCAGCTGGGGAAGCATTTGACTATCTGGA" 85 | ... "ACTTGTGTGTGCCTCCTCAGGTATGGCAGTGACTCACCTGGTTTTAATAAAACAACCTGCAACATCTCA" 86 | ... ).translate(stop='require') 87 | ``` 88 | 89 | The above step translated a DNA sequence to a protein sequence, but it didn't show the result. The view a nicely formatted summary of the result, we can view a representation of the variable as follows: 90 | 91 | ```python 92 | >>> protein 93 | Protein 94 | --------------------------------------------------------------------- 95 | Stats: 96 | length: 178 97 | has gaps: False 98 | has degenerates: False 99 | has non-degenerates: True 100 | has stops: False 101 | --------------------------------------------------------------------- 102 | 0 KPQLLGPGHP VSPYLLFLSS SDCAMGLSDG EWQLVLNVWG KVEADIPGHG QEVLIRLFKG 103 | 60 HPETLEKFDK FKHLKSEDEM KASEDLKKHG ATVLTALGGI LKKKGHHEAE IKPLAQSHAT 104 | 120 KHKIPVKYLE FISECIIQVL QSKHPGDFGA DAQGAMNKAL ELFRKDMASN YKELGFQG 105 | ``` 106 | 107 | If we just want the protein sequence itself, you can call `print` on the variable. How might you determine what this sequence is? 108 | 109 | ```python 110 | >>> print(protein) 111 | ``` 112 | 113 | Genomes contain messages other than protein sequences, so in reality the messages encoded by DNA in our genomes are more complex than a base 4 numerical system. For example, the structure that a chromosomal region adopts can impact whether genes in that region are expressed or not, which can have profound phenotypic impacts. This is a higher-level message that is encoded in our genomes. So, while in some ways we can relate the information contained in our genomes to the way information is stored in computers, our genomes are not just programs that are executed. Even the simplest cellular organisms are far more complex than the most complex machines of humankind! "An airplane is nothing if you compare it to a pelican," observed Herman Dune. 114 | 115 | Also ignored in this discussion is that additional characters are sometimes used to represent ambiguity in our knowledge of a DNA sequence, or to concisely represent more than one sequence. The [IUPAC nucleic acid notation](https://en.wikipedia.org/wiki/Nucleic_acid_notation) is what we've been using in this chapter, where A, C, G, and T represent adenine, cytosine, guanine, and thymine, respectively. Other characters are definited in this notation. For example, N is defined to mean either A, C, G, or T, and is thus commonly encountered in readouts of DNA sequences at positions where the base couldn't be determined. These _degenerate_ characters couldn't be represented in the base 4 numerical system we've been discussing here. They also don't exist in nature - we just use them to talk about DNA sequences. 116 | 117 | ## Quantifying information 118 | 119 | Information is a quantifiable concept, an idea that has its roots in Boolean algebra and in Claude Shannon's work on Information Theory. The most basic unit of information is the _binary digit_, or _bit_, which has two possible states. Depending on the domain, the symbols representing these two states might be `0` and `1`, `yes` and `no`, `+` and `-`, `true` and `false`, or `on` and `off`. When you answer a "yes/no" question in a conversation, or a "true/false" question on an exam, you're providing one bit of information. 120 | 121 | _Information_ is technically defined as a sequence of symbols that can be interpreted as a message. To put these terms in the context of our examples above, our _message_ is a decimal number, our _symbols_ are 0 and 1, and the sequence is the ordered collection of symbols, such as `011`. The number of places (let's call that $p$), and the number of symbols (let's call that $n_{symbols}$) define the number of different messages ($n_{messages}$) that can be encoded as: $n_{messages} = n_{symbols}^p$. 122 | 123 | Let's apply this formula to determine how many messages can be sent with one byte of information: 124 | 125 | ```python 126 | >>> n_symbols = 2 # we'll use the two available binary numbers, 0 and 1 127 | >>> p = 8 # because there are 8 places, or bits, in a byte 128 | ... 129 | >>> print(n_symbols**p) 130 | ``` 131 | 132 | Since bases in a DNA sequence are represented with four characters, each position in a sequence contains two bits of information. We know this because we could represent all four bases using two places in a binary number. For example, 00 could represent A, 01 could represent C, 01 could represent G, and 11 could represent T. These assignments of binary numbers to DNA bases is arbitrary.) In other words, if we have two symbols and two places, we can send four messages ($2^2=4$), so one base of DNA represents 2 bits of information. A DNA sequence that is 100 bases long would therefore contain 200 bits of information. 133 | 134 | More generally, if we send a message using a numerical system with $s$ symbols, and our message is $p$ places long, the number of bits that are sent would be $n$ in the following equation: $s^p = 2^n$. We could solve for $n$ as: $n = \log_{2}s^p$. 135 | 136 | ## The genetic code 137 | 138 | As mentioned above, the genetic code describes the mapping of codons to amino acids. This mapping is embodied in an organism's [transfer RNA (tRNA)](http://pdb101.rcsb.org/motm/15) molecules. As illustrated in Figure 3, one end of the folded tRNA contains the "anticodon loop", which is the complementary sequence to the mRNA's codon. On the other end of the tRNA is the acceptor stem, which contains the amino acid attachment site. Through interaction with an [aminoacyl tRNA synthetase](https://pdb101.rcsb.org/motm/16), the amino acid corresponding to the anticodon is covalently linked to the acceptor stem. During translation, a tRNA's anticodon pairs with a codon in a messenger RNA (mRNA) inside the ribosome and thereby provides the next amino acid needed for protein synthesis. 139 | 140 |

141 | 142 |
Figure 3: The secondary and tertiary structure of a transfer RNA (tRNA) molecule. (Figure attribution: This image was obtained from OpenStax Microbiology, a free microbiology text book, and is licensed under CC-BY. OpenStax Microbiology can be accessed for free at https://openstax.org/books/microbiology/pages/1-introduction.)
143 |
144 |

145 | 146 | Figure 3 also illustrates two different views of the structure of a tRNA molecule. The secondary ($2^o$) structure is presented on the left, and the tertiary ($3^o$) structure is presented on the right. For a nucleic acid molecule, like tRNA, the secondary structure refers to the base pairing interactions in a folded molecule. The tertiary structure refers to the three-dimensional structure that the molecule takes inside of an organism. The primary ($1^o$) structure, which isn't illustrated here, refers to the linear sequence of nucleotides in the tRNA molecule. The primary structure of a phenylalanine tRNA (i.e., tRNAPhe) from yeast in [FASTA format](https://en.wikipedia.org/wiki/FASTA_format), for example, is as follows. 147 | 148 | ``` 149 | >4TNA:A|PDBID|CHAIN|SEQUENCE 150 | GCGGAUUUAGCUCAGUUGGGAGAGCGCCAGACUGAAGAUCUGGAGGUCCUGUGUUCGAUCCACAGAAUUCGCACCA 151 | ``` 152 | 153 | Each level of structure contains some information about the other levels. For example, if we were to examine the primary structure of tRNAPhe, we might find that there are stretches of certain bases that could form stable base pairing interactions with each other. That could help us to make a prediction about the secondary structure of tRNAPhe. We could use that information, along with knowledge about the physics of nucleic acid molecules to make predictions about how the molecule would fold inside of a cell (i.e., its tertiary structure). If we knew the tertiary structure of a molecule with similar primary structure to tRNAPhe, that information would also be very helpful in predicting the tertiary structure of tRNAPhe because we expect nucleic acids or proteins with similar primary structures to also have similar secondary and tertiary structures, though this isn't always the case. 154 | 155 | The primary and secondary structure of a molecule alone doesn't currently allow us to make perfect predictions about the tertiary structure that a molecular will adopt. Personally, I think additional information is needed so this type of prediction won't ever be entirely reliable. But this is a classic problem in bioinformatics. 156 | 157 | ## Summary 158 | 159 | In this section we explored different numerical systems, and discussed relationships between how computers and organisms represent information. In the next section we'll dive into the fundamentals of bioinformatics. 160 | -------------------------------------------------------------------------------- /book/getting-started/images/central-dogma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/central-dogma.png -------------------------------------------------------------------------------- /book/getting-started/images/genetic-code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/genetic-code.png -------------------------------------------------------------------------------- /book/getting-started/images/greg-in-telluride.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/greg-in-telluride.png -------------------------------------------------------------------------------- /book/getting-started/images/trna.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/getting-started/images/trna.png -------------------------------------------------------------------------------- /book/getting-started/index.md: -------------------------------------------------------------------------------- 1 | # Getting started 2 | -------------------------------------------------------------------------------- /book/getting-started/index.yaml: -------------------------------------------------------------------------------- 1 | contents: 2 | - reading-iab 3 | - biological-information 4 | -------------------------------------------------------------------------------- /book/getting-started/reading-iab.md: -------------------------------------------------------------------------------- 1 | # Reading An Introduction to Applied Bioinformatics 2 | 3 | **Bioinformatics, as I see it, is the application of the tools of computer science (things like programming languages, algorithms, and databases) to address biological problems (for example, inferring the evolutionary relationship between a group of organisms based on fragments of their genomes, or understanding if or how the community of microorganisms that live in my gut changes if I modify my diet).** Bioinformatics is a rapidly growing field, largely in response to the vast increase in the quantity of data that biologists now grapple with. Students from varied disciplines (e.g., biology, computer science, statistics, and biochemistry) and stages of their educational careers (undergraduate, graduate, or postdoctoral) are becoming interested in bioinformatics. 4 | 5 | *An **I**ntroduction to **A**pplied **B**ioinformatics*, or **IAB**, is a free, open access bioinformatics text available at http://readIAB.org. **It introduces readers to the core concepts of bioinformatics in the context of their implementation and application to real-world problems and data.** IAB makes extensive use of the [scikit-bio](http://www.scikit-bio.org) Python package, which provides production-ready implementations of core bioinformatics algorithms and data structures. As readers are learning a concept, for example, pairwise sequence alignment, they are presented with its scikit-bio implementation directly in the text. scikit-bio code is well annotated (adhering to the [pep8](https://www.python.org/dev/peps/pep-0008/) and [numpydoc](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) conventions), so readers can use it to assist with their understanding of the concept. Readers of IAB also therefore learn the concepts in the context of tools they can use to develop their own bioinformatics software and pipelines, enabling them to rapidly get started on their own projects. While some theory is discussed, the focus of IAB is on what readers need to know to be effective, practicing bioinformaticians. 6 | 7 | IAB is **completely open access**, with all software being BSD-licensed, and all text being licensed under Creative Commons Attribution Only (i.e., CC BY-NC-SA 4.0). All development and publication is coordinated under [public revision control](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics). 8 | 9 | [My](alias://fedd13) goal for IAB is for it to make bioinformatics as accessible as possible to students from varied backgrounds, and to get more and diverse people into this hugely exciting field. I'm very interested in hearing from readers and instructors who are using IAB, so get in touch if you have corrections, suggestions for how to improve the content, or any other thoughts or comments on the text. In the spirit of openness, I'd prefer to be contacted via the [IAB issue tracker](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/issues/). I'll respond to direct e-mail as well, but I'm often backlogged on e-mail (just ask my students), so e-mail responses are likely to be slower. 10 | 11 | I hope you find IAB useful, and that you enjoy reading it! 12 | 13 | ## Who should read IAB? 14 | 15 | IAB is written for scientists, software developers, and students interested in understanding and applying bioinformatics methods, and ultimately in developing their own bioinformatics analysis pipelines or software. 16 | 17 | IAB was initially developed for an undergraduate course cross-listed in computer science and biology with no pre-requisites. It therefore assumes little background in biology or computer science, however some basic background is very helpful. For example, an understanding of the roles of and relationship between DNA and protein in a cell, and the ability to read and follow well-annotated python code, are both helpful (but not necessary) to get started. 18 | 19 | In the *Getting started with [Biology](alias://cf88ac) and [Computer Science](alias://6ad7e1)* sections below I provide some suggestions for other texts that will help you to get started. 20 | 21 | ## How to read IAB 22 | 23 | IAB can be read interactively as a series of [Jupyter Notebooks](https://jupyter.org) or read statically. Due to popular demand, a print version may ultimately be available for a fee, but the full and most recent version of IAB will always be available for free on the [project website](http://readIAB.org). The recommended way to read IAB is interactively as this allows readers to execute code directly in the text. For example, when learning pairwise alignment, users can align sequences provided in IAB (or their own sequences) and modify parameters (or even the algorithm itself) to see how changes affect the resulting alignments. 24 | 25 | IAB is constantly being updated. As I teach with it, I will often update text or add new chapters in an effort to keep up with advances in the field. The project website contains the most up-to-date recommendations on how to read IAB or teach with IAB, including strategies for dealing with changing content. (For example, if you're teaching with IAB, you can fork the IAB repository and only pull updates into your fork when you're ready for them. If _forking repositories_ and _pulling updates_ are terms that don't mean anything to you right now, you can safely ignore this!) 26 | 27 | IAB is split into four different sections: *Getting started*, *Fundamentals*, *Applications*, and *Wrapping up*. You should start reading IAB by working through the *Getting started* and *Fundamentals* chapters in order. You should then read the *Applications* chapters and *Wrapping up* in any order, based on your own interest. 28 | 29 | ## Using Jupyter Notebooks to read IAB interactively 30 | 31 | IAB can be read interactively as a series of Jupyter Notebooks. The main source for information about Jupyter Notebooks is the [Jupyter website](https://jupyter.org). You can find information there on how to use Jupyter Notebooks as well as setting up and running a Jupyter Notebook server (for example, if you'd like to make one available to your students). 32 | 33 | Most of the code that is used in IAB comes from [scikit-bio](http://scikit-bio.org) package, or other Python scientific computing tools. You can access these in the same way that you would in a Python script. For example: 34 | 35 | ```python 36 | >>> import skbio 37 | >>> from IPython.core import page 38 | >>> page.page = print 39 | ``` 40 | 41 | We can then access functions, variables, and classes from these modules. 42 | 43 | ```python 44 | >>> print(skbio.title) 45 | >>> print(skbio.art) 46 | ``` 47 | 48 | We'll inspect a lot of source code in IAB as we explore bioinformatics algorithms. If you're ever interested in seeing the source code for some functionality that we're using, you can do that using Jupyter's ``psource`` magic. 49 | 50 | ```python 51 | >>> from skbio.alignment import TabularMSA 52 | >>> %psource TabularMSA.conservation 53 | ``` 54 | 55 | The documentation for scikit-bio is also very extensive. You can view the documentation for the `TabularMSA` object, for example, [here](http://scikit-bio.org/docs/latest/generated/skbio.alignment.TabularMSA.html). These documents will be invaluable for learning how to use the objects. 56 | 57 | ## Reading list 58 | 59 | ### Getting started with Biology 60 | 61 | If you're new to biology, these are some books and resources that will help you get started. 62 | 63 | * [The Processes of Life](http://amzn.to/1P0dc2E) by Lawrence Hunter. *An introduction to biology for computer scientists.* 64 | 65 | * The [NIH Bookshelf](http://www.ncbi.nlm.nih.gov/books/) A lot of free biology texts, some obviously better than others. 66 | 67 | ### Getting started with Computer Science and programming 68 | 69 | If you're new to Computer Science and programming, these are some books and resources that will help you get started. 70 | 71 | * [Software Carpentry](http://www.software-carpentry.org) *Online resources for learning scientific computing skills, and regular in-person workshops all over the world. Taking a Software Carpentry workshop **will** pay off for biology students interested in a career in research.* 72 | 73 | * [Practical Computing for Biologists](http://amzn.to/1Ukx5S6) by Steven Haddock and Casey Dunn. *A great introduction to many computational skills that are required of modern biologists. I *highly* recommend this book to all Biology undergraduate and graduate students.* 74 | 75 | * [Practical Programming: A Introduction to Computer Science Using Python](http://amzn.to/1P0dmqM) by Jennifer Campbell, Paul Gries, Jason Montojo, Greg Wilson. *An introduction to the python programming language and basic computer science. This is a great first programming book for people interested in bioinformatics or scientific computing in general.* 76 | 77 | * [The Pragmatic Programmer](http://amzn.to/1P0dl6i) by Andrew Hunt. *A more advanced book on becoming a better programmer. This book is excellent, and I highly recommend it for anyone developing bioinformatics software. You should know how to program and have done some software development before jumping into this.* 78 | 79 | ### Philosophy of biology and popular science books 80 | 81 | These are some books that I've enjoyed, that have also helped me think about biological systems. These are generally written for a more popular audience, so should be accessible to any readers of *An Introduction to Applied Bioinformatics*. 82 | 83 | * [The Selfish Gene](http://amzn.to/1UkyQ1R) by Richard Dawkins. 84 | 85 | * [Ever Since Darwin](http://amzn.to/1Ukzdt7) by Stephen Jay Gould. *This is the first book in a series of collections of short essays.* 86 | 87 | * [The Demon Haunted World](http://amzn.to/1UkyIzi) by Carl Sagan. 88 | 89 | * [Sex and Death](http://amzn.to/1UkySXg) by Kim Sterelny. 90 | 91 | * [Gödel, Escher, Bach](http://amzn.to/1UkzxYL) by Douglas Hofstadter. 92 | 93 | ## Need help? 94 | 95 | If you're having issues getting *An Introduction to Applied Bioinformatics* running on your computer, or you have corrections or suggestions on the content, you should get in touch through the [IAB issue tracker](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/issues). This will generally be much faster than e-mailing the author directly, as there are multiple people who monitor the issue tracker. It also helps us manage our technical support load if we can consolidate all requests and responses in one place. 96 | 97 | ## Contributing and Code of Conduct 98 | 99 | If you're interested in contributing content or features to IAB, you should start by reviewing the project's [Code of Conduct](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/blob/master/CODE-OF-CONDUCT.md) and [Contributing Guide](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/blob/master/CONTRIBUTING.md). 100 | 101 | ## Acknowledgements 102 | 103 | *An Introduction to Applied Bioinformatics* was funded in part by the [Alfred P. Sloan Foundation](www.sloan.org). Initial prototyping was funded by [Arizona's Technology and Research Initiative Fund](http://nau.edu/Research/Funding/Technology-Research-Initiative-Fund/). The style of the project was inspired by [Bayesian Methods for Hackers](http://camdavidsonpilon.github.io/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/). 104 | 105 | See the repository's [contributors page](https://github.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/graphs/contributors) for information on who has contributed to the project. 106 | -------------------------------------------------------------------------------- /book/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/book/images/logo.png -------------------------------------------------------------------------------- /book/index.md: -------------------------------------------------------------------------------- 1 | 2 | # An Introduction To Applied Bioinformatics 3 | -------------------------------------------------------------------------------- /book/index.yaml: -------------------------------------------------------------------------------- 1 | contents: 2 | - getting-started 3 | - fundamentals 4 | - applications 5 | - exercises 6 | - back-matter 7 | -------------------------------------------------------------------------------- /custom.css: -------------------------------------------------------------------------------- 1 | /* Custom styling to help align things, and correct section numbers */ 2 | 3 | h4, h5, h6 { 4 | font-size: 120% !important; 5 | } 6 | 7 | .iab-edit { 8 | float: right; 9 | font-size: 14px; 10 | } 11 | 12 | .anchor-link { 13 | display: none !important; 14 | } 15 | 16 | /* from SO: https://stackoverflow.com/a/4098296/579416 */ 17 | .cell ol { 18 | counter-reset: item; 19 | } 20 | .cell li { 21 | display: block; 22 | } 23 | .cell li:before { 24 | content: counters(item, ".") " "; 25 | counter-increment: item ; 26 | } 27 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - etetoolkit 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.6 7 | - pip 8 | - notebook 9 | - scikit-bio >= 0.5.5, < 0.6.0 10 | - networkx = 2.3.0 11 | - ete3 12 | - ete_toolchain 13 | - pip: 14 | - https://github.com/caporaso-lab/An-Introduction-To-Applied-Bioinformatics/archive/master.zip 15 | -------------------------------------------------------------------------------- /iab/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ----------------------------------------------------------------------------- 4 | # This work is licensed under the Creative Commons 5 | # Attribution-NonCommercial-ShareAlike 4.0 International License. To view a 6 | # copy of this license, visit 7 | # http://creativecommons.org/licenses/by-nc-sa/4.0/. 8 | # ----------------------------------------------------------------------------- 9 | 10 | import pkg_resources 11 | 12 | __version__ = pkg_resources.get_distribution( 13 | 'An-Introduction-To-Applied-Bioinformatics').version 14 | -------------------------------------------------------------------------------- /iab/format/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/applied-bioinformatics/An-Introduction-To-Applied-Bioinformatics/bd4ec8752709a18f5c81aaddc592965548bcd5c2/iab/format/__init__.py -------------------------------------------------------------------------------- /iab/format/dialog_box.py: -------------------------------------------------------------------------------- 1 | import markdown2 2 | from IPython.display import HTML 3 | 4 | 5 | def make_box(section_text='', header_background_color='none', 6 | header_text='', header_text_color='none', 7 | icon='', section_background_color='none', 8 | section_text_color='none', style=''): 9 | """Generic fucntion to create dialog box. 10 | 11 | This is a generic function that displays a string as HTML that is 12 | displayed inline in an IPython notebook. The HTML is automatically 13 | displayed when the function is called. 14 | 15 | Parameters 16 | ---------- 17 | section_text : str 18 | The text that will be displayed in the main section of the dialog 19 | box 20 | header_background_color : str 21 | The color of the header background. 22 | header_text : str 23 | The text to be displayed in the header. 24 | header_text_color : str 25 | The color of the text in the header. 26 | icon : str 27 | The tag for the image or icon that will be displayed next to the 28 | header text. 29 | section_background_color : str 30 | The background color of the main text section. 31 | section_text_color : str 32 | The color of the main section text. 33 | style : str 34 | The name of the style used in the css class. This prevents the dialog 35 | box from overwriting other HTML objects and should therefore be 36 | unique. 37 | 38 | Returns 39 | ------- 40 | IPython.core.display.HTML 41 | IPython object that displays the HTML inline in the IPython notebook 42 | 43 | Examples 44 | -------- 45 | >>> from iab.format.dialog_box import make_box 46 | >>> make_box('foo',\ 47 | header_background_color='#000',\ 48 | header_text='Developer Note',\ 49 | header_text_color='#76EE00',\ 50 | icon="",\ 52 | section_background_color='#e5e5e5',\ 53 | section_text_color='#000',\ 54 | style='developer_note') 55 | 56 | 57 | 58 | """ 59 | return HTML(""" 60 | 61 | 63 | 96 | 97 | 98 |

99 | 100 |
101 |

%(icon)s %(header_text)s

102 |
103 | 104 |
105 | %(section_text)s 106 |
107 | 108 |
109 | """ % {"style": style, 110 | "header_background_color": header_background_color, 111 | "header_text_color": header_text_color, 112 | "section_background_color": section_background_color, 113 | "section_text_color": section_text_color, 114 | "icon": icon, 115 | "header_text": header_text, 116 | "section_text": markdown2.markdown(section_text)}) 117 | 118 | 119 | def link(section_text): 120 | return make_box(section_text=section_text, 121 | header_background_color='dodgerblue', 122 | header_text='Additional Resources', 123 | header_text_color='#fff', 124 | icon='', 125 | section_background_color='#e8f3ff', 126 | section_text_color='dodgerblue', 127 | style='link_box') 128 | 129 | 130 | def warning(section_text): 131 | return make_box(section_text=section_text, 132 | header_background_color='#FFCC00', 133 | header_text='Warning!', 134 | header_text_color='darkred', 135 | icon='', 136 | section_background_color='#FFF9E5', 137 | section_text_color='darkred', 138 | style="warning_box") 139 | 140 | 141 | def additional_info(section_text): 142 | return make_box(section_text=section_text, 143 | header_background_color='#590059', 144 | header_text='Additional Information', 145 | header_text_color='#fff', 146 | icon='', 147 | section_background_color='#eee5ee', 148 | section_text_color='#590059', 149 | style='additional_box') 150 | 151 | 152 | def developer_note(section_text): 153 | return make_box(section_text=section_text, 154 | header_background_color='#000', 155 | header_text='Developer Note', 156 | header_text_color='#76EE00', 157 | icon="""""", 159 | section_background_color='#e5e5e5', 160 | section_text_color='#000', 161 | style='developer_note') 162 | -------------------------------------------------------------------------------- /licenses/runipy.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Paul Butler 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this 10 | list of conditions and the following disclaimer in the documentation and/or 11 | other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 16 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 17 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 18 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 19 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 21 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 22 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | -------------------------------------------------------------------------------- /licenses/scikit-bio.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013--, scikit-bio development team. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, this 11 | list of conditions and the following disclaimer in the documentation and/or 12 | other materials provided with the distribution. 13 | 14 | * Neither the name of the {organization} nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /paper.bib: -------------------------------------------------------------------------------- 1 | 2 | % Generated by Paperpile. Check out http://paperpile.com for more information. 3 | % BibTeX export options can be customized via Settings -> BibTeX. 4 | 5 | @BOOK{Dunn2010-ik, 6 | title = "Practical Computing for Biologists", 7 | author = "Dunn, Casey and Haddock, Steven HD", 8 | abstract = "Increasingly, scientists find themselves facing exponentially 9 | larger data sets and analyses without suitable tools to deal 10 | with them. Many biologists end up using spreadsheet programs for 11 | most of their data-processing tasks and spend hours clicking 12 | around or copying and pasting, and then repeating the process 13 | for other data files. Practical Computing for Biologists shows 14 | you how to use many freely available computing tools to work 15 | more powerfully and effectively. The book was born out of the 16 | authors' own experience in developing tools for their research 17 | and helping other biologists with their computational problems. 18 | Although many of the techniques are relevant to molecular 19 | bioinformatics, the motivation for the book is much broader, 20 | focusing on topics and techniques that are applicable to a range 21 | of scientific endeavors. Twenty-two chapters organized into six 22 | parts address these topics and more: Searching with regular 23 | expressions The Unix command line Python programming and 24 | debugging Creating and editing graphics Databases Performing 25 | analyses on remote servers Working with electronics While most 26 | of the concepts and examples apply to any operating system, the 27 | main narrative focuses on Mac OS X. Where there are differences 28 | for Windows and Linux users, parallel instructions are provided 29 | in the margin and in an appendix. The book is designed to be 30 | used as a self-guided resource for researchers, a companion book 31 | in a course, or as a primary textbook. Practical Computing for 32 | Biologists will free you from the most frustrating and 33 | time-consuming aspects of data processing so you can focus on 34 | the pleasures of scientific inquiry.", 35 | publisher = "Sinauer Associates, Inc.", 36 | edition = "First edition", 37 | month = nov, 38 | year = 2010, 39 | url = "http://practicalcomputing.org/" 40 | } 41 | 42 | @ARTICLE{Wilson2016-kh, 43 | title = "Software Carpentry: lessons learned", 44 | author = "Wilson, Greg", 45 | abstract = "Since its start in 1998, Software Carpentry has evolved from a 46 | week-long training course at the US national laboratories into a 47 | worldwide volunteer effort to improve researchers' computing 48 | skills. This paper explains what we have learned along the way, 49 | the challenges we now face, and our plans for the future.", 50 | journal = "F1000 Research", 51 | volume = 3, 52 | month = jan, 53 | year = 2016, 54 | doi = "10.12688/F1000RESEARCH.3-62.V2" 55 | } 56 | 57 | @ARTICLE{Searls2014-ac, 58 | title = "A new online computational biology curriculum", 59 | author = "Searls, David B", 60 | abstract = "A recent proliferation of Massive Open Online Courses (MOOCs) and 61 | other web-based educational resources has greatly increased the 62 | potential for effective self-study in many fields. This article 63 | introduces a catalog of several hundred free video courses of 64 | potential interest to those wishing to expand their knowledge of 65 | bioinformatics and computational biology. The courses are 66 | organized into eleven subject areas modeled on university 67 | departments and are accompanied by commentary and career advice.", 68 | journal = "PLoS Computational Biology", 69 | volume = 10, 70 | number = 6, 71 | pages = "e1003662", 72 | month = jun, 73 | year = 2014, 74 | doi = "10.1371/journal.pcbi.1003662" 75 | } 76 | 77 | @BOOK{Felsenstein2003-wm, 78 | title = "Inferring Phylogenies", 79 | author = "Felsenstein, Joseph", 80 | abstract = "Phylogenies (evolutionary trees) are basic to thinking about and 81 | analyzing differences between species. Statistical, 82 | computational, and algorithmic work on them has been ongoing for 83 | four decades, with great advances in understanding. Yet no book 84 | has summarized this work until now. Inferring Phylogenies 85 | explains clearly the assumptions and logic of making inferences 86 | about phylogenies, and using them to make inferences about 87 | evolutionary processes. It is an essential text and reference 88 | for anyone who wants to understand how phylogenies are 89 | reconstructed and how they are used. As phylogenies are inferred 90 | with various kinds of data, this book concentrates on some of 91 | the central ones: discretely coded characters, molecular 92 | sequences, gene frequencies, and quantitative traits. Also 93 | covered are restriction sites, RAPDs, and microsatellites. 94 | Inferring Phylogenies is intended for graduate-level courses, 95 | assuming some knowledge of statistics, mathematics (calculus and 96 | fundamental matrix algebra), molecular sequences, and 97 | quantitative genetics.", 98 | publisher = "Sinauer Associates", 99 | edition = "Second edition", 100 | month = sep, 101 | year = 2003 102 | } 103 | 104 | @BOOK{Durbin1998-ru, 105 | title = "Biological Sequence Analysis: Probabilistic Models of Proteins 106 | and Nucleic Acids", 107 | author = "Durbin, Richard and Eddy, Sean R and Krogh, Anders and 108 | Mitchison, Graeme", 109 | abstract = "Probablistic models are becoming increasingly important in 110 | analyzing the huge amount of data being produced by large-scale 111 | DNA-sequencing efforts such as the Human Genome Project. For 112 | example, hidden Markov models are used for analyzing biological 113 | sequences, linguistic-grammar-based probabilistic models for 114 | identifying RNA secondary structure, and probabilistic 115 | evolutionary models for inferring phylogenies of sequences from 116 | different organisms. This book gives a unified, up-to-date and 117 | self-contained account, with a Bayesian slant, of such methods, 118 | and more generally to probabilistic methods of sequence 119 | analysis. Written by an interdisciplinary team of authors, it is 120 | accessible to molecular biologists, computer scientists, and 121 | mathematicians with no formal knowledge of the other fields, and 122 | at the same time presents the state of the art in this new and 123 | important field.", 124 | publisher = "Cambridge University Press", 125 | edition = "First edition", 126 | month = may, 127 | year = 1998, 128 | } 129 | 130 | @ARTICLE{Searls2012-ab, 131 | title = "An online bioinformatics curriculum", 132 | author = "Searls, David B", 133 | abstract = "Online learning initiatives over the past decade have become 134 | increasingly comprehensive in their selection of courses and 135 | sophisticated in their presentation, culminating in the recent 136 | announcement of a number of consortium and startup activities 137 | that promise to make a university education on the internet, free 138 | of charge, a real possibility. At this pivotal moment it is 139 | appropriate to explore the potential for obtaining comprehensive 140 | bioinformatics training with currently existing free video 141 | resources. This article presents such a bioinformatics curriculum 142 | in the form of a virtual course catalog, together with editorial 143 | commentary, and an assessment of strengths, weaknesses, and 144 | likely future directions for open online learning in this field.", 145 | journal = "PLoS Computational Biology", 146 | volume = 8, 147 | number = 9, 148 | pages = "e1002632", 149 | month = sep, 150 | year = 2012, 151 | doi = "10.1371/journal.pcbi.1002632" 152 | } 153 | -------------------------------------------------------------------------------- /paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'An Introduction to Applied Bioinformatics: a free, open, and interactive text.' 3 | tags: 4 | - bioinformatics 5 | - python 6 | authors: 7 | - name: Evan Bolyen 8 | orcid: 0000-0002-5362-6782 9 | affiliation: 1 10 | - name: Jai Ram Rideout 11 | affiliation: 1 12 | - name: John Chase 13 | affiliation: 1 14 | - name: T. Anders Pitman 15 | affiliation: 1 16 | - name: Arron Shiffer 17 | affiliation: "1, 2" 18 | - name: Willow Mercurio 19 | affiliation: 1 20 | - name: Matthew R Dillon 21 | orcid: 0000-0002-7713-1952 22 | affiliation: 1 23 | - name: J Gregory Caporaso 24 | orcid: 0000-0002-8865-1670 25 | affiliation: "1, 2" 26 | affiliations: 27 | - name: Pathogen and Microbiome Institute, Northern Arizona University, Flagstaff, AZ, USA. 28 | index: 1 29 | - name: Department of Biological Sciences, Northern Arizona University, Flagstaff, AZ, USA. 30 | index: 2 31 | date: 17 July 2018 32 | bibliography: paper.bib 33 | --- 34 | 35 | # Summary 36 | 37 | _Statement of need_: Due to the increasing rate of biological data generation, bioinformatics is rapidly growing as a field and is now an essential part of scientific advances in human health and environmental sciences. Online and publicly accessible resources for learning bioinformatics exist (e.g., [Rosalind](http://rosalind.info), [@Searls2012-ab; @Searls2014-ac]), and there are excellent textbooks and courses in the area, some focused heavily on theory [@Felsenstein2003-wm; @Durbin1998-ru], and others geared toward learning specific skills such as Python programming or the Unix shell [@Dunn2010-ik; @Wilson2016-kh]. An Introduction to Applied Bioinformatics (IAB) is a free, online bioinformatics text that bridges the gap between theory and application by teaching fundamentals of bioinformatics in the context of their implementation, using an interactive framework based on highly relevant tools including Python 3, Jupyter Notebooks, and GitHub. 38 | 39 | IAB is geared toward students who are completely new to bioinformatics, though having completed an introductory course (or book) in both Computer Science and Biology are useful prerequisites. IAB readers begin on the [project website](http://readIAB.org). While it is possible to view the content statically from this page, we recommend that readers work interactively by installing IAB. Readers progress through chapters that introduce fundamental topics, such as sequence homology searching and multiple sequence alignment, and presents their Python 3 implementation. Because the content is presented in Jupyter Notebooks, students can edit and execute the code, for example to explore how changing k-word size or an alignment gap penalty might impact the results of a database search. The Python code that readers interact with is intended for educational purposes, where the implementation is made as simple as possible, sometimes at the cost of computational efficiency. Chapters therefore also include examples of performing the same analyses with [scikit-bio](http://scikit-bio.org), a production-quality bioinformatics Python 3 library. This enables a rapid transition from learning theory, or how an algorithm works, to applying techniques in a real-world setting. 40 | 41 | IAB additionally contains Wikipedia-style “Edit” links in each section of the text. When one of these links is followed, the reader is taken to the GitHub online editor where they can submit a pull request to modify content or code. Readers are therefore introduced to GitHub through a user-friendly web interface, and can begin building their GitHub activity history (commonly reviewed by bioinformatics hiring managers). Finally, every time a change is proposed via GitHub, all of the executable content of IAB is automatically tested. This continuous integration testing ensures that IAB example code remains functional as changes are introduced, solving an issue that plagues printed applied computational texts (for example because they describe an outdated software interface). 42 | 43 | IAB evolved from lecture materials developed by Dr. Caporaso for an introductory bioinformatics course targeted toward computer science and biology undergraduates (typically juniors or seniors) at Northern Arizona University. Since the early stages of its development, it has been used to teach at least ten courses and short (e.g., one day) bioinformatics workshops. As it became clear that the content and format was useful for teaching bioinformatics, Dr. Caporaso applied for and received grants from the Arizona Technology and Research Initiative and the Alfred P Sloan Foundation to further develop the resource. The content was originally written in Jupyter Notebooks, but as the project grew, it became difficult to maintain the notebooks and in particular to review submissions from others. The Jupyter Notebooks were transitioned to markdown files which are now the source for static HTML and Jupyter Notebook renderings of the content. 44 | 45 | The current version of IAB contains six chapters covering fundamental concepts and their applications. It is a dynamic resource that will be expanded, revised and updated over time. Its lifecycle is thus more similar to an active software project than a textbook: a practical approach to education in a rapidly changing field. 46 | 47 | # Acknowledgements 48 | An Introduction to Applied Bioinformatics is funded by a grant from the Alfred P Sloan Foundation to JGC. Initial prototyping was funded by a grant from the Arizona Technology and Research Initiative to JGC. EB, JC, and JGC had partial salary support from National Cancer Institute of the National Institutes of Health under the awards for the Partnership of Native American Cancer Prevention U54CA143924 (UACC) and U54CA143925 (NAU). We would like to thank the students in our courses, whose questions and feedback have shaped IAB. 49 | 50 | # Author Contributions 51 | JGC is the primary author of the IAB content. EB, JRR, and JC have developed components of the underlying framework. AP, AS, and WM have provided useful feedback on the content. 52 | 53 | # References 54 | -------------------------------------------------------------------------------- /runipynbs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ----------------------------------------------------------------------------- 4 | # This work is licensed under the Creative Commons 5 | # Attribution-NonCommercial-ShareAlike 4.0 International License. To view a 6 | # copy of this license, visit 7 | # http://creativecommons.org/licenses/by-nc-sa/4.0/. 8 | # ----------------------------------------------------------------------------- 9 | 10 | import logging 11 | import os 12 | import sys 13 | 14 | from runipy.notebook_runner import NotebookRunner, NotebookError 15 | import nbformat 16 | 17 | # Taken and modified from 18 | # https://github.com/paulgb/runipy/blob/master/runipy/main.py 19 | # See licenses/ directory for runipy license. 20 | def main(): 21 | log_format = '%(asctime)s %(message)s' 22 | log_datefmt = '%m/%d/%Y %I:%M:%S %p' 23 | ignore_dirs = ['.git', '.ipynb_checkpoints'] 24 | 25 | logging.basicConfig(level=logging.DEBUG, format=log_format, 26 | datefmt=log_datefmt) 27 | 28 | if len(sys.argv) > 1: 29 | start_path = sys.argv[1] 30 | else: 31 | start_path = '.' 32 | 33 | if not os.path.exists(start_path): 34 | logging.error("Directory or file '%s' does not exist.", start_path) 35 | sys.exit(1) 36 | 37 | if os.path.isfile(start_path): 38 | run_notebook(start_path) 39 | else: 40 | for root, dirs, files in os.walk(start_path): 41 | dirs.sort() 42 | for ignore_dir in ignore_dirs: 43 | if ignore_dir in dirs: 44 | dirs.remove(ignore_dir) 45 | 46 | for name in sorted(files): 47 | if name.endswith('.ipynb'): 48 | nbpath = os.path.normpath(os.path.join(root, name)) 49 | run_notebook(nbpath) 50 | 51 | 52 | def run_notebook(nbpath): 53 | logging.info("Reading notebook '%s'", nbpath) 54 | with open(nbpath) as nbfile: 55 | notebook = nbformat.read(nbfile, as_version=3) 56 | 57 | runner = NotebookRunner(notebook) 58 | 59 | try: 60 | runner.run_notebook() 61 | except NotebookError: 62 | logging.error("An error occurred while executing notebook '%s'. " 63 | "Exiting with nonzero exit status", nbpath) 64 | sys.exit(1) 65 | 66 | 67 | if __name__ == '__main__': 68 | main() 69 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ----------------------------------------------------------------------------- 4 | # This work is licensed under the Creative Commons 5 | # Attribution-NonCommercial-ShareAlike 4.0 International License. To view a 6 | # copy of this license, visit 7 | # http://creativecommons.org/licenses/by-nc-sa/4.0/. 8 | # ----------------------------------------------------------------------------- 9 | 10 | __version__ = '0.1.4.dev0' 11 | 12 | from setuptools import find_packages, setup 13 | import sys 14 | 15 | # Check that Python version is 3, since build will 16 | # currently complete in Python 2, but the notebooks 17 | # won't work. 18 | python_version = sys.version_info.major 19 | if python_version != 3: 20 | sys.exit("IAB can only be used with Python 3. " 21 | "You are currently running Python %d." % python_version) 22 | 23 | classes = """ 24 | Development Status :: 1 - Planning 25 | Framework :: IPython 26 | Intended Audience :: Developers 27 | Intended Audience :: Education 28 | Intended Audience :: Science/Research 29 | Natural Language :: English 30 | Operating System :: MacOS :: MacOS X 31 | Operating System :: POSIX 32 | Operating System :: Unix 33 | Programming Language :: Python 34 | Programming Language :: Python :: 3.5 35 | Topic :: Scientific/Engineering 36 | Topic :: Scientific/Engineering :: Bio-Informatics 37 | """ 38 | classifiers = [s.strip() for s in classes.split('\n') if s] 39 | 40 | description = ("An Introduction To Applied Bioinformatics (IAB): " 41 | "Interactive lessions in bioinformatics.") 42 | 43 | setup(name='An-Introduction-To-Applied-Bioinformatics', 44 | version=__version__, 45 | license='CC BY-NC-SA 4.0', 46 | description=description, 47 | long_description=description, 48 | author='Greg Caporaso', 49 | author_email='gregcaporaso@gmail.com', 50 | maintainer='Greg Caporaso', 51 | maintainer_email='gregcaporaso@gmail.com', 52 | url='http://readIAB.org', 53 | packages=find_packages(), 54 | install_requires=['scikit-bio >= 0.5.5, < 0.6.0', 55 | 'jupyter', 'seaborn', 56 | 'qiime-default-reference >= 0.1.3, < 0.2.0', 57 | 'pandas', 58 | 'markdown2 >= 2.3.0', 59 | 'tabulate', 60 | 'networkx == 2.3.0', 61 | 'ete3', 62 | 'ipymd >= 0.1.2'], 63 | classifiers=classifiers) 64 | --------------------------------------------------------------------------------