├── .gitignore
├── .travis.yml
├── LICENSE.rst
├── NOTES.rst
├── README.rst
├── fetch_sample_data.sh
├── reading_sequence_files
    ├── README.rst
    ├── check_start_met.py
    ├── check_stops.py
    ├── count_fasta.py
    ├── count_fasta_adv.py
    ├── print_seq.py
    ├── record_lengths.py
    └── total_length.py
├── reading_writing_alignments
    ├── README.rst
    ├── count_gaps.py
    └── sort_gaps.py
├── tests
    ├── README.rst
    ├── test_consistency.py
    └── test_scripts.py
├── using_seqfeatures
    ├── README.rst
    ├── bases_in_genes.py
    ├── extract_cds.py
    ├── total_feature_lengths.py
    └── total_gene_lengths.py
└── writing_sequence_files
    ├── README.rst
    ├── convert_gb_to_fasta.py
    ├── cut_final_star.py
    ├── cut_star_dangerous.py
    ├── filter_wanted_id.py
    ├── filter_wanted_id_in_order.py
    ├── length_filter.py
    └── length_filter_naive.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | #Ingore sample files
 2 | *.gbk
 3 | *.fna
 4 | *.ffn
 5 | *.faa
 6 | *.fasta
 7 | *.sth
 8 | 
 9 | #Ignore backup files from some Unix editors,
10 | *~
11 | *.swp
12 | *.bak
13 | 
14 | #Ignore patches and any original files created by patch command
15 | *.diff
16 | *.patch
17 | *.orig
18 | *.rej
19 | 
20 | #Ignore these hidden files from Mac OS X
21 | .DS_Store
22 | 
23 | #Ignore hidden files from Dolphin window manager
24 | .directory
25 |  
26 | #Ignore all compiled python files (e.g. from running the unit tests):
27 | *.pyc
28 | *.pyo
29 | 
30 | #Ignore all Jython class files (present if using Jython)
31 | *.class
32 | 
33 | #Ignore compressed archives of files
34 | *.zip
35 | *.tar.gz
36 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | #Special configuration file to run tests on Travis-CI via GitHub notifications
 2 | #See http://travis-ci.org/ for details
 3 | #
 4 | #Note when testing Python 3, the 'python' command will invoke Python 3
 5 | #and similarly for PyPy too.
 6 | 
 7 | language: python
 8 | python:
 9 |   - "2.6"
10 |   - "2.7"
11 |   - "3.3"
12 |   - "3.4"
13 |   - "pypy"
14 |   - "pypy3"
15 | 
16 | install:
17 |   - pip install biopython
18 |   - ./fetch_sample_data.sh
19 | 
20 | script:
21 |   - python tests/test_consistency.py
22 |   - python tests/test_scripts.py
23 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | =====================
 2 | Copyright and Licence
 3 | =====================
 4 | 
 5 | Copyright 2014-2015 by Peter Cock, The James Hutton Institute, Dundee, UK.
 6 | All rights reserved.
 7 | 
 8 | This work is licensed under a `Creative Commons Attribution-ShareAlike 4.0 International
 9 | License <http://creativecommons.org/licenses/by-sa/4.0/>`_ (CC-BY-SA 4.0).
10 | 
11 | .. image:: http://i.creativecommons.org/l/by-sa/4.0/88x31.png
12 | 
13 | Note this documentation links to and uses external and separately licenced sample data.
14 | 


--------------------------------------------------------------------------------
/NOTES.rst:
--------------------------------------------------------------------------------
 1 | As this material is aimed at Python beginners, we're avoiding a lot of
 2 | useful but not fundamental things, including:
 3 | 
 4 | * String formating with the % operator
 5 | * Exceptions and try/except error handling
 6 | * The ``with`` statement for context management (e.g. closing file handles)
 7 | * The increment/decrement operators, use ``count = count + 1`` not ``count += 1``
 8 | * List comprehensions, generator expressions, generator functions (just use for loops)
 9 | 
10 | Also note that the examples should try to run under both Python 2.6, 2.7
11 | and 3.3 (or later) without changes. i.e. The same versions of Python which
12 | are supported by Biopython.
13 | 
14 | To this end, only simple print statements are used as ``print(some_string)``
15 | which will work on both Python 2 and 3, with or without using
16 | ``from __future__ import print_function``.
17 | 
18 | Additionally, basic automated testing is done on TravisCI via the special
19 | ``.travis.yml`` file, test results here:
20 | 
21 | .. image:: https://travis-ci.org/peterjc/biopython_workshop.png?branch=master
22 |    :alt: Current status of TravisCI build for master branch
23 |    :target: https://travis-ci.org/peterjc/biopython_workshop/builds
24 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =========================
  2 | Introduction to Biopython
  3 | =========================
  4 | 
  5 | This is a basic introduction to Biopython, intended for a classroom based workshop.
  6 | It assumes you have been introduced to both working at the command line, and basic
  7 | Python - for example as covered in Martin Jones' free eBook
  8 | `Python for Biologists <http://pythonforbiologists.com/index.php/introduction-to-python-for-biologists/>`_.
  9 | 
 10 | The Biopython website http://www.biopython.org has more information including the 
 11 | `Biopython Tutorial & Cookbook <http://biopython.org/DIST/docs/tutorial/Tutorial.html>`_
 12 | (html, `PDF available <http://biopython.org/DIST/docs/tutorial/Tutorial.pdf>`_),
 13 | which is worth going through once you have mastered the basics of Python. That Tutorial & Cookbook
 14 | is also available as `Jupyter Notebooks <https://github.com/tiagoantao/biopython-notebook>`_,
 15 | as is `another short introductory tutorial <https://github.com/chris-rands/biopython-coronavirus>`_.
 16 | 
 17 | =================
 18 | Workshop Sections
 19 | =================
 20 | 
 21 | I've broken up the workshop into sections:
 22 | 
 23 | * `Reading sequence files <reading_sequence_files/README.rst>`_.
 24 | * `Writing sequence files <writing_sequence_files/README.rst>`_.
 25 | * `Working with sequence features <using_seqfeatures/README.rst>`_.
 26 | * `Reading and writing alignment files <reading_writing_alignments/README.rst>`_.
 27 | 
 28 | This material focuses on Biopython's `SeqIO <http://biopython.org/wiki/SeqIO>`_
 29 | and `AlignIO <http://biopython.org/wiki/AlignIO>`_ modules (these links
 30 | include an overview and tables of supported file formats), each of which
 31 | also has a whole chapter in the `Biopython Tutorial & Cookbook
 32 | <http://biopython.org/DIST/docs/tutorial/Tutorial.html>`_
 33 | (`PDF <http://biopython.org/DIST/docs/tutorial/Tutorial.pdf>`_)
 34 | which would be worth reading after this workshop to learn more.
 35 | 
 36 | ========
 37 | Notation
 38 | ========
 39 | 
 40 | Text blocks starting with ``$`` show something you would type and run at the
 41 | command line prompt, where the ``$`` itself represents the prompt. For example:
 42 | 
 43 | .. sourcecode:: console
 44 | 
 45 |     $ python -V
 46 |     Python 2.7.5
 47 | 
 48 | Depending how your system is configured, rather than just ``$`` you may see your
 49 | user name and the current working directory. Here you would only type ``python -V``
 50 | (python space minus capital V) to find out the default version of Python installed.
 51 | 
 52 | Lines starting ``>>>`` represent the interactive Python prompt, and something
 53 | you would type inside Python. For example:
 54 | 
 55 | .. sourcecode:: pycon
 56 | 
 57 |     $ python
 58 |     Python 2.7.3 (default, Nov  7 2012, 23:34:47) 
 59 |     [GCC 4.4.6 20120305 (Red Hat 4.4.6-4)] on linux2
 60 |     Type "help", "copyright", "credits" or "license" for more information.
 61 |     >>> 7 * 6
 62 |     42
 63 |     >>> quit()
 64 | 
 65 | Here you would only need to type ``7 * 6`` (and enter) into Python, the ``>>>``
 66 | is already there. To quit the interactive Python prompt use ``quit()`` (and enter).
 67 | This example would usually be shortened to just:
 68 | 
 69 | .. sourcecode:: pycon
 70 | 
 71 |     >>> 7 * 6
 72 |     42
 73 | 
 74 | These text blocks are also used for entire short Python scripts, which you can
 75 | copy and save as a plain text file with the extension ``.py`` to run them.
 76 | 
 77 | ================
 78 | Sample Solutions
 79 | ================
 80 | 
 81 | Each workshop section was written in a separate directory, and in addition
 82 | to the main text (named ``README.rst`` which is plain text file with markup
 83 | to make it look pretty on GitHub), the folders contain sample solution
 84 | Python scripts (named as in the text).
 85 | 
 86 | ===========================
 87 | Prerequisites & Sample Data
 88 | ===========================
 89 | 
 90 | If you are reading this on GitHub.com, you can view, copy/paste or download
 91 | individual examples from your web browser.
 92 | 
 93 | To make a local copy of the entire workshop, you can use the ``git``
 94 | command line tool:
 95 | 
 96 | .. sourcecode:: console
 97 | 
 98 |     $ git clone https://github.com/peterjc/biopython_workshop.git
 99 | 
100 | Alternatively, depending on your firewall settings, use:
101 | 
102 | .. sourcecode:: console
103 | 
104 |     $ git clone git@github.com:peterjc/biopython_workshop.git
105 | 
106 | To learn more about ``git`` and software version control, I recommend attending a
107 | `Software Carpentry Workshop <http://software-carpentry.org/workshops/index.html>`_
108 | or similar course.
109 | 
110 | This should make a new sub-directory, ``biopython_workshop/`` which we will now
111 | change into:
112 | 
113 | .. sourcecode:: console
114 | 
115 |     $ cd biopython_workshop
116 | 
117 | Most of the examples use real biological data files. You should download them
118 | now using the `provided shell script <fetch_sample_data.sh>`_:
119 | 
120 | .. sourcecode:: console
121 | 
122 |     $ bash fetch_sample_data.sh
123 | 
124 | We assume you have Python and Biopython 1.63 or later installed and working.
125 | Biopython 1.63 supports Python 2.6, 2.7 and 3.3 (and should work on more recent
126 | versions). The examples here assume you are using Python 2.6 or 2.7, but in
127 | general should work with Python 3 with minimal changes. Check this works:
128 | 
129 | .. sourcecode:: console
130 | 
131 |     $ python -c "import Bio; print(Bio.__version__)"
132 |     1.63
133 | 
134 | =======
135 | History
136 | =======
137 | 
138 | This material was first used as part of a two-day course "Introduction to Python for
139 | Biologists" (Kathryn Crouch, Peter Cock and Tim Booth), part of a two-week course
140 | `Keystone Skills in Bioinformatics <http://environmentalomics.org/foundations/>`_,
141 | held in February 2014 at Centre for Ecology & Hydrology (CEH), Wallingford, UK.
142 | In a morning session lasting about 2.5 hours (plus coffee break), we covered all
143 | of `reading sequence files <reading_sequence_files/README.rst>`_ and
144 | `writing sequence files <writing_sequence_files/README.rst>`_ - and I quickly
145 | talked through `alignment files <reading_writing_alignments/README.rst>`_.
146 | 
147 | I presented much of it again later in February 2014 at the University of Dundee
148 | as part of the third year undergraduate course *BS32010 Applied Bioinformatics*
149 | run by Dr David Martin and Dr David Booth. In the two hour slot we covered all
150 | of `reading sequence files <reading_sequence_files/README.rst>`_ and most of
151 | `writing sequence files <writing_sequence_files/README.rst>`_.
152 | 
153 | I repeated this in March 2015 for the same third year undergraduate course,
154 | *BS32010 Applied Bioinformatics* at the University of Dundee. In a three hour
155 | slot we covered  `reading sequence files <reading_sequence_files/README.rst>`_
156 | most of `writing sequence files <writing_sequence_files/README.rst>`_ (up to
157 | editing sequences, but not filtering by identifier), and the start of
158 | `multiple-sequence alignments <reading_writing_alignments/README.rst>`_.
159 | 
160 | =====================
161 | Copyright and Licence
162 | =====================
163 | 
164 | Copyright 2014-2015 by Peter Cock, The James Hutton Institute, Dundee, UK.
165 | All rights reserved.
166 | 
167 | This work is licensed under a `Creative Commons Attribution-ShareAlike 4.0 International
168 | License <http://creativecommons.org/licenses/by-sa/4.0/>`_ (CC-BY-SA 4.0).
169 | 
170 | .. image:: http://i.creativecommons.org/l/by-sa/4.0/88x31.png
171 | 
172 | Note this documentation links to and uses external and separately licenced
173 | sample data files.
174 | 


--------------------------------------------------------------------------------
/fetch_sample_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Set bash strict mode (fail on errors, undefined variables, and via pipes)
 3 | set -euo pipefail
 4 | 
 5 | if [ -x "$(command -v wget)" ]; then
 6 |   # e.g. Linux
 7 |   echo "Downloading files using wget"
 8 |   FETCH="wget"
 9 | elif [ -x "$(command -v curl)" ]; then
10 |   # e.g. Max OS X
11 |   echo "Downloading files using curl"
12 |   FETCH="curl -O"
13 | else
14 |   echo "ERROR: Failed to find wget or curl"
15 |   exit 1
16 | fi
17 | 
18 | echo "=============================================="
19 | echo "Fetching Escherichia coli K-12 files from NCBI"
20 | echo "=============================================="
21 | 
22 | # Note: These files are no longer being updated...
23 | $FETCH ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.gbk
24 | $FETCH ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.fna
25 | $FETCH ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.ffn
26 | $FETCH ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Bacteria/Escherichia_coli_K_12_substr__MG1655_uid57779/NC_000913.faa
27 | 
28 | echo "=========================================================="
29 | echo "Fetching proteins from Potato Genome Sequencing Consortium"
30 | echo "=========================================================="
31 | 
32 | $FETCH http://potato.plantbiology.msu.edu/data/PGSC_DM_v3.4_pep_representative.fasta.zip
33 | unzip -o PGSC_DM_v3.4_pep_representative.fasta.zip
34 | 
35 | echo "===================================="
36 | echo "Fetching PF08792 alignment from PFAM"
37 | echo "===================================="
38 | 
39 | if [ -x "$(command -v wget)" ]; then
40 |   # Note: Using -O to set the filename explicitly as default is format?format=stockholm
41 |   wget -O "PF08792_seed.sth" http://pfam.sanger.ac.uk/family/PF08792/alignment/seed/format?format=stockholm
42 | elif [ -x "$(command -v curl)" ]; then
43 |   # Note: Mac OS alternative needs -L due to link redirect:
44 |   curl -o "PF08792_seed.sth" -L http://pfam.sanger.ac.uk/family/PF08792/alignment/seed/format?format=stockholm
45 | else
46 |   echo "ERROR: Failed to find wget or curl"
47 |   exit 1
48 | fi
49 | 


--------------------------------------------------------------------------------
/reading_sequence_files/README.rst:
--------------------------------------------------------------------------------
  1 | ===================================
  2 | Reading Sequence Files in Biopython
  3 | ===================================
  4 | 
  5 | Dealing with assorted sequence file formats is one of the strengths of Biopython.
  6 | The primary module we'll be using is `Bio.SeqIO <http://biopython.org/wiki/SeqIO>`_,
  7 | which is short for sequence input/output (following the naming convention set by
  8 | `BioPerl's SeqIO module <http://bioperl.org/wiki/HOWTO:SeqIO>`_).
  9 | 
 10 | For these examples we're going to use files for the famous bacteria *Esherichia coli*
 11 | K12 (from the NCBI FTP server), and some potato genes from the PGSC - see the
 12 | sample data instructions in the `introduction <../README.rst>`_ for how to download
 13 | these files.
 14 | 
 15 | -------------
 16 | Built-in Help
 17 | -------------
 18 | 
 19 | Python code should be documented. You can (and should) write special comment strings
 20 | called ``docstrings`` at the start of your own modules, classes and functions which
 21 | are used by Python as the built-in help text. Let's look at some of the built-in
 22 | Biopython documentation.
 23 | 
 24 | We'll run the interactive Python prompt from within the command line terminal (but you
 25 | could use a Python GUI, or `IPython <http://ipython.org/>`_,  if you prefer - depending
 26 | on what you are used to working with).
 27 | 
 28 | Load Biopython's ``SeqIO`` module with the ``import`` command, and have a look at the built
 29 | in help:
 30 | 
 31 | .. sourcecode:: pycon
 32 | 
 33 |     $ python2.7
 34 |     Python 2.7.3 (default, Nov  7 2012, 23:34:47) 
 35 |     [GCC 4.4.6 20120305 (Red Hat 4.4.6-4)] on linux2
 36 |     Type "help", "copyright", "credits" or "license" for more information.
 37 |     >>> from Bio import SeqIO
 38 |     >>> help(SeqIO)
 39 | 
 40 | You'll see the `SeqIO help text <http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html>`_
 41 | built into Biopython - the latest version of which should also be online. Pressing
 42 | space will show the next page of help text, the up and down cursor arrows scroll,
 43 | and ``q`` will quit the help and return to the Python prompt.
 44 | 
 45 | Rather than showing the help for the entire ``SeqIO`` module, you can ask for the help
 46 | on a particular object or function. Let's start with ``SeqIO.parse`` - and from now on
 47 | the triple greater-than-sign prompt (``>>>``) will be used to indicate something you
 48 | would type into Python:
 49 | 
 50 | .. sourcecode::	pycon
 51 | 
 52 |     >>> help(SeqIO.parse)
 53 | 
 54 | This gives some examples, and we'll start with something very similar.
 55 | 
 56 | ----------------
 57 | Counting Records
 58 | ----------------
 59 | 
 60 | We'll start by looking at the protein sequence in the FASTA amino acid file,
 61 | ``NC_000913.faa``. First take a quick peek using some command line tools like
 62 | ``head`` to look at the start of the file:
 63 | 
 64 | .. sourcecode:: console
 65 | 
 66 |     $ head NC_000913.faa 
 67 |     >gi|16127995|ref|NP_414542.1| thr operon leader peptide [Escherichia coli str. K-12 substr. MG1655]
 68 |     MKRISTTITTTITITTGNGAG
 69 |     >gi|16127996|ref|NP_414543.1| fused aspartokinase I and homoserine dehydrogenase I [Escherichia coli str. K-12 substr. MG1655]
 70 |     MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERI
 71 |     FAELLTGLAAAQPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEA
 72 |     RGHNVTVIDPVEKLLAVGHYLESTVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYS
 73 |     AAVLAACLRADCCEIWTDVDGVYTCDPRQVPDARLLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPC
 74 |     LIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMKGMVGMAARVFAAMSRARISVVLIT
 75 |     QSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRTLRGISAKFFAAL
 76 |     ARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSW
 77 | 
 78 | We can use ``grep`` to count the number of proteins by using the regular
 79 | expression pattern ``^>``.  The caret is a special symbol meaning look at
 80 | the start of a line, so this means look for lines starting with a greater
 81 | than sign (which is how individual FASTA format sequences are marked):
 82 | 
 83 | .. sourcecode::	console
 84 | 
 85 |     $ grep -c "^>" NC_000913.faa 
 86 |     4141
 87 | 
 88 | Now let's count the records with Biopython using the ``SeqIO.parse`` function:
 89 | 
 90 | .. sourcecode::	pycon
 91 | 
 92 |     $ python
 93 |     Python 2.7.3 (default, Nov  7 2012, 23:34:47) 
 94 |     [GCC 4.4.6 20120305 (Red Hat 4.4.6-4)] on linux2
 95 |     Type "help", "copyright", "credits" or "license" for more information.
 96 |     >>> from Bio import SeqIO
 97 |     >>> filename = "NC_000913.faa"
 98 |     >>> count = 0
 99 |     >>> for record in SeqIO.parse(filename, "fasta"):
100 |     ...     count = count + 1
101 |     ...
102 |     >>> print("There were " + str(count) + " records in file " + filename)
103 |     There were 4141 records in file NC_000913.faa
104 | 
105 | Running more than few commands like this at the Python prompt gets complicated,
106 | especially with indentation like this for loop. It is tough if you make a mistake
107 | and need to edit lines to rerun them (even with the up-arrow trick). It is also
108 | fiddly to copy and paste without the ``>>>`` prompt and ``...`` line continuation
109 | characters.
110 | 
111 | Instead, using your favourite editor (e.g. ``nano`` or ``gedit``) create a plain
112 | text file (in the same directory as the *E. coli* files) named ``count_fasta.py``:
113 | 
114 | .. sourcecode::	console
115 | 
116 |     $ nano count_fasta.py
117 | 
118 | Edit your new file ``count_fasta.py`` to contain the following:
119 | 
120 | .. sourcecode:: python
121 | 
122 |     from Bio import SeqIO
123 |     filename = "NC_000913.faa"
124 |     count = 0
125 |     for record in SeqIO.parse(filename, "fasta"):
126 |         count = count + 1
127 |     print("There were " + str(count) + " records in file " + filename)
128 | 
129 | This time it should be easy to copy & paste in one go. We can now run this:
130 | 
131 | .. sourcecode::	    console
132 | 
133 |     $ python count_fasta.py
134 |     There were 4141 records in file NC_000913.faa
135 | 
136 | **Exercise**: Modify this to count the number of records in the other FASTA files,
137 | both from *E. coli* K12 and the potato genome (``PGSC_DM_v3.4_pep_representative.fasta``).
138 | 
139 | **Advanced Exercise**: Using ``sys.argv`` get the filename as a command line argument,
140 | so that you can run it like this:
141 | 
142 | .. sourcecode::	console
143 | 
144 |     $ python count_fasta_adv.py NC_000913.ffn
145 |     There were 4321 records in file NC_000913.ffn
146 | 
147 | ----------------------
148 | Looking at the records
149 | ----------------------
150 | 
151 | In the above example, we used a for loop to count the records in a FASTA file,
152 | but didn't actually look at the information in the records. The ``SeqIO.parse``
153 | function was creating `SeqRecord objects <http://biopython.org/wiki/SeqRecord>`_.
154 | Biopython's ``SeqRecord`` objects are a container holding the sequence, and any
155 | annotation about it - most importantly the identifier.
156 | 
157 | For FASTA files, the record identifier is taken to be the first word on the ``>``
158 | line - anything after a space is *not* part of the identifier.
159 | 
160 | This simple example prints out the record identifers and their lengths:
161 | 
162 | .. sourcecode:: python
163 | 
164 |     from Bio import SeqIO
165 |     filename = "NC_000913.faa"
166 |     for record in SeqIO.parse(filename, "fasta"):
167 |         print("Record " + record.id + ", length " + str(len(record.seq)))
168 | 
169 | Notice that given a ``SeqRecord`` object we access the identifer as ``record.id``
170 | and the sequence object as ``record.seq``. As a shortcut, ``len(record)`` gives
171 | the sequence length, ``len(record.seq)``.
172 | 
173 | If you save that as ``record_lengths.py`` and run it you'll get over four thousand
174 | lines of output:
175 | 
176 | .. sourcecode::	console
177 | 
178 |     $ python record_lengths.py
179 |     Record gi|16127995|ref|NP_414542.1|, length 21
180 |     Record gi|16127996|ref|NP_414543.1|, length 820
181 |     Record gi|16127997|ref|NP_414544.1|, length 310
182 |     Record gi|16127998|ref|NP_414545.1|, length 428
183 |     ...
184 |     Record gi|16132219|ref|NP_418819.1|, length 46
185 |     Record gi|16132220|ref|NP_418820.1|, length 228
186 |     
187 | The output shown here is truncated!
188 | 
189 | **Exercise**: Count how many sequences are less than 100 amino acids long.
190 | 
191 | **Exercise**: Create a modified script ``total_length.py`` based on the above examples
192 | which counts the number of records and calculates the total length of all the
193 | sequences (i.e. ``21 + 820 + 310 + 428 + ... + 46 + 228``), giving:
194 | 
195 | .. sourcecode::	console
196 | 
197 |     $ python total_length.py
198 |     4141 records, total length 1311442
199 | 
200 | **Advanced Exercise**: Plot a histogram of the sequence length distribution (tip - see the
201 | `Biopython Tutorial & Cookbook <http://biopython.org/DIST/docs/tutorial/Tutorial.html>`_).
202 | 
203 | -----------------------
204 | Looking at the sequence
205 | -----------------------
206 | 
207 | The record identifiers are very important, but more important still is the sequence
208 | itself. In the ``SeqRecord`` objects the identifiers are stored as standard Python
209 | strings (e.g. ``.id``). For the sequence, Biopython uses a string-like ``Seq`` object,
210 | accessed as ``.seq``.
211 | 
212 | In many ways the ``Seq`` objects act like Python strings, you can print them, take
213 | their length using the ``len(...)`` function, and slice them with square brackets
214 | to get a sub-sequence or a single letter.
215 | 
216 | **Exercise**: Using ``SeqIO.parse(...)`` in a for loop, for each record print out the
217 | identifier, the first 10 letters of each sequences, and the last 10 letters. e.g.:
218 | 
219 | .. sourcecode::	console
220 | 
221 |    $ python print_seq.py
222 |    gi|16127995|ref|NP_414542.1| MKRISTTITT...ITITTGNGAG
223 |    gi|16127996|ref|NP_414543.1| MRVLKFGGTS...LRTLSWKLGV
224 |    gi|16127997|ref|NP_414544.1| MVKVYAPASS...DTAGARVLEN
225 |    ...
226 |    gi|16132219|ref|NP_418819.1| MTKVRNCVLD...AVILTILTAT
227 |    gi|16132220|ref|NP_418820.1| MRITIILVAP...LHDIEKNITK
228 | 
229 | ---------------------------------------
230 | Checking proteins start with methionine
231 | ---------------------------------------
232 | 
233 | In the next example we'll check all the protein sequences start with a methionine
234 | (represented as the letter "M" in the standard IUPAC single letter amino acid code),
235 | and count how many records fail this. Let's create a script called ``check_start_met.py``:
236 | 
237 | .. sourcecode:: python
238 | 
239 |     from Bio import SeqIO
240 |     filename = "NC_000913.faa"
241 |     bad = 0
242 |     for record in SeqIO.parse(filename, "fasta"):
243 |         if not record.seq.startswith("M"):
244 |             bad = bad + 1
245 |             print(record.id + " starts " + record.seq[0])
246 |     print("Found " + str(bad) + " records in " + filename + " which did not start with M")
247 | 
248 | If you run that, you should find this *E. coli* protein set all had leading methionines:
249 | 
250 | .. sourcecode::	console
251 | 
252 |     $ python check_start_met.py
253 |     Found 0 records in NC_000913.faa which did not start with M
254 | 
255 | Good - no strange proteins. This genome has been completely sequenced and a lot of
256 | work has been done on the annotation, so it is a 'Gold Standard'. Now try this on
257 | the potato protein file ``PGSC_DM_v3.4_pep_representative.fasta``:
258 | 
259 | .. sourcecode::	console
260 | 
261 |     $ python check_start_met.py
262 |     PGSC0003DMP400032467 starts T
263 |     PGSC0003DMP400011427 starts Q
264 |     PGSC0003DMP400068739 starts E
265 |     ...
266 |     PGSC0003DMP400011481 starts Y
267 |     Found 208 records in PGSC_DM_v3.4_pep_representative.fasta which did not start with M
268 | 
269 | **Excercise**: Modify this script to print out the description of the problem records,
270 | not just the identifier. *Tip*: Try reading the documentation, e.g. Biopython's wiki page
271 | on the `SeqRecord <http://biopython.org/wiki/SeqRecord>`_.
272 | 
273 | **Discussion**: What did you notice about these record descriptions? Can you think of any
274 | reasons why there could be so many genes/proteins with a problem at the start?
275 | 
276 | ------------------------
277 | Checking stop characters
278 | ------------------------
279 | 
280 | In the standard one letter IUPAC amino acid codes for proteins, "*" is used for a
281 | stop codon. For many analyses tools having a "*" in the protein sequence can cause
282 | an error. There are two main reasons why you might see a "*" in a protein sequence.
283 | 
284 | First, it might be there from translation up to and including the closing stop codon
285 | for the gene. In this case, you might want to remove it.
286 | 
287 | Second, it could be there from a problematic/broken annotation where there is an
288 | in-frame stop codon. In this case, you might want to fix the annotation, remove
289 | the whole sequence, or perhaps cheat and replace the "*" with an "X" for an unknown
290 | amino acid.
291 | 
292 | We'll talk about writing out sequence files soon, but first let's check the example
293 | protein FASTA files for any "*" symbols in the sequence. For this you can use several
294 | of the standard Python string operations which also apply to ``Seq`` objects, e.g.:
295 | 
296 | .. sourcecode:: pycon
297 | 
298 |     >>> my_string = "MLNTCRVPLTDRKVKEKRAMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVFTAYESE*"
299 |     >>> my_string.startswith("M")
300 |     True
301 |     >>> my_string.endswith("*")
302 |     True
303 |     >>> len(my_string)
304 |     70
305 |     >>> my_string.count("M")
306 |     3
307 |     >>> my_string.count("*")
308 |     1
309 | 
310 | **Exercise**: Write a python script to check ``NC_000913.faa`` to count the number of
311 | sequences with a "*" in them (anywhere), and the number where the sequence ends with
312 | a "*". Then try it on ``PGSC_DM_v3.4_pep_representative.fasta`` as well. e.g.:
313 | 
314 | .. sourcecode::	console
315 | 
316 |     $ python check_stops.py
317 |     Checking NC_000913.faa for terminal stop codons
318 |     0 records with * in them
319 |     0 with * at the end
320 | 
321 | **Discussion**: What did you notice about the "*" stop characters in these FASTA files?
322 | What should we do to 'fix' the problems?
323 | 
324 | --------------
325 | Single Records
326 | --------------
327 | 
328 | One of the example FASTA files for *E. coli* K12 is a single long sequence
329 | for the entire (circular) genome, file ``NC_000913.fna``. We can still use a
330 | for loop and ``SeqIO.parse(...)`` but it can feel awkward. Instead, for the
331 | special case where the sequence file contains one and only one record, you
332 | can use ``SeqIO.read(...)``.
333 | 
334 | .. sourcecode:: pycon
335 | 
336 |     >>> from Bio import SeqIO
337 |     >>> record = SeqIO.read("NC_000913.fna", "fasta")
338 |     >>> print(record.id + " length " + str(len(record)))
339 |     gi|556503834|ref|NC_000913.3| length 4641652
340 | 
341 | **Exercise**: Try using ``SeqIO.read(...)`` on one of the protein files.
342 | What happens?
343 | 
344 | ----------------------
345 | Different File Formats
346 | ----------------------
347 | 
348 | So far we've only been using FASTA format files, which is why when we've called
349 | ``SeqIO.parse(...)`` or ``SeqIO.read(...)`` the second argument has been ``"fasta"``.
350 | The Biopython ``SeqIO`` module supports quite a few other important sequence file
351 | formats (see the table on the `SeqIO wiki page <http://biopython.org/wiki/SeqIO>`_).
352 | 
353 | If you work with finished genomes, you'll often see nicely annotated files in
354 | the EMBL or GenBank format. Let's try this with the *E. coli* K12 GenBank file,
355 | ``NC_000913.gbk``, based on the previous example:
356 | 
357 | .. sourcecode::	pycon
358 | 
359 |     >>> from Bio import SeqIO
360 |     >>> fasta_record = SeqIO.read("NC_000913.fna", "fasta")
361 |     >>>	print(fasta_record.id + " length " + str(len(fasta_record)))
362 |     gi|556503834|ref|NC_000913.3| length 4641652
363 |     >>> genbank_record = SeqIO.read("NC_000913.gbk", "genbank")
364 |     >>>	print(genbank_record.id + " length " + str(len(genbank_record)))
365 |     NC_000913.3 length 4641652
366 | 
367 | All we needed to change was the file format argument to the ``SeqIO.read(...)``
368 | function - and we could load a GenBank file instead. You'll notice the GenBank
369 | version was given a shorter identifier, and took longer to load. The reason is
370 | that there is a lot more information present - most importantly lots of features
371 | (where each gene is and so on). We'll return to this in a later section,
372 | `working with sequence features <../using_seqfeatures/README.rst>`_.
373 | 
374 | ===================================
375 | Writing Sequence Files in Biopython
376 | ===================================
377 | 
378 | We move on to `writing sequence files <../writing_sequence_files/README.rst>`_
379 | in the next section.
380 | 


--------------------------------------------------------------------------------
/reading_sequence_files/check_start_met.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | #filename = "NC_000913.faa"
 3 | filename = "PGSC_DM_v3.4_pep_representative.fasta"
 4 | bad = 0
 5 | for record in SeqIO.parse(filename, "fasta"):
 6 |     if not record.seq.startswith("M"):
 7 |         bad = bad + 1
 8 |         print(record.id + " starts " + record.seq[0])
 9 | print("Found " + str(bad) + " records in " + filename + " which did not start with M")
10 | 
11 | 


--------------------------------------------------------------------------------
/reading_sequence_files/check_stops.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | filename = "NC_000913.faa"
 3 | #filename = "PGSC_DM_v3.4_pep_representative.fasta"
 4 | contains_star = 0
 5 | ends_with_star = 0
 6 | print("Checking " + filename + " for terminal stop codons")
 7 | for record in SeqIO.parse(filename, "fasta"):
 8 |     if record.seq.count("*"):
 9 |         contains_star = contains_star + 1
10 |     if record.seq.endswith("*"):
11 |         ends_with_star = ends_with_star + 1
12 | print(str(contains_star) + " records with * in them")
13 | print(str(ends_with_star) + " with * at the end")
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/reading_sequence_files/count_fasta.py:
--------------------------------------------------------------------------------
1 | from Bio import SeqIO
2 | filename = "NC_000913.faa"
3 | count = 0
4 | for record in SeqIO.parse(filename, "fasta"):
5 |     count = count + 1
6 | print("There were " + str(count) + " records in file " + filename)
7 | 


--------------------------------------------------------------------------------
/reading_sequence_files/count_fasta_adv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | from Bio import SeqIO
 4 | 
 5 | #Remember sys.argv[0] is the script itself
 6 | for filename in sys.argv[1:]:
 7 |     count = 0
 8 |     for record in SeqIO.parse(filename, "fasta"):
 9 |         count += 1 # this is shorthand for count = count + 1
10 |     print("There were " + str(count) + " records in file " + filename)
11 | 


--------------------------------------------------------------------------------
/reading_sequence_files/print_seq.py:
--------------------------------------------------------------------------------
1 | from Bio import SeqIO
2 | filename = "NC_000913.faa"
3 | for record in SeqIO.parse(filename, "fasta"):
4 |     start_seq = record.seq[:10] # first 10 letters
5 |     end_seq = record.seq[-10:] # last 10 letters
6 |     print(record.id + " " + start_seq + "..." + end_seq)
7 | 


--------------------------------------------------------------------------------
/reading_sequence_files/record_lengths.py:
--------------------------------------------------------------------------------
1 | from Bio import SeqIO
2 | filename = "NC_000913.faa"
3 | for record in SeqIO.parse(filename, "fasta"):
4 |     print("Record " + record.id + ", length " + str(len(record.seq)))
5 | 


--------------------------------------------------------------------------------
/reading_sequence_files/total_length.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | filename = "NC_000913.faa"
 3 | count = 0
 4 | total = 0
 5 | for record in SeqIO.parse(filename, "fasta"):
 6 |     count = count + 1
 7 |     # Can use len(record) as shortcut for len(record.seq)
 8 |     total = total + len(record)
 9 | print(str(count) + " records, total length " +str(total) + " in file " + filename)
10 | 


--------------------------------------------------------------------------------
/reading_writing_alignments/README.rst:
--------------------------------------------------------------------------------
  1 | =========================================
  2 | Reading Multiple-sequence Alignment Files
  3 | =========================================
  4 | 
  5 | The previous sections looked at Biopython's ``SeqIO`` module for
  6 | sequence file input and output
  7 | (`reading <../reading_sequence_files/README.rst>`_ and
  8 | `writing <../writing_sequence_files/README.rst>`_ sequence files).
  9 | 
 10 | Now we come to the ``AlignIO`` module which as the name suggests
 11 | is for alignment input and ouput. Note that this is focused on
 12 | dealing with multiple sequence alignments of the kind typically
 13 | used in phylogenetics - a separate ``SearchIO`` module targets
 14 | pairwise alignments generated by search tools like BLAST.
 15 | 
 16 | These examples use a number of real example sequence alignment files,
 17 | see the sample data instructions in the `introduction <../README.rst>`_
 18 | for how to download them.
 19 | 
 20 | We're going to look at a small seed alignment for one of the PFAM
 21 | domains, the `A2L zinc ribbon domain (A2L_zn_ribbon; PF08792)
 22 | <http://pfam.sanger.ac.uk/family/PF08792>`_. This was picked
 23 | almost at random - it is small enough to see the entire alignment
 24 | on screen, and has some obvious gap-rich columns.
 25 | 
 26 | From the alignments tab on the Pfam webpage, you can download
 27 | the raw alignment in several different formats (Selex, Stockholm,
 28 | FASTA, and MSF). Biopython is able to work with FASTA (very simple)
 29 | and Stockholm format (richly annotated).
 30 | 
 31 | --------------------------
 32 | Loading a single Alignment
 33 | --------------------------
 34 | 
 35 | As in ``SeqIO``, under ``AlignIO`` we have both ``AlignIO.parse(...)``
 36 | for looping over multiple separate alignments, and ``AlignIO.read(...)``
 37 | for loading a file containing a single alignment.
 38 | 
 39 | Most of the time you will be working with alignment files which contain
 40 | a single alignment, so normally you will use ``AlignIO.read(..)``.
 41 | 
 42 | Here is an example loading the Pfam seed alignment for the `A2L zinc ribbon
 43 | domain (A2L_zn_ribbon; PF08792) <http://pfam.sanger.ac.uk/family/PF08792>`_:
 44 | 
 45 | .. sourcecode:: pycon
 46 | 
 47 |     >>> from Bio import AlignIO
 48 |     >>> alignment = AlignIO.read("PF08792_seed.sth", "stockholm")
 49 |     >>> print(alignment)
 50 |     SingleLetterAlphabet() alignment with 14 rows and 37 columns
 51 |     SIPVVCT---CGNDKDFY--KDDDIYICQLCNAETVK VF282_IIV6/150-181
 52 |     DIIENCKY--CGSFDIE---KVKDIYTCGDCTQTYTT Q9YW27_MSEPV/2-33
 53 |     SDNIKCKY--CNSFNII---KNKDIYSCCDCSNCYTT Q9EMK1_AMEPV/2-33
 54 |     AQDWRCDD--CNATLVYV--KKDAQRVCLECGKSTFF Q6XM16_9PHYC/83-115
 55 |     SKEWICEV--CNKELVYI--RKDAERVCPDCGLSHPY Q8QNH7_ESV1K/101-133
 56 |     NDDSKCIK--CGGPVLMQ--AARSLLNCQECGYSAAV Q4A276_EHV8U/148-180
 57 |     KSQNVCSVPDCDGEKILN--QNDGYMVCKKCGFSEPI YR429_MIMIV/213-247
 58 |     LKYKECKY--CHTDMVFN--TTQFGLQCPNCGCIQEL VF385_ASFB7/145-177
 59 |     RNLKSCSN--CKHNGLI---TEYNHEFCIFCQSVFQL Q6VZA9_CNPV/2-33
 60 |     MNLRMCGG--CRRNGLV---SDADYEFCLFCETVFPM Q6TVP3_ORFSA/1-32
 61 |     MNLRLCSG--CRHNGIV---SEQGYEYCIFCESVFQK VLTF3_VACCC/1-32
 62 |     MNLKMCSG--CSHNGIV---SEHGYEFCIFCESIFQS Q8V3K7_SWPV1/1-32
 63 |     NALRHCHG--CKHNGLV---LEQGYEFCIFCQAVFQH O11357_MCV1/5-36
 64 |     DQIYTCT---CGGQMELWVNSTQSDLVCNECGATQPY Y494R_PBCV1/148-181
 65 | 
 66 | Printing a Biopython alignment object will give you a display as above
 67 | (but truncated for larger alignments).
 68 | 
 69 | In many ways, the alignment acts like a list of ``SeqRecord``
 70 | objects (just like you would get from ``SeqIO``). The length
 71 | of the alignment is the number of rows for example, and you
 72 | can loop over the rows as individual ``SeqRecord`` objects:
 73 | 
 74 | .. sourcecode:: pycon
 75 | 
 76 |     >>> print(len(alignment))
 77 |     14
 78 |     >>> for record in 
 79 |     >>> for record in alignment:
 80 |     ...     print(record.id + " has " + str(record.seq.count("-")) + " gaps")
 81 |     ... 
 82 |     VF282_IIV6/150-181 has 5 gaps
 83 |     Q9YW27_MSEPV/2-33 has 5 gaps
 84 |     Q9EMK1_AMEPV/2-33 has 5 gaps
 85 |     Q6XM16_9PHYC/83-115 has 4 gaps
 86 |     Q8QNH7_ESV1K/101-133 has 4 gaps
 87 |     Q4A276_EHV8U/148-180 has 4 gaps
 88 |     YR429_MIMIV/213-247 has 2 gaps
 89 |     VF385_ASFB7/145-177 has 4 gaps
 90 |     Q6VZA9_CNPV/2-33 has 5 gaps
 91 |     Q6TVP3_ORFSA/1-32 has 5 gaps
 92 |     VLTF3_VACCC/1-32 has 5 gaps
 93 |     Q8V3K7_SWPV1/1-32 has 5 gaps
 94 |     O11357_MCV1/5-36 has 5 gaps
 95 |     Y494R_PBCV1/148-181 has 3 gaps
 96 | 
 97 | **Exercise**: Write a python script called ``count_gaps.py`` which
 98 | reports the number of records, the total number of gaps, and the
 99 | mean (average) number of gaps per record:
100 | 
101 | .. sourcecode:: console
102 | 
103 |     $ python count_gaps.py
104 |     PF08792_seed.sth had 14 records,
105 |     Total gaps 61, average per record 4.35714285714
106 | 
107 | *Tip*: If you get zero as the average, and are using Python 2,
108 | add the following special import line to the start of your Python
109 | file. This will give natural division (as used in Python 3) rather
110 | than integer division (used by default in Python 2)::
111 | 
112 | .. sourcecode:: python
113 | 
114 |     from __future__ import division
115 | 
116 | =========================================
117 | Writing Multiple-sequence Alignment Files
118 | =========================================
119 | 
120 | As you might guess from using ``SeqIO.convert(...)`` and
121 | ``SeqIO.write(...)``, there are matching ``AlignIO.convert()``
122 | and ``AlignIO.write(...)`` functions.
123 | 
124 | For example, this will convert the Stockholm formatted alignment
125 | into a relaxed PHYLIP format file:
126 | 
127 | .. sourcecode:: python
128 | 
129 |     from Bio import AlignIO
130 |     input_filename = "PF08792_seed.sth"
131 |     output_filename = "PF08792_seed_converted.phy"
132 |     AlignIO.convert(input_filename, "stockholm", output_filename, "phylip-relaxed")
133 | 
134 | **Exercise**: Modify this example to convert the Stockholm file
135 | into a FASTA alignment file.
136 | 
137 | This ``AlignIO.convert(...)``  example is equivalent to using
138 | ``AlignIO.read(...)`` and ``AlignIO.write(...)`` explicitly:
139 | 
140 | .. sourcecode:: python
141 | 
142 |     from Bio import AlignIO
143 |     input_filename = "PF08792_seed.sth"
144 |     output_filename = "PF08792_seed_converted.phy"
145 |     alignment = AlignIO.read(input_filename, "stockholm")
146 |     AlignIO.write(alignment, output_filename, "phylip-relaxed")
147 | 
148 | This form is most useful if you wish to modify the alignment in some way,
149 | which we will do next.
150 | 
151 | ----------------
152 | Sorting the rows
153 | ----------------
154 | 
155 | Downloading from Pfam gives you the option of picking the order
156 | the rows appear in - by default this is according to the *tree*
157 | order (clustering similar sequences together), but it can also
158 | be *alphabetical* (using the identifiers).
159 | 
160 | We downloaded the file using the tree order, but here is how you
161 | can sort the rows by identifier within Biopython:
162 | 
163 | .. sourcecode:: pycon
164 | 
165 |     >>> from Bio import AlignIO
166 |     >>> alignment = AlignIO.read("PF08792_seed.sth", "stockholm")
167 |     >>> alignment.sort()
168 |     >>> print(alignment)
169 |     SingleLetterAlphabet() alignment with 14 rows and 37 columns
170 |     NALRHCHG--CKHNGLV---LEQGYEFCIFCQAVFQH O11357_MCV1/5-36
171 |     NDDSKCIK--CGGPVLMQ--AARSLLNCQECGYSAAV Q4A276_EHV8U/148-180
172 |     MNLRMCGG--CRRNGLV---SDADYEFCLFCETVFPM Q6TVP3_ORFSA/1-32
173 |     RNLKSCSN--CKHNGLI---TEYNHEFCIFCQSVFQL Q6VZA9_CNPV/2-33
174 |     AQDWRCDD--CNATLVYV--KKDAQRVCLECGKSTFF Q6XM16_9PHYC/83-115
175 |     SKEWICEV--CNKELVYI--RKDAERVCPDCGLSHPY Q8QNH7_ESV1K/101-133
176 |     MNLKMCSG--CSHNGIV---SEHGYEFCIFCESIFQS Q8V3K7_SWPV1/1-32
177 |     SDNIKCKY--CNSFNII---KNKDIYSCCDCSNCYTT Q9EMK1_AMEPV/2-33
178 |     DIIENCKY--CGSFDIE---KVKDIYTCGDCTQTYTT Q9YW27_MSEPV/2-33
179 |     SIPVVCT---CGNDKDFY--KDDDIYICQLCNAETVK VF282_IIV6/150-181
180 |     LKYKECKY--CHTDMVFN--TTQFGLQCPNCGCIQEL VF385_ASFB7/145-177
181 |     MNLRLCSG--CRHNGIV---SEQGYEYCIFCESVFQK VLTF3_VACCC/1-32
182 |     DQIYTCT---CGGQMELWVNSTQSDLVCNECGATQPY Y494R_PBCV1/148-181
183 |     KSQNVCSVPDCDGEKILN--QNDGYMVCKKCGFSEPI YR429_MIMIV/213-247
184 | 
185 | **Exercise**: Write a Python script ``sort_alignment_by_id.py``
186 | which uses ``AlignIO.read(..)`` and ``AlignIO.write(..)``
187 | to convert ``PF08792_seed.sth`` into a sorted FASTA file.
188 | 
189 | By default the alignment's sort method uses the identifers as
190 | the sort key, but much like how sorting a Python list works,
191 | you can override this.
192 | 
193 | **Advanced Exercise**: Define your own function taking a single
194 | argument (a ``SeqRecord``) which returns the number of gaps
195 | in the sequence. Use this to sort the alignment and print it
196 | to screen (or save it as a new file):
197 | 
198 | .. sourcecode:: python
199 | 
200 |     from Bio import AlignIO
201 | 
202 |     def count_gaps(record):
203 |         """Counts number of gaps in record's sequence."""
204 |         return 0  # Fill in code
205 | 
206 |     filename = "PF08792_seed.sth"
207 |     alignment = AlignIO.read(filename, "stockholm")
208 |     alignment.sort(key=count_gaps)
209 |     print(alignment)
210 | 
211 | Expected output:
212 | 
213 | .. sourcecode:: console
214 | 
215 |     $ python sort_gaps.py
216 |     SingleLetterAlphabet() alignment with 14 rows and 37 columns
217 |     KSQNVCSVPDCDGEKILN--QNDGYMVCKKCGFSEPI YR429_MIMIV/213-247
218 |     DQIYTCT---CGGQMELWVNSTQSDLVCNECGATQPY Y494R_PBCV1/148-181
219 |     AQDWRCDD--CNATLVYV--KKDAQRVCLECGKSTFF Q6XM16_9PHYC/83-115
220 |     SKEWICEV--CNKELVYI--RKDAERVCPDCGLSHPY Q8QNH7_ESV1K/101-133
221 |     NDDSKCIK--CGGPVLMQ--AARSLLNCQECGYSAAV Q4A276_EHV8U/148-180
222 |     LKYKECKY--CHTDMVFN--TTQFGLQCPNCGCIQEL VF385_ASFB7/145-177
223 |     SIPVVCT---CGNDKDFY--KDDDIYICQLCNAETVK VF282_IIV6/150-181
224 |     DIIENCKY--CGSFDIE---KVKDIYTCGDCTQTYTT Q9YW27_MSEPV/2-33
225 |     SDNIKCKY--CNSFNII---KNKDIYSCCDCSNCYTT Q9EMK1_AMEPV/2-33
226 |     RNLKSCSN--CKHNGLI---TEYNHEFCIFCQSVFQL Q6VZA9_CNPV/2-33
227 |     MNLRMCGG--CRRNGLV---SDADYEFCLFCETVFPM Q6TVP3_ORFSA/1-32
228 |     MNLRLCSG--CRHNGIV---SEQGYEYCIFCESVFQK VLTF3_VACCC/1-32
229 |     MNLKMCSG--CSHNGIV---SEHGYEFCIFCESIFQS Q8V3K7_SWPV1/1-32
230 |     NALRHCHG--CKHNGLV---LEQGYEFCIFCQAVFQH O11357_MCV1/5-36
231 | 


--------------------------------------------------------------------------------
/reading_writing_alignments/count_gaps.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from Bio import AlignIO
 3 | 
 4 | filename = "PF08792_seed.sth"
 5 | alignment = AlignIO.read(filename, "stockholm")
 6 | gaps = 0
 7 | for record in alignment:
 8 |     gaps = gaps + record.seq.count("-")
 9 | count = len(alignment)  # number of records
10 | print(filename + " had " + str(count) + " records,")
11 | print("Total gaps " + str(gaps) + ", average per record " + str(gaps / count))
12 | 


--------------------------------------------------------------------------------
/reading_writing_alignments/sort_gaps.py:
--------------------------------------------------------------------------------
 1 | from Bio import AlignIO
 2 | 
 3 | def count_gaps(record):
 4 |     """Counts number of gaps in record's sequence."""
 5 |     return record.seq.count("-")
 6 | 
 7 | filename = "PF08792_seed.sth"
 8 | alignment = AlignIO.read(filename, "stockholm")
 9 | alignment.sort(key=count_gaps)
10 | print(alignment)
11 | 


--------------------------------------------------------------------------------
/tests/README.rst:
--------------------------------------------------------------------------------
1 | This folder (``tests``) contains a number of Python scripts used to
2 | test all the examples used in this workshop, and is not part of the
3 | material the workshop participants are expected to read.
4 | 


--------------------------------------------------------------------------------
/tests/test_consistency.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Check the sample python scripts match the emmbeded copies in README.rst files.
 3 | 
 4 | This is a workaround for the fact that (due to security concerns)
 5 | neither BitBucket nor GitHub support the reStructuredText include
 6 | directive, which would have allowed direct embedding of small
 7 | Python scripts into the documentation. See:
 8 | 
 9 | - https://bitbucket.org/site/master/issue/5411/restructuredtext-include-directive
10 | - https://github.com/github/markup/issues/172
11 | """
12 | 
13 | from __future__ import print_function
14 | import os
15 | import sys
16 | 
17 | filename = os.path.split(__file__)[1]
18 | if os.path.isfile(filename):
19 |     #Already in the tests directory
20 |     base_path = ".."
21 | elif os.path.isfile(os.path.join("tests", filename)):
22 |     #Already in the repository root directory
23 |     base_path ="."
24 | else:
25 |     sys.stderr.write("Should be in base folder or tests folder.\n")
26 |     sys.exit(1)
27 | 
28 | def load_and_indent(filename, indent=" "*4):
29 |     """Load a text file as a string, adding the indent to each line."""
30 |     lines = []
31 |     for line in open(filename):
32 |         lines.append(indent + line)
33 |     return "".join(lines)
34 | 
35 | good = 0
36 | warn = 0
37 | errors = 0
38 | for dirpath, dirnames, filenames in os.walk(base_path):
39 |     if "README.rst" not in filenames:
40 |         continue
41 |     readme = os.path.join(dirpath, "README.rst")
42 |     if readme.endswith("/tests/README.rst"):
43 |         continue
44 |     print("-" * 40)
45 |     print("Checking %s" % readme)
46 |     #Which script files might this contain?
47 |     scripts = dict()
48 |     for f in filenames:
49 |         if f.endswith(".py"):
50 |             scripts[f] = load_and_indent(os.path.join(dirpath, f))
51 |     if not scripts:
52 |         print("No local script files for this")
53 |         continue
54 |     #Now check the README.rst file contains them...
55 |     print("Using: %s" % ", ".join(sorted(scripts)))
56 |     with open(readme) as handle:
57 |         text = handle.read()
58 |     for filename, script in sorted(scripts.items()):
59 |         filename_used = (("``%s``" % filename) in text) or (("$ python %s" % filename) in text)
60 |         script_embedded = script in text
61 |         if filename_used and script_embedded:
62 |             print(" - %s named and embedded" % filename)
63 |             good += 1
64 |         elif filename_used:
65 |             print(" - %s named but not embedded (warning)" % filename)
66 |             warn += 1
67 |         elif script_embedded:
68 |             print(" - %s not named, but embedded in text (ERROR)" % filename)
69 |             errors += 1
70 |         else:
71 |             print(" - %s neither named nor embedded (ERROR)" % filename)
72 |             errors += 1
73 | print("=" * 40)
74 | print("%i good, %i warnings, %i errors" % (good, warn, errors))
75 | if errors:
76 |     sys.stderr.write("Consistency test failed")
77 |     sys.exit(1)
78 | 
79 |             
80 | 


--------------------------------------------------------------------------------
/tests/test_scripts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Check the sample python scripts run.
 3 | 
 4 | Useful to catch any Python 2 vs Python 3 syntax errors.
 5 | 
 6 | TODO: Check output against embedded examples in README.rst?
 7 | TODO: Handle any command line switches?
 8 | """
 9 | 
10 | from __future__ import print_function
11 | import os
12 | import sys
13 | import subprocess
14 | 
15 | filename = os.path.split(__file__)[1]
16 | if os.path.isfile(filename):
17 |     #Already in the tests directory
18 |     #base_path = ".."
19 |     #Assume sample data files in the repository root directory
20 |     os.chdir("..")
21 | if os.path.isfile(os.path.join("tests", filename)):
22 |     #Already in the repository root directory
23 |     base_path = "."
24 | else:
25 |     sys.stderr.write("Should be in base folder or tests folder.\n")
26 |     sys.exit(1)
27 | 
28 | 
29 | def abbreviate(text):
30 |     if len(text) <= 1000:
31 |         return text
32 |     lines = text.split("\n")
33 |     if len(lines) > 20:
34 |         lines = lines[:10] + ["..."] + lines[-10:]
35 |         return "\n".join(lines)
36 |     # Not elegant...
37 |     return lines[:100] + "\n...\n" + lines[-100:]
38 | 
39 | 
40 | def check(script):
41 |     """Runs script and Will increment good, warn or errors."""
42 |     global good, warn, errors
43 |     #TODO - This assumes 'python' will be aliased as on TravisCI
44 |     child = subprocess.Popen(["python", script],
45 |                              stdout=subprocess.PIPE,
46 |                              stderr=subprocess.PIPE,
47 |                              universal_newlines=True,
48 |                              )
49 |     stdout, stderr = child.communicate()
50 |     if child.returncode:
51 |         errors += 1
52 |         sys.stderr.write("Return code %i from %s\n" % (child.returncode, script))
53 |     elif stderr:
54 |         warn += 1
55 |         sys.stderr.write(stderr)
56 |     else:
57 |         good += 1
58 |         sys.stdout.write(abbreviate(stdout))
59 | 
60 | 
61 | good = 0
62 | warn = 0
63 | errors = 0
64 | for dirpath, dirnames, filenames in os.walk(base_path):
65 |     if "README.rst" not in filenames:
66 |         continue
67 |     readme = os.path.join(dirpath, "README.rst")
68 |     if readme.endswith("/tests/README.rst"):
69 |         continue
70 |     scripts = [f for f in filenames if f.endswith(".py")]
71 |     if not scripts:
72 |         continue
73 |     print("-" * 40)
74 |     print("Checking %s (%i scripts)" % (dirpath, len(scripts)))
75 |     print("-" * 40)
76 |     for f in scripts:
77 |         script = os.path.join(dirpath, f)
78 |         print("Checking %s" % script)
79 |         check(script)
80 | print("=" * 40)
81 | print("%i good, %i warnings, %i errors" % (good, warn, errors))
82 | if errors:
83 |     sys.stderr.write("Test failed\n")
84 |     sys.exit(1)
85 | 
86 | 


--------------------------------------------------------------------------------
/using_seqfeatures/README.rst:
--------------------------------------------------------------------------------
  1 | ==============================
  2 | Working with Sequence Features
  3 | ==============================
  4 | 
  5 | This picks up from the end of the section on `reading sequence files
  6 | <../reading_sequence_files/README.rst>`_, but looks at the feature
  7 | annotation included in some file formats like EMBL or GenBank.
  8 | 
  9 | Most of the time GenBank files contain a single record for a single
 10 | chromosome or plasmid, so we'll generally use the ``SeqIO.read(...)``
 11 | function. Remember the second argument is the file format, so if we
 12 | start from the code to read in a FASTA file:
 13 | 
 14 | .. sourcecode:: pycon
 15 | 
 16 |     >>> from Bio import SeqIO
 17 |     >>> record = SeqIO.read("NC_000913.fna", "fasta")
 18 |     >>> print(record.id)
 19 |     gi|556503834|ref|NC_000913.3|
 20 |     >>> print(len(record))
 21 |     4641652
 22 |     >>> print(len(record.features))
 23 |     0
 24 | 
 25 | Now switch the filename and the format:
 26 | 
 27 | .. sourcecode::	pycon
 28 | 
 29 |     >>> from Bio import SeqIO
 30 |     >>> record = SeqIO.read("NC_000913.gbk", "genbank")
 31 |     >>> print(record.id)
 32 |     NC_000913.3
 33 |     >>> print(len(record))
 34 |     4641652
 35 |     >>> print(len(record.features))
 36 |     23086
 37 | 
 38 | So what is this new ``.features`` thing? It is a Python list, containing
 39 | a Biopython ``SeqFeature`` object for each feature in the GenBank file.
 40 | For instance,
 41 | 
 42 | .. sourcecode:: pycon
 43 | 
 44 |     >>> my_gene = record.features[3]
 45 |     >>> print(my_gene)
 46 |     type: gene
 47 |     location: [336:2799](+)
 48 |     qualifiers: 
 49 |         Key: db_xref, Value: ['EcoGene:EG10998', 'GeneID:945803']
 50 |         Key: gene, Value: ['thrA']
 51 |         Key: gene_synonym, Value: ['ECK0002; Hs; JW0001; thrA1; thrA2; thrD']
 52 |         Key: locus_tag, Value: ['b0002']
 53 | 
 54 | Doing a print like this tries to give a human readable display. There
 55 | are three key properties, ``.type`` which is a string like ``gene``
 56 | or ``CDS``, ``.location`` which describes where on the genome this
 57 | feature is, and ``.qualifiers`` which is a Python dictionary full of
 58 | all the annotation for the feature (things like gene identifiers).
 59 | 
 60 | This is what this gene looks like in the raw GenBank file::
 61 | 
 62 |      gene            337..2799
 63 |                      /gene="thrA"
 64 |                      /locus_tag="b0002"
 65 |                      /gene_synonym="ECK0002; Hs; JW0001; thrA1; thrA2; thrD"
 66 |                      /db_xref="EcoGene:EG10998"
 67 |                      /db_xref="GeneID:945803"
 68 | 
 69 | Hopefully it is fairly clear how this maps to the ``SeqFeature`` structure.
 70 | The `Biopython Tutorial & Cookbook <http://biopython.org/DIST/docs/tutorial/Tutorial.html>`_
 71 | (`PDF <http://biopython.org/DIST/docs/tutorial/Tutorial.pdf>`_) goes into
 72 | more detail about this.
 73 | 
 74 | -----------------
 75 | Feature Locations
 76 | -----------------
 77 | 
 78 | We're going to focus on using the location information for different feature
 79 | types. Continuing with the same example:
 80 | 
 81 | .. sourcecode:: pycon
 82 | 
 83 |     >>> from Bio import SeqIO
 84 |     >>> record = SeqIO.read("NC_000913.gbk", "genbank")
 85 |     >>> my_gene = record.features[3]
 86 |     >>> print(my_gene.qualifiers["locus_tag"])
 87 |     ['b0002']
 88 |     >>> print(my_gene.location)
 89 |     [336:2799](+)
 90 |     >>> print(my_gene.location.start)
 91 |     336
 92 |     >>> print(my_gene.location.end)
 93 |     2799
 94 |     >>> print(my_gene.location.strand)
 95 |     1
 96 | 
 97 | Recall in the GenBank file this simple location was ``337..2799``, yet
 98 | in Biopython this has become a start value of 336 and 2799 as the end.
 99 | The reason for this is to match how Python counting works, in particular
100 | how Python string slicing. In order to pull out this sequence from the full
101 | genome we need to use slice values of 336 and 2799:
102 | 
103 | .. sourcecode:: pycon
104 | 
105 |     >>> gene_seq = record.seq[336:2799]
106 |     >>> len(gene_seq)
107 |     2463
108 |     >>> print(gene_seq)
109 |     ...
110 | 
111 | This was a very simple location on the forward strand, if it had been on
112 | the reverse strand you'd need to take the reverse-complement. Also if the
113 | location had been a more complicated compound location like a *join* (used
114 | for eukaryotic genes where the CDS is made up of several exons), then the
115 | location would have-sub parts to consider.
116 | 
117 | All these complications are taken care of for you via the ``.extract(...)``
118 | method which takes the full length parent record's sequence as an argument:
119 | 
120 | .. sourcecode:: pycon
121 | 
122 |     >>> gene_seq = my_gene.extract(record.seq)
123 |     >>> len(gene_seq)
124 |     2463
125 |     >>> print(gene_seq)
126 |     ...
127 | 
128 | **Exercise**: Finish the following script by setting an appropriate
129 | feature name like the locus tag or GI number (use the ``.qualifiers``
130 | or ``.dbxrefs`` information) to extract all the coding sequences from
131 | the GenBank file:
132 | 
133 | .. sourcecode:: python
134 | 
135 |     from Bio import SeqIO
136 |     record = SeqIO.read("NC_000913.gbk", "genbank")
137 |     output_handle = open("NC_000913_cds.fasta", "w")
138 |     count = 0
139 |     for feature in record.features:
140 |         if feature.type == "CDS":
141 |             count = count + 1
142 |             feature_name = "..." # Use feature.qualifiers or feature.dbxrefs here
143 |             feature_seq = feature.extract(record.seq)
144 |             # Simple FASTA output without line wrapping:
145 |             output_handle.write(">" + feature_name + "\n" + str(feature_seq) + "\n")
146 |     output_handle.close()
147 |     print(str(count) + " CDS sequences extracted")
148 | 
149 | .. sourcecode:: console
150 | 
151 |     $ python extract_cds.py 
152 |     4321 CDS sequences extracted
153 | 
154 | Check your sequences using the NCBI provided FASTA file ``NC_000913.ffn``.
155 | 
156 | **Advanced exercise**: Can you recreate the NCBI naming scheme as used
157 | in ``NC_000913.ffn``?
158 | 
159 | **Advanced exercise**: Using the Biopython documentation, can you create
160 | a new ``SeqRecord`` object and then use ``SeqIO.write(...)`` which will
161 | produce line-wrapped FASTA output.
162 | 
163 | ---------------
164 | Feature Lengths
165 | ---------------
166 | 
167 | The length of Biopython's ``SeqFeature`` objects (and the location objects)
168 | is defined as the length of the sequence region they describe (i.e. how
169 | many bases are includied; or for protein annotation how many amino acids).
170 | 
171 | .. sourcecode:: pycon
172 | 
173 |     >>> len(my_gene)
174 |     2463
175 | 
176 | Remember when we checked the length of ``my_gene.extract(record.seq)``
177 | that also gave 2463.
178 | 
179 | This example loops over all the features looking for gene records, and
180 | calculates their total length:
181 | 
182 | .. sourcecode:: python
183 | 
184 |     from Bio import SeqIO
185 |     record = SeqIO.read("NC_000913.gbk", "genbank")
186 |     total = 0
187 |     for feature in record.features:
188 |         if feature.type == "gene":
189 |             total = total + len(feature)
190 |     print("Total length of all genes is " + str(total))
191 | 
192 | .. sourcecode:: console
193 | 
194 |     $ python total_gene_lengths.py
195 |     Total length of genome is 4641652
196 |     Total length of all genes is 4137243
197 | 
198 | **Exercise**: Give a separate count for each feature type. Use a dictionary
199 | where the keys are the feature type (e.g. "gene" and "CDS") and the values
200 | are the count for that type.
201 | 
202 | **Discussion**: What proportion of the genome is annotated as gene coding?
203 | What assumptions does this estimate 89% make:
204 | 
205 | .. sourcecode:: pycon
206 | 
207 |     >>> 4137243 * 100.0 / 4641652
208 |     89.13298541122859
209 | 
210 | **Exercise**: Extend the previous script to also count the number of
211 | features of each type, and report this and the average length of that
212 | feature type. e.g.
213 | 
214 | .. sourcecode:: console
215 | 
216 |     $ python total_feature_lengths.py
217 |     Total length of genome is 4641652
218 |     misc_feature
219 |      - total number: 13686
220 |      - total length: 6136082
221 |      - average length: 448.347362268
222 |     mobile_element
223 |      - total number: 49
224 |      - total length: 50131
225 |      - average length: 1023.08163265
226 |     ...
227 | 
228 | **Discussion**: What proportion of the genome is annotated with *misc_feature*?
229 | Does this simple calculation give a meaningful answer?
230 | 
231 | .. sourcecode:: pycon
232 | 
233 |     >>> 6136082 * 100.0 / 4641652
234 |     132.19608018869144
235 | 
236 | This is an alternative approach, using some more advanced bits of Python like
237 | the set datatype, and the concept of iterating over the bases within a feature:
238 | 
239 | .. sourcecode:: pycon
240 | 
241 |     >>> from Bio import SeqIO
242 |     >>> record = SeqIO.read("NC_000913.gbk", "genbank")
243 |     >>> bases = set()
244 |     >>> for feature in record.features:
245 |     ...     if feature.type == "misc_feature":
246 |     ...         bases.update(feature.location)
247 |     ... 
248 |     >>> print(len(bases) * 100.0 / len(record))
249 |     80.69355479471533
250 | 
251 | **Exercise**: Without worrying to much about how it works, modify this example
252 | to count the number of bases in the *gene* features.
253 | 
254 | .. sourcecode:: console
255 | 
256 |     $ python bases_in_genes.py 
257 |     88.9494085295
258 | 
259 | **Discussion**: Compare this calculation (88.95%) to one earlier (89.13%).
260 | Which is a better estimate of the proportion of the genome which encodes genes?
261 | When might these methods give very different answers? Any virologists in the group?
262 | How should this be defined given that any single base may be in more than one gene?
263 | 
264 | ------------------------
265 | Translating CDS features
266 | ------------------------
267 | 
268 | When dealing with GenBank files and trying to get the protein sequence of the
269 | genes, you'll need to look at the CDS features (coding sequences) - not the
270 | gene features (although for simple cases they'll have the same location).
271 | 
272 | Sometimes, as in the *E. coli* exmaple, you will find the translation is
273 | provided in the qualifiers:
274 | 
275 |     >>> from Bio import SeqIO
276 |     >>> record = SeqIO.read("NC_000913.gbk", "genbank")
277 |     >>> my_cds = record.features[4]
278 |     >>> print(my_cds.qualifiers["locus_tag"])
279 |     ['b0002']
280 |     >>> print(my_cds.qualifiers["translation"])
281 |     ['MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNI...KLGV']
282 | 
283 | This has been truncated for display here - the whole protein sequence is
284 | present. However, many times the annotation will not include the amino acid
285 | translation - but we can get it by translating the nucleotide sequence.
286 | 
287 |     >>>	print(cds_seq.translate(table=11))
288 |     >>> protein_seq = cds_seq.translate(table=11)
289 |     >>>	len(protein_seq)
290 |     821
291 |     >>> print(protein_seq)
292 |     MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNI...KLGV*
293 | 
294 | Notice because this is a bacteria, we used the NCBI translation table 11,
295 | rather than the default (suitable for humans etc).
296 | 
297 | **Advanced Exercise**: Using this information, and the CDS extraction script
298 | from earlier, translate all the CDS features into a FASTA file.
299 | 
300 | Check your sequences using the NCBI provided FASTA file ``NC_000913.faa``.
301 | 


--------------------------------------------------------------------------------
/using_seqfeatures/bases_in_genes.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | record = SeqIO.read("NC_000913.gbk", "genbank")
 3 | bases = set() # Python's built in set datatype
 4 | for feature in record.features:
 5 |     if feature.type == "gene":
 6 |         # This adds all the possible base coordinates
 7 |         # within the feature location to the set. Try
 8 |         # print(list(feature.location)) on a gene...
 9 |         bases.update(feature.location)
10 | # The Python set doesn't store duplicates, so len(bases)
11 | # is the number of unique bases in at least one gene.
12 | print(len(bases) * 100.0 / len(record))
13 | 


--------------------------------------------------------------------------------
/using_seqfeatures/extract_cds.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | record = SeqIO.read("NC_000913.gbk", "genbank")
 3 | output_handle = open("NC_000913_cds.fasta", "w")
 4 | count = 0
 5 | for feature in record.features:
 6 |     if feature.type == "CDS":
 7 |         count = count + 1
 8 |         feature_name = feature.qualifiers["locus_tag"][0]
 9 |         feature_seq = feature.extract(record.seq)
10 |         # Simple FASTA output without line wrapping:
11 |         output_handle.write(">" + feature_name + "\n" + str(feature_seq) + "\n")
12 | output_handle.close()
13 | print(str(count) + " CDS sequences extracted")
14 | 


--------------------------------------------------------------------------------
/using_seqfeatures/total_feature_lengths.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | # (needed under Python 2 for sensible division)
 3 | 
 4 | from Bio import SeqIO
 5 | record = SeqIO.read("NC_000913.gbk", "genbank")
 6 | print("Total length of genome is " + str(len(record)))
 7 | totals = dict()
 8 | counts = dict()
 9 | for feature in record.features:
10 |     if feature.type in totals:
11 |         totals[feature.type] = totals[feature.type] + len(feature)
12 |         counts[feature.type] = counts[feature.type] + 1
13 |     else:
14 |         #First time to see this feature type
15 |         totals[feature.type] = 1
16 |         counts[feature.type] = 1
17 | for f_type in totals:
18 |     print(f_type)
19 |     print(" - total number: " + str(counts[f_type]))
20 |     print(" - total length: " + str(totals[f_type]))
21 |     ave_len = totals[f_type] / counts[f_type]
22 |     print(" - average length: " + str(ave_len))
23 | 


--------------------------------------------------------------------------------
/using_seqfeatures/total_gene_lengths.py:
--------------------------------------------------------------------------------
1 | from Bio import SeqIO
2 | record = SeqIO.read("NC_000913.gbk", "genbank")
3 | print("Total length of genome is " + str(len(record)))
4 | total = 0
5 | for feature in record.features:
6 |     if feature.type == "gene":
7 |         total = total + len(feature)
8 | print("Total length of all genes is " + str(total))
9 | 


--------------------------------------------------------------------------------
/writing_sequence_files/README.rst:
--------------------------------------------------------------------------------
  1 | ====================================
  2 | Writing Sequences Files in Biopython
  3 | ====================================
  4 | 
  5 | The `previous section <../reading_sequence_files/README.rst>`_ talked
  6 | about reading sequence files in Biopython using the ``SeqIO.parse(...)``
  7 | function. Now we'll focus on writing sequence files using the sister
  8 | function ``SeqIO.write(...)``.
  9 | 
 10 | The more gently paced `Biopython Tutorial and Cookbook
 11 | <http://biopython.org/DIST/docs/tutorial/Tutorial.html>`_
 12 | (`PDF <http://biopython.org/DIST/docs/tutorial/Tutorial.pdf>`_)
 13 | first covers creating your own records (``SeqRecord`` objects) and
 14 | then how to write them out. We're going to skip that here, and work
 15 | with ready-made ``SeqRecord`` objects loaded with ``SeqIO.parse(...)``.
 16 | 
 17 | Let's start with something really simple...
 18 | 
 19 | --------------------------
 20 | Converting a sequence file
 21 | --------------------------
 22 | 
 23 | Recall we looked at the *E. coli* K12 chromosome as a FASTA file
 24 | ``NC_000913.fna`` and as a GenBank file ``NC_000913.gbk``. Suppose
 25 | we only had the GenBank file, and wanted to turn it into a FASTA file?
 26 | 
 27 | Biopython's ``SeqIO`` module can read and write lots of sequence file
 28 | formats, and has a handy helper function to convert a file:
 29 | 
 30 | .. sourcecode:: pycon
 31 | 
 32 |     >>> from Bio import SeqIO
 33 |     >>> help(SeqIO.convert)
 34 | 
 35 | Here's a very simple script which uses this function:
 36 | 
 37 | .. sourcecode:: python
 38 | 
 39 |     from Bio import SeqIO
 40 |     input_filename = "NC_000913.gbk"
 41 |     output_filename = "NC_000913_converted.fasta"
 42 |     count = SeqIO.convert(input_filename, "gb", output_filename, "fasta")
 43 |     print(str(count) + " records converted")
 44 | 
 45 | Save this as ``convert_gb_to_fasta.py`` and run it:
 46 | 
 47 | .. sourcecode:: console
 48 | 
 49 |     $ python convert_gb_to_fasta.py
 50 |     1 records converted
 51 | 
 52 | Notice that the ``SeqIO.convert(...)`` function returns the number of
 53 | sequences it converted - here only one. Also have a look at the output file:
 54 | 
 55 | .. sourcecode:: console
 56 | 
 57 |     $ head NC_000913_converted.fasta 
 58 |     >NC_000913.3 Escherichia coli str. K-12 substr. MG1655, complete genome.
 59 |     AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
 60 |     TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
 61 |     TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
 62 |     ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
 63 |     AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
 64 |     CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
 65 |     ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
 66 |     AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
 67 |     GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
 68 | 
 69 | **Warning**: The output will over-write any pre-existing file of the same name.
 70 | 
 71 | **Advanced Exercise**: Modify this to add command line parsing to take
 72 | the input and output filenames as arguments.
 73 | 
 74 | The ``SeqIO.convert(...)`` function is effectively a shortcut combining
 75 | ``SeqIO.parse(...)`` for input ``SeqIO.write(...)`` for output. Here's how
 76 | you'd do this explictly:
 77 | 
 78 | .. sourcecode::	python
 79 | 
 80 |     from Bio import SeqIO
 81 |     input_filename = "NC_000913.gbk"
 82 |     output_filename = "NC_000913_converted.fasta"
 83 |     records_iterator = SeqIO.parse(input_filename, "gb")
 84 |     count = SeqIO.write(records_iterator, output_filename, "fasta")    
 85 |     print(str(count) + " records converted")
 86 | 
 87 | Previously we'd always used the results from ``SeqIO.parse(...)`` in a for
 88 | loop - but here the for loop happens inside the ``SeqIO.write(...)`` function.
 89 | 
 90 | **Exercise**: Check this does the same as the ``SeqIO.convert(...)`` version above.
 91 | 
 92 | The ``SeqIO.write(...)`` function is happy to be given multiple records
 93 | like this, or simply as a list of ``SeqRecord`` objects. You can also give
 94 | it just one record:
 95 | 
 96 | .. sourcecode:: python
 97 | 
 98 |     from Bio import SeqIO
 99 |     input_filename = "NC_000913.gbk"
100 |     output_filename = "NC_000913_converted.fasta"
101 |     record = SeqIO.read(input_filename, "gb")
102 |     SeqIO.write(record, output_filename, "fasta")
103 | 
104 | We'll be doing this in the next example, where we call ``SeqIO.write(..)``
105 | several times in order to build up a mult-record output file.
106 | 
107 | -------------------------
108 | Filtering a sequence file
109 | -------------------------
110 | 
111 | Suppose we wanted to filter a FASTA file by length, for example
112 | exclude protein sequences less than 100 amino acids long.
113 | 
114 | The `Biopython Tutorial and Cookbook
115 | <http://biopython.org/DIST/docs/tutorial/Tutorial.html>`_
116 | (`PDF <http://biopython.org/DIST/docs/tutorial/Tutorial.pdf>`_)
117 | has filtering  examples combining ``SeqIO.write(...)`` with more
118 | advanced Python features like generator expressions and so on.
119 | These are all worth learning about later, but in this workshop
120 | we will stick with the simpler for-loop.
121 | 
122 | You might try something like this:
123 | 
124 | .. sourcecode:: python
125 | 
126 |     from Bio import SeqIO
127 |     input_filename = "NC_000913.faa"
128 |     output_filename = "NC_000913_long_only.faa"
129 |     count = 0
130 |     total = 0
131 |     for record in SeqIO.parse(input_filename, "fasta"):
132 |         total = total + 1
133 |         if 100 <= len(record):
134 |             count = count + 1
135 | 	    SeqIO.write(record, output_filename, "fasta")
136 |     print(str(count) + " records selected out of " + str(total))
137 | 
138 | Save this as ``length_filter_naive.py``, and run it, and check it worked.
139 | 
140 | .. sourcecode:: console
141 | 
142 |     $ python length_filter_naive.py
143 |     3719 records selected out of 4141
144 | 
145 | **Discussion:** What goes wrong and why? Have a look at the output file...
146 | 
147 | .. sourcecode:: console
148 | 
149 |     $ grep -c "^>" NC_000913_long_only.faa
150 |     1
151 |     $ cat NC_000913_long_only.faa 
152 |     >gi|16132220|ref|NP_418820.1| predicted methyltransferase [Escherichia coli str. K-12 substr. MG1655]
153 |     MRITIILVAPARAENIGAAARAMKTMGFSDLRIVDSQAHLEPATRWVAHGSGDIIDNIKV
154 |     FPTLAESLHDVDFTVATTARSRAKYHYYATPVELVPLLEEKSSWMSHAALVFGREDSGLT
155 |     NEELALADVLTGVPMVADYPSLNLGQAVMVYCYQLATLIQQPAKSDATADQHQLQALRER
156 |     AMTLLTTLAVADDIKLVDWLQQRLGLLEQRDTAMLHRLLHDIEKNITK
157 | 
158 | The problem is that our output file only contains *one* sequence, actually
159 | the last long sequence in the FASTA file. Why? What happened is each time
160 | round the loop when we called ``SeqIO.write(...)`` to save one record, it
161 | overwrote the existing data.
162 | 
163 | The simplest solution is to open and close the file explicitly, using a *file handle*.
164 | The ``SeqIO`` functions are happy to work with either filenames (strings) or
165 | file handles, and this is a case where the more low-level handle is useful.
166 | 
167 | Here's a working version of the script, save this as ``length_filter.py``:
168 | 
169 | .. sourcecode:: python
170 | 
171 |     from Bio import SeqIO
172 |     input_filename = "NC_000913.faa"
173 |     output_filename = "NC_000913_long_only.faa"
174 |     count = 0
175 |     total = 0
176 |     output_handle = open(output_filename, "w")
177 |     for record in SeqIO.parse(input_filename, "fasta"):
178 |         total = total + 1
179 |         if 100 <= len(record):
180 |             count = count + 1
181 | 	    SeqIO.write(record, output_handle, "fasta")
182 |     output_handle.close()
183 |     print(str(count) + " records selected out of " + str(total))
184 | 
185 | This time we get the expected output - and it is much faster (needlessly
186 | creating and replacing several thousand small files is slow):
187 | 
188 | .. sourcecode:: console
189 | 
190 |     $ python length_filter.py
191 |     3719 records selected out of 4141
192 |     $ grep -c "^>" NC_000913_long_only.faa 
193 |     3719
194 | 
195 | Yay!
196 | 
197 | 
198 | -----------------
199 | Editing sequences
200 | -----------------
201 | 
202 | One of the examples in the `previous section <../reading_sequence_files/README.rst>`_
203 | looked at the potato protein sequences, and that they all had a terminal "*"
204 | character (stop codon). Python strings, Biopython ``Seq`` and ``SeqRecord`` objects
205 | can all be *sliced* to extract a sub-sequence or partial record. In this case,
206 | we want to take everything up to but excluding the final letter:
207 | 
208 | .. sourceode: pycon
209 | 
210 |     >>> my_seq = "MTAIVIGAKILGIIYSSPQLRKCNSATQNDHSDLQISFWKDHLRQCTTNS*"
211 |     >>> cut_seq = my_seq[:-1] # remove last letter
212 |     >>> print(cut_seq)
213 |     MTAIVIGAKILGIIYSSPQLRKCNSATQNDHSDLQISFWKDHLRQCTTNS
214 | 
215 | Consider the following example (which I'm calling ``cut_star_dangerous.py``):
216 | 
217 | .. sourcecode:: python
218 | 
219 |     from Bio import SeqIO
220 |     input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
221 |     output_filename = "PGSC_DM_v3.4_pep_rep_no_stars.fasta"
222 |     output_handle = open(output_filename, "w")
223 |     for record in SeqIO.parse(input_filename, "fasta"):
224 |         cut_record = record[:-1]  # remove last letter
225 |         SeqIO.write(cut_record, output_handle, "fasta")
226 |     output_handle.close()
227 | 
228 | This should work fine on this potato file... but what might go wrong if you
229 | used it on another protein file? What happens if (some of) the input records
230 | don't end with a "*"?
231 | 
232 | **Exercise**: Modify this example to only remove the last letter if it is a "*"
233 | (and save the original record unchanged if it does not end with "*"). The sample
234 | solution is called ``cut_final_star.py`` instead.
235 | 
236 | 
237 | ------------------------
238 | Filtering by record name
239 | ------------------------
240 | 
241 | A very common task is pulling out particular sequences from a large sequence
242 | file. Membership testing with Python lists (or sets) is one neat way to do
243 | this. Recap:
244 | 
245 | .. sourcecode:: pycon
246 | 
247 |     >>> wanted_ids = ["PGSC0003DMP400019313", "PGSC0003DMP400020381", "PGSC0003DMP400020972"]
248 |     >>> "PGSC0003DMP400067339" in wanted_ids
249 |     False
250 |     >>> "PGSC0003DMP400020972" in wanted_ids
251 |     True
252 | 
253 | **Exercise**: Guided by the ``filter_length.py`` script, write a new script
254 | starting as follows which writes out the potato proteins on this list:
255 | 
256 | .. sourcecode:: python
257 | 
258 |     from Bio import SeqIO
259 |     wanted_ids = ["PGSC0003DMP400019313", "PGSC0003DMP400020381", "PGSC0003DMP400020972"]
260 |     input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
261 |     output_filename = "wanted_potato_proteins.fasta"
262 |     count = 0
263 |     total = 0
264 |     output_handle = open(output_filename, "w")
265 |     # ...
266 |     # Your code here
267 |     # ...
268 |     output_handle.close()
269 |     print(str(count) + " records selected out of " + str(total))
270 | 
271 | The sample solution is called ``filter_wanted_ids.py``, and the output should be:
272 | 
273 | .. sourcecode:: console
274 | 
275 |     $ python filter_wanted_id.py
276 |     3 records selected out of 39031
277 | 
278 | **Advanced Exerise**: Modify this to read the list of wanted identifiers from
279 | a plain text input file (one identifier per line).
280 | 
281 | **Advanced Exerise**: What is the advatage of using a Python set instead of
282 | a Python list for the wanted identifiers?
283 | 
284 | **Discussion**: What happens if a wanted identifier is not in the input file?
285 | What happens if an identifer appears twice? What order is the output file?
286 | 
287 | ------------------------
288 | Selecting by record name
289 | ------------------------
290 | 
291 | In the previous example, we used ``SeqIO.parse(...)`` to loop over the input
292 | FASTA file. This means the output order will be dictated by the input sequence
293 | file's order. What if you want the records in the specified order (regardless
294 | of the order in the FASTA file)?
295 | 
296 | In this situation, you can't make a single for loop over the FASTA file. For
297 | a tiny file you could load everything into memory (e.g. as a Python dictionary),
298 | but that won't work on larger files. Instead, we can use Biopython's
299 | ``SeqIO.index(...)`` function which lets us treat a sequence file like a
300 | Python dictionary:
301 | 
302 | .. sourcecode:: pycon
303 | 
304 |     >>> from Bio import SeqIO
305 |     >>> filename = "PGSC_DM_v3.4_pep_representative.fasta"
306 |     >>> fasta_index = SeqIO.index(filename, "fasta")
307 |     >>> print(str(len(fasta_index)) + " records in " + filename)
308 |     >>> "PGSC0003DMP400019313" in fasta_index
309 |     True
310 |     >>> record = fasta_index["PGSC0003DMP400019313"]
311 |     >>> print(record)
312 |     ID: PGSC0003DMP400019313
313 |     Name: PGSC0003DMP400019313
314 |     Description: PGSC0003DMP400019313 PGSC0003DMT400028369 Protein
315 |     Number of features: 0
316 |     Seq('MSKSLYLSLFFLSFVVALFGILPNVKGNILDDICPGSFFPPLCFQMLRNDPSVS...LK*', SingleLetterAlphabet())
317 | 
318 | **Exercise**: Write a new version of your ``count_fasta.py`` script using
319 | ``SeqIO.index(...)`` instead of ``SeqIO.parse(...)`` and a for loop.
320 | Which is faster?
321 | 
322 | **Exercise**: Complete the following script by using ``SeqIO.index(...)``
323 | to make a FASTA file with records of interest *in the given order*:
324 | 
325 | .. sourcecode:: python
326 | 
327 |     from Bio import SeqIO
328 |     wanted_ids = ["PGSC0003DMP400019313", "PGSC0003DMP400020381", "PGSC0003DMP400020972"]
329 |     input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
330 |     output_filename = "wanted_potato_proteins_in_order.fasta"
331 |     fasta_index = SeqIO.index(input_filename, "fasta")
332 |     count = 0
333 |     total = # Your code here, get total from fasta_index
334 |     output_handle = open(output_filename, "w")
335 |     for identifier in wanted_ids:
336 |         # ...
337 |         # Your code here, get the record for the identifier, and write it out
338 |         # ...
339 |     output_handle.close()
340 |     print(str(count) + " records selected out of " + str(total))
341 | 
342 | I called this script ``filter_wanted_id_in_order.py`` and the output should be:
343 | 
344 | .. sourcecode:: console
345 | 
346 |     $ python filter_wanted_id_in_order.py
347 |     3 records selected out of 39031
348 | 
349 | 
350 | Now compare the outfile files from the two approaches:
351 | 
352 | .. sourcecode:: console
353 | 
354 |     $ grep "^>" wanted_potato_proteins.fasta
355 |     >PGSC0003DMP400020381 PGSC0003DMT400029984 Protein
356 |     >PGSC0003DMP400020972 PGSC0003DMT400030871 Protein
357 |     >PGSC0003DMP400019313 PGSC0003DMT400028369 Protein
358 |     $ grep "^>" wanted_potato_proteins_in_order.fasta 
359 |     >PGSC0003DMP400019313 PGSC0003DMT400028369 Protein
360 |     >PGSC0003DMP400020381 PGSC0003DMT400029984 Protein
361 |     >PGSC0003DMP400020972 PGSC0003DMT400030871 Protein
362 | 
363 | The second file has the order specified in the Python list.
364 | 


--------------------------------------------------------------------------------
/writing_sequence_files/convert_gb_to_fasta.py:
--------------------------------------------------------------------------------
1 | from Bio import SeqIO
2 | input_filename = "NC_000913.gbk"
3 | output_filename = "NC_000913_converted.fasta"
4 | count = SeqIO.convert(input_filename, "gb", output_filename, "fasta")
5 | print(str(count) + " records converted")
6 | 


--------------------------------------------------------------------------------
/writing_sequence_files/cut_final_star.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
 3 | output_filename = "PGSC_DM_v3.4_pep_rep_no_stars.fasta"
 4 | output_handle = open(output_filename, "w")
 5 | for record in SeqIO.parse(input_filename, "fasta"):
 6 |     if record.seq.endswith("*"):
 7 |         record = record[:-1] # remove last letter (the star)
 8 |     SeqIO.write(record,output_handle, "fasta")
 9 | output_handle.close()
10 | 


--------------------------------------------------------------------------------
/writing_sequence_files/cut_star_dangerous.py:
--------------------------------------------------------------------------------
1 | from Bio import SeqIO
2 | input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
3 | output_filename = "PGSC_DM_v3.4_pep_rep_no_stars.fasta"
4 | output_handle = open(output_filename, "w")
5 | for record in SeqIO.parse(input_filename, "fasta"):
6 |     cut_record = record[:-1]  # remove last letter
7 |     SeqIO.write(cut_record, output_handle, "fasta")
8 | output_handle.close()
9 | 


--------------------------------------------------------------------------------
/writing_sequence_files/filter_wanted_id.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | wanted_ids = ["PGSC0003DMP400019313", "PGSC0003DMP400020381", "PGSC0003DMP400020972"]
 3 | input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
 4 | output_filename = "wanted_potato_proteins.fasta"
 5 | count = 0
 6 | total = 0
 7 | output_handle = open(output_filename, "w")
 8 | for record in SeqIO.parse(input_filename, "fasta"):
 9 |     total = total + 1
10 |     if record.id in wanted_ids:
11 |         count = count + 1
12 |         SeqIO.write(record, output_handle, "fasta")
13 | output_handle.close()
14 | print(str(count) + " records selected out of " + str(total))
15 | 


--------------------------------------------------------------------------------
/writing_sequence_files/filter_wanted_id_in_order.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | wanted_ids = ["PGSC0003DMP400019313", "PGSC0003DMP400020381", "PGSC0003DMP400020972"]
 3 | input_filename = "PGSC_DM_v3.4_pep_representative.fasta"
 4 | output_filename = "wanted_potato_proteins_in_order.fasta"
 5 | fasta_index = SeqIO.index(input_filename, "fasta")
 6 | count = 0
 7 | total = len(fasta_index)
 8 | output_handle = open(output_filename, "w")
 9 | for identifier in wanted_ids:
10 |     record = fasta_index[identifier]
11 |     SeqIO.write(record, output_handle, "fasta")
12 |     count = count + 1
13 | output_handle.close()
14 | print(str(count) + " records selected out of " + str(total))
15 | 


--------------------------------------------------------------------------------
/writing_sequence_files/length_filter.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | input_filename = "NC_000913.faa"
 3 | output_filename = "NC_000913_long_only.faa"
 4 | count = 0
 5 | total = 0
 6 | output_handle = open(output_filename, "w")
 7 | for record in SeqIO.parse(input_filename, "fasta"):
 8 |     total = total + 1
 9 |     if 100 <= len(record):
10 |         count = count + 1
11 |         SeqIO.write(record, output_handle, "fasta")
12 | output_handle.close()
13 | print(str(count) + " records selected out of " + str(total))
14 | 


--------------------------------------------------------------------------------
/writing_sequence_files/length_filter_naive.py:
--------------------------------------------------------------------------------
 1 | from Bio import SeqIO
 2 | input_filename = "NC_000913.faa"
 3 | output_filename = "NC_000913_long_only.faa"
 4 | count = 0
 5 | total = 0
 6 | for record in SeqIO.parse(input_filename, "fasta"):
 7 |     total = total + 1
 8 |     if 100 <= len(record):
 9 |         count = count + 1
10 |         SeqIO.write(record, output_filename, "fasta")
11 | print(str(count) + " records selected out of " + str(total))
12 | 


--------------------------------------------------------------------------------