├── acgt_dither
    ├── Swanson_et_al_2012_fig1a.pdf
    ├── Swanson_et_al_2012_fig1a.png
    ├── README.rst
    ├── Swanson_et_al_2012_fig1.txt
    ├── dither.py
    └── dither_rgb.py
├── assembly_comparison
    ├── images
    │   ├── TY2482_vs_NC_018658.png
    │   ├── H112180280_vs_NC_018658.png
    │   └── TY2482_20110610_vs_NC_018658.png
    ├── fasta_trim_n.py
    ├── dedup_assembly.py
    ├── README.rst
    └── order_assembly.py
├── galaxy_workflows
    ├── rxlr_venn_workflow
    │   ├── Phyca11_example_output.png
    │   ├── README.rst
    │   └── repository_dependencies.xml
    ├── README.rst
    └── secreted_protein_workflow
    │   ├── repository_dependencies.xml
    │   └── README.rst
├── fetch_viruses
    ├── README.txt
    ├── fetch_viruses.py
    └── merge_viruses.py
├── align
    └── align_back_trans.py
├── .gitignore
├── snakemake
    ├── demo.smk
    └── snakemake_progress_bar_demo.py
├── README.rst
├── LICENSE.rst
├── blast
    ├── README.rst
    ├── wwwblast2loc.py
    ├── blast_wrap.py
    ├── blast_most_matched.py
    └── blast_sync.py
├── seq_manipulation
    ├── pick_N_random_seqs.py
    ├── shred_contigs.py
    ├── seqio_index_db.py
    ├── rename_locustags.py
    └── insert_gaps_for_ena.py
├── annotation_comparison
    ├── mauve_orthologues_to_genbank.py
    ├── annotation_patch.py
    └── annotation_diff.py
├── sambam
    ├── profile
    │   └── bench_iter.py
    ├── sam_depair.py
    ├── samtools_auto.py
    ├── bgzf_add_eof.py
    ├── sam_drop_qname.py
    ├── bgzf_check_eof.py
    ├── fastq_to_sam.py
    ├── sam_drop_long_cigar.py
    ├── sam_strip_tags.py
    ├── sam_restore_seq.py
    └── sam_to_sspace_tab.py
├── ena_fetch
    ├── get_ENA_project_submissions.py
    ├── get_ENA_project_meta.py
    └── get_ENA_project_fastq.py
├── primer_selection
    ├── iupac_isPcr.py
    ├── species_dedup_gbk.py
    ├── primer_selection.py
    ├── isPcr_tally.py
    └── plot_isprc.py
├── blooming_reads
    ├── interlace_fastq.py
    └── stack_coverage_plot.py
├── .pre-commit-config.yaml
└── hmmer
    └── hmmer_table2tabular.py


/acgt_dither/Swanson_et_al_2012_fig1a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/acgt_dither/Swanson_et_al_2012_fig1a.pdf


--------------------------------------------------------------------------------
/acgt_dither/Swanson_et_al_2012_fig1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/acgt_dither/Swanson_et_al_2012_fig1a.png


--------------------------------------------------------------------------------
/assembly_comparison/images/TY2482_vs_NC_018658.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/TY2482_vs_NC_018658.png


--------------------------------------------------------------------------------
/assembly_comparison/images/H112180280_vs_NC_018658.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/H112180280_vs_NC_018658.png


--------------------------------------------------------------------------------
/assembly_comparison/images/TY2482_20110610_vs_NC_018658.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/TY2482_20110610_vs_NC_018658.png


--------------------------------------------------------------------------------
/galaxy_workflows/rxlr_venn_workflow/Phyca11_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/galaxy_workflows/rxlr_venn_workflow/Phyca11_example_output.png


--------------------------------------------------------------------------------
/fetch_viruses/README.txt:
--------------------------------------------------------------------------------
1 | A couple of scripts used to build BLAST databases of complete viral genomes,
2 | and their genes/proteins.
3 | 
4 | This worked great back in 2009, but now that the viral sequences have grown
5 | at least ten fold, hammering NCBI Entrez like this is not ideal...
6 | 


--------------------------------------------------------------------------------
/galaxy_workflows/README.rst:
--------------------------------------------------------------------------------
1 | As of 17 September 2013, my Galaxy workflow development has moved from here:
2 | 
3 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/
4 | 
5 | To here, along with the associated Galaxy tools:
6 | 
7 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/
8 | 


--------------------------------------------------------------------------------
/acgt_dither/README.rst:
--------------------------------------------------------------------------------
1 | Python scripts to render photos using bases A, C, G, and T for pixels.
2 | 
3 | Takes as input a PNG photo (JPEG should work if the right dependencies
4 | are installed), and a FASTA sequence file, and uses them to produce a
5 | PDF output image using ReportLab.
6 | 
7 | The motivation and example images are described on this blog post:
8 | http://blastedbio.blogspot.co.uk/2013/08/pixelated-potato-posters-in-python.html
9 | 


--------------------------------------------------------------------------------
/galaxy_workflows/secreted_protein_workflow/repository_dependencies.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0"?>
2 | <repositories description="This requires my SignalP and TMHMM wrapers, and my FASTA filtering tool.">
3 |     <!-- Revision 15:6abd809cefdd on the main tool shed is v0.2.4, the current latest - but older should be OK -->
4 |     <repository name="tmhmm_and_signalp" owner="peterjc" />
5 |     <!-- Revision 2:abdd608c869b on the main tool shed is v0.0.5, the current latest - but older should be OK -->
6 |     <repository name="seq_filter_by_id" owner="peterjc" />
7 | </repositories>
8 | 


--------------------------------------------------------------------------------
/align/align_back_trans.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | sys.exit(
 5 |     """Python script for 'back-translating' a protein alignment.
 6 | 
 7 | This script was originally available from here:
 8 | https://github.com/peterjc/picobio/tree/master/align
 9 | 
10 | It is now available from here instead, with an optional Galaxy wrapper:
11 | https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
12 | 
13 | The Galaxy tool is available from the Galaxy Tool Shed here:
14 | http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
15 | """
16 | )
17 | 


--------------------------------------------------------------------------------
/galaxy_workflows/rxlr_venn_workflow/README.rst:
--------------------------------------------------------------------------------
 1 | This is package is a Galaxy workflow for comparing three RXLR prediction
 2 | methods with a Venn Diagram, and creates a FASTA file of any proteins
 3 | passing all three methods.
 4 | 
 5 | As of 17 September 2013, development has moved from here:
 6 | 
 7 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/rxlr_venn_workflow
 8 | 
 9 | To here, along with the associated Galaxy tools:
10 | 
11 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/rxlr_venn_workflow
12 | 
13 | This workflow is available to download and/or install from the main
14 | Galaxy Tool Shed:
15 | 
16 | * http://toolshed.g2.bx.psu.edu/view/peterjc/rxlr_venn_workflow
17 | 


--------------------------------------------------------------------------------
/galaxy_workflows/rxlr_venn_workflow/repository_dependencies.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <repositories description="This requires my SignalP and TMHMM wrapers, and my FASTA filtering tool.">
 3 |     <!-- Revision 15:6abd809cefdd on the main tool shed is v0.2.4, the current latest - but older should be OK -->
 4 |     <repository name="tmhmm_and_signalp" owner="peterjc" />
 5 |     <!-- Revision 2:abdd608c869b on the main tool shed is v0.0.5, the current latest - but older should be OK -->
 6 |     <repository name="seq_filter_by_id" owner="peterjc" />
 7 |     <!-- Revisiion 2:c96bef0643dc on the main tool shed is v0.0.3, the current latest -->
 8 |     <repository name="venn_list" owner="peterjc" />
 9 | </repositories>
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #Ignore backup files from some Unix editors,
 2 | *~
 3 | *.swp
 4 | *.bak
 5 | 
 6 | #Ignore any tar-balls
 7 | *.tar.gz
 8 | 
 9 | #Ignore patches and any original files created by patch command
10 | *.diff
11 | *.patch
12 | *.orig
13 | *.rej
14 | 
15 | #Ignore these hidden files from Mac OS X
16 | .DS_Store
17 | 
18 | #Ignore hidden files from Dolphin window manager
19 | .directory
20 |  
21 | #Ignore all compiled python files (e.g. from running the unit tests):
22 | *.pyc
23 | *.pyo
24 | 
25 | #Ignore all Jython class files (present if using Jython)
26 | *.class
27 | 
28 | #Ignore any NCBI BLAST database files
29 | *.nhr
30 | *.nin
31 | *.nsq
32 | 
33 | #Ignore any PDF or graphics output files
34 | *.pdf
35 | *.png
36 | 
37 | 


--------------------------------------------------------------------------------
/acgt_dither/Swanson_et_al_2012_fig1.txt:
--------------------------------------------------------------------------------
 1 | Figure 1. Transmission electron micrographs of phage virions showing their isometric heads and long non-contractile tails.
 2 | show more
 3 | 
 4 | Panel A shows multiple SpaA1 virions and panel B shows a single Bce A1 (B) virions. All scale bars represent 100 nm.
 5 | 
 6 | doi:10.1371/journal.pone.0040683.g001
 7 | 
 8 | Accession numbers HE614281 and gi|399498862|ref|NC_018277.1| (SpaA1)
 9 | and HE614282 and gi|397174303|emb|HE614282.1| (BceA1)
10 | 
11 | From:
12 | 
13 | Swanson MM, Reavy B, Makarova KS, Cock PJ, Hopkins DW, et al. (2012)
14 | Novel Bacteriophages Containing a Genome of Another Bacteriophage within Their Genomes.
15 | PLoS ONE 7(7): e40683. doi:10.1371/journal.pone.0040683 
16 | http://dx.doi.org/10.1371/journal.pone.0040683
17 | 


--------------------------------------------------------------------------------
/snakemake/demo.smk:
--------------------------------------------------------------------------------
 1 | # Example usage at the command line:
 2 | #
 3 | # $ rm -rf *.md5; snakemake -q -s demo.smk -p $(for f in *.fna; do echo $f.md5; done); ls *.md5
 4 | #
 5 | # Here using a little bash loop to generate a listing of all the
 6 | # desired MD5 files based on the FASTA files present.
 7 | #
 8 | # Example usage from Python via the API (using same logic for targets):
 9 | #
10 | # $ rm -rf *.md5; ./snakemake_progress_bar_demo.py ; ls *.md5
11 | #
12 | # The rule will sleep for between 1 and 10s, and then compute the MD5.
13 | # However, 1 time in 20 it will fail instead.
14 | 
15 | rule fasta_checksum:
16 |     input:
17 |         "{genome}.fna"
18 |     output:
19 |         "{genome}.fna.md5"
20 |     shell:
21 |         #'X=$((1 + $RANDOM % 10)); if [ "$X" == "1" ]; then sleep 5; exit 1; else sleep $X; md5sum {input} > {output}; fi'
22 |         'sleep $((1 + $RANDOM % 10)); if [ "$(($RANDOM % 20))" == "0" ]; then exit 1; else md5sum {input} > {output}; fi'
23 | 


--------------------------------------------------------------------------------
/galaxy_workflows/secreted_protein_workflow/README.rst:
--------------------------------------------------------------------------------
 1 | This is package is a Galaxy workflow for the identification of candidate
 2 | secreted proteins from a given protein FASTA file.
 3 | 
 4 | It runs SignalP v3.0 (Bendtsen et al. 2004) and selects only proteins with a
 5 | strong predicted signal peptide, and then runs TMHMM v2.0 (Krogh et al. 2001)
 6 | on those, and selects only proteins without a predicted trans-membrane helix.
 7 | This workflow was used in Kikuchi et al. (2011), and is a simplification of
 8 | the candidate effector protocol described in Jones et al. (2009).
 9 | 
10 | As of 17 September 2013, development has moved from here:
11 | 
12 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/secreted_protein_workflow
13 | 
14 | To here, along with the associated Galaxy tools:
15 | 
16 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/secreted_protein_workflow
17 | 
18 | This workflow is available to download and/or install from the main
19 | Galaxy Tool Shed:
20 | 
21 | * http://toolshed.g2.bx.psu.edu/view/peterjc/secreted_protein_workflow
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | .. image:: https://img.shields.io/github/license/peterjc/picobio.svg?label=License
 2 |    :alt: MIT License
 3 |    :target: https://github.com/peterjc/picobio/blob/master/LICENSE.rst
 4 | .. image:: https://results.pre-commit.ci/badge/github/peterjc/picobio/master.svg
 5 |    :target: https://results.pre-commit.ci/latest/github/peterjc/picobio/master
 6 |    :alt: pre-commit.ci status
 7 | .. image:: https://img.shields.io/badge/Code%20style-black-000000.svg
 8 |    :alt: Code style: black
 9 |    :target: https://github.com/python/black
10 | 
11 | This is the README file for the picobio repository,
12 | https://github.com/peterjc/picobio
13 | 
14 | This is a small general hold all for Miscellaneous Bioinformatics scripts etc
15 | mostly in Python, written by Peter Cock.
16 | 
17 | The name "picobio" is a play on "pico" meaning small (10^-12), and the
18 | Japanense phonetics of my name (ピーター starting "pi" in the Latin alphabet,
19 | and コック starting "ko", giving "piko"), "bio" from Bioinformatics.
20 | 
21 | Unless otherwise stated, the scripts in this repository are released under the
22 | MIT License.
23 | 


--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright 2011-2024, The James Hutton Institute, UK.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/blast/README.rst:
--------------------------------------------------------------------------------
 1 | Misc. BLAST scripts.
 2 | 
 3 | Auto-caching of Databases
 4 | =========================
 5 | 
 6 | Files ``blast_sync.py`` and ``blast_wrap.py`` are used to
 7 | pre-cache our central BLAST databases onto a cluster node's
 8 | local hard drive (using ``rsync``).
 9 | 
10 | This works by adding wrapper scripts like ``$HOME/bin/blastp``::
11 | 
12 |     $ more ~/bin/blastp
13 |     #!/bin/bash
14 |     #This bash script pretends to be an NCBI BLAST command line tool
15 |     #acting as a proxy via a Python wrapper script to cache databases.
16 |     #echo $@
17 |     DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
18 |     $DIR/ncbi_blast/blast_wrap.py $DIR/ncbi_blast/blastp "$@"
19 | 
20 | This runs ``$HOME/ncbi_blast/blast_wrap.py`` which checks if a sync
21 | is required via ``$HOME/ncbi_blast/blast_sync.py'', and then runs
22 | the real NCBI BLAST+ binary named ``$HOME/bin/ncbi_blast/blastp``.
23 | 
24 | 
25 | Converting wwwblast BLAST DB list to Galaxy loc files
26 | =====================================================
27 | 
28 | We used to run a ``wwwblast`` server with a collection of
29 | local BLAST databases, but transitioned to using BLAST+ via
30 | Galaxy - see https://github.com/peterjc/galaxy_blast
31 | 
32 | The script ``wwwblast2loc.py`` was used during our transition
33 | period to generate the Galaxy location files ``blastdb.loc``
34 | and ``blastdb_p.loc`` from the ```wwwblast`` listing defined
35 | in ``blast.rc`` and ``blast.html``.
36 | 


--------------------------------------------------------------------------------
/seq_manipulation/pick_N_random_seqs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import random
 3 | import sys
 4 | 
 5 | from Bio import SeqIO
 6 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 7 | 
 8 | """Extract N randomly selected sequences from a FASTA file.
 9 | 
10 | Takes three arguments: input FASTA filename, number of
11 | sequences to pick out, and output FASTA filename. e.g.
12 | 
13 | $ python pick_N_random_seqs.py input.fasta 1000 output.fasta
14 | 
15 | If the input FASTA file has less than the requested count,
16 | this will fail with an error.
17 | """
18 | 
19 | input_fasta, count, output_fasta = sys.argv[1:]
20 | count = int(count)
21 | 
22 | with open(input_fasta) as handle:
23 |     # Using as faster than SeqIO.parse(...)
24 |     ids = [title.split(None, 1)[0] for title, seq in SimpleFastaParser(handle)]
25 | print("Input FASTA file %s has %i sequences" % (input_fasta, len(ids)))
26 | assert len(set(ids)) == len(ids), "You have duplicate identifiers"
27 | 
28 | # seqs = SeqIO.index(input_fasta, "fasta")
29 | # print("Input FASTA file %s has %i sequences"
30 | #      % (input_fasta, len(seqs)))
31 | # assert count <= len(seqs)
32 | # picked = set(random.sample(list(seqs), count))
33 | # assert len(picked) == count
34 | # del seqs
35 | 
36 | picked = set(random.sample(ids, count))
37 | 
38 | # This will preserve the input order, and do line wrapping
39 | wanted = (r for r in SeqIO.parse(input_fasta, "fasta") if r.id in picked)
40 | saved = SeqIO.write(wanted, output_fasta, "fasta")
41 | assert saved == count
42 | 
43 | print(
44 |     "Saved %i randomly selected records from %s into %s"
45 |     % (count, input_fasta, output_fasta)
46 | )
47 | 


--------------------------------------------------------------------------------
/annotation_comparison/mauve_orthologues_to_genbank.py:
--------------------------------------------------------------------------------
 1 | # Use case:
 2 | # - Have multiple annotated GenBank files
 3 | # - Aligned with Mauve, and orthologue file exported
 4 | #
 5 | # Want to copy the sister genome's gene identifiers
 6 | # into a reference GenBank file (as gene aliases, notes,
 7 | # etc) so they can be viewed/searched for easily.
 8 | from __future__ import print_function
 9 | 
10 | from Bio import SeqIO
11 | 
12 | mauve_orthologues_file = "mauve_orthologues.txt"
13 | reference_genbank_file = "reference.gbk"
14 | reference_number_in_mauve = 0
15 | output_genbank_file = "reference_with_aliases.gbk"
16 | 
17 | # Might be more than one contig
18 | reference_records = list(SeqIO.parse(reference_genbank_file, "genbank"))
19 | cds_dict = {}
20 | for r in reference_records:
21 |     for f in r.features:
22 |         if f.type == "CDS":
23 |             name = f.qualifiers["gene"][0]
24 |             key = "%i:%s:%i-%i" % (
25 |                 reference_number_in_mauve,
26 |                 name,
27 |                 f.location.start + 1,
28 |                 f.location.end,
29 |             )
30 |             cds_dict[key] = f
31 | # print(list(cds_dict.keys()))
32 | 
33 | for line in open(mauve_orthologues_file, "rU"):
34 |     parts = sorted(line.strip().split("\t"))
35 |     key = None
36 |     # print(parts)
37 |     for x in parts:
38 |         # if x.startswith("%i|" % reference_number_in_mauve):
39 |         if x in cds_dict:
40 |             print("Using: %r" % parts)
41 |             name = x.split(":")[1]
42 |             names = [y.split(":")[1] for y in parts if y != x]
43 |             cds_dict[x].qualifiers["name"] = [",".join([name] + names)]
44 | 
45 | SeqIO.write(reference_records, output_genbank_file, "genbank")
46 | print("Wrote to %s" % output_genbank_file)
47 | 


--------------------------------------------------------------------------------
/acgt_dither/dither.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | from Bio import SeqIO
 5 | from PIL import Image
 6 | from reportlab.graphics import renderPDF
 7 | from reportlab.graphics.shapes import Drawing
 8 | from reportlab.graphics.shapes import String
 9 | from reportlab.lib import colors
10 | from reportlab.lib.units import cm
11 | from reportlab.pdfgen import canvas
12 | 
13 | png_file = "Swanson_et_al_2012_fig1a.png"
14 | pdf_file = "Swanson_et_al_2012_fig1a.pdf"
15 | main_caption = "Swanson et al (2012) Figure 1"
16 | 
17 | # Load sequence
18 | seq = SeqIO.read("SpaA1.fasta", "fasta").seq
19 | shape = (239, 176)
20 | scale = 0.125 * cm  # per bp
21 | 
22 | # Original is 1274 x 937 pixels, try about 20%
23 | pixels = np.product(shape)
24 | im = Image.open(png_file).resize(shape)
25 | # im.show()
26 | data = im.getdata()
27 | assert len(data) == pixels, len(data)
28 | assert shape == im.getbbox()[2:]
29 | data = np.array(data).reshape(shape, order="F")
30 | assert shape == data.shape
31 | pixels = np.product(shape)
32 | print("Have %i base pairs, and %i pixels" % (len(seq), pixels))
33 | 
34 | assert pixels <= len(seq)
35 | assert 0 <= data.min() <= data.max() <= 255
36 | 
37 | # Open PDF
38 | width, height = page_size = [x * scale for x in shape]
39 | c = canvas.Canvas(pdf_file, page_size)
40 | c.setTitle(main_caption)
41 | d = Drawing(*page_size)
42 | base = 0
43 | for row in range(shape[1]):
44 |     for col in range(shape[0]):
45 |         color = colors.CMYKColor(black=(255 - data[col, row]) / 255.0)
46 |         # From top left?
47 |         s = String(
48 |             (col + 0.5) * scale,
49 |             (shape[1] - row) * scale,
50 |             seq[base],
51 |             fillColor=color,
52 |             fontSize=4,
53 |             textAnchor="middle",
54 |         )
55 |         d.add(s)
56 |         base += 1
57 | renderPDF.draw(d, c, 0, 0)
58 | c.showPage()
59 | c.save()
60 | 


--------------------------------------------------------------------------------
/sambam/profile/bench_iter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import time
 5 | 
 6 | to_profile = []
 7 | 
 8 | try:
 9 |     from Bio.Sequencing.SamBam import BamIterator
10 | 
11 |     def peter_iter(bam_filename, out_filename):
12 |         """Peter's pure Python BAM iterator."""
13 |         h = open(bam_filename, "rb")
14 |         out_h = open(out_filename, "w")
15 |         count = 0
16 |         mapped = 0
17 |         for read in BamIterator(h):
18 |             count += 1
19 |             if read.is_mapped:
20 |                 mapped += 1
21 |                 out_h.write("%s\t%s\n" % (read.rname, read.pos))
22 |         h.close()
23 |         out_h.close()
24 |         return mapped, count
25 | 
26 |     to_profile.append(peter_iter)
27 | except ImportError:
28 |     pass
29 | 
30 | try:
31 |     from pysam import Samfile
32 | 
33 |     def pysam_iter(bam_filename, out_filename):
34 |         """PySam's Samfile as BAM iterator."""
35 |         out_h = open(out_filename, "w")
36 |         count = 0
37 |         mapped = 0
38 |         for read in Samfile(bam_filename, "rb"):
39 |             count += 1
40 |             if not read.is_unmapped:
41 |                 mapped += 1
42 |                 out_h.write("%s\t%s\n" % (read.rname, read.pos))
43 |         out_h.close()
44 |         return mapped, count
45 | 
46 |     to_profile.append(pysam_iter)
47 | except ImportError:
48 |     pass
49 | 
50 | print("Will profile %i functions:" % len(to_profile))
51 | for p in to_profile:
52 |     print(p.__doc__)
53 | print
54 | for f in os.listdir("."):
55 |     if f.endswith(".bam"):
56 |         print("Using %s" % f)
57 |         for p in to_profile:
58 |             print("Profiling %s" % p.__doc__)
59 |             start = time.time()
60 |             mapped, count = p(f, "/dev/null")
61 |             taken = time.time() - start
62 |             print("%s - %0.1fs giving %i/%i mapped" % (p.__doc__, taken, mapped, count))
63 | 


--------------------------------------------------------------------------------
/sambam/sam_depair.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | usage = """Python script to remove paired information in SAM reads.
 6 | 
 7 | The intended usage is where you wish to treat "orphaned" paired
 8 | reads as single reads, meaning removing any /1 or /2 suffix in
 9 | the FASTQ file and likewise clearing the paired bits in the SAM
10 | FLAG.
11 | 
12 | This script is designed to be used as part of a Unix pipeline. It
13 | takes no command line arguments. It reads SAM format data from stdin,
14 | and writes SAM format data to stdout.
15 | 
16 | The only change made to the FLAG field, clearing the following bits:
17 | * 0x1 template having multiple segments in sequencing
18 | * 0x8 next segment in the template unmapped
19 | * 0x20 next segment mapped to reverse strand
20 | * 0x40 the first segment in the template
21 | * 0x80 the last segment in the template
22 | 
23 | Example:
24 | 
25 | $ ./sam_depair.py < original.sam > as_singles.sam
26 | 
27 | Simple usage with BAM files with conversion to/from SAM via samtools:
28 | 
29 | $ samtools view -h original.bam | ./sam_depair.py | samtools view -S -b - > as_singles.bam
30 | 
31 | Copyright Peter Cock 2014. All rights reserved. See:
32 | https://github.com/peterjc/picobio
33 | """
34 | 
35 | if len(sys.argv) != 1:
36 |     sys.stderr.write("ERROR: Bad arguments.\n\n")
37 |     sys.stderr.write("Expects SAM on stdin, and writes SAM to stdout.\n")
38 |     sys.exit(1)
39 | 
40 | count = 0
41 | tweaked = 0
42 | mask = 0x1 | 0x8 | 0x20 | 0x40 | 0x80
43 | flip_mask = ~mask
44 | for line in sys.stdin:
45 |     if line[0] != "@":
46 |         # Should be a read
47 |         count += 1
48 |         qname, flag, rest = line.split("\t", 2)
49 |         flag = int(flag)
50 |         if flag & mask:
51 |             # Want to clear those bits...
52 |             flag = flag & flip_mask
53 |             tweaked += 1
54 |             line = "%s\t%i\t%s" % (qname, flag, rest)
55 |     sys.stdout.write(line)
56 | sys.stderr.write("Tweaked %i out of %i reads\n" % (tweaked, count))
57 | 


--------------------------------------------------------------------------------
/ena_fetch/get_ENA_project_submissions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import urllib
 5 | 
 6 | project = "ERP000297"
 7 | 
 8 | submissions_url = (
 9 |     "http://www.ebi.ac.uk/ena/data/view/reports/sra/submitted_files/internal/%s"
10 |     % project
11 | )
12 | submissions_file = "%s_submissions.tsv" % project
13 | 
14 | 
15 | def download_in_one(url, filename):
16 |     print("Fetching %s" % url)
17 |     n = urllib.urlopen(url)
18 |     data = n.read()
19 |     n.close()
20 | 
21 |     h = open(filename, "w")
22 |     h.write(data)
23 |     h.close()
24 |     print("Saved as %s" % filename)
25 | 
26 | 
27 | print
28 | if not os.path.isfile(submissions_file):
29 |     download_in_one(submissions_url, submissions_file)
30 | 
31 | 
32 | def process_submissions(project, submissions_filename):
33 |     h = open(submissions_filename)
34 |     line = h.readline()
35 |     assert (
36 |         line
37 |         == "Study\tSample\tExperiment\tRun\tOrganism\tInstrument Platform\tInstrument Model\tLibrary Name\tLibrary Layout\tLibrary Source\tLibrary Selection\tRun Read Count\tRun Base Count\tFile Name\tFile Size\tmd5\tFtp\n"
38 |     ), repr(line)
39 |     for line in h:
40 |         parts = line.rstrip("\n").split("\t")
41 |         assert parts[0] == project
42 |         url = parts[16]
43 |         assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/ERA")
44 |         filename = url[len("ftp://ftp.sra.ebi.ac.uk/") :]
45 |         if os.path.isfile(filename):
46 |             print("Already have %s" % filename)
47 |             continue
48 |         if filename.endswith(".srf"):
49 |             print("Skipping %s" % filename)
50 |             continue
51 |         # Make directory...
52 |         d = os.path.split(filename)[0]
53 |         if not os.path.isdir(d):
54 |             print("Making directory %s" % d)
55 |             os.makedirs(d)
56 |         # Download file...
57 |         rc = os.system("wget -O %s %s" % (filename, url))
58 |         assert not rc, rc
59 |         # Now check the md5...
60 |         print(filename)
61 |     h.close()
62 | 
63 | 
64 | process_submissions(project, submissions_file)
65 | 


--------------------------------------------------------------------------------
/primer_selection/iupac_isPcr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Generalise Jim Kent's isPcr to support IUPAC ambiguities by brute force.
 3 | 
 4 | As of v33 at least, ambiguous bases are rejected in the primers. So, this
 5 | script generalises the input file to record all the non-ambiguous
 6 | interpretations of the primer. Running isPcr will take several times longer,
 7 | and the output will probably need to be deduplicated.
 8 | 
 9 | The input and output are simple three-column TSV files with the name of
10 | each primer pair, the forward primer sequence, and the reverse primer
11 | sequence.
12 | """
13 | 
14 | import itertools
15 | import sys
16 | 
17 | from Bio.Data.IUPACData import ambiguous_dna_values
18 | 
19 | expand_iupac = {
20 |     # Treat I (inosine) like N
21 |     "I": list(ambiguous_dna_values["N"].upper()),
22 |     "i": list(ambiguous_dna_values["N"].lower()),
23 | }
24 | for base, expanded in ambiguous_dna_values.items():
25 |     expand_iupac[base.upper()] = list(expanded.upper())
26 |     expand_iupac[base.lower()] = list(expanded.lower())
27 | 
28 | 
29 | def expand_iupac_bases(seq):
30 |     """All possible unabmiguous sequences described with IUPAC ambiguities.
31 | 
32 |     e.g.
33 | 
34 |     >>> list(expand_iupac_bases("DAY"))
35 |     ['AAC', 'AAT', 'GAC', 'GAT', 'TAC', 'TAT']
36 |     """
37 |     try:
38 |         for alt in itertools.product(*[expand_iupac[base] for base in seq]):
39 |             yield "".join(alt)
40 |     except KeyError as err:
41 |         sys.exit(f"ERROR - Problem with primer sequence {seq}, {err}")
42 | 
43 | 
44 | before = after = 0
45 | for line in sys.stdin:
46 |     if line.startswith("#") or not line.strip():
47 |         continue
48 |     try:
49 |         idn, fwd, rev = line.strip("\n").split("\t")[:3]
50 |     except ValueError:
51 |         t = line.count("\t")
52 |         sys.exit(f"ERROR: Only {t} tabs in line: {line}")
53 |     before += 1
54 |     for fwd2 in expand_iupac_bases(fwd):
55 |         for rev2 in expand_iupac_bases(rev):
56 |             sys.stdout.write(f"{idn}\t{fwd2}\t{rev2}\n")
57 |             after += 1
58 | sys.stderr.write(f"Generalised {before} primer pairs into unabmiguous {after} pairs\n")
59 | 


--------------------------------------------------------------------------------
/sambam/samtools_auto.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Dirty hack to allow mixing of samtools commands between versions.
 3 | 
 4 | It can be a downside that the samtools command line API is a single
 5 | binary which offers multiple (often independent) commands.
 6 | 
 7 | Right now, samtools 1.1 still lacks some functionality from 0.1.19,
 8 | for example "samtools index", "samtools depad" and "samtools rmdup"
 9 | are not yet fully functional. e.g.
10 | 
11 |  - https://github.com/samtools/samtools/issues/199
12 |  - https://github.com/samtools/samtools/issues/291
13 | 
14 | Conversely, the "samtools bam2fq" from samtools 0.1.19 has had
15 | several issues fixed.
16 | 
17 | This wrapper allows me to call "samtools" and route this to the
18 | appropriate binary. In this case:
19 | 
20 |  - ``samtools`` (alone) will call samtools 1.1
21 |  - ``samtools bam2fq [...]`` will call samtools 1.1
22 |  - ``samtools depad [...]`` will call samtools 0.1.19
23 |  - ``samtools rmdup [...]`` will call samtools 0.1.19
24 |  - etc
25 | 
26 | Install this by putting the Python script (or a symlink to it) on
27 | your ``$PATH`` as ``samtools``, for example under ``~/bin/``::
28 | 
29 |    $ cd ~/bin
30 |    $ ln -s samtools_auto.py samtools
31 | 
32 | Also install binaries for samtools 0.1.19 and 1.1 and set their
33 | paths below (variables ``samtools_old`` and ``samtools_new``).
34 | """
35 | 
36 | import os
37 | import sys
38 | 
39 | samtools_old = "/mnt/galaxy/bin/samtools_0.1.19"
40 | samtools_new = "/mnt/galaxy/bin/samtools_1.1"
41 | 
42 | 
43 | def pick_binary():
44 |     """Return new samtools unless known to be using a broken command.
45 | 
46 |     i.e. Avoid samtools commands with known regressions!
47 |     """
48 |     if len(sys.argv) == 1:
49 |         return samtools_new
50 |     elif sys.argv[1] in ["index", "depad", "rmdup"]:
51 |         return samtools_old
52 |     else:
53 |         return samtools_new
54 | 
55 | 
56 | # argv[0] is this python script
57 | # Turn the argv list into a string, escaping as needed
58 | 
59 | 
60 | def wrap(text):
61 |     if " " in text and not text[0] == '"' and not text[-1] == '"':
62 |         return '"%s"' % text
63 |     else:
64 |         return text
65 | 
66 | 
67 | cmd = pick_binary() + " " + " ".join(wrap(arg) for arg in sys.argv[1:])
68 | 
69 | err = os.system(cmd)
70 | if 0 < err < 128:
71 |     sys.exit(err)
72 | elif err:
73 |     # Returning 512 gives 0 (odd)
74 |     sys.exit(1)
75 | 


--------------------------------------------------------------------------------
/blooming_reads/interlace_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Simple FASTQ interlacer.
 3 | 
 4 | Checks read identifiers agree, or end with /1 and /2 respectively.
 5 | """
 6 | 
 7 | import gzip
 8 | import sys
 9 | 
10 | try:
11 |     from Bio.SeqIO.QualityIO import FastqGeneralIterator
12 | except ImportError:
13 |     sys.exit("Missing FastqGeneralIterator from Biopython")
14 | 
15 | if len(sys.argv) != 3:
16 |     sys.exit("Requires two arguments, a pair of FASTQ filenames")
17 | fastq1 = sys.argv[1]
18 | fastq2 = sys.argv[2]
19 | 
20 | sys.stderr.write("Interlacing %s and %s\n" % (fastq1, fastq2))
21 | if fastq1.endswith(".gz"):
22 |     sys.stderr.write("Decompressing %s\n" % fastq1)
23 |     handle1 = gzip.open(fastq1)
24 | else:
25 |     handle1 = open(fastq1)
26 | if fastq2.endswith(".gz"):
27 |     sys.stderr.write("Decompressing %s\n" % fastq2)
28 |     handle2 = gzip.open(fastq2)
29 | else:
30 |     handle2 = open(fastq2)
31 | sys.stderr.write("Interlacing paired FASTQ files to stdout...\n")
32 | out_handle = sys.stdout
33 | 
34 | iter1 = FastqGeneralIterator(handle1)
35 | iter2 = FastqGeneralIterator(handle2)
36 | 
37 | for title1, seq1, qual1 in iter1:
38 |     try:
39 |         title2, seq2, qual2 = iter2.next()
40 |     except StopIteration:
41 |         sys.exit("More records in %s than %s, e.g. %s" % (fastq1, fastq2, title1))
42 |     id1, descr1 = title1.split(None, 1)
43 |     id2, descr2 = title2.split(None, 1)
44 |     if id1 == id2:
45 |         # Add the /1 and /2, preserve any description after the ID
46 |         if descr1:
47 |             descr1 = " " + descr1
48 |         if descr2:
49 |             descr2 = " " + descr2
50 |         out_handle.write(
51 |             "@%s/1%s\n%s\n+\n%s\n@%s/2%s\n%s\n+\n%s\n"
52 |             % (id1, descr1, seq1, qual1, id2, descr2, seq2, qual2)
53 |         )
54 |     elif id1.endswith("/1") and id2.endswith("/2") and id1[:-2] == id2[:-2]:
55 |         out_handle.write(
56 |             "@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n"
57 |             % (title1, seq1, qual1, title2, seq2, qual2)
58 |         )
59 |     else:
60 |         sys.exit("Mismatched records %r vs %r" % (title1, title2))
61 | 
62 | # Check at end of file two
63 | try:
64 |     title2, seq2, qual2 = iter2.next()
65 |     sys.exit("More records in %s than %s, e.g. %s" % (fastq2, fastq1, title2))
66 | except StopIteration:
67 |     pass
68 | 
69 | handle1.close()
70 | handle2.close()
71 | sys.stderr.write("Interlacing paired FASTQ files done.\n")
72 | 


--------------------------------------------------------------------------------
/sambam/bgzf_add_eof.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Python script to add missing EOF marker to BAM or BGZF files.
 3 | 
 4 | BAM files are compressed using BGZF, Blocked GNU Zip Format, which
 5 | is a variant of GZIP. Modern BAM files include a special empty
 6 | block at the end of the file (EOF) as a marker to help spot when
 7 | a dataset has been truncated. This is just a 28 byte BGZF block,
 8 | which when decompressed is empty.
 9 | 
10 | Some early tools output valid BAM files without this optional
11 | (but recommended) EOF marker.
12 | 
13 | This script will add the EOF marker is not already present.
14 | 
15 | WARNING: If your BAM or BGZF file is truly truncated, this will
16 | not magically fix it. It may hide or obscure the true problem.
17 | 
18 | WARNING: To avoid excessive data writing, this script modifies
19 | the BAM or BGZF file in situ!
20 | 
21 | Usage with one or more BAM or BGZF files:
22 | 
23 | $ ./bam_add_eof.py example1.bam example2.bam ... exampleN.bam
24 | 
25 | See also: http://samtools.sourceforge.net/
26 | 
27 | v0.0.0 - Original script
28 | v0.0.1 - Use append mode to add EOF block
29 | v0.0.2 - removed internal function sys_exit
30 | """
31 | 
32 | import os
33 | import sys
34 | 
35 | 
36 | def fix_bam(filename):
37 |     header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00"
38 |     eof = (
39 |         "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC"
40 |         "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
41 |     )
42 |     if not os.path.isfile(filename):
43 |         sys.exit("Missing file %s" % filename)
44 |     size = os.path.getsize(filename)
45 |     h = open(filename, "rb")  # read only for now
46 |     # Check it looks like a BGZF file
47 |     # (could still be GZIP'd, in which case the extra block is harmless)
48 |     data = h.read(len(header))
49 |     if data != header:
50 |         sys.exit("File %s is not a BAM file" % filename)
51 |     # Check if it has the EOF already
52 |     h.seek(size - 28)
53 |     data = h.read(28)
54 |     h.close()
55 |     if data == eof:
56 |         sys.stderr.write("EOF already present in %s\n" % filename)
57 |     else:
58 |         sys.stderr.write("Adding EOF block to %s\n" % filename)
59 |         h = open(filename, "ab")
60 |         h.write(eof)
61 |         h.close()
62 | 
63 | 
64 | if len(sys.argv) == 1:
65 |     sys.exit("Takes one or more BGZF/BAM filenames as arguments (edits in place)")
66 | for bam_filename in sys.argv[1:]:
67 |     fix_bam(bam_filename)
68 | 


--------------------------------------------------------------------------------
/sambam/sam_drop_qname.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Python script to drop read name (QNAME) from SAM/BAM files.
 3 | 
 4 | This script is designed to be used as part of a Unix pipeline. It reads
 5 | SAM format data from stdin, and writes SAM format data to stdout.
 6 | 
 7 | The only change made to the SAM reads is in the QNAME field. For
 8 | single-fragment reads, QNAME is dropped (set to * for missing).
 9 | For multi-fragment reads (e.g. paired end reads), a QNAME is
10 | required to cross reference the parts. Here short automatic names
11 | are substituted instead.
12 | 
13 | The optional argument prefix is added to the start of any generated
14 | read name (allowing you to avoid read name clashes).
15 | 
16 | Simple usage with SAM files:
17 | 
18 | $ ./sam_drop_names [prefix] < original.sam > dropped_names.sam
19 | 
20 | Simple usage with BAM files with conversion to/from SAM via samtools:
21 | 
22 | $ samtools view -h original.bam | ./sam_drop_names.py [prefix] | samtools view -S -b - > dropped_names.bam
23 | 
24 | If your SAM/BAM files lack @SQ headers, you may need to give
25 | samtools the reference FASTA file as well.
26 | 
27 | Copyright Peter Cock 2012. All rights reserved. See:
28 | https://github.com/peterjc/picobio
29 | http://blastedbio.blogspot.co.uk/2012/03/bam-verus-cram-07.html
30 | """
31 | 
32 | import sys
33 | 
34 | if len(sys.argv) == 1:
35 |     prefix = ""
36 | elif len(sys.argv) == 2:
37 |     prefix = sys.argv[1]
38 | else:
39 |     sys.stderr.write("Error, expect one optional parameter only (read name prefix)")
40 |     sys.exit(1)
41 | 
42 | count = 0
43 | mapping = {}
44 | # TODO - Automatically remove mapping entries once all parts of the read
45 | # have been found? They would typically be near each other in the file...
46 | # otherwise memory will be a problem with big paired end datasets.
47 | for line in sys.stdin:
48 |     if line[0] != "@":
49 |         # Should be a read
50 |         qname, flag, rest = line.split("\t", 2)
51 |         if int(flag) & 0x1:
52 |             # Multi-fragment read
53 |             try:
54 |                 qname = prefix + str(mapping[qname])
55 |             except KeyError:
56 |                 count += 1
57 |                 mapping[qname] = count
58 |                 qname = prefix + str(count)
59 |         else:
60 |             # Single fragment read
61 |             qname = "*"
62 |         line = "\t".join([qname, flag, rest])
63 |     sys.stdout.write(line)
64 | sys.stderr.write("Modified %i multi-fragment reads\n" % count)
65 | 


--------------------------------------------------------------------------------
/assembly_comparison/fasta_trim_n.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Python script for trimming N bases from ends of sequences."""
 3 | 
 4 | import sys
 5 | from optparse import OptionParser
 6 | 
 7 | usage = """Basic usage: ./fasta_trim_n.py < input.fasta > output.fasta
 8 | 
 9 | For more details, run with -h for the help.
10 | """
11 | 
12 | try:
13 |     from Bio import SeqIO
14 | except ImportError:
15 |     sys.exit("This script requires Biopython")
16 | 
17 | parser = OptionParser(usage=usage)
18 | parser.add_option(
19 |     "-i",
20 |     "--input",
21 |     dest="input_filename",
22 |     help="Input sequence file (default is stdin)",
23 |     default=None,
24 |     metavar="FILE",
25 | )
26 | parser.add_option(
27 |     "-o",
28 |     "--output",
29 |     dest="output_filename",
30 |     help="Output sequence file (fefault is stdout)",
31 |     default=None,
32 |     metavar="FILE",
33 | )
34 | parser.add_option(
35 |     "-f",
36 |     "--format",
37 |     dest="sequence_format",
38 |     help='Sequence format (as named in Biopython SeqIO, default "fasta")',
39 |     default="fasta",
40 | )
41 | parser.add_option(
42 |     "-c",
43 |     "--chars",
44 |     dest="characters",
45 |     help='Characters to trim (default "Nn" covering upper and lower case)',
46 |     default="Nn",
47 |     metavar="FILE",
48 | )
49 | (options, args) = parser.parse_args()
50 | 
51 | chars = options.characters
52 | format = options.sequence_format.lower()
53 | 
54 | sys.stderr.write(
55 |     "Removing %s characters from start/end of %s format file...\n" % (chars, format)
56 | )
57 | 
58 | if options.input_filename:
59 |     input_handle = open(options.input_filename)
60 | else:
61 |     input_handle = sys.stdin
62 | 
63 | if options.output_filename:
64 |     output_handle = open(options.output_filename, "w")
65 | else:
66 |     output_handle = sys.stdout
67 | 
68 | chars = options.characters
69 | format = options.sequence_format.lower()
70 | 
71 | 
72 | def strip_seq(records):
73 |     for record in records:
74 |         # FASTQ etc will be a problem, must trim quality too!
75 |         # old_len = len(record.seq)
76 |         record.seq = record.seq.strip(chars)
77 |         # TODO Minimum length!
78 |         # new_len = len(record.seq)
79 |         # if new_len < old_len:
80 |         #    sys.stderr.write("Trimmed %s from %i to %i\n" % (record.id, old_len, new_len))
81 |         yield record
82 | 
83 | 
84 | # Do the work,
85 | count = SeqIO.write(strip_seq(SeqIO.parse(input_handle, format)), output_handle, format)
86 | 
87 | if options.input_filename:
88 |     input_handle.close()
89 | if options.output_filename:
90 |     output_handle.close()
91 | 
92 | sys.stderr.write("Saved %i records\n" % count)
93 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # pre-commit run --all-files
 2 | repos:
 3 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v5.0.0
 5 |     hooks:
 6 |     -   id: check-added-large-files
 7 |     -   id: check-case-conflict
 8 |     -   id: check-executables-have-shebangs
 9 |     -   id: check-json
10 |     -   id: check-merge-conflict
11 |     -   id: check-shebang-scripts-are-executable
12 |     -   id: check-symlinks
13 |     -   id: check-yaml
14 |     -   id: debug-statements
15 |     -   id: destroyed-symlinks
16 |     -   id: end-of-file-fixer
17 |         files: \.(py|sh|rst|yml|yaml)$
18 |     -   id: mixed-line-ending
19 |     -   id: trailing-whitespace
20 |         files: \.(py|sh|rst|yml|yaml)$
21 | -   repo: local
22 |     hooks:
23 |     -   id: no-tabs
24 |         name: No tabs
25 |         description: Reject any files containing a tab
26 |         entry: '\t'
27 |         language: pygrep
28 |         files: \.(py|sh|rst|yml|yaml)$
29 | -   repo: https://github.com/astral-sh/ruff-pre-commit
30 |     rev: v0.6.9
31 |     hooks:
32 |     # Run the Ruff linter (flake8 alternative):
33 |     -   id: ruff
34 |         args: [
35 |             '--fix',
36 |             '--exit-non-zero-on-fix',
37 |             '--extend-select=BLE,C4,D,I,ISC',
38 |             '--extend-ignore=D100,D103,D203,D213',
39 |             '--config=lint.isort.force-single-line=true',
40 |             '--config=lint.isort.order-by-type=false',
41 |             '--config=lint.pyupgrade.keep-runtime-typing=true'
42 |         ]
43 |     # Run the Ruff formatter (black alternative):
44 |     -   id: ruff-format
45 |         args: [
46 |             '--config=format.docstring-code-format=true'
47 |         ]
48 | -   repo: https://github.com/rstcheck/rstcheck
49 |     rev: v6.2.4
50 |     hooks:
51 |     -   id: rstcheck
52 |         args: [
53 |             --report-level=warning,
54 |             --ignore-roles=ref,
55 |             "--ignore-directives=automodule,toctree",
56 |             --ignore-substitutions=version
57 |         ]
58 | -   repo: https://github.com/PyCQA/doc8
59 |     rev: 'v1.1.2'
60 |     hooks:
61 |     -   id: doc8
62 |         additional_dependencies: [pygments]
63 |         args: [--quiet,--ignore=D001]
64 | -   repo: https://github.com/codespell-project/codespell
65 |     rev: v2.3.0
66 |     hooks:
67 |     -   id: codespell
68 |         files: \.(py|sh|rst|yml|yaml)$
69 |         args: ['-L', 'nin,mis']
70 | ci:
71 |     # Settings for the https://pre-commit.ci/ continuous integration service
72 |     autofix_prs: true
73 |     # Default message is more verbose
74 |     autoupdate_commit_msg: '[pre-commit.ci] autoupdate'
75 |     # Default is weekly
76 |     autoupdate_schedule: monthly
77 | 


--------------------------------------------------------------------------------
/ena_fetch/get_ENA_project_meta.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import os
 4 | import urllib
 5 | 
 6 | project = "ERP000297"
 7 | strain_file = "%s_strain.tsv" % project  # output file
 8 | 
 9 | fastq_url = (
10 |     "http://www.ebi.ac.uk/ena/data/view/reports/sra/fastq_files/internal/%s" % project
11 | )
12 | fastq_file = "%s_fastq.tsv" % project
13 | 
14 | 
15 | def download_in_one(url, filename):
16 |     print("Fetching %s" % url)
17 |     n = urllib.urlopen(url)
18 |     data = n.read()
19 |     n.close()
20 | 
21 |     h = open(filename, "w")
22 |     h.write(data)
23 |     h.close()
24 |     print("Saved as %s" % filename)
25 | 
26 | 
27 | if not os.path.isfile(fastq_file):
28 |     download_in_one(fastq_url, fastq_file)
29 | 
30 | 
31 | def get_strain(meta_xml_filename):
32 |     h = open(meta_xml_filename)
33 |     while True:
34 |         line = h.readline()
35 |         if not line:
36 |             break
37 |         if "<tag>strain</tag>" in line.lower():
38 |             strain = h.readline().strip()
39 |             assert strain.lower().startswith("<value>"), strain
40 |             assert strain.lower().endswith("</value>"), strain
41 |             h.close()
42 |             return strain[7:-8]
43 |     h.close()
44 |     return None
45 | 
46 | 
47 | def process_meta(project, fastq_filename, strain_file):
48 |     h = open(fastq_filename)
49 |     out = open(strain_file, "w")
50 |     line = h.readline()
51 |     assert (
52 |         line
53 |         == "Study\tSample\tExperiment\tRun\tOrganism\tInstrument Platform\tInstrument Model\tLibrary Name\tLibrary Layout\tLibrary Source\tLibrary Selection\tRun Read Count\tRun Base Count\tFile Name\tFile Size\tmd5\tFtp\n"
54 |     ), repr(line)
55 |     out.write(line[:-1] + "\tStrain\n")
56 |     for line in h:
57 |         parts = line.rstrip("\n").split("\t")
58 |         assert parts[0] == project
59 |         url = parts[16]
60 |         assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR"), url
61 | 
62 |         sample = parts[1]
63 |         assert sample.startswith("ERS")
64 |         url = "http://www.ebi.ac.uk/ena/data/view/%s&display=xml" % sample
65 |         url = "http://www.ebi.ac.uk/ena/data/view/%s&display=xml&download" % sample
66 |         filename = "xml/%s.xml" % sample
67 | 
68 |         # Download file...
69 |         if not os.path.isfile(filename):
70 |             print(url)
71 |             rc = os.system("wget -O %s '%s'" % (filename, url))
72 |             assert not rc, rc
73 | 
74 |         strain = get_strain(filename)
75 |         if not strain:
76 |             strain = ""
77 |         print(filename, strain)
78 |         out.write(line[:-1] + "\t" + strain + "\n")
79 |     h.close()
80 |     out.close()
81 | 
82 | 
83 | process_meta(project, fastq_file, strain_file)
84 | 


--------------------------------------------------------------------------------
/sambam/bgzf_check_eof.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Python script to check if BGZF (e.g. BAM) files have EOF marker.
 3 | 
 4 | BAM files are compressed using BGZF, Blocked GNU Zip Format, which
 5 | is a variant of GZIP. Modern BAM files include a special empty
 6 | block at the end of the file (EOF) as a marker to help spot when
 7 | a dataset has been truncated. This is just a 28 byte BGZF block,
 8 | which when decompressed is empty.
 9 | 
10 | Some early tools output valid BAM files without this optional
11 | (but recommended) EOF marker.
12 | 
13 | Usage with one or more BAM or BGZF files:
14 | 
15 | $ ./bgzf_check_eof.py example1.bam example2.bam ... exampleN.bam
16 | 
17 | The filenames are checked in the order given, if any are invalid
18 | the tool exits with a non-zero error level and a message to stderr.
19 | If all the files are valid, it returns with a zero error level.
20 | 
21 | Return codes:
22 | * 0 - No errors found
23 | * 1 - Invalid arguments
24 | * 2 - File not found
25 | * 3 - File is zero bytes (and thus not valid BGZF or BAM)
26 | * 4 - File missing BGZF header
27 | * 5 - File looks like BGZF, but missing BGZF EOF marker
28 | 
29 | See also: http://samtools.sourceforge.net/
30 | 
31 | v0.0.0 - Original script
32 | v0.0.1 - Dropped internal function sys_exit
33 | """
34 | 
35 | import os
36 | import sys
37 | 
38 | 
39 | def check_bam(filename):
40 |     header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00"
41 |     eof = (
42 |         "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC"
43 |         "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
44 |     )
45 |     if not os.path.isfile(filename):
46 |         sys.stderr.write("Missing file %s\n" % filename)
47 |         sys.exit(2)
48 |     size = os.path.getsize(filename)
49 |     if not size:
50 |         sys.stderr.write("Empty file (zero bytes) %s\n" % filename)
51 |         sys.exit(3)
52 |     h = open(filename, "rb")
53 |     # Check it looks like a BGZF file
54 |     # (could still be GZIP'd, in which case the extra block is harmless)
55 |     data = h.read(len(header))
56 |     if data != header:
57 |         sys.stderr.write("File %s is not a BGZF/BAM file\n" % filename)
58 |         sys.exit(4)
59 |     # Check if it has the EOF already
60 |     h.seek(size - 28)
61 |     data = h.read(28)
62 |     h.close()
63 |     if data == eof:
64 |         sys.stderr.write("Good, BGZF EOF already present in %s\n" % filename)
65 |     else:
66 |         sys.stderr.write("Missing EOF marker in BGZF/BAM file %s\n" % filename)
67 |         sys.exit(5)
68 | 
69 | 
70 | if len(sys.argv) == 1:
71 |     sys.stderr.write(
72 |         "Takes one or more BGZF/BAM filenames as arguments (edits in place)"
73 |     )
74 |     sys.exit(1)
75 | for bam_filename in sys.argv[1:]:
76 |     check_bam(bam_filename)
77 | 


--------------------------------------------------------------------------------
/blooming_reads/stack_coverage_plot.py:
--------------------------------------------------------------------------------
 1 | """Script to produce stacked coverage plot with matplotlib."""
 2 | 
 3 | from __future__ import print_function
 4 | 
 5 | import sys
 6 | 
 7 | import numpy as np
 8 | from matplotlib import pyplot as plt
 9 | 
10 | 
11 | def load(filename):
12 |     h = open(filename)
13 |     line = h.readline()
14 |     assert line.startswith(">")
15 |     while line and line[0] == ">":
16 |         name = line[1:].split(None, 1)[0]
17 |         values = []
18 |         while line:
19 |             line = h.readline()
20 |             if not line or line[0] == ">":
21 |                 break
22 |             values.append([float(v) for v in line.rstrip("\n").split("\t")])
23 |         yield name, np.array(values, np.float)
24 |     h.close()
25 | 
26 | 
27 | def make_colors(start, end, steps):
28 |     delta = (end - start) / float(steps - 1)
29 |     return ["#%02x%02x%02x" % tuple(start + i * delta) for i in range(steps)]
30 | 
31 | 
32 | def stack(data, filename, colors=None):
33 |     total = len(data)
34 |     max_value = 0
35 |     for names, values in data:
36 |         max_value = max(max_value, values.sum(axis=0).max())
37 |     plt.ylim([0, max_value])
38 | 
39 |     fig = plt.figure(figsize=(12, 2 * total))
40 |     if not colors:
41 |         # Assumes all the examples have same number of colors:
42 |         if data[0][1].shape[0] == 3:
43 |             colors = ["#CC6666", "#1DACD6", "#6E5160"]
44 |         elif data[0][1].shape[0] == 5:
45 |             colors = ["#CDCDC1", "#8B8B83", "#FF6A6A", "#F0E68C", "#CDC673"]
46 |         else:
47 |             colors = make_colors(
48 |                 np.array([0xCC, 0x66, 0x66]),
49 |                 # np.array([0x6E, 0x51, 0x60]),
50 |                 np.array([0x90, 0x41, 0x50]),
51 |                 # np.array([0x20, 0xF0, 0x60]),
52 |                 data[0][1].shape[0],
53 |             )
54 |         print(colors)
55 |     for i, (name, values) in enumerate(data):
56 |         x = range(values.shape[1])
57 |         print(i, name, values.shape, "coverage:")
58 |         print("\t".join("%0.1f" % v for v in values.sum(axis=1)))
59 |         y_stack = np.cumsum(values, axis=0)
60 |         ax1 = fig.add_subplot(total, 1, i + 1)
61 |         ax1.set_autoscaley_on(False)
62 |         ax1.set_ylim([0, max_value])
63 |         ax1.set_title(name.split(None, 1)[0], fontsize="xx-small")
64 |         ax1.fill_between(x, 0, y_stack[0, :], facecolor=colors[0], alpha=0.7)
65 |         for i in range(0, values.shape[0] - 1):
66 |             ax1.fill_between(
67 |                 x, y_stack[i, :], y_stack[i + 1, :], facecolor=colors[i + 1], alpha=0.7
68 |             )
69 |     # fig.tight_layout()
70 |     plt.show()
71 |     plt.savefig(filename)
72 | 
73 | 
74 | for filename in sys.argv[1:]:
75 |     if not filename.endswith(".cov"):
76 |         continue
77 |     print("-" * 60)
78 |     print(filename)
79 |     print("-" * 60)
80 |     data = list(load(filename))
81 |     stack(data, filename + ".png")
82 | print("Done")
83 | 


--------------------------------------------------------------------------------
/sambam/fastq_to_sam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Python script to turn FASTQ into unaliged SAM/BAM files.
 3 | 
 4 | This script is designed to be used as part of a Unix pipeline. It
 5 | works with Python 2 and Python 3, e.g.
 6 | 
 7 |     $ python fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam
 8 |     Done, 532 pairs
 9 | 
10 | Or:
11 | 
12 |     $ python3 fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam
13 |     Done, 532 pairs
14 | 
15 | As long as the Python script is marked as executable you can do:
16 | 
17 |     $ ./fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam
18 |     Done, 532 pairs
19 | 
20 | Simple usage with BAM files with conversion from SAM via samtools:
21 | 
22 |     $ ./fastq_to_sam.py R1.fastq R2.fastq | samtools view -S -b - > unmapped.bam
23 |     [samopen] no @SQ lines in the header.
24 |     Done, 532 pairs
25 | 
26 | Note that no @SQ lines are expected in SAM/BAM files with only unaligned reads.
27 | 
28 | WARNING: This assumes your FASTQ files use the Sanger quality encoding.
29 | 
30 | Todo:
31 | ----
32 |  - Test cases
33 |  - Galaxy wrapper?
34 |  - Proper command line API
35 |  - Support for gzipped FASTQ (detected via filename?)
36 |  - Support for interlaced FASTQ
37 |  - Support for setting read groups
38 |  - Support for multiple FASTQ input pairs (and read groups)
39 | 
40 | Copyright Peter Cock 2015. All rights reserved. See:
41 | https://github.com/peterjc/picobio
42 | 
43 | """
44 | 
45 | import sys
46 | 
47 | if "-v" in sys.argv or "--version" in sys.argv:
48 |     print("This is fastq_to_sam.py version 0.0.1")
49 |     sys.exit(0)
50 | 
51 | # TODO - proper API, allow interleaved FASTQ, read group, etc
52 | if len(sys.argv) != 3:
53 |     sys.stderr.write("Expects two arguments, a pair of FASTQ filenames\n")
54 |     sys.exit(1)
55 | 
56 | try:
57 |     from Bio._py3k import zip
58 |     from Bio.SeqIO.QualityIO import FastqGeneralIterator
59 | except ImportError:
60 |     sys.exit("ERROR: This requires Biopython.\n")
61 |     sys.exit(1)
62 | 
63 | fastq1 = FastqGeneralIterator(open(sys.argv[1]))
64 | fastq2 = FastqGeneralIterator(open(sys.argv[2]))
65 | 
66 | # Paired, unmapped, mate unmapped, either first or second in pair:
67 | flag1 = "77"
68 | flag2 = "141"
69 | rname = "*"
70 | pos = "0"
71 | mapq = "0"
72 | cigar = "*"
73 | rnext = "*"
74 | pnext = "0"
75 | tlen = "0"
76 | 
77 | pairs = 0
78 | for (t1, s1, q1), (t2, s2, q2) in zip(fastq1, fastq2):
79 |     id1 = t1.split(None, 1)[0]
80 |     id2 = t2.split(None, 1)[0]
81 |     if id1 == id2:
82 |         # Good, should we check the description follows Illumina naming?
83 |         qname = id1
84 |     else:
85 |         assert id1.endswith("/1"), t1
86 |         assert id2.endswith("/2"), t2
87 |         qname = id1[:-2]
88 | 
89 |     print(
90 |         "\t".join([qname, flag1, rname, pos, mapq, cigar, rnext, pnext, tlen, s1, q1])
91 |     )
92 |     print(
93 |         "\t".join([qname, flag1, rname, pos, mapq, cigar, rnext, pnext, tlen, s2, q2])
94 |     )
95 |     pairs += 1
96 | sys.stderr.write("Done, %i pairs\n" % pairs)
97 | 


--------------------------------------------------------------------------------
/sambam/sam_drop_long_cigar.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | 
 5 | usage = """Python script to remove SAM reads with long CIGAR strings.
 6 | 
 7 | The BAM format (currently) uses an unsigned 16bit integer for the
 8 | number of CIGAR operations in a read, and therefore BAM files can
 9 | only hold reads with up to 65535 CIGAR operators. SAM does not have
10 | this limit, but the samtools implementation (reasonably) also has
11 | the same 16bit limit. See also:
12 | https://github.com/samtools/samtools/pull/39
13 | 
14 | This script is designed to be used as part of a Unix pipeline. It
15 | takes no command line arguments. It reads SAM format data from stdin,
16 | and writes SAM format data to stdout.
17 | 
18 | The only change made to the SAM reads is to drop records with over
19 | 65535 CIGAR operators. These are logged to stderr.
20 | 
21 | $ ./sam_drop_long_cigar.py < original.sam > no_long_cigar.sam
22 | 
23 | Simple usage with BAM files with conversion to/from SAM via samtools:
24 | 
25 | $ samtools view -h original.bam | ./sam_drop_long_cigar.py | samtools view -S -b - > no_long_cigar.bam
26 | 
27 | Copyright Peter Cock 2014. All rights reserved. See:
28 | https://github.com/peterjc/picobio
29 | """
30 | 
31 | if len(sys.argv) != 1:
32 |     sys.stderr.write("ERROR: Bad arguments.\n\n")
33 |     sys.stderr.write("Expects SAM on stdin, and writes SAM to stdout.\n")
34 |     sys.exit(1)
35 | 
36 | # def decode_cigar(cigar):
37 | #    """Returns a list of 2-tuples, integer count and operator char."""
38 | #    count = ""
39 | #    answer = []
40 | #    for letter in cigar:
41 | #        if letter.isdigit():
42 | #            count += letter #string addition
43 | #        elif letter in "MIDNSHP=X":
44 | #            answer.append((int(count), letter))
45 | #            count = ""
46 | #        else:
47 | #            raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
48 | #    return answer
49 | #
50 | # assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14,'S'),(15,'M'),(1,'P'),(1,'D'),(3,'P'),(54,'M'),(1,'D'),(34,'M'),(5,'S')]
51 | 
52 | 
53 | def cigar_length(cigar):
54 |     """Return number of cigar operators (integer)."""
55 |     answer = 0
56 |     for letter in cigar:
57 |         if letter.isdigit():
58 |             pass
59 |         elif letter in "MIDNSHP=X":
60 |             answer += 1
61 |         else:
62 |             raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
63 |     return answer
64 | 
65 | 
66 | count = 0
67 | longs = 0
68 | for line in sys.stdin:
69 |     if line[0] != "@":
70 |         # Should be a read
71 |         count += 1
72 |         qname, flag, rname, pos, mapq, cigar, rest = line.split("\t", 6)
73 |         if cigar != "*":
74 |             len_cigar = cigar_length(cigar)
75 |             if len_cigar > 65535:
76 |                 longs += 1
77 |                 sys.stderr.write(
78 |                     "Dropping read %s with %i CIGAR operators\n" % (qname, len_cigar)
79 |                 )
80 |                 continue
81 |     sys.stdout.write(line)
82 | sys.stderr.write("Dropped %i out of %i reads\n" % (longs, count))
83 | 


--------------------------------------------------------------------------------
/snakemake/snakemake_progress_bar_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Demonstration of calling a snakemake workflow with a progress bar.
 3 | 
 4 | Written and tested using snakemake 8.20.6 under macOS.
 5 | 
 6 | Currently the snakemake API doesn't have any obvious way
 7 | to get callbacks or an iterator approach to running a
 8 | workflow which would allow direct updates to a progress
 9 | bar. Improvements to their logging system may allow this?
10 | 
11 | Instead, this demonstrate running snakemake on a subprocess,
12 | and monitoring the creation of the expected output files
13 | as a proxy to update a progress bar. This works, but would
14 | put some additional load on the file system.
15 | 
16 | This uses the rich library's progress bar, but the same idea
17 | would work with another library like tqdm. We must explicitly
18 | update the progress bar whenever a new output file is found.
19 | """
20 | 
21 | from multiprocessing import Process
22 | from pathlib import Path
23 | 
24 | from rich.progress import Progress  # or use tqdm, or ...
25 | 
26 | from snakemake.api import DAGSettings
27 | from snakemake.api import ResourceSettings
28 | from snakemake.api import SnakemakeApi
29 | from snakemake.settings.types import OutputSettings
30 | from snakemake.settings.types import Quietness
31 | 
32 | inputs = Path(".").glob("*.fna")
33 | targets = [str(_) + ".md5" for _ in inputs]
34 | 
35 | 
36 | def black_box(output_files):
37 |     """Black-box function which generates known files (here snakemake)."""
38 |     snakefile = Path("demo.smk")
39 |     with SnakemakeApi(OutputSettings(quiet={Quietness.ALL})) as snakemake_api:
40 |         workflow_api = snakemake_api.workflow(
41 |             snakefile=snakefile,
42 |             resource_settings=ResourceSettings(),
43 |             # config_settings=ConfigSettings(config=config_args),
44 |             # workdir=workdir,
45 |         )
46 |         dag_api = workflow_api.dag(
47 |             dag_settings=DAGSettings(targets=output_files),
48 |         )
49 |         dag_api.unlock()
50 |         dag_api.execute_workflow()
51 | 
52 | 
53 | def with_progress_bar(function, output_files, interval=0.5):
54 |     """Run given function via subprocess with a progress bar.
55 | 
56 |     The function must accept a single argument, the given file list.
57 |     The appearance of those files on disk is used to update the progress
58 |     bar. This runs the function in a process via multiprocessing, and
59 |     returns the process exit code (should be zero for success).
60 |     """
61 |     pending = [Path(_) for _ in targets]
62 |     p = Process(target=function, args=(output_files,))
63 |     p.start()
64 |     with Progress() as progress:
65 |         task = progress.add_task("Snakemake...", total=len(pending))
66 |         while pending:
67 |             p.join(interval)
68 |             for t in pending[:]:
69 |                 if t.is_file():
70 |                     print(f"Done: {t}")
71 |                     pending.remove(t)
72 |                     progress.update(task, advance=1)
73 |             if p.exitcode is not None:
74 |                 # Should be finished, but was it success or failure?
75 |                 pending = []  # to break the loop
76 |     p.join()  # Should be immediate as should have finished
77 |     assert not p.is_alive()
78 |     print(f"Snakemake return code {p.exitcode}")
79 |     return p.exitcode
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     # black_box(targets)
84 |     with_progress_bar(black_box, targets)
85 | 


--------------------------------------------------------------------------------
/seq_manipulation/shred_contigs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script for shredding contigs into fake reads.
  3 | 
  4 | e.g. for input into Newbler.
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | from optparse import OptionParser
 10 | 
 11 | from Bio import SeqIO
 12 | 
 13 | usage = """Basic usage: python shred_contigs.py assembly.fasta -o shredded.fasta
 14 | 
 15 | Multiple input FASTA files are accepted, see -h for more details.
 16 | 
 17 | Using Roche 454 Newbler, non-SFF input reads are limited to 1999 bp, thus
 18 | you might wish to use something like this on an Illumina assembly:
 19 | 
 20 | $ python shred_contigs.py other_assemby.fasta -o shredded.fasta -m 1999 -l 1999 -s 500
 21 | """
 22 | 
 23 | parser = OptionParser(usage=usage)
 24 | parser.add_option(
 25 |     "-m",
 26 |     "--min-contig-len",
 27 |     dest="max_contig",
 28 |     type="int",
 29 |     help="Max contig length to reuse as is (default 2000)",
 30 |     default=2000,
 31 | )
 32 | parser.add_option(
 33 |     "-l",
 34 |     "--shred-length",
 35 |     dest="shred_length",
 36 |     type="int",
 37 |     help="Length of fake reads to generate (default 1000 bp)",
 38 |     default=1000,
 39 | )
 40 | parser.add_option(
 41 |     "-s",
 42 |     "--shred-step",
 43 |     dest="shred_step",
 44 |     type="int",
 45 |     help="Offset between fake reads (default 500 bp)",
 46 |     default=500,
 47 | )
 48 | parser.add_option(
 49 |     "-o",
 50 |     "--output",
 51 |     dest="output_filename",
 52 |     help="FASTA output filename for fake reads (required)",
 53 |     default=None,
 54 |     metavar="FILE",
 55 | )
 56 | (options, args) = parser.parse_args()
 57 | if not args:
 58 |     sys.exit("Requires at least one input FASTA filename\n\n" + usage)
 59 | 
 60 | max_contig = int(options.max_contig)
 61 | shred_length = int(options.shred_length)
 62 | shred_step = int(options.shred_step)
 63 | output_fasta = options.output_filename
 64 | 
 65 | if shred_step < 1:
 66 |     sys.exit("Shred step should be positive")
 67 | if shred_length < shred_step:
 68 |     sys.exit("Shred step should be less than shred length")
 69 | 
 70 | print("Accepting contigs up to length %i as they are (option -m)" % max_contig)
 71 | print(
 72 |     "Shredding longer contigs into reads of %i bp (option -l), step %i (option -s)"
 73 |     % (shred_length, shred_step)
 74 | )
 75 | 
 76 | for assembly_fasta in args:
 77 |     if not os.path.isfile(assembly_fasta):
 78 |         sys.exit("Assembly FASTA file not found: %r" % assembly_fasta)
 79 | 
 80 | 
 81 | def shred(input_filename):
 82 |     global as_is, shredded
 83 |     for record in SeqIO.parse(input_filename, "fasta"):
 84 |         length = len(record)
 85 |         if length <= max_contig:
 86 |             as_is += 1
 87 |             yield record
 88 |         else:
 89 |             # Shred it!
 90 |             shredded += 1
 91 |             for i, start in enumerate(range(0, length - shred_step, shred_step)):
 92 |                 fragment = record[start : start + shred_length]
 93 |                 fragment.id = "%s_fragment%i" % (record.id, i + 1)
 94 |                 yield fragment
 95 | 
 96 | 
 97 | count = 0
 98 | as_is = 0
 99 | shredded = 0
100 | with open(output_fasta, "w") as output_handle:
101 |     for assembly_fasta in args:
102 |         count += SeqIO.write(shred(assembly_fasta), output_handle, "fasta")
103 | print("Shredded %i FASTA files, containing %i contigs" % (len(args), as_is + shredded))
104 | print(
105 |     "Accepted %i short contigs, shredded %i long contigs, giving %i reads"
106 |     % (as_is, shredded, count)
107 | )
108 | 


--------------------------------------------------------------------------------
/blast/wwwblast2loc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Short Python script to parse the blastwww setup files to extract a list of
  3 | # nucleotide and protein BLAST databases (with descriptions) and write them
  4 | # out as location files for use in Galaxy.
  5 | #
  6 | # Copyright 2010, Peter Cock.
  7 | #
  8 | # v001 - First version
  9 | # v002 - Use print as function
 10 | from __future__ import print_function
 11 | 
 12 | import os
 13 | 
 14 | # This gives us the list of databases and their type (nt vs aa):
 15 | blastrc = "/var/www/html/blast/blast.rc"
 16 | # This gives us a sensible order and their descriptions:
 17 | blastwww = "/var/www/html/blast/blast.html"
 18 | 
 19 | # BLAST DB path,
 20 | blastpath = "/data/blastdb"
 21 | 
 22 | # Output files
 23 | blast_nt = "blastdb.loc"
 24 | blast_aa = "blastdb_p.loc"
 25 | 
 26 | 
 27 | def load_blast_db_list(filename):
 28 |     nt = set()
 29 |     aa = set()
 30 |     handle = open(filename)
 31 |     for line in handle:
 32 |         if line.startswith("#") or not line.strip():
 33 |             continue
 34 |         elif line.startswith("NumCpuToUse"):
 35 |             continue
 36 |         elif line.startswith(("blastn ", "tblastn", "tblastx ")):
 37 |             nt.update(line.rstrip().split()[1:])
 38 |         elif line.startswith(("blastp ", "blastx")):
 39 |             aa.update(line.rstrip().split()[1:])
 40 |         else:
 41 |             raise ValueError(line)
 42 |     handle.close()
 43 |     return nt, aa
 44 | 
 45 | 
 46 | nt, aa = load_blast_db_list(blastrc)
 47 | # print(nt)
 48 | # print(aa)
 49 | 
 50 | 
 51 | def load_blast_db_descr(html_filename, nt, aa):
 52 |     nt_list = []
 53 |     aa_list = []
 54 |     handle = open(html_filename)
 55 |     for line in handle:
 56 |         line = line.strip()
 57 |         if not line.startswith("<option ") or not line.endswith("</option>"):
 58 |             continue
 59 |         i = line.find(">")
 60 |         assert i != -1, line
 61 |         descr = line[i + 1 : -9].strip()
 62 |         attrs = line[8:i]
 63 |         for db in nt:
 64 |             if '"%s"' % db in attrs:
 65 |                 if descr.endswith(" (nt)"):
 66 |                     descr = descr[:-5].strip()
 67 |                 # print("NT: %s -> %s" % (db, descr))
 68 |                 nt_list.append((db, descr))
 69 |                 nt.remove(db)
 70 |                 break
 71 |         for db in aa:
 72 |             if '"%s"' % db in attrs:
 73 |                 if descr.endswith(" (aa)"):
 74 |                     descr = descr[:-5].strip()
 75 |                 # print("AA: %s -> %s" % (db, descr))
 76 |                 aa_list.append((db, descr))
 77 |                 aa.remove(db)
 78 |                 break
 79 |     handle.close()
 80 |     return nt_list, aa_list
 81 | 
 82 | 
 83 | nt_list, aa_list = load_blast_db_descr(blastwww, nt, aa)
 84 | # print(nt_list)
 85 | # print(aa_list)
 86 | 
 87 | 
 88 | for filename, dbs in [(blast_nt, nt_list), (blast_aa, aa_list)]:
 89 |     handle = open(filename, "w")
 90 |     handle.write("#Automatically generated from\n")
 91 |     handle.write("#file %s\n" % blastrc)
 92 |     handle.write("#and %s\n" % blastwww)
 93 |     for db, descr in dbs:
 94 |         handle.write("\t".join([db, descr, os.path.join(blastpath, db)]) + "\n")
 95 |     handle.close()
 96 | print("Done, %i nt and %i aa BLAST databases" % (len(aa_list), len(nt_list)))
 97 | if aa:
 98 |     print("Missing protein databases:")
 99 |     for db in aa:
100 |         print(db)
101 | if nt:
102 |     print("Missing nucleotide databases:")
103 |     for db in nt:
104 |         print(db)
105 | 


--------------------------------------------------------------------------------
/primer_selection/species_dedup_gbk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | r"""De-duplicate GenBank file to species level with FASTA output.
  3 | 
  4 | Parses stdin or given filenames (optionally sorted), writes to
  5 | stdout (or given filename stem). Takes the first entry of each
  6 | species found (discarding duplicates). The output FASTA entries
  7 | are named ">{accession} {lineage}\n{sequence}\n" with the
  8 | taxonomic lineage from the GenBank header.
  9 | """
 10 | 
 11 | import argparse
 12 | import sys
 13 | 
 14 | from Bio import SeqIO
 15 | from Bio.Seq import UndefinedSequenceError
 16 | 
 17 | parser = argparse.ArgumentParser(
 18 |     prog="species_dedup_gbk.py",
 19 |     description=("Depulicate GenBank files to species level FASTA file(s)."),
 20 | )
 21 | parser.add_argument(
 22 |     "filenames",
 23 |     type=argparse.FileType("r"),
 24 |     nargs="*",
 25 |     metavar="GBK",
 26 |     default=[sys.stdin],
 27 |     help="One or more GenBank files (default stdin)",
 28 | )
 29 | parser.add_argument(
 30 |     "-s",
 31 |     "--sort",
 32 |     action="store_true",
 33 |     help="Sort input filenames alphabetically (for reproducibility, "
 34 |     "applied to paths as given without any normalisation).",
 35 | )
 36 | parser.add_argument(
 37 |     "-o",
 38 |     "--output",
 39 |     type=str,
 40 |     metavar="STEM",
 41 |     help="Output filename stem for generating reports, default all to stdout.",
 42 | )
 43 | parser.add_argument(
 44 |     "-c",
 45 |     "--chunk",
 46 |     type=int,
 47 |     metavar="COUNT",
 48 |     default=10000,
 49 |     help="Number of sequences per file if using the output stemp option.",
 50 | )
 51 | options = parser.parse_args()
 52 | 
 53 | if options.output:
 54 |     filename_pattern = options.output + ".0000.fasta"
 55 |     out_handle = open(filename_pattern, "w")
 56 | else:
 57 |     filename_pattern = None
 58 |     out_handle = sys.stdout
 59 | 
 60 | # if not options.filenames:
 61 | #    sys.stderr.write("Parsing from stdin...\n")
 62 | #    options.filenames = [sys.stdin]
 63 | 
 64 | ignored = 0
 65 | unique = 0
 66 | counts = {}
 67 | chunk = 0
 68 | if options.sort and options.filenames:
 69 |     sys.stderr.write(f"Sorted {len(options.filenames)} given filenames\n")
 70 |     options.filenames.sort(key=lambda f: f.name)
 71 | for in_handle in options.filenames:
 72 |     sys.stderr.write(f"Parsing {in_handle.name}\n")
 73 |     for record in SeqIO.parse(in_handle, "genbank"):
 74 |         lineage = (
 75 |             ";".join(record.annotations["taxonomy"])
 76 |             + ";"
 77 |             + record.annotations["organism"]
 78 |         )
 79 |         try:
 80 |             counts[lineage] += 1
 81 |         except KeyError:
 82 |             try:
 83 |                 out_handle.write(f">{record.id} {lineage}\n{record.seq}\n")
 84 |                 unique += 1
 85 |                 counts[lineage] = 1
 86 |                 if filename_pattern and unique % options.chunk == 0:
 87 |                     out_handle.close()
 88 |                     sys.stderr.write(f"{unique} so far, finished {out_handle.name}\n")
 89 |                     chunk += 1
 90 |                     filename_pattern = f"{options.output}.{chunk:04d}.fasta"
 91 |                     out_handle = open(filename_pattern, "w")
 92 |             except UndefinedSequenceError:
 93 |                 sys.stderr.write(f"WARNING - Ignoring {record.id} as no sequence\n")
 94 | 
 95 | if filename_pattern:
 96 |     out_handle.close()
 97 |     sys.stderr.write(f"{unique} unique, finished {out_handle.name}\n")
 98 | 
 99 | sys.stderr.write(
100 |     f"{unique} unique counts in {sum(counts.values())} sequences; plus {ignored} ignored\n"
101 | )
102 | # for sp, count in sorted(counts.items()):
103 | #    sys.stderr.write(f"{sp}\t{count}\n")
104 | 


--------------------------------------------------------------------------------
/blast/blast_wrap.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Simple Python wrapper for BLAST to locally cache databases.
  3 | 
  4 | This is intended for use within a computer cluster where there is
  5 | a central copy of the BLAST databases which is updated regularly,
  6 | but which is slow to access over the network. Therefore for speed,
  7 | and assuming sufficient disk space on each node, we want to copy
  8 | the BLAST databases to a local cache.
  9 | 
 10 | This can be setup as a scheduled task (e.g. with cron) for key
 11 | databases (see blast_sync.py), and/or uses via a wrapper script
 12 | to sync on demand (see blast_wrap.py).
 13 | 
 14 | Currently uses two hard coded settings for the network mounted
 15 | master copy of the databases (e.g. NFS, SAMBA), and the local
 16 | fast hard drive to use as the cache.
 17 | 
 18 | We're using /mnt/gfs/blast/galaxy as the master, and /var/blast/galaxy
 19 | as the local cache - see variables 'master' and 'local' below.
 20 | 
 21 | Intention is rather than this:
 22 | 
 23 | $ blastx -query=example.fasta -db=/mnt/gfs/blast/galaxy/nr ...
 24 | 
 25 | You do this:
 26 | 
 27 | $ blast_wrap.py blastx -query=example.fasta -db=/mnt/gfs/blast/galaxy/nr ...
 28 | 
 29 | This will cache the /mnt/gfs/blast/galaxy/nr.* files from the server
 30 | as /var/blast/galaxy/ncbi/nr.* and run this command for you:
 31 | 
 32 | $ blastx -query=example.fasta -db=/var/blast/galaxy/ncbi/nr ...
 33 | 
 34 | TODO: Work out the database path if not given explicitly (e.g. just nr)
 35 | but via the BLAST environment variable etc.
 36 | """
 37 | 
 38 | from __future__ import print_function
 39 | 
 40 | import os
 41 | import sys
 42 | import time
 43 | 
 44 | # Old naming,
 45 | # master = "/mnt/shared/cluster/blast/galaxy"
 46 | # local = "/var/local/blast/galaxy"
 47 | 
 48 | master = "/mnt/shared/cluster/blast/galaxy"
 49 | local = "/mnt/scratch/local/blast/galaxy"
 50 | 
 51 | # e.g. db = "ncbi/nr"
 52 | 
 53 | # print("Given: %r" % sys.argv)
 54 | 
 55 | # argv[0] is this python script
 56 | # Turn the argv list into a string, escaping as needed
 57 | 
 58 | 
 59 | def wrap(text):
 60 |     if " " in text and not text[0] == '"' and not text[-1] == '"':
 61 |         return '"%s"' % text
 62 |     elif "|" in text:
 63 |         return '"%s"' % text
 64 |     else:
 65 |         return text
 66 | 
 67 | 
 68 | cmd = " ".join(wrap(arg) for arg in sys.argv[1:])
 69 | 
 70 | if master in cmd:
 71 |     # We have syncing to do!
 72 |     i = cmd.find(master + "/")
 73 |     db = cmd[i + len(master) + 1 :].split(None, 1)[0]
 74 |     if db.endswith('"'):
 75 |         db = db.rstrip('"')
 76 |     print("Synchronising database,")
 77 |     assert master + "/" + db in cmd
 78 |     start = time.time()
 79 |     sync = os.path.join(os.path.dirname(os.path.abspath(__file__)), "blast_sync.py")
 80 |     if not os.path.isfile(sync):
 81 |         sys.stderr.write("Syncing %s failed (missing %s script)\n" % (master, sync))
 82 |         sys.exit(1)
 83 |     err = os.system("%s %s %s %s > /dev/null" % (sync, master, local, db))
 84 |     taken = time.time() - start
 85 |     if 0 < err < 128:
 86 |         sys.stderr.write("Syncing %s failed (error code %i)\n" % (db, err))
 87 |         sys.exit(err)
 88 |     elif err:
 89 |         sys.stderr.write("Syncing %s failed (error code %i --> 1)\n" % (db, err))
 90 |         sys.exit(1)
 91 |     # Update the command
 92 |     cmd = cmd.replace(master + "/" + db, local + "/" + db)
 93 |     if taken > 100:
 94 |         print("%s done in %0.1fm" % (db, taken / 60.0))
 95 |     else:
 96 |         print("%s done in %is" % (db, int(taken)))
 97 | 
 98 | # Run the command
 99 | # print cmd
100 | err = os.system(cmd)
101 | if 0 < err < 128:
102 |     sys.exit(err)
103 | elif err:
104 |     # Returning 512 gives 0 (odd)
105 |     sys.exit(1)
106 | 


--------------------------------------------------------------------------------
/sambam/sam_strip_tags.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script to remove tags from SAM/BAM files.
  3 | 
  4 | This script is designed to be used as part of a Unix pipeline. It
  5 | takes as optional command line arguments a white list of tags to
  6 | preserve (or a black list of tags to remove). It reads SAM format
  7 | data from stdin, and writes SAM format data to stdout.
  8 | 
  9 | Simple usage with SAM files, keeping only read-group tags:
 10 | 
 11 | $ ./sam_strip_tags.py RG < original.sam > only_RG.sam
 12 | 
 13 | Simple usage with BAM files with conversion to/from SAM via samtools:
 14 | 
 15 | $ samtools view -h original.bam | ./sam_strip_tags.py RG | samtools view -S -b - > only_RG.bam
 16 | 
 17 | If your SAM/BAM files lack @SQ headers, you may need to give
 18 | samtools the reference FASTA file as well.
 19 | 
 20 | To remove particular tags (a black list rather than a white list)
 21 | include the switch -v (for invert, like the grep option). For example,
 22 | to remove any original quality (OC) tags, use:
 23 | 
 24 | $ ./sam_strip_tags.py -v OQ < original.sam > no_OQ.sam
 25 | 
 26 | Likewise with BAM files via samtools,
 27 | 
 28 | $ samtools view -h original.bam | ./sam_strip_tags.py -v OQ | samtools view -S -b - > no_OQ.bam
 29 | 
 30 | Copyright Peter Cock 2012. All rights reserved. See:
 31 | https://github.com/peterjc/picobio
 32 | """
 33 | 
 34 | import sys
 35 | 
 36 | if "-v" in sys.argv[1:]:
 37 |     black_list = {x.strip() for x in sys.argv[1:] if x != "-v"}
 38 |     sys.stderr.write("Removing these tags: %s\n" % ", ".join(black_list))
 39 |     for line in sys.stdin:
 40 |         if line[0] != "@":
 41 |             # Should be a read
 42 |             (
 43 |                 qname,
 44 |                 flag,
 45 |                 rname,
 46 |                 pos,
 47 |                 mapq,
 48 |                 cigar,
 49 |                 rnext,
 50 |                 pnext,
 51 |                 tlen,
 52 |                 seq,
 53 |                 qual,
 54 |                 tags,
 55 |             ) = line.rstrip().split("\t", 11)
 56 |             tags = "\t".join(t for t in tags.split("\t") if t[:2] not in black_list)
 57 |             line = (
 58 |                 "\t".join(
 59 |                     [
 60 |                         qname,
 61 |                         flag,
 62 |                         rname,
 63 |                         pos,
 64 |                         mapq,
 65 |                         cigar,
 66 |                         rnext,
 67 |                         pnext,
 68 |                         tlen,
 69 |                         seq,
 70 |                         qual,
 71 |                         tags,
 72 |                     ]
 73 |                 )
 74 |                 + "\n"
 75 |             )
 76 |         sys.stdout.write(line)
 77 | else:
 78 |     white_list = {x.strip() for x in sys.argv[1:]}
 79 |     sys.stderr.write("Keeping only these tags: %s\n" % ", ".join(white_list))
 80 |     for line in sys.stdin:
 81 |         if line[0] != "@":
 82 |             # Should be a read
 83 |             (
 84 |                 qname,
 85 |                 flag,
 86 |                 rname,
 87 |                 pos,
 88 |                 mapq,
 89 |                 cigar,
 90 |                 rnext,
 91 |                 pnext,
 92 |                 tlen,
 93 |                 seq,
 94 |                 qual,
 95 |                 tags,
 96 |             ) = line.rstrip().split("\t", 11)
 97 |             tags = "\t".join(t for t in tags.split("\t") if t[:2] in white_list)
 98 |             line = (
 99 |                 "\t".join(
100 |                     [
101 |                         qname,
102 |                         flag,
103 |                         rname,
104 |                         pos,
105 |                         mapq,
106 |                         cigar,
107 |                         rnext,
108 |                         pnext,
109 |                         tlen,
110 |                         seq,
111 |                         qual,
112 |                         tags,
113 |                     ]
114 |                 )
115 |                 + "\n"
116 |             )
117 |         sys.stdout.write(line)
118 | 


--------------------------------------------------------------------------------
/seq_manipulation/seqio_index_db.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script for building Biopython SeqIO SQLite index files.
  3 | 
  4 | Intended for use as part of a larger pipeline, e.g. with make to
  5 | ensure every input FASTQ file has been indexed.
  6 | 
  7 | History:
  8 | 
  9 | * v0.0.1 - Original
 10 | * v0.0.2 - Option to force re-creation of indexes
 11 |          - Check indexes claiming to have zero records
 12 | * v0.0.3 - Actually print the usage text I wrote.
 13 | """
 14 | 
 15 | import os
 16 | import sys
 17 | from optparse import OptionParser
 18 | 
 19 | VERSION = "0.0.3"
 20 | 
 21 | 
 22 | try:
 23 |     from Bio import SeqIO
 24 | except ImportError:
 25 |     sys.exit("Missing Biopython")
 26 | 
 27 | try:
 28 |     from Bio.SeqIO import index_db
 29 | except ImportError:
 30 |     sys.exit("Biopython too old to provide Bio.SeqIO.index_db(...)?")
 31 | 
 32 | 
 33 | def at_least_one_record(filenames, format):
 34 |     for f in filenames:
 35 |         for r in SeqIO.parse(f, format):
 36 |             return True
 37 |     # Really were no records!
 38 |     return False
 39 | 
 40 | 
 41 | def main():
 42 |     usage = """usage: %prog [options] [sequence filenames]
 43 | 
 44 |     By default (if not using the -i or --index argument), one index
 45 |     will be created for each sequence file with ".idx" appended.
 46 |     """
 47 |     parser = OptionParser(usage=usage, version="%prog " + VERSION)
 48 |     parser.add_option(
 49 |         "-f",
 50 |         "--format",
 51 |         dest="format",
 52 |         type="string",
 53 |         metavar="FORMAT",
 54 |         help="""Sequence format supported by Biopython's SeqIO.
 55 | 
 56 |                       Common examples would be 'fasta' or 'fastq' (required).
 57 |                       """,
 58 |     )
 59 |     parser.add_option(
 60 |         "-i",
 61 |         "--index",
 62 |         dest="index_filename",
 63 |         type="string",
 64 |         metavar="FILE",
 65 |         help="Created one combined index using this filename.",
 66 |     )
 67 |     parser.add_option(
 68 |         "-r",
 69 |         "--reindex",
 70 |         dest="reindex",
 71 |         action="store_true",
 72 |         help="Delete pre-existing indexes and rebuild them.",
 73 |     )
 74 |     (options, filenames) = parser.parse_args()
 75 | 
 76 |     if not filenames:
 77 |         sys.exit("No sequence filenames provided")
 78 |     if not options.format:
 79 |         sys.exit("No sequence format specified")
 80 |     format = options.format.lower()
 81 | 
 82 |     for filename in filenames:
 83 |         if not os.path.isfile(filename):
 84 |             sys.exit("Missing %s" % filename)
 85 | 
 86 |     if options.index_filename:
 87 |         # One shared index for all sequence files
 88 |         idx_filename = options.index_filename
 89 |         if options.reindex and os.path.isfile(idx_filename):
 90 |             print("%s - re-indexing %i files.." % len(filenames))
 91 |             os.remove(idx_filename)
 92 |         elif not os.path.isfile(idx_filename):
 93 |             print("%s - indexing %i files..." % len(filenames))
 94 |         d = index_db(idx_filename, filenames, format)
 95 |         if len(d) == 0 and at_least_one_record(filenames, format):
 96 |             sys.exit("Index %s wrongly reports zero records" % idx_filename)
 97 |         print(
 98 |             "%s - OK, %i records in %i files" % (idx_filename, len(d), len(filenames))
 99 |         )
100 |     else:
101 |         # One index per sequence file
102 |         for filename in filenames:
103 |             idx_filename = filename + ".idx"
104 |             if options.reindex and os.path.isfile(idx_filename):
105 |                 print("%s - re-indexing..." % filename)
106 |                 os.remove(idx_filename)
107 |             elif not os.path.isfile(idx_filename):
108 |                 print("%s - indexing..." % filename)
109 |             d = index_db(idx_filename, filename, format)
110 |             if len(d) == 0 and at_least_one_record([filename], format):
111 |                 sys.exit("Index %s wrongly reports zero records" % idx_filename)
112 |             print("%s - OK, %i records" % (idx_filename, len(d)))
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/seq_manipulation/rename_locustags.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script to rename locus tags in EMBL/GenBank files.
  3 | 
  4 | When submitting an annotated genome assembly to the ENA, you
  5 | are asked for your locus tag prefix, which must be globally
  6 | unique. It is possible to register the locus tag prefix in
  7 | advance, and I've found out the hard way why this is a good
  8 | idea.
  9 | 
 10 | If another project has already taken the prefix you wanted,
 11 | tough: You'll have to rename your features. Hopefully your
 12 | collaborators/consortium members will forgive you.
 13 | 
 14 | Script usage:
 15 | 
 16 | ./rename_locustags OLD_PREFIX NEW_PREFIX old_file new_file
 17 | 
 18 | Include the trailing underscore on your prefix if used (this
 19 | allows the script to be used on input files where there is
 20 | no underscore after the prefix).
 21 | 
 22 | Using a minus sign for the input or output filenames will
 23 | select stdin or stdout respectively.
 24 | 
 25 | This will simply loop over the lines looking for feature
 26 | qualifiers using the old locus tag prefix, which it edits.
 27 | It is a quick-and-dirty approach (not using a full parser),
 28 | which assumes a locus tag is short enough not to cause a line
 29 | wrap, and that the prefix is followed by an underscore.
 30 | 
 31 | e.g. Given the following where locus tag prefix YP has already
 32 | been taken:
 33 | 
 34 | FT   gene            2334..3335
 35 | FT                   /locus_tag="YP_00001"
 36 | FT                   /gene="rbsR_1"
 37 | FT   CDS             2334..3335
 38 | FT                   /product="Ribose operon repressor"
 39 | FT                   /inference="ab initio prediction:Prodigal:2.60"
 40 | FT                   /db_xref="UniProtKB/Swiss-Prot:P0ACQ0"
 41 | FT                   /locus_tag="YP_00001"
 42 | FT                   /gene="rbsR_1"
 43 | FT                   /transl_table=11
 44 | 
 45 | Running the script with new old prefix ``YP_`` and new locus tag
 46 | prefix ``XYZYP_`` gives (note the trailing underscores):
 47 | 
 48 | FT   gene            2334..3335
 49 | FT                   /locus_tag="XYZYP_00001"
 50 | FT                   /gene="rbsR_1"
 51 | FT   CDS             2334..3335
 52 | FT                   /product="Ribose operon repressor"
 53 | FT                   /inference="ab initio prediction:Prodigal:2.60"
 54 | FT                   /db_xref="UniProtKB/Swiss-Prot:P0ACQ0"
 55 | FT                   /locus_tag="XYZYP_00001"
 56 | FT                   /gene="rbsR_1"
 57 | FT                   /transl_table=11
 58 | 
 59 | This example used EMBL format, but GenBank format is also
 60 | supported.
 61 | """
 62 | 
 63 | import sys
 64 | 
 65 | # TODO - Proper API
 66 | if len(sys.argv) != 5:
 67 |     sys.exit(
 68 |         "Expects four arguments: Old prefix, new prefix, input filename, output filename"
 69 |     )
 70 | old_prefix, new_prefix, in_filename, out_filename = sys.argv[1:]
 71 | 
 72 | sys.stderr.write(
 73 |     'Replacing /locus_tag="%s..." with /locus_tag="%s..."\n' % (old_prefix, new_prefix)
 74 | )
 75 | 
 76 | if in_filename == "-":
 77 |     in_handle = sys.stdin
 78 | else:
 79 |     in_handle = open(in_filename)
 80 | 
 81 | if out_filename == "-":
 82 |     out_handle = sys.stdout
 83 | else:
 84 |     out_handle = open(out_filename, "w")
 85 | 
 86 | # Accept either EMBL or GenBank format
 87 | patterns = (
 88 |     'FT                   /locus_tag="%s' % old_prefix,
 89 |     '                     /locus_tag="%s' % old_prefix,
 90 | )
 91 | count = 0
 92 | locus_tags = set()
 93 | for line in in_handle:
 94 |     if line.startswith(patterns):
 95 |         count += 1
 96 |         line = line.replace(
 97 |             '/locus_tag="%s' % old_prefix, '/locus_tag="%s' % new_prefix
 98 |         )
 99 |         locus_tags.add(line[32:].strip())  # left quotes in place
100 |     out_handle.write(line)
101 | 
102 | if in_filename != "-":
103 |     in_handle.close()
104 | if out_filename != "-":
105 |     out_handle.close()
106 | 
107 | if not count:
108 |     sys.stderr.write("Original locus tag prefix %s_... not found\n" % old_prefix)
109 |     sys.exit(1)
110 | sys.stderr.write(
111 |     "Edited %s_... -> %s_... in %i lines.\n" % (old_prefix, new_prefix, count)
112 | )
113 | sys.stderr.write("Saw %i unique locus tags\n" % len(locus_tags))
114 | 


--------------------------------------------------------------------------------
/fetch_viruses/fetch_viruses.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | try:
  4 |     from StringIO import StringIO  # Python 2
  5 | except ImportError:
  6 |     from io import StringIO  # Python 3
  7 | 
  8 | from Bio import Entrez
  9 | from Bio import SeqIO
 10 | 
 11 | Entrez.email = "peter.cock@hutton.ac.uk"
 12 | 
 13 | 
 14 | def download(acc, name, filename):
 15 |     fetch_handle = Entrez.efetch("nuccore", rettype="gbwithparts", id=acc)
 16 |     # fetch_handle = TogoWS.entry("nuccore", acc)
 17 |     data = fetch_handle.read()  # defaults to gb
 18 |     fetch_handle.close()
 19 |     assert data.lstrip().startswith("LOCUS "), data
 20 |     assert data.rstrip().endswith("//"), data
 21 |     # Test we can parse it:
 22 |     record = SeqIO.read(StringIO(data), "gb")
 23 |     assert name == record.name or record.id.startswith(name + "."), (
 24 |         "Got %r and %r expected %r" % (record.id, record.name, name)
 25 |     )
 26 |     # Save it:
 27 |     handle = open(filename, "w")
 28 |     handle.write(data)
 29 |     handle.close()
 30 | 
 31 | 
 32 | def download_batch(acc_list, check=False):
 33 |     missing = []
 34 |     for acc in acc_list:
 35 |         if "." in acc:
 36 |             name, version = acc.split(".")
 37 |         else:
 38 |             name = acc
 39 |         filename = "GenBank/%s.gbk" % name
 40 |         if not os.path.isfile(filename):
 41 |             missing.append(acc)
 42 |         elif check:
 43 |             check(acc, name, filename)
 44 |     count = len(missing)
 45 |     for index, acc in enumerate(missing):
 46 |         if "." in acc:
 47 |             name, version = acc.split(".")
 48 |         else:
 49 |             name = acc
 50 |         filename = "GenBank/%s.gbk" % name
 51 |         assert not os.path.isfile(filename)
 52 |         print("Fetching %s (%i of %i)" % (name, index + 1, count))
 53 |         download(acc, name, filename)
 54 | 
 55 | 
 56 | def check(acc, name, filename):
 57 |     record = SeqIO.read(open(filename), "gb")
 58 |     assert name == record.name or record.id.startswith(name + "."), (
 59 |         "Got %r and %r expected %r" % (record.id, record.name, name)
 60 |     )
 61 | 
 62 | 
 63 | # dsDNA viruses, no RNA stage, Taxonomy ID: 35237
 64 | # dsRNA viruses, Taxonomy ID: 35325
 65 | # ssDNA viruses, Taxonomy ID: 29258
 66 | # ssRNA viruses, Taxonomy ID: 439488
 67 | # Viruses, Taxonomy ID: 10239
 68 | # Cellular organisms, Taxonomy ID: 131567 (to avoid chimeras)
 69 | for group, taxon_id in [
 70 |     ("dsDnaViruses", "35237"),
 71 |     ("dsRnaViruses", "35325"),
 72 |     ("ssDnaViruses", "29258"),
 73 |     ("ssRnaViruses", "439488"),
 74 |     ("allViruses", "10239"),
 75 | ]:
 76 |     print("=" * 60)
 77 |     print(group)
 78 |     print("=" * 60)
 79 | 
 80 |     if os.path.isfile("GenBank/%s.txt" % group):
 81 |         print("Pre-fetching any outstanding old search results...")
 82 |         handle = open("GenBank/%s.txt" % group)
 83 |         names = [line.strip() for line in handle]
 84 |         handle.close()
 85 |         download_batch(names)
 86 |     else:
 87 |         names = []
 88 | 
 89 |     print("Running NCBI search...")
 90 |     search_text = (
 91 |         "txid%s[orgn] AND complete[Properties] AND genome NOT txid131567[orgn]"
 92 |         % taxon_id
 93 |     )
 94 |     handle = Entrez.esearch("nucleotide", term=search_text, usehistory=True)
 95 |     search_results = Entrez.read(handle)
 96 |     handle.close()
 97 |     webenv = search_results["WebEnv"]
 98 |     query_key = search_results["QueryKey"]
 99 |     count = int(search_results["Count"])
100 |     print("%i hits" % count)
101 | 
102 |     if len(names) == count:
103 |         print("Probably no new names...")
104 |         continue
105 | 
106 |     # Get the accessions...
107 |     names = []
108 |     batch_size = 1000
109 |     for start in range(0, count, batch_size):
110 |         end = min(count, start + batch_size)
111 |         print("Getting accessions for record %i to %i" % (start + 1, end))
112 |         fetch_handle = Entrez.efetch(
113 |             db="nucleotide",
114 |             rettype="acc",
115 |             retmode="xml",
116 |             retstart=start,
117 |             retmax=batch_size,
118 |             webenv=webenv,
119 |             query_key=query_key,
120 |         )
121 |         data = fetch_handle.read().strip().split()
122 |         fetch_handle.close()
123 |         assert len(data) == end - start
124 |         names.extend(data)
125 |     assert len(names) == count
126 |     print("%i records, %s to %s" % (len(names), names[0], names[-1]))
127 | 
128 |     handle = open("GenBank/%s.txt" % group, "w")
129 |     handle.write("\n".join(names))
130 |     handle.close()
131 | 
132 |     # Get the sequences
133 |     download_batch(names)
134 | 


--------------------------------------------------------------------------------
/blast/blast_most_matched.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | 
  4 | import sys
  5 | from collections import OrderedDict
  6 | 
  7 | from Bio import SeqIO
  8 | 
  9 | min_run = 50
 10 | min_bases = 100
 11 | 
 12 | contig_fasta = sys.argv[1]  # FASTA file used as BLAST query
 13 | contig_blast = sys.argv[2]  # Tabular 12 (std) columns + optional extras
 14 | 
 15 | 
 16 | def cull_runs(set_of_points, min_run):
 17 |     answer = set()
 18 |     start = None
 19 |     end = None
 20 |     for i in sorted(set_of_points):
 21 |         if start is None:
 22 |             # very first run
 23 |             start = i
 24 |             end = i
 25 |         elif i == end + 1:
 26 |             # Continues run
 27 |             end = i
 28 |         else:
 29 |             # End of run.
 30 |             if end - start + 1 >= min_run:
 31 |                 answer.update(range(start, end + 1))
 32 |             start = i
 33 |             end = i
 34 |     # Final run,
 35 |     if start and end and end - start + 1 >= min_run:
 36 |         answer.update(range(start, end + 1))
 37 |     return answer
 38 | 
 39 | 
 40 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13}, 5) == set()
 41 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13}, 4) == {10, 11, 12, 13}
 42 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13}, 3) == {1, 2, 3, 10, 11, 12, 13}
 43 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13}, 2) == {1, 2, 3, 5, 6, 10, 11, 12, 13}
 44 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13, 20}, 5) == set()
 45 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13, 20}, 4) == {10, 11, 12, 13}
 46 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13, 20}, 3) == {1, 2, 3, 10, 11, 12, 13}
 47 | assert cull_runs({1, 2, 3, 5, 6, 10, 11, 12, 13, 20}, 2) == {
 48 |     1,
 49 |     2,
 50 |     3,
 51 |     5,
 52 |     6,
 53 |     10,
 54 |     11,
 55 |     12,
 56 |     13,
 57 | }
 58 | 
 59 | # key = contig id
 60 | # value = length of contig
 61 | contig_lengths = OrderedDict()
 62 | 
 63 | # key = contig id
 64 | # value = length of preceding contigs
 65 | contig_starts = {}
 66 | 
 67 | offset = 0
 68 | for record in SeqIO.parse(contig_fasta, "fasta"):
 69 |     length = len(record)
 70 |     contig_lengths[record.id] = length
 71 |     contig_starts[record.id] = offset
 72 |     offset += length
 73 | 
 74 | # key = subject id
 75 | # value = set of base coordinates (cumulative along all contigs)
 76 | contig_mapping = {}
 77 | 
 78 | contig_species = {}
 79 | for line in open(contig_blast):
 80 |     parts = line.rstrip("\n").split("\t")
 81 |     assert len(parts) > 12
 82 |     (
 83 |         qseqid,
 84 |         sseqid,
 85 |         pident,
 86 |         length,
 87 |         mismatch,
 88 |         gapopen,
 89 |         qstart,
 90 |         qend,
 91 |         sstart,
 92 |         send,
 93 |         evalue,
 94 |         bitscore,
 95 |     ) = parts[:12]
 96 |     if len(parts) >= 14:
 97 |         contig_species[sseqid] = parts[14]
 98 |     start = int(qstart) - 1
 99 |     end = int(qend)
100 |     assert 0 <= start <= end <= contig_lengths[qseqid]
101 |     offset = contig_starts[qseqid]
102 |     if sseqid not in contig_mapping:
103 |         contig_mapping[sseqid] = set()
104 |     contig_mapping[sseqid].update(range(offset + start, offset + end))
105 | 
106 | 
107 | def pop_most_mapped():
108 |     global contig_mapping
109 |     # Sort by most bases mapped to each subject
110 |     contig_mapping_counts = sorted(((len(v), k) for k, v in contig_mapping.items()))
111 |     most_mapped_count, most_mapped = contig_mapping_counts[-1]
112 |     # Now remove all those bases from consideration!
113 |     taken_bases = contig_mapping[most_mapped]
114 |     del contig_mapping[most_mapped]
115 |     for sseqid in list(contig_mapping):  # list as editing dict during loop
116 |         contig_mapping[sseqid].difference_update(taken_bases)
117 |         contig_mapping[sseqid] = cull_runs(contig_mapping[sseqid], min_run)
118 |         if len(contig_mapping[sseqid]) < min_bases:
119 |             # print("Culled %s" % sseqid)
120 |             del contig_mapping[sseqid]
121 |     return most_mapped, most_mapped_count
122 | 
123 | 
124 | for sseqid in list(contig_mapping):  # list as editing dict during loop
125 |     contig_mapping[sseqid] = cull_runs(contig_mapping[sseqid], min_run)
126 |     if len(contig_mapping[sseqid]) < min_bases:
127 |         del contig_mapping[sseqid]
128 | 
129 | # print("- Raw -")
130 | # contig_mapping_counts = sorted(((len(v), k) for k, v in contig_mapping.items()), reverse=True)
131 | # for sseqid, bases in contig_mapping_counts:
132 | #    print(sseqid, bases)
133 | print("- Culling 1st hit -")
134 | while contig_mapping:
135 |     sseqid, count = pop_most_mapped()
136 |     print(sseqid, count, contig_species.get(sseqid, ""))
137 | 


--------------------------------------------------------------------------------
/sambam/sam_restore_seq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | 
  5 | usage = """Python script to restore SEQ entries recorded as just * in SAM/BAM.
  6 | 
  7 | This script is designed to be used on the BWA-MEM output where it seems
  8 | with the -a option additional alignments are reported with the SEQ and
  9 | QUAL fields set to just *.
 10 | 
 11 | If the alignment was recorded with SEQ * and a CIGAR string with hard
 12 | clipping, we restore the full SEQ and alter the CIGAR to say soft trimming
 13 | was used.
 14 | 
 15 | Developed with the output from:
 16 | 
 17 |     ~/Downloads/bwa-0.7.10/bwa mem -p -S -a REF.fas PAIRED.fastq > PAIRED.sam
 18 | 
 19 | and:
 20 | 
 21 |     ~/Downloads/bwa-0.7.10/bwa mem -S -a REF.fas SINGLE.fastq > SINGLE.sam
 22 | 
 23 | This script is designed to be used as part of a Unix pipeline. It reads
 24 | SAM format data from stdin, and writes SAM format data to stdout.
 25 | 
 26 | TODO: May need to refer to the original unaligned FASTQ file in some
 27 | cases (e.g. if the BWA output was sorted), but the assumption is that
 28 | the first mapped read will include the full SEQ and QUAL fields.
 29 | 
 30 | Copyright Peter Cock 2014. All rights reserved. See:
 31 | https://github.com/peterjc/picobio
 32 | """
 33 | 
 34 | 
 35 | def decode_cigar(cigar):
 36 |     """Return a list of 2-tuples, integer count and operator char."""
 37 |     count = ""
 38 |     answer = []
 39 |     for letter in cigar:
 40 |         if letter.isdigit():
 41 |             count += letter  # string addition
 42 |         elif letter in "MIDNSHP=X":
 43 |             answer.append((int(count), letter))
 44 |             count = ""
 45 |         else:
 46 |             raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
 47 |     return answer
 48 | 
 49 | 
 50 | assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [
 51 |     (14, "S"),
 52 |     (15, "M"),
 53 |     (1, "P"),
 54 |     (1, "D"),
 55 |     (3, "P"),
 56 |     (54, "M"),
 57 |     (1, "D"),
 58 |     (34, "M"),
 59 |     (5, "S"),
 60 | ]
 61 | 
 62 | 
 63 | def cigar_seq_len(cigar_str):
 64 |     slen = 0
 65 |     for count, operator in decode_cigar(cigar_str):
 66 |         if operator in "MIS=X":
 67 |             slen += count
 68 |     return slen
 69 | 
 70 | 
 71 | assert cigar_seq_len("1I58M1I34M1I2M1D12M") == 109
 72 | 
 73 | 
 74 | def get_frag(flag):
 75 |     f = int(flag)
 76 |     if f & 0x1:
 77 |         # multi-part
 78 |         first = f & 0x40
 79 |         last = f & 0x80
 80 |         if first and last:
 81 |             return None  # Part of a mult-fragment read, not pair?
 82 |         elif first:
 83 |             return 1
 84 |         elif last:
 85 |             return 2
 86 |         else:
 87 |             return None  # Unknown
 88 |     else:
 89 |         return None
 90 | 
 91 | 
 92 | last_seq = None
 93 | last_qual = None
 94 | last_name = None
 95 | last_frag = 0
 96 | count = 0
 97 | mod = 0
 98 | for line in sys.stdin:
 99 |     if line[0] != "@":
100 |         # Should be a read
101 |         count += 1
102 |         (
103 |             qname,
104 |             flag,
105 |             rname,
106 |             pos,
107 |             mapq,
108 |             cigar,
109 |             rnext,
110 |             pnext,
111 |             tlen,
112 |             seq,
113 |             qual,
114 |             rest,
115 |         ) = line.split("\t", 11)
116 |         if seq == "*":
117 |             if qname == last_name and get_frag(flag) == last_frag:
118 |                 assert last_seq
119 |                 exp_len = cigar_seq_len(cigar)
120 |                 if exp_len < len(last_seq) and "H" in cigar:
121 |                     # Ought to work if record it as sort trimming...
122 |                     cigar = cigar.replace("H", "S")
123 |                     exp_len = cigar_seq_len(cigar)
124 |                 assert exp_len == len(last_seq), (
125 |                     "Cached SEQ %r length %i, but this read CIGAR expects length %i:\n%s"
126 |                     % (last_seq, len(last_seq), cigar_seq_len(cigar), line)
127 |                 )
128 |                 seq = last_seq
129 |                 if qual == "*":
130 |                     qual = last_qual
131 |                 mod += 1
132 |                 line = "\t".join(
133 |                     [
134 |                         qname,
135 |                         flag,
136 |                         rname,
137 |                         pos,
138 |                         mapq,
139 |                         cigar,
140 |                         rnext,
141 |                         pnext,
142 |                         tlen,
143 |                         seq,
144 |                         qual,
145 |                         rest,
146 |                     ]
147 |                 )
148 |         elif "H" not in cigar:
149 |             # Cache the SEQ
150 |             last_name = qname
151 |             last_frag = get_frag(flag)
152 |             last_seq = seq
153 |             last_qual = qual
154 |     sys.stdout.write(line)
155 | sys.stderr.write("Modified %i out of %i reads\n" % (mod, count))
156 | 


--------------------------------------------------------------------------------
/ena_fetch/get_ENA_project_fastq.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import os
  4 | 
  5 | try:
  6 |     from urllib.request import urlopen
  7 | except ImportError:
  8 |     from urllib import urlopen
  9 | 
 10 | project = "PRJEB2896"
 11 | 
 12 | # This was simple, but does not work anymore:
 13 | # fastq_url = "http://www.ebi.ac.uk/ena/data/view/reports/sra/fastq_files/internal/%s" % project
 14 | # This is the default column set:
 15 | # fastq_url = "https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=PRJEB2896&result=read_run&fields=study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_layout,fastq_ftp,fastq_galaxy,submitted_ftp,submitted_galaxy,sra_ftp,sra_galaxy,cram_index_ftp,cram_index_galaxy&download=txt"
 16 | # Or goto https://www.ebi.ac.uk/ena/data/view/PRJEB2896 and click on "TEXT" download link
 17 | #
 18 | # This includes important metadata including the fastq_md5 information,
 19 | fields = "study_accession,sample_accession,secondary_sample_accession,experiment_accession,run_accession,tax_id,scientific_name,instrument_model,library_name,nominal_length,library_layout,read_count,experiment_title,study_title,study_alias,experiment_alias,run_alias,fastq_md5,fastq_ftp,submitted_md5,submitted_ftp,sra_md5,sra_ftp,cram_index_ftp".split(
 20 |     ","
 21 | )
 22 | 
 23 | fastq_url = (
 24 |     "https://www.ebi.ac.uk/ena/data/warehouse/filereport?accession=%s&result=read_run&fields=%s&download=txt"
 25 |     % (project, ",".join(fields))
 26 | )
 27 | 
 28 | fastq_file = "%s_metadata.tsv" % project
 29 | 
 30 | # The Gp life-stages:
 31 | wanted = """ERS091755
 32 | ERS092427
 33 | ERS001595
 34 | ERS092081
 35 | ERS092426
 36 | ERS092348
 37 | ERS092526
 38 | ERS091953
 39 | ERS092579
 40 | ERS001598
 41 | ERS092349
 42 | ERS001809
 43 | ERS092350
 44 | ERS002001
 45 | ERS092351
 46 | ERS091952
 47 | ERS092525""".split()
 48 | 
 49 | # A few populations of interest:
 50 | # wanted += ["ERR202431", "ERR202432", "ERR202433", "ERR202434"]
 51 | wanted += ["ERS092428", "ERS092429", "ERS092430", "ERS092431"]
 52 | 
 53 | 
 54 | def download_in_one(url, filename):
 55 |     print("Fetching %s" % url)
 56 |     n = urlopen(url)
 57 |     data = n.read()
 58 |     n.close()
 59 | 
 60 |     h = open(filename, "wb")
 61 |     h.write(data)
 62 |     h.close()
 63 |     print("Saved as %s" % filename)
 64 | 
 65 | 
 66 | if not os.path.isfile(fastq_file):
 67 |     download_in_one(fastq_url, fastq_file)
 68 | 
 69 | 
 70 | def process_fastq(project, fastq_filename):
 71 |     h = open(fastq_filename)
 72 |     line = h.readline()
 73 |     assert line == "\t".join(fields) + "\n", repr(line)
 74 |     for line in h:
 75 |         parts = line.rstrip("\n").split("\t")
 76 |         assert parts[0] == project
 77 |         urls = parts[fields.index("fastq_ftp")].split(";")
 78 |         md5s = parts[fields.index("fastq_md5")].split(";")
 79 |         for url, md5 in zip(urls, md5s):
 80 |             if url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR"):
 81 |                 pass
 82 |             elif url.startswith("ftp.sra.ebi.ac.uk/vol1/fastq/ERR"):
 83 |                 url = "ftp://" + url
 84 |             assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR"), url
 85 |             filename = url[len("ftp://ftp.sra.ebi.ac.uk/") :]
 86 |             pending = filename + ".tmp"
 87 |             acc = parts[2]  # here using secondary_sample_accession
 88 |             if wanted and acc not in wanted:
 89 |                 print("Not interested in %s from %s" % (filename, acc))
 90 |                 continue
 91 |             if not filename.endswith(".fastq.gz"):
 92 |                 print("Skipping %s" % filename)
 93 |                 continue
 94 |             # Make directory...
 95 |             d = os.path.split(filename)[0]
 96 |             if not os.path.isdir(d):
 97 |                 print("Making directory %s" % d)
 98 |                 os.makedirs(d)
 99 |             if os.path.isfile(filename):
100 |                 print("Already have %s" % filename)
101 |                 # Assume MD5 checked
102 |                 continue
103 |             # Download file...
104 |             print("Downloading %s" % filename)
105 |             rc = os.system("wget -nv -O %s %s" % (pending, url))
106 |             assert not rc, rc
107 |             # Now check the md5...
108 |             m = pending + ".md5"
109 |             if not os.path.isfile(m):
110 |                 print("Creating %s with md5 %s" % (m, md5))
111 |                 with open(m, "w") as handle:
112 |                     handle.write("%s  %s" % (md5, os.path.basename(pending)))
113 |             print("Confirming %s has checksum %s" % (pending, md5))
114 |             rc = os.system("cd %s && md5sum -c %s" % (d, os.path.split(m)[1]))
115 |             assert not rc, rc
116 |             # Rename files now that MD5 confirmed
117 |             os.remove(m)
118 |             m = filename + ".md5"
119 |             with open(m, "w") as handle:
120 |                 handle.write("%s  %s" % (md5, os.path.basename(filename)))
121 |             os.rename(pending, filename)
122 |             print("Renamed %s to %s" % (pending, filename))
123 |     h.close()
124 | 
125 | 
126 | process_fastq(project, fastq_file)
127 | 


--------------------------------------------------------------------------------
/seq_manipulation/insert_gaps_for_ena.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script to insert gap features into EMBL files for ENA submission.
  3 | 
  4 | Given a FASTA assembly with runs of Ns in the sequence fed through Prokka
  5 | for annotation and then gff3_to_embl to make an EMBL file, it will fail
  6 | ENA validation:
  7 | 
  8 | $ java -jar embl-api-validator-1.1.146.jar BXF1.embl
  9 | ...
 10 | ERROR: Sequence contains a stretch of 'n' characters between base {0} and {1} that is not represented with a "gap" feature (stretches of n greater than {2} gives a warning, greater than {3} gives an error). (26 occurrences) (SequenceToGapFeatureBasesCheck-1)
 11 | ...
 12 | 
 13 | The validator reports a warning for any Ns without a gap feature, and an
 14 | error for runs of at least 10 Ns without a gap feature. Therefore seems
 15 | sensible to create gap features for any run of 10 or more N?
 16 | 
 17 | References
 18 | ----------
 19 | Prokka: https://github.com/tseemann/prokka
 20 | 
 21 | GFF3 to ENA ready EMBL: https://github.com/sanger-pathogens/gff3toembl
 22 | 
 23 | ENA Validator: https://www.ebi.ac.uk/ena/software/flat-file-validator
 24 | and https://github.com/enasequence/sequencetools
 25 | 
 26 | This script: https://github.com/peterjc/picobio/tree/master/seq_manipulation
 27 | 
 28 | """
 29 | 
 30 | import sys
 31 | 
 32 | if len(sys.argv) != 3:
 33 |     sys.exit("Expects two arguments: Input EMLB filename, output EMBL filename\n")
 34 | input_embl = sys.argv[1]
 35 | output_embl = sys.argv[2]
 36 | MIN_GAP = 10  # TODO: Could be a command line option?
 37 | 
 38 | 
 39 | try:
 40 |     from Bio import SeqIO
 41 |     from Bio._py3k import StringIO
 42 |     from Bio.SeqFeature import FeatureLocation
 43 |     from Bio.SeqFeature import SeqFeature
 44 | except ImportError:
 45 |     sys.exit("This script requires Biopython 1.69 or later")
 46 | 
 47 | 
 48 | def insert_feature(record, feature):
 49 |     pos = int(feature.location.start)
 50 |     i = 0
 51 |     for i, f in enumerate(record.features):
 52 |         if int(f.location.start) > pos:
 53 |             break
 54 |     record.features.insert(i, feature)
 55 | 
 56 | 
 57 | def insert_gaps(record):
 58 |     seq = str(record.seq).upper()
 59 |     sys.stderr.write(
 60 |         "Record %s (length %i bp) has %i N characters\n"
 61 |         % (record.id, len(seq), seq.count("N"))
 62 |     )
 63 |     gap = "N" * MIN_GAP
 64 |     try:
 65 |         i = seq.find(gap)
 66 |     except IndexError:
 67 |         sys.stderr.write(
 68 |             "No long gaps in record %s (length %i bp)\n" % (record.id, len(seq))
 69 |         )
 70 |         return record
 71 | 
 72 |     count = 0
 73 |     while i != -1:
 74 |         j = i + len(gap)
 75 |         while seq[j] == "N":
 76 |             j += 1
 77 |         sys.stderr.write(
 78 |             "Record %s (length %i bp) has run of %i N from %i to %i\n"
 79 |             % (record.id, len(seq), j - i, i + 1, j)
 80 |         )
 81 |         # WARNING - I suspect the validator is broken for features of one,
 82 |         # where I think the location is just X, rather than X..X instead?
 83 |         gap_feature = SeqFeature(
 84 |             FeatureLocation(i, j), type="gap", qualifiers={"estimated_length": j - i}
 85 |         )
 86 |         insert_feature(record, gap_feature)
 87 |         count += 1
 88 |         i = seq.find(gap, j)
 89 |     sys.stderr.write(
 90 |         "Added %i gap features to record %s (length %i bp)\n"
 91 |         % (count, record.id, len(seq))
 92 |     )
 93 |     return record
 94 | 
 95 | 
 96 | print("Adding gap features to %s making %s" % (input_embl, output_embl))
 97 | 
 98 | # This is the original short-and-sweet implementation, however right now
 99 | # as of November 2016 the Biopython EMBL round-trip is not close enough
100 | # to avoid creating extra warnings from the ENA submission validator.
101 | #
102 | # fixed_records = (insert_gaps(r) for r in SeqIO.parse(input_embl, "embl"))
103 | # count = SeqIO.write(fixed_records, output_embl, "embl")
104 | #
105 | # New version uses original header combined with Biopython's output of
106 | # the feature table etc.
107 | 
108 | 
109 | def get_header(embl_string):
110 |     """Return everything up to but excluding the FH line.
111 | 
112 |     i.e. All the header lines prior to the features.
113 |     """
114 |     assert "\nFH" in embl_string
115 |     answer = []
116 |     for line in embl_string.split("\n"):
117 |         if line.startswith("FH  "):
118 |             break
119 |         else:
120 |             answer.append(line)
121 |     return "\n".join(answer) + "\n"
122 | 
123 | 
124 | def get_body(embl_string):
125 |     """Return everything after and including the FH line.
126 | 
127 |     i.e. All the features and the sequence itself.
128 |     """
129 |     assert "\nFH" in embl_string
130 |     answer = []
131 |     for line in embl_string.split("\n"):
132 |         if line.startswith("FH   "):
133 |             answer = []
134 |         answer.append(line)
135 |     answer = "\n".join(answer)
136 |     return answer.rstrip("\n") + "\n"
137 | 
138 | 
139 | def break_up_embl_file(handle):
140 |     record = []
141 |     for line in handle:
142 |         record.append(line)
143 |         if line.startswith("//"):
144 |             yield "".join(record)
145 |             record = []
146 |     if record:
147 |         yield "".join(record)
148 | 
149 | 
150 | count = 0
151 | with open(input_embl) as in_handle:
152 |     with open(output_embl, "w") as out_handle:
153 |         for embl_string in break_up_embl_file(in_handle):
154 |             assert embl_string.startswith("ID "), embl_string
155 |             assert embl_string.endswith("//\n"), embl_string
156 |             count += 1
157 |             out_handle.write(get_header(embl_string))
158 |             r = SeqIO.read(StringIO(embl_string), "embl")
159 |             out_handle.write(get_body(insert_gaps(r).format("embl")))
160 | print("Done, %i records" % count)
161 | 


--------------------------------------------------------------------------------
/acgt_dither/dither_rgb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script to render photos using bases A, C, G, and T for pixels.
  3 | 
  4 | Takes as input a PNG photo (JPEG should work if the right dependencies
  5 | are installed), and a FASTA sequence file, and uses them to produce a
  6 | PDF output image using ReportLab.
  7 | 
  8 | The motivation and example images are described on this blog post:
  9 | http://blastedbio.blogspot.co.uk/2013/08/pixelated-potato-posters-in-python.html
 10 | """
 11 | 
 12 | from __future__ import print_function
 13 | 
 14 | import os
 15 | 
 16 | import numpy as np
 17 | from Bio import SeqIO
 18 | from PIL import Image
 19 | from reportlab.graphics import renderPDF
 20 | from reportlab.graphics.shapes import Drawing
 21 | from reportlab.graphics.shapes import String
 22 | from reportlab.lib import colors
 23 | from reportlab.lib.units import cm
 24 | from reportlab.lib.units import mm
 25 | from reportlab.pdfgen import canvas
 26 | 
 27 | # These are heuristic, and currently slightly squash the image vertically,
 28 | # so h_scale should be a little smaller or the v_scale a little bigger.
 29 | h_scale = 0.140 * cm  # per bp
 30 | v_scale = 0.125 * cm  # per bp
 31 | 
 32 | 
 33 | def run(im, seq, pdf_file, main_caption):
 34 |     data = np.array(im)  # .reshape(shape_rgb, order="F")
 35 |     shape = data.shape[0:2]
 36 | 
 37 |     pixels = np.product(shape)
 38 |     print("Have %i base pairs, shape %r, and %i pixels" % (len(seq), shape, pixels))
 39 | 
 40 |     assert pixels <= len(seq)
 41 |     assert 0 <= data.min() <= data.max() <= 255
 42 | 
 43 |     # Open PDF
 44 |     width, height = page_size = h_scale * shape[1], v_scale * shape[0]
 45 |     print("Creating %s, %i by %i mm" % (pdf_file, width / mm, height / mm))
 46 |     c = canvas.Canvas(pdf_file, page_size)
 47 |     c.setTitle(main_caption)
 48 |     d = Drawing(*page_size)
 49 |     base = 0
 50 |     r_max = float(data[:, :, 0].max())
 51 |     g_max = float(data[:, :, 1].max())
 52 |     b_max = float(data[:, :, 2].max())
 53 |     for row in range(shape[0]):
 54 |         for col in range(shape[1]):
 55 |             # Ignore any alpha channel
 56 |             r, g, b = data[row, col, 0:3]
 57 |             # This scaling according to the channel maxima seemed like
 58 |             # a good idea to maximize contrast for use as a background
 59 |             # image:
 60 |             color = colors.Color(r / r_max, g / g_max, b / b_max)
 61 |             s = String(
 62 |                 (col + 0.5) * h_scale,
 63 |                 (shape[0] - row) * v_scale,
 64 |                 seq[base],
 65 |                 fillColor=color,
 66 |                 fontSize=4,
 67 |                 textAnchor="middle",
 68 |             )
 69 |             d.add(s)
 70 |             base += 1
 71 |     renderPDF.draw(d, c, 0, 0)
 72 |     c.showPage()
 73 |     c.save()
 74 | 
 75 | 
 76 | # print("A0: Suggest width %i, height %i pixels" % (841 * mm / h_scale, 1189 * mm / v_scale))
 77 | # --> A0: Suggest width 600, height 951 pixels
 78 | # print("A1: Suggest width %i, height %i pixels" % (594 * mm / h_scale, 841 * mm / v_scale))
 79 | # --> Suggest width 424, height 672 pixels
 80 | # print("A2: Suggest width %i, height %i pixels" % (420 * mm / h_scale, 594 * mm / v_scale))
 81 | # --> A2: Suggest width 300, height 475 pixels
 82 | # print("A3: Suggest width %i, height %i pixels" % (297 * mm / h_scale, 420 * mm / v_scale))
 83 | # --> A3: Suggest width 212, height 336 pixels
 84 | # print("A4: Suggest width %i, height %i pixels" % (210 * mm / h_scale, 297 * mm / v_scale))
 85 | # --> A4: Suggest width 150, height 237 pixels
 86 | 
 87 | 
 88 | # This is a hard coded list of potato images used for the
 89 | # poster backgrounds described here,
 90 | # http://blastedbio.blogspot.co.uk/2013/08/pixelated-potato-posters-in-python.html
 91 | for name, seq_file in [
 92 |     ("Purple on black 002", "chr06.fasta"),
 93 |     # ("Potato field 001", "chr07.fasta"),
 94 |     ("Potato field 002", "chr07.fasta"),
 95 |     ("Potato roots 001", "chr08.fasta"),
 96 |     ("Tractor 001", "chr09.fasta"),
 97 |     ("Potato flower", "chr01.fasta"),
 98 |     # ("Potato branch", "chr02.fasta"),
 99 |     ("Potato branch center", "chr02.fasta"),
100 |     ("new branch 001", "chr02.fasta"),
101 |     # ("Potato tubers", "chr03.fasta"),
102 |     # ("Potato tubers2", "chr03.fasta"),
103 |     ("Potato tubers 003", "chr03.fasta"),
104 |     ("Potato leaves", "chr04.fasta"),
105 |     ("Potato blue flowers", "chr05.fasta"),
106 |     ("Blue Flower Brown", "chr05.fasta"),
107 |     ("Blue Flower dark", "chr05.fasta"),
108 | ]:
109 |     stem = name.lower().replace(" ", "_")
110 |     png_file = "%s.png" % stem
111 |     png_fileA = "%s_424x672.png" % stem
112 |     png_fileB = "%s_600x951.png" % stem
113 |     if not os.path.isfile(png_fileB):
114 |         png_fileB = png_file
115 |     if not os.path.isfile(png_fileA):
116 |         png_fileA = png_fileB
117 | 
118 |     pdf_file = stem + "_%s.pdf"
119 | 
120 |     seq = str(SeqIO.read(seq_file, "fasta").seq)
121 |     # Reduce runs of N to a single N to avoid visual distraction
122 |     while "NNNNN" in seq:
123 |         seq = seq.replace("NNNNN", "N")
124 |     while "NN" in seq:
125 |         seq = seq.replace("NN", "N")
126 | 
127 |     print("Drawing %s using %s" % (name, seq_file))
128 |     for name, shape, png_file in [
129 |         ("A4", (150, 237), png_fileB),
130 |         ("A3", (212, 336), png_fileA),
131 |         ("A2", (300, 475), png_fileB),
132 |         ("A1", (424, 672), png_fileA),
133 |         ("A0", (600, 951), png_fileB),
134 |     ]:
135 |         if not os.path.isfile(png_file):
136 |             print("Missing %s" % png_file)
137 |             continue
138 |         if os.path.isfile(pdf_file % name):
139 |             print("Skipping as %s exists..." % (pdf_file % name))
140 |             continue
141 |         print(
142 |             "Size %s, using %i by %i pixels from %s"
143 |             % (name, shape[0], shape[1], png_file)
144 |         )
145 |         im = Image.open(png_file).resize(shape)
146 |         run(im, seq, pdf_file % name, name)
147 | 


--------------------------------------------------------------------------------
/blast/blast_sync.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Sync BLAST database(s).
  3 | 
  4 | Arguments:
  5 | ---------
  6 | (*) Master path, e.g. /data/blastdb
  7 | (*) Local path, e.g. /tmp/galaxy-blastdb
  8 | (*) Database name(s), e.g. ncbi/nr
  9 | 
 10 | Return codes:
 11 | 
 12 | 0 - Worked, database(s) are now up to date
 13 | 1 - Failed
 14 | 2 - Locked, aborted
 15 | 
 16 | Because we use rsync internally, we cannot easily separate already
 17 | up to date vs successful update.
 18 | 
 19 | TODO - Some locking/flag mechanism to mark a database as in use?
 20 | Consider a cluster node running multiple BLAST jobs, job 1 starts
 21 | and the database is up to date, then the master copy of the db is
 22 | updated, then job 2 starts and will try to update the local copy
 23 | (which will probably fail and/or mess up job 1). Corner case?
 24 | We avoid this by running BLAST jobs which take a whole node at once.
 25 | 
 26 | """
 27 | 
 28 | # example values:
 29 | # master = "/mnt/gfs/blast/galaxy"
 30 | # local = "/var/blast/galaxy"
 31 | # db = "ncbi/nr"
 32 | from __future__ import print_function
 33 | 
 34 | import os
 35 | import sys
 36 | import time
 37 | 
 38 | try:
 39 |     from os.path import relpath
 40 | except ImportError:
 41 |     # Must be prior to Python 2.6
 42 |     # This implementation is based on that by James Gardner in
 43 |     # MIT licensed package BareNecessities
 44 |     import posixpath
 45 | 
 46 |     def relpath(path, start=posixpath.curdir):
 47 |         """Return a relative version of a path."""
 48 |         if not path:
 49 |             msg = "no path specified"
 50 |             raise ValueError(msg)
 51 |         start_list = posixpath.abspath(start).split(posixpath.sep)
 52 |         path_list = posixpath.abspath(path).split(posixpath.sep)
 53 |         # Work out how much of the filepath is shared by start and path.
 54 |         i = len(posixpath.commonprefix([start_list, path_list]))
 55 |         rel_list = [posixpath.pardir] * (len(start_list) - i) + path_list[i:]
 56 |         if not rel_list:
 57 |             return path
 58 |         return posixpath.join(*rel_list)
 59 | 
 60 |     assert relpath("/data/blast/ncbi/nr.pal", "/data/blast") == "ncbi/nr.pal"
 61 | 
 62 | if len(sys.argv) < 4:
 63 |     sys.stderr.write("Bad BLAST database sync arguments\n")
 64 |     sys.stderr.write("Expect source path, dest path, one or more DBs\n")
 65 |     sys.exit(3)
 66 | 
 67 | master = sys.argv[1]
 68 | local = sys.argv[2]
 69 | names = sys.argv[3:]
 70 | 
 71 | if not os.path.isdir(master):
 72 |     sys.stderr.write("Master directory %s not found\n" % master)
 73 |     sys.exit(1)
 74 | 
 75 | if not os.path.isdir(local):
 76 |     try:
 77 |         os.makedirs(local, 0o777)  # Octal 777
 78 |     except OSError as e:
 79 |         sys.stderr.write(
 80 |             "Local directory %s not found and couldn't create it\n" % local
 81 |         )
 82 |         sys.stderr.write(str(e) + "\n")
 83 |         sys.exit(1)
 84 | 
 85 | 
 86 | def sync_blast_alias_db(master, local, db, index):
 87 |     print("Syncing %s" % index)
 88 |     handle = open(index)
 89 |     for line in handle:
 90 |         if line.startswith("DBLIST "):
 91 |             dbs = line[7:].strip().split()
 92 |             for d in dbs:
 93 |                 if d.startswith("/"):
 94 |                     sys.stderr.write(
 95 |                         "ERROR: Absolute paths in %s index file?\n" % index
 96 |                     )
 97 |                     return 1
 98 |                 d = os.path.join(os.path.split(index)[0], d)
 99 |                 err = sync_blast_db(master, local, relpath(d, master))
100 |                 if err:
101 |                     return err
102 |     handle.close()
103 |     return 0
104 | 
105 | 
106 | def sync_blast_db(master, local, db):
107 |     for index in [os.path.join(master, db + ".nal"), os.path.join(master, db + ".pal")]:
108 |         if os.path.isfile(index):
109 |             err = sync_blast_alias_db(master, local, db, index)
110 |             if err:
111 |                 return err
112 |             # else continue to sync the alias file itself,
113 | 
114 |     cmd = "rsync -v -rtz --exclude=*.tar.gz --exclude=*.md5 %s %s"
115 |     old = os.path.join(master, db + ".*")
116 |     new = os.path.join(local, os.path.split(db)[0])  # Folder namer!
117 |     # print("%s -> %s" % (old, new))
118 |     print(cmd % (old, new))
119 |     err = os.system(cmd % (old, new))
120 |     if err:
121 |         sys.stderr.write("Return code %i from rsync:\n%s\n" % (err, cmd % (old, new)))
122 |     return err
123 | 
124 | 
125 | for db in names:
126 |     print(db)
127 |     lock = os.path.join(local, db + ".lock")
128 |     if not os.path.isdir(os.path.split(lock)[0]):
129 |         os.makedirs(os.path.split(lock)[0], 0o777)
130 |     # Wait a little, enough for other copies of the script
131 |     # to finish the sync check if the files are current
132 |     if os.path.isfile(lock):
133 |         sys.stderr.write("BLAST Database already locked, %s\n" % lock)
134 |         time.sleep(5)
135 |     if os.path.isfile(lock):
136 |         time.sleep(10)
137 |     if os.path.isfile(lock):
138 |         time.sleep(15)
139 |     if os.path.isfile(lock):
140 |         time.sleep(30)
141 |     # if os.path.isfile(lock):
142 |     #    time.sleep(60)
143 |     if os.path.isfile(lock):
144 |         try:
145 |             handle = open(lock)
146 |             sys.stderr.write(handle.read())
147 |             handle.close()
148 |         except Exception:  # noqa: B905,BLE001
149 |             pass
150 |         sys.stderr.write("Aborting sync\n")
151 |         sys.exit(2)
152 |     try:
153 |         handle = open(lock, "w")
154 |         handle.write(time.strftime("%a, %d %b %Y %H:%M:%S +0000\n", time.gmtime()))
155 |         handle.close()
156 |     except Exception:  # noqa: B902,BLE001
157 |         sys.stderr.write("Could not create BLAST DB lock\n")
158 |         sys.exit(1)
159 | 
160 |     start = time.time()
161 |     try:
162 |         err = sync_blast_db(master, local, db)
163 |     except Exception as e:  # noqa: B902,BLE001
164 |         # Want to catch this and remove the lock file
165 |         sys.stderr.write("Unexpected failure: %s" % e)
166 |         err = True
167 |     if err:
168 |         os.remove(lock)
169 |         sys.exit(1)
170 |     taken = time.time() - start
171 |     os.remove(lock)
172 |     if taken > 100:
173 |         print("%s done in %0.1fm" % (db, taken / 60.0))
174 |     else:
175 |         print("%s done in %is" % (db, int(taken)))
176 | print("Done")
177 | 


--------------------------------------------------------------------------------
/annotation_comparison/annotation_patch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | 
  4 | usage = """Annotation feature qualifier patch tool, for GFF3 format.
  5 | 
  6 | Usage:
  7 | 
  8 | $ ./annotation_patch.py old_vs_new.txt original.gff > updated.gff
  9 | 
 10 | The output is printed to stdout, use Unix redirection to capture
 11 | this to a file.
 12 | 
 13 | The input patch file is a simple tab based format, where lines with
 14 | a leading # are comments:
 15 | 
 16 | - Reference/chromosome name
 17 | - Feature type, e.g. 'CDS'
 18 | - INSDC style location string (as used in GenBank/EMBL files)
 19 | - Annotation qualifier key, e.g. 'gene'
 20 | - Old annotation qualifier value
 21 | - New annotation qualifier value
 22 | 
 23 | The qualifier values should be quoted strings, or None as a special
 24 | value to indicate missing (i.e. adding or removing a key=value).
 25 | 
 26 | The order of the patch file is not important (it is loaded into
 27 | memory, and then the script streams/iterates over the input file).
 28 | 
 29 | The expectation is you create (and perhaps post-process) the patch
 30 | file using the sister script annotation_diff.py, e.g.
 31 | 
 32 | $ ./annotation_diff.py old.gff new.gbk > old_vs_new.txt
 33 | 
 34 | Note the patch script only supports GFF3, and uses a simplistic
 35 | internal parser which DOES NOT SUPPORT multi-line features, i.e.
 36 | joins or multi-exon features.
 37 | """
 38 | 
 39 | 
 40 | def patch_gff(handle, diffs):
 41 |     """Quick hack to patch Bacterial GFF files from Prokka etc.
 42 | 
 43 |     Does NOT support multi-line features (i.e. splicing and
 44 |     multiple exons).
 45 |     """
 46 |     out_handle = sys.stdout
 47 | 
 48 |     line = handle.readline()
 49 |     assert line.startswith("##gff-version 3"), line
 50 |     out_handle.write(line)
 51 | 
 52 |     # print("Parsing GFF3")
 53 |     for line in handle:
 54 |         if line.strip() == "##FASTA":
 55 |             out_handle.write(line)
 56 |             break
 57 |         elif line.startswith("#"):
 58 |             out_handle.write(line)
 59 |             continue
 60 |         elif line.count("\t") == 8:
 61 |             (
 62 |                 seqid,
 63 |                 source,
 64 |                 ftype,
 65 |                 start,
 66 |                 end,
 67 |                 score,
 68 |                 strand,
 69 |                 phase,
 70 |                 attributes,
 71 |             ) = line.rstrip().split("\t")
 72 |             # assert seqid in references, seqid
 73 |             start = int(start)  # Leave this as one-based
 74 |             end = int(end)
 75 |             assert 0 <= start < end  # < len(references[seqid])
 76 |             loc = "%i..%i" % (start, end)
 77 |             if strand == "-":
 78 |                 loc = "complement(%s)" % loc
 79 |             elif strand not in "+.?":
 80 |                 # "+" = Forward strand - do nothing
 81 |                 # "." = Unstranded - do nothing to match INSDC
 82 |                 # "?" = Stranded but missing - do nothing to match INSDC
 83 |                 raise ValueError("Bad strand %r in line: %s" % (strand, line))
 84 |             diff_key = "%s:%s:%s" % (seqid, loc, ftype)
 85 |             if diff_key in diffs:
 86 |                 a = attributes + ";"
 87 |                 for key, old, new in diffs[diff_key]:
 88 |                     if key == "EC_number":
 89 |                         key = "eC_number"
 90 |                     if new is None:
 91 |                         # Remove the key=value;
 92 |                         assert ("%s=%s;" % (key, old)) in a
 93 |                         a = a.replace("%s=%s;" % (key, old), "")
 94 |                     else:
 95 |                         new = "%s=%s" % (key, new.replace(",", "%2C"))
 96 |                         if old is None:
 97 |                             a += new + ";"
 98 |                         else:
 99 |                             old = "%s=%s" % (key, old.replace(",", "%2C"))
100 |                             assert old in a, (line, old, new)
101 |                             a = a.replace(old, new)
102 |                 assert a.endswith(";")
103 |                 line = line.replace(attributes, a[:-1])
104 |             assert line.count("\n") == 1 and line.endswith("\n"), repr(line)
105 |             out_handle.write(line)
106 |         else:
107 |             raise NotImplementedError(line)
108 |     # Deal with any FASTA block
109 |     for line in handle:
110 |         out_handle.write(line)
111 | 
112 | 
113 | def load_diffs(handle):
114 |     answer = {}
115 |     for line in handle:
116 |         if line.startswith("#"):
117 |             continue
118 |         parts = line.strip("\n").split("\t")
119 |         ref, ftype, loc, key, old, new = parts
120 |         if old == "None":
121 |             old = None
122 |         elif old.startswith("'") and old.endswith("'"):
123 |             old = old[1:-1]
124 |         elif old.startswith('"') and old.endswith('"'):
125 |             old = old[1:-1]
126 |         else:
127 |             raise NotImplementedError(old)
128 |         if new == "None":
129 |             new = None
130 |         elif new.startswith("'") and new.endswith("'"):
131 |             new = new[1:-1]
132 |         elif new.startswith('"') and new.endswith('"'):
133 |             new = new[1:-1]
134 |         else:
135 |             raise NotImplementedError(new)
136 |         diff_key = "%s:%s:%s" % (ref, loc, ftype)
137 |         if diff_key not in answer:
138 |             answer[diff_key] = []
139 |         answer[diff_key].append((key, old, new))
140 |     return answer
141 | 
142 | 
143 | def apply_diffs(handle, diffs):
144 |     offset = handle.tell()
145 |     line = handle.readline()
146 |     handle.seek(offset)
147 | 
148 |     if line.startswith("##gff-version"):
149 |         return patch_gff(handle, diffs)
150 |     elif line.startswith("LOCUS "):
151 |         raise NotImplementedError
152 |     elif line.startswith("ID "):
153 |         raise NotImplementedError
154 |     else:
155 |         sys.exit("Could not guess file type from first line:\n%s" % line)
156 | 
157 | 
158 | # TODO: Proper command line API
159 | try:
160 |     diff_filename, old_filename = sys.argv[1:]
161 | except ValueError:
162 |     sys.exit(usage)
163 | 
164 | if diff_filename == "-":
165 |     diffs = load_diffs(sys.stdin)
166 | else:
167 |     with open(diff_filename) as handle:
168 |         diffs = load_diffs(handle)
169 | sys.stderr.write("Loaded %i differences to apply\n" % len(diffs))
170 | # sys.stderr.write("%s\n" % list(diffs.keys())[:10])
171 | # sys.stderr.write("%s\n" % list(diffs.values())[0])
172 | 
173 | old_handle = open(old_filename)
174 | apply_diffs(old_handle, diffs)
175 | sys.stderr.write("Done\n")
176 | 


--------------------------------------------------------------------------------
/primer_selection/primer_selection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Simple primer selection script using Jim Kent's isPcr tool internally.
  3 | 
  4 | Takes as input a primer file (TSV format) and one or more FASTA files.
  5 | Reports primers of interest and unique products which might be expected.
  6 | Goal was to pick primers for distinguishing strains or sub-types.
  7 | """
  8 | 
  9 | import argparse
 10 | import os
 11 | import sys
 12 | from collections import defaultdict
 13 | from string import ascii_uppercase
 14 | 
 15 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 16 | 
 17 | tmp = "/tmp/primer_selection"
 18 | if not os.path.isdir(tmp):
 19 |     os.mkdir(tmp)
 20 | 
 21 | 
 22 | def load_primers(tsv_filename):
 23 |     """Load primer TSV file into list of 3-tuples."""
 24 |     answer = []
 25 |     with open(tsv_filename) as handle:
 26 |         for line in handle:
 27 |             if line.startswith("#"):
 28 |                 continue
 29 |             parts = line.rstrip("\n").split("\t")
 30 |             if len(parts) == 2:
 31 |                 left, right = parts
 32 |                 name = f"P{len(answer)}"
 33 |             else:
 34 |                 name, left, right = parts[:3]
 35 |             answer.append((name, left, right))
 36 |     return answer
 37 | 
 38 | 
 39 | def load_isprc(isprc_filename, ref_name, primer_hits):
 40 |     """Parse the FASTA output from Jim Kent's isPcr tool.
 41 | 
 42 |     Adds (reference, sequence) entries for (left, right) in primer_hits dict.
 43 |     """
 44 |     with open(isprc_filename) as handle:
 45 |         for title, seq in SimpleFastaParser(handle):
 46 |             amp_region, name, length, left, right = title.split()
 47 |             assert (
 48 |                 left,
 49 |                 right,
 50 |             ) in primer_hits, f"Stale cache? Why {name}, {left}, {right}"
 51 |             assert (
 52 |                 length == f"{len(seq)}bp"
 53 |             ), f"Expected length {len(seq)} from sequence, yet {length}"
 54 |             seq = seq.upper()
 55 |             # chrom, rest = region.rsplit(":", 1)
 56 |             # if "+" in rest:
 57 |             #    strand = "+"
 58 |             #    start, end = rest.split("+")
 59 |             # else:
 60 |             #    strand = "-"
 61 |             #    start, end = rest.split("-")
 62 |             primer_hits[left, right].append((ref_name, seq))
 63 | 
 64 | 
 65 | def main():
 66 |     parser = argparse.ArgumentParser()
 67 |     parser.add_argument("primers", metavar="TSV", help="TSV file of primers")
 68 |     parser.add_argument(
 69 |         "references", nargs="+", metavar="FASTA", help="One or more FASTA files"
 70 |     )
 71 |     parser.add_argument(
 72 |         "-n", "--names", metavar="TSV", help="TSV file mapping FASTA files to names"
 73 |     )
 74 |     parser.add_argument(
 75 |         "-m",
 76 |         "--minvars",
 77 |         type=int,
 78 |         metavar="INT",
 79 |         help="Minimum number of amplicon variants to report on",
 80 |     )
 81 |     args = parser.parse_args()
 82 | 
 83 |     if not args.primers:
 84 |         sys.exit("ERROR: Missing primer TSV file")
 85 |     if not args.references:
 86 |         sys.exit("ERROR: Missing FASTA file(s)")
 87 | 
 88 |     primers = load_primers(args.primers)
 89 |     if not primers:
 90 |         sys.exit(f"ERROR: No primers identified in {args.primers}")
 91 |     primer_file = os.path.join(tmp, "primers.tsv")
 92 |     with open(primer_file, "w") as handle:
 93 |         for name, left, right in primers:
 94 |             handle.write(f"{name}\t{left}\t{right}\n")
 95 | 
 96 |     ref_names = {}
 97 |     if args.names:
 98 |         with open(args.names) as handle:
 99 |             for line in handle:
100 |                 if line.startswith("#"):
101 |                     continue
102 |                 try:
103 |                     fasta, name, _ = line.split("\t", 2)
104 |                 except ValueError:
105 |                     fasta, name = line.split("\t", 1)
106 |                 name = name.strip()  # drops \n if 2 column
107 |                 ref_names[fasta] = name
108 | 
109 |     ref_list = []
110 |     primer_hits = {(left, right): [] for name, left, right in primers}
111 |     for fasta in args.references:
112 |         ref_name = os.path.splitext(os.path.split(fasta)[1])[0]
113 |         ispcr_file = os.path.join(tmp, ref_name + ".tsv")
114 |         if fasta.endswith(".gz"):
115 |             # It was a double extension!
116 |             ref_name = os.path.splitext(ref_name)[0]
117 |         if not os.path.isfile(ispcr_file):
118 |             if fasta.endswith(".gz"):
119 |                 print(f"Decompressing {fasta}")
120 |                 cmd = f"cat '{fasta}' | gunzip > "
121 |                 fasta = os.path.join(tmp, ref_name + ".fasta")
122 |                 cmd += fasta
123 |                 if os.system(cmd):
124 |                     sys.exit(f"ERROR: Calling gunzip failed:\n{cmd}")
125 |             print(f"Calling isPrc on {fasta}")
126 |             cmd = f"isPcr '{fasta}' '{primer_file}' '{ispcr_file}'"
127 |             if os.system(cmd):
128 |                 sys.exit(f"ERROR: Calling isPcr failed:\n{cmd}")
129 |         load_isprc(ispcr_file, ref_name, primer_hits)
130 |         ref_list.append(ref_name)
131 | 
132 |     # print(f"Have {len(fasta_list)} references")
133 |     amplicons = defaultdict(dict)
134 |     for (left, right), values in primer_hits.items():
135 |         for ref_name, seq in values:
136 |             try:
137 |                 amplicons[left, right][seq] += 1
138 |             except KeyError:
139 |                 amplicons[left, right][seq] = 1
140 | 
141 |     if args.minvars:
142 |         primer_hits = {
143 |             k: v for (k, v) in primer_hits.items() if len(amplicons[k]) >= args.minvars
144 |         }
145 | 
146 |     print(
147 |         f"The {len(primer_hits)} primers, unique amplicons, lengths, references which amplify:"
148 |     )
149 |     for name, left, right in primers:
150 |         if (left, right) in primer_hits:
151 |             ref_count = len({r for (r, s) in primer_hits[left, right]})
152 |             sizes = {len(s) for (r, s) in primer_hits[left, right]}
153 |             if len(sizes) == 1:
154 |                 size = f"{max(sizes)}bp"
155 |             else:
156 |                 size = f"{min(sizes)}-{max(sizes)}bp"
157 |             print(
158 |                 f"{name}\t{left}\t{right}\t{len(amplicons[left, right])}\t{size}\t{ref_count}"
159 |             )
160 | 
161 |     # Assign letters to each unique sequence for each amplicon: A, B, ...
162 |     amplicon_alias = {}
163 |     for (left, right), seq_counts in amplicons.items():
164 |         for i, (count, seq) in enumerate(
165 |             sorted(((count, seq) for seq, count in seq_counts.items()), reverse=True)
166 |         ):
167 |             amplicon_alias[left, right, seq] = ascii_uppercase[i]
168 | 
169 |     print(f"The {len(ref_list)} references and their amplicon variants:")
170 |     for name, ref in sorted((ref_names.get(ref, ref), ref) for ref in ref_list):
171 |         print(
172 |             pretty(amplicon_alias, primer_hits, ref),
173 |             name,
174 |         )
175 | 
176 | 
177 | def pretty(amplicon_alias, primer_hits, ref):
178 |     values = [
179 |         "".join(
180 |             amplicon_alias[left, right, s]
181 |             for (r, s) in primer_hits[left, right]
182 |             if r == ref
183 |         )
184 |         for left, right in primer_hits
185 |     ]
186 |     values = [str(len(_)) if len(_) > 1 else _ for _ in values]
187 |     values = [_ if _ else "-" for _ in values]
188 |     return "".join(values)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     main()
193 | 


--------------------------------------------------------------------------------
/assembly_comparison/dedup_assembly.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script for assembly contig de-duplication using BLASTN."""
  3 | 
  4 | import os
  5 | import shutil
  6 | import sys
  7 | import tempfile
  8 | from optparse import OptionParser
  9 | 
 10 | from Bio import SeqIO
 11 | 
 12 | usage = """Basic usage: dedup_assembly.py assembly.fasta dedup_output.fasta
 13 | 
 14 | This will sort the input assembly by contig length making a temporary
 15 | FASTA file and BLAST database.
 16 | """
 17 | 
 18 | makeblastdb_binary = "makeblastdb"
 19 | blastn_binary = "blastn"
 20 | 
 21 | 
 22 | def run(cmd):
 23 |     print(cmd)
 24 |     return_code = os.system(cmd)
 25 |     if return_code:
 26 |         sys.exit("Return %i from: %s" % (return_code, cmd), return_code)
 27 | 
 28 | 
 29 | parser = OptionParser(usage=usage)
 30 | parser.add_option(
 31 |     "-m",
 32 |     "--min-contig-len",
 33 |     dest="min_len",
 34 |     type="int",
 35 |     help="Minimum contig length for FASTA output (default 1000)",
 36 |     default=1000,
 37 | )
 38 | parser.add_option(
 39 |     "-l",
 40 |     "--min-hit-len",
 41 |     dest="min_hit",
 42 |     type="int",
 43 |     help="Minimum BLAST hit length to consider (default 500)",
 44 |     default=500,
 45 | )
 46 | parser.add_option(
 47 |     "-p",
 48 |     "--min-perc-identity",
 49 |     dest="perc_identity",
 50 |     type="float",
 51 |     help="Minimum BLAST percentage identity to consider (default 95%)",
 52 |     default=95,
 53 | )
 54 | parser.add_option(
 55 |     "-c",
 56 |     "--min-cover",
 57 |     dest="min_cover",
 58 |     type="float",
 59 |     help="Minimum BLAST hit coverage to black-list (default 95%)",
 60 |     default=95,
 61 | )
 62 | (options, args) = parser.parse_args()
 63 | 
 64 | if len(args) != 2:
 65 |     sys.exit("Requires two arguments!\n\n" + usage)
 66 | assembly_fasta, output_fasta = args
 67 | min_len = int(options.min_len)
 68 | min_hit = int(options.min_hit)
 69 | min_cover = float(options.min_cover)
 70 | perc_identity = float(options.perc_identity)
 71 | 
 72 | if not os.path.isfile(assembly_fasta):
 73 |     sys.exit("Assembly FASTA file not found: %r" % assembly_fasta)
 74 | 
 75 | cols = "qseqid sseqid qlen slen length qstart qend"
 76 | c_query = 0
 77 | c_match = 1
 78 | c_qlen = 2
 79 | c_slen = 3
 80 | c_length = 4
 81 | c_qstart = 5
 82 | c_qend = 6
 83 | 
 84 | 
 85 | def prepare_sorted_fasta(assembly_fasta, sorted_fasta):
 86 |     # Sort & remove short contigs, being lazy and doing this in memory
 87 |     contigs = [r for r in SeqIO.parse(assembly_fasta, "fasta") if len(r) >= min_len]
 88 |     # Sort on length (longest first), tie break on identifier
 89 |     contigs.sort(key=lambda r: (-len(r), r.id))
 90 |     # Write sorted FASTA file
 91 |     count = SeqIO.write(contigs, sorted_fasta, "fasta")
 92 |     assert count == len(contigs)
 93 |     del contigs
 94 |     # Make BLAST database
 95 |     cmd = "%s -dbtype nucl -in %s" % (makeblastdb_binary, sorted_fasta)
 96 |     run(cmd)
 97 |     print("Prepared BLAST database of %i contigs passing minimum length" % count)
 98 | 
 99 | 
100 | def prepare_blast(sorted_fasta, blast_file):
101 |     cmd = '%s -db %s -query %s -out %s -perc_identity %f -outfmt="6 %s"' % (
102 |         blastn_binary,
103 |         sorted_fasta,
104 |         sorted_fasta,
105 |         blast_file,
106 |         perc_identity,
107 |         cols,
108 |     )
109 |     run(cmd)
110 |     print("Ran self-BLAST")
111 | 
112 | 
113 | def find_duplicates(blast_file):
114 |     regions = {}
115 |     lengths = {}
116 |     for line in open(blast_file):
117 |         fields = line.rstrip("\n").split("\t")
118 |         query = fields[c_query]
119 |         if query == fields[c_match]:
120 |             # Ignore self hits
121 |             continue
122 |         qlen = int(fields[c_qlen])
123 |         slen = int(fields[c_slen])
124 |         if qlen > slen:
125 |             # Ignore hits to smaller sequences
126 |             continue
127 |         elif qlen == slen and query > fields[c_match]:
128 |             # If same length, using identifier as the tie break
129 |             continue
130 |         assert qlen <= slen
131 |         if float(fields[c_length]) < min_hit:
132 |             # Ignore short HSPs
133 |             continue
134 |         # Will next look at hit coverage etc
135 |         qstart = int(fields[c_qstart])
136 |         qend = int(fields[c_qend])
137 |         assert qstart < qend
138 |         try:
139 |             regions[query].add((qstart, qend))
140 |         except KeyError:
141 |             regions[query] = {(qstart, qend)}
142 |         lengths[query] = qlen
143 |     for query in regions:
144 |         regs = sorted(regions[query])
145 |         qlen = lengths[query]
146 |         if (1, qlen) in regs:
147 |             # Short cut!
148 |             # print("All of %s matched other longer contig(s)" % query)
149 |             yield query
150 |             continue
151 |         # Now get effective total region covered by good hits
152 |         i = 0
153 |         q = 0
154 |         while regs:
155 |             qstart, qend = regs.pop(0)
156 |             if i < qstart:
157 |                 # New region
158 |                 q += qend - qstart + 1
159 |                 i = qend
160 |             elif i < qend:
161 |                 # Extending region
162 |                 q += qend - i
163 |                 i = qend
164 |             else:
165 |                 # Already counted this region
166 |                 pass
167 |         if len(regions[query]) == 1:
168 |             # Sanity test
169 |             qstart, qend = list(regions[query])[0]
170 |             assert q == qend - qstart + 1
171 |         # else:
172 |         #    # For debugging:
173 |         #    print qlen, sorted(regions[query])
174 |         assert q <= qlen, "%0.2f of %s hit other longer contigs (%i of %i bp)" % (
175 |             q * 100.0 / qlen,
176 |             query,
177 |             q,
178 |             qlen,
179 |         )
180 |         if qlen * min_cover / 100 <= q:
181 |             # print("%0.2f of %s hit other longer contigs" % (q * 100.0 / qlen, query))
182 |             yield query
183 | 
184 | 
185 | def dedup(assembly_fasta, blast_file, output_fasta):
186 |     duplicates = set(find_duplicates(blast_file))
187 |     print("Identified %i contigs to treat as duplicates" % len(duplicates))
188 |     # Filter the original assembly FASTA file to retain its ordering.
189 |     # Must repeat the minimum length filter here too...
190 |     wanted = (
191 |         r
192 |         for r in SeqIO.parse(assembly_fasta, "fasta")
193 |         if r.id not in duplicates and len(r) >= min_len
194 |     )
195 |     count = SeqIO.write(wanted, output_fasta, "fasta")
196 |     print("Saved %i contigs to %s" % (count, output_fasta))
197 | 
198 | 
199 | temp_dir = tempfile.mkdtemp(prefix="tmp_dedup_")
200 | sorted_fasta = os.path.join(temp_dir, "pre_dedup_sorted.fasta")
201 | blast_file = os.path.join(temp_dir, "pre_dedup_blast.tsv")
202 | 
203 | prepare_sorted_fasta(assembly_fasta, sorted_fasta)
204 | prepare_blast(sorted_fasta, blast_file)
205 | 
206 | dedup(sorted_fasta, blast_file, output_fasta)
207 | 
208 | print("-" * 60)
209 | count = 0
210 | total = 0
211 | for r in SeqIO.parse(assembly_fasta, "fasta"):
212 |     count += 1
213 |     total += len(r)
214 | print("Input %i contigs, total length %i bp, in %s" % (count, total, assembly_fasta))
215 | count = 0
216 | total = 0
217 | for r in SeqIO.parse(sorted_fasta, "fasta"):
218 |     count += 1
219 |     total += len(r)
220 | print(
221 |     "Min length gives %i contigs, total length %i bp, in %s"
222 |     % (count, total, sorted_fasta)
223 | )
224 | count = 0
225 | total = 0
226 | for r in SeqIO.parse(output_fasta, "fasta"):
227 |     count += 1
228 |     total += len(r)
229 | print("Output %i contigs, total length %i bp, in %s" % (count, total, output_fasta))
230 | print("-" * 60)
231 | 
232 | shutil.rmtree(temp_dir)
233 | 


--------------------------------------------------------------------------------
/hmmer/hmmer_table2tabular.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script to convert HMMER3 table output into tab separated tables.
  3 | 
  4 | This handles both the per-sequence table and the per-domain table. The
  5 | script just takes two arguments, the input filename (or '-' for stdin),
  6 | and the output filename (or '-' for stdout).
  7 | 
  8 | This is v0.0.2 of the script.
  9 | 
 10 | Copyright 2012, Peter Cock, all rights reserved.
 11 | 
 12 | THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
 13 | WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
 14 | WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
 15 | CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
 16 | OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 17 | OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 18 | OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 19 | OR PERFORMANCE OF THIS SOFTWARE.
 20 | """
 21 | 
 22 | from __future__ import print_function
 23 | 
 24 | import sys
 25 | 
 26 | 
 27 | def convert(input_handle, output_handle):
 28 |     """Convert HMMER space separated table into tab separated table.
 29 | 
 30 |     Can be given a per-sequence table,
 31 | 
 32 |     #                                                                       --- full sequence ---- --- best 1 domain ---- --- domain number estimation ----
 33 |     # target name        accession  query name                   accession    E-value  score  bias   E-value  score  bias   exp reg clu  ov env dom rep inc description of target
 34 |     #------------------- ----------         -------------------- ---------- --------- ------ ----- --------- ------ -----   --- --- --- --- --- --- --- --- ---------------------
 35 |     Amidinotransf        PF02274.12 Gpa_EST_02_04___C05_022|ORF2 -            3.3e-06   26.2   0.1   3.3e-06   26.2   0.0   1.0   1   0   0   1   1   1   1 Amidinotransferase
 36 | 
 37 |     Or, a per-domain table,
 38 | 
 39 |     #                                                                                    --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
 40 |     # target name        accession   tlen query name                   accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
 41 |     #------------------- ---------- -----         -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------
 42 |     Amidinotransf        PF02274.12   281 Gpa_EST_02_04___C05_022|ORF2 -             77   3.3e-06   26.2   0.1   1   1   2.4e-10   3.3e-06   26.2   0.0   213   280    10    77     1    77 0.90 Amidinotransferase
 43 | 
 44 | 
 45 |     In both these examples, the columnation is clearly disturbed by the
 46 |     long query names.
 47 | 
 48 |     For these tables the HMMER3 manual states: "... is columnated neatly
 49 |     for human readability, but you should not write parsers that rely on
 50 |     this columnation; parse based on space-delimited fields instead."
 51 |     """
 52 |     h1 = input_handle.readline()
 53 |     assert h1.startswith("# ")
 54 |     h2 = input_handle.readline()
 55 |     assert h2.startswith("# target name")
 56 |     h3 = input_handle.readline()
 57 |     assert h3.startswith("#---")
 58 |     columns = len(h3.split())
 59 |     assert columns == 19 or columns == 23, columns
 60 | 
 61 |     # Hard code our expected header names (but allow for differences
 62 |     # in the spacing):
 63 |     if columns == 19:
 64 |         names = [
 65 |             "target name",
 66 |             "accession",
 67 |             "query name",
 68 |             "accession",
 69 |             "E-value",
 70 |             "score",
 71 |             "bias",
 72 |             "E-value",
 73 |             "score",
 74 |             "bias",
 75 |             "exp",
 76 |             "reg",
 77 |             "clu",
 78 |             "ov",
 79 |             "env",
 80 |             "dom",
 81 |             "rep",
 82 |             "inc",
 83 |             "description of target",
 84 |         ]
 85 |         assert " ".join(h2[2:-1].split()) == " ".join(names)
 86 |         # Now switch to longer names (including line one information):
 87 |         names = [
 88 |             "target name",
 89 |             "accession",
 90 |             "query name",
 91 |             "accession",
 92 |             "full sequence E-value",
 93 |             "full sequence score",
 94 |             "full sequence bias",
 95 |             "best 1 domain E-value",
 96 |             "best 1 domain score",
 97 |             "best 1 domain bias",
 98 |             "domain number estimation exp",
 99 |             "domain number estimation reg",
100 |             "domain number estimation clu",
101 |             "domain number estimation ov",
102 |             "domain number estimation env",
103 |             "domain number estimation dom",
104 |             "domain number estimation rep",
105 |             "domain number estimation inc",
106 |             "description of target",
107 |         ]
108 |     else:
109 |         names = [
110 |             "target name",
111 |             "accession",
112 |             "tlen",
113 |             "query name",
114 |             "accession",
115 |             "qlen",
116 |             "E-value",
117 |             "score",
118 |             "bias",
119 |             "#",
120 |             "of",
121 |             "c-Evalue",
122 |             "i-Evalue",
123 |             "score",
124 |             "bias",
125 |             "from",
126 |             "to",
127 |             "from",
128 |             "to",
129 |             "from",
130 |             "to",
131 |             "acc",
132 |             "description of target",
133 |         ]
134 |         assert " ".join(h2[2:-1].split()) == " ".join(names)
135 |         # Now switch to longer names (including line one information):
136 |         names = [
137 |             "target name",
138 |             "accession",
139 |             "tlen",
140 |             "query name",
141 |             "accession",
142 |             "qlen",
143 |             "full sequence E-value",
144 |             "full sequence score",
145 |             "full sequence bias",
146 |             # The next two columns are for e.g. 1 of 3, 2 of 3, 3 of 3.
147 |             "dom#",
148 |             "ndom",
149 |             "c-Evalue",
150 |             "i-Evalue",
151 |             "score",
152 |             "bias",
153 |             "hmm coord from",
154 |             "hmm coord to",
155 |             "ali coord from",
156 |             "ali coord to",
157 |             "env coord from",
158 |             "env coord to",
159 |             "acc",
160 |             "description of target",
161 |         ]
162 |     assert len(names) == columns
163 |     output_handle.write("#%s\n" % "\t".join(names))
164 | 
165 |     # Now the easy bit, tabify the data (using white space as instructed
166 |     # in the HMMER3 manual, which says not to use the columnation).
167 |     count = 0
168 |     for line in input_handle:
169 |         assert line[0] != "#"
170 |         # There will be spaces in the last column (description of target)
171 |         parts = line.rstrip("\n").split(None, columns - 1)
172 |         assert len(parts) == columns, parts
173 |         output_handle.write("\t".join(parts) + "\n")
174 |         count += 1
175 |     return count
176 | 
177 | 
178 | if len(sys.argv) != 3:
179 |     sys.exit("Expect two argument: input HMMER table, output tabular file")
180 | in_file, out_file = sys.argv[1:]
181 | 
182 | if in_file == "-":
183 |     inp = sys.stdin
184 | else:
185 |     inp = open(in_file, "rU")
186 | 
187 | if out_file == "-":
188 |     out = sys.stdout
189 | else:
190 |     out = open(out_file, "w")
191 | 
192 | count = convert(inp, out)
193 | 
194 | if in_file != "-":
195 |     inp.close()
196 | if out_file != "-":
197 |     out.close()
198 |     print("Converted table with %i lines" % count)
199 | 


--------------------------------------------------------------------------------
/primer_selection/isPcr_tally.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Based on report_len_###.py developed for an in-silico PCR
  4 | # primer evaluation project Sep 2023 to Jan 2024.
  5 | # Renamed from isPcr_lineage_tally.py to isPcr_tally.py
  6 | # when moved from v0.9.2 to v1.0.0 and preserved FASTA IDs.
  7 | import argparse
  8 | import os
  9 | import sys
 10 | 
 11 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 12 | 
 13 | if "-v" in sys.argv or "--version" in sys.argv:
 14 |     print("v1.0.0")
 15 |     sys.exit(0)
 16 | 
 17 | usage = """\
 18 | Parses a set of FASTA files and isPcr BED output files, to
 19 | report on the expected product legnths expected to amplify
 20 | per primer and per lineage. Produces one row for each sequence
 21 | ID in the FASTA files, with columns for the sequence ID and
 22 | description, and each primer pair. Each value is a semi-colon
 23 | separated list of any amplicon lengths for each in-silico PCR
 24 | combination. This is then used with sister script
 25 | ``isPcr_lineage_report.py`` to produce tables and plots.
 26 | 
 27 | Inputs:
 28 | 
 29 | * Primer definitions in 3-column TSV format used by isPcr
 30 | * Set of reference FASTA files where the description line is
 31 |   ideally for downstream analysis a semi-colon separated taxonomic
 32 |   lineage, as produced by script ``species_dedup_gbk.py`` from the
 33 |   source lines in NCBI GenBank format files.
 34 | * Bed files (TSV) from Jim Kent's isPcr tool run on those FASTA
 35 |   files (must have matching record identifiers), and primers
 36 |   (possibly with even more primers). This may drop column 5
 37 |   (score).
 38 | 
 39 | The reason for ignoring column 5 is to facilitate running isPcr
 40 | in combination with script ``iupac_isPcr.py`` which expands any
 41 | IUPAC ambiguity codes in degenerate primers into a set of all
 42 | possible unambiguous interpretations of the primers. This will
 43 | result in duplicate hits differing only in score.
 44 | 
 45 | Example usage::
 46 | 
 47 |     $ sort primers.tsv | uniq | ./iupac_isPcr.py > expanded.tsv
 48 |     $ isPcr refs.fasta expanded.tsv stdout -out=bed \\
 49 |       | cut -f 1-4,6 | sort | uniq > amplicons.tsv
 50 |     $ ./isPcr_tally.py -f refs.fasta \\
 51 |       -p primers.tsv -a amplicons.tsv -o tally.tsv
 52 | 
 53 | Here ``primers.tsv`` is a three-column input TSV file of primer
 54 | pair name, forward, and reverse sequences -- possibly ambiguous.
 55 | Intermediate file ``expanded.tsv`` is the expanded file of
 56 | unambiguous primers, ``amplicons.tsv`` is the isPcr output in BED
 57 | format (without column 5, score), sorted and deduplicated.
 58 | Finally ``tally.tsv`` is the output tally TSV filename.
 59 | """
 60 | 
 61 | parser = argparse.ArgumentParser(
 62 |     prog="isPcr_tally.py",
 63 |     description="Produce tally of Jim Kent's isPcr results, sequence vs primer.",
 64 |     epilog=usage,
 65 |     formatter_class=argparse.RawDescriptionHelpFormatter,
 66 | )
 67 | parser.add_argument(
 68 |     "-f",
 69 |     "--fasta",
 70 |     nargs="+",
 71 |     metavar="FASTA",
 72 |     required=True,
 73 |     help="One or more reference FASTA files with taxonomic lineage as description.",
 74 | )
 75 | parser.add_argument(
 76 |     "-p",
 77 |     "--primers",
 78 |     nargs="+",
 79 |     metavar="TSV",
 80 |     required=True,
 81 |     help=(
 82 |         "Primer as isPcr style 3-column plain text TSV file(s) "
 83 |         "listing the primers in the order to report on (which "
 84 |         "can be a subset of those in the amplicon file)."
 85 |     ),
 86 | )
 87 | parser.add_argument(
 88 |     "-a",
 89 |     "--amplicons",
 90 |     nargs="+",
 91 |     metavar="TSV",
 92 |     required=True,
 93 |     help=(
 94 |         "Deduplicated bed output file(s) from isPcr. Only the "
 95 |         "first 4 columns are used."
 96 |     ),
 97 | )
 98 | parser.add_argument(
 99 |     "-o",
100 |     "--output",
101 |     metavar="TSV",
102 |     required=True,
103 |     help="Filename for tally TSV output.",
104 | )
105 | args = parser.parse_args()
106 | 
107 | primer_files = args.primers
108 | fasta_files = args.fasta
109 | bed_files = args.amplicons
110 | tally_file = args.output
111 | 
112 | 
113 | def load_primers(primer_files):
114 |     primers = {}
115 |     for primer_file in primer_files:
116 |         # sys.stderr.write(f"DEBUG: Loading primer TSV file {primer_file}\n")
117 |         for line in open(primer_file):
118 |             if line.startswith("#") or not line.strip():
119 |                 continue
120 |             name, fwd, rev = line.rstrip().split("\t")[:3]
121 |             if name not in primers:
122 |                 primers[name] = {(fwd, rev)}
123 |             elif (fwd, rev) not in primers[name]:
124 |                 primers[name].add((fwd, rev))
125 |             else:
126 |                 sys.stderr.write(f"WARNING - duplicate line for {name}\n")
127 |     for name in primers:
128 |         if (count := len(primers[name])) > 1:
129 |             sys.stderr.write(f"WARNING - cocktail of {count} pairs for {name}\n")
130 |     return primers
131 | 
132 | 
133 | primer_defs = load_primers(primer_files)
134 | sys.stderr.write(f"Loaded {len(primer_defs)} primers\n")
135 | 
136 | 
137 | def load_fasta(fasta_files):
138 |     acc_description = {}
139 |     for fasta_file in fasta_files:
140 |         # sys.stderr.write(f"DEBUG: Loading reference FASTA file {fasta_file}\n")
141 |         with open(fasta_file) as handle:
142 |             for title, seq in SimpleFastaParser(handle):
143 |                 acc, description = title.split(None, 1)
144 |                 if acc in acc_description:
145 |                     sys.exit(f"ERROR - Duplicate {acc} in FASTA files")
146 |                 acc_description[acc] = description
147 |     return acc_description
148 | 
149 | 
150 | if os.path.isfile(tally_file):
151 |     sys.stderr.write(f"WARNING - Overwriting {tally_file}\n")
152 | 
153 | acc_description = load_fasta(fasta_files)
154 | sys.stderr.write(f"Loaded lineage for {len(acc_description)} reference ids\n")
155 | 
156 | amplicon_lengths = {}
157 | for bed_file in bed_files:
158 |     # sys.stderr.write(f"DEBUG: Loading BED file {bed_file}\n")
159 |     with open(bed_file) as handle:
160 |         for line in handle:
161 |             if not line or line.startswith("#"):
162 |                 continue
163 |             # We dropped column 5 (score), not checking 6 (now 5) strand etc
164 |             acc, start, end, name, strand = line.rstrip().split("\t", 4)
165 |             if acc not in acc_description:
166 |                 sys.exit(f"ERROR - Unexpected sequence {acc} in {bed_file}")
167 |             # Need the lengths of the primers
168 |             try:
169 |                 cocktail = primer_defs[name]
170 |             except KeyError:
171 |                 sys.stderr.write(
172 |                     f"WARNING - Ignoring unknown primer {name} in {bed_file}\n"
173 |                 )
174 |                 continue
175 |             f_lengths = {len(f) for f, r in cocktail}
176 |             r_lengths = {len(r) for f, r in cocktail}
177 |             assert (
178 |                 len(f_lengths) == 1 and len(r_lengths) == 1
179 |             ), f"Assorted lengths in {name} cocktail"
180 |             # Do NOT add +1, the start/end are python style, len=end-start
181 |             product_len = (
182 |                 int(end) - int(start) - list(f_lengths)[0] - list(r_lengths)[0]
183 |             )
184 |             # TODO - drop this and then would work even with original column 5 score?:
185 |             assert strand in "+-", line
186 |             try:
187 |                 amplicon_lengths[acc, name].append(product_len)
188 |             except KeyError:
189 |                 amplicon_lengths[acc, name] = [product_len]
190 | 
191 | if not amplicon_lengths:
192 |     sys.exit("ERROR - No amplicons loaded. Are the amplicon and primer files matched?")
193 | 
194 | with open(tally_file, "w") as handle:
195 |     handle.write("#Sequence\tDescription\t" + "\t".join(primer_defs) + "\n")
196 |     for acc, lineage in acc_description.items():
197 |         fields = [acc, lineage] + [
198 |             ";".join(str(_) for _ in sorted(amplicon_lengths.get((acc, name), [])))
199 |             for name in primer_defs
200 |         ]
201 |         handle.write("\t".join(fields) + "\n")
202 | del amplicon_lengths
203 | sys.stderr.write(
204 |     f"Wrote {tally_file} with {len(acc_description)} sequences vs {len(primer_defs)} primer pairs\n"
205 | )
206 | 


--------------------------------------------------------------------------------
/fetch_viruses/merge_viruses.py:
--------------------------------------------------------------------------------
  1 | # Prepares merged FASTA files to use for BLAST databases
  2 | #
  3 | # v000 - proteins only
  4 | # v001 - date based filename
  5 | #      - CDS nuc sequences too
  6 | # v002 - Use BLAST friendly names
  7 | # v003 - multiple sets of viruses
  8 | # v004 - fixed missing | in fna names
  9 | # v005 - Handle feature extraction via Biopython
 10 | #      - Tested under Python 3
 11 | from __future__ import print_function
 12 | 
 13 | import os
 14 | 
 15 | from Bio import SeqIO
 16 | from Bio.SeqRecord import SeqRecord
 17 | 
 18 | date_stamp = "20131114"
 19 | 
 20 | tables = {
 21 |     "NC_008956": 1,
 22 |     "NC_008954": 1,
 23 |     "NC_008949": 1,
 24 |     "NC_008948": 1,
 25 |     "NC_011452": 1,
 26 | }
 27 | 
 28 | 
 29 | def dedup(input_fasta, output_fasta):
 30 |     """Merge identical FASTA entries using NCBI NR Ctrl+A style."""
 31 |     # Try it in memory...
 32 |     print("Deduplicating %s..." % input_fasta)
 33 |     by_seq = {}
 34 |     total = 0
 35 |     for record in SeqIO.parse(input_fasta, "fasta"):
 36 |         total += 1
 37 |         s = str(record.seq).upper()
 38 |         t = record.description
 39 |         try:
 40 |             by_seq[s].append(t)
 41 |         except KeyError:
 42 |             by_seq[s] = [t]
 43 |     print(
 44 |         "Reduced %s from %i to %i records as %s (%0.1f%%)"
 45 |         % (
 46 |             input_fasta,
 47 |             total,
 48 |             len(by_seq),
 49 |             output_fasta,
 50 |             len(by_seq) * 100.0 / float(total),
 51 |         )
 52 |     )
 53 |     handle = open(output_fasta, "w")
 54 |     for s in by_seq:
 55 |         titles = by_seq[s]
 56 |         # chr(1) = CTRL+A
 57 |         handle.write(">%s\n%s\n" % (chr(1).join(sorted(titles)), s))
 58 |     handle.close()
 59 | 
 60 | 
 61 | def get_nuc(seq, loc_string):
 62 |     reverse = False
 63 |     if loc_string.startswith("complement("):
 64 |         assert loc_string[-1] == ")"
 65 |         loc_string = loc_string[11:-1]
 66 |         reverse = True
 67 |     start, end = [int(x.strip("<").strip(">")) for x in loc_string.split("..")]
 68 |     nuc = seq[start - 1 : end]
 69 |     if reverse:
 70 |         return nuc.reverse_complement()
 71 |     else:
 72 |         return nuc
 73 | 
 74 | 
 75 | def make_db(fasta, protein=False):
 76 |     stem = os.path.splitext(fasta)[0]
 77 |     if protein:
 78 |         t = "prot"
 79 |         if os.path.isfile(stem + ".pin"):
 80 |             return
 81 |     else:
 82 |         t = "nucl"
 83 |         if os.path.isfile(stem + ".nin"):
 84 |             return
 85 |     cmd = "makeblastdb -in %s -dbtype %s -out %s -parse_seqids" % (fasta, t, stem)
 86 |     print(cmd)
 87 |     rc = os.system(cmd)
 88 |     if rc:
 89 |         raise RuntimeError("Return code %i from:\n%s" % (rc, cmd))
 90 | 
 91 | 
 92 | def make_merged_genomes(genomes_file, names):
 93 |     count = 0
 94 |     handle = open(genomes_file, "w")
 95 |     for name in names:
 96 |         acc = (name + ".").split(".")[0]
 97 |         record = SeqIO.read("GenBank/%s.gbk" % acc, "gb")
 98 |         gi = record.annotations["gi"]
 99 |         # Convert to NCBI style FASTA identifier...
100 |         record.id = "gi|%s|ref|%s" % (gi, record.id)
101 |         count += SeqIO.write(record, handle, "fasta")
102 |     handle.close()
103 |     return count
104 | 
105 | 
106 | for group in [
107 |     "dsDnaViruses",
108 |     "ssDnaViruses",
109 |     "dsRnaViruses",
110 |     "ssRnaViruses",
111 |     "allViruses",
112 | ]:
113 |     print("=" * len(group))
114 |     print(group)
115 |     print("=" * len(group))
116 |     names = open("GenBank/%s.txt" % group).read().split("\n")
117 |     genomes_file = "%s_%s_genomes.fna" % (group, date_stamp)
118 |     genomes_nr = "%s_%s_genomes_NR.fna" % (group, date_stamp)
119 |     protein_file = "%s_%s_proteins.faa" % (group, date_stamp)
120 |     protein_nr = "%s_%s_proteins_NR.faa" % (group, date_stamp)
121 |     nuc_file = "%s_%s_genes.ffn" % (group, date_stamp)
122 |     nuc_nr = "%s_%s_genes_NR.ffn" % (group, date_stamp)
123 |     print("Looking at %i %s" % (len(names), group))
124 | 
125 |     if os.path.isfile(genomes_file):
126 |         print("Got %s" % genomes_file)
127 |     else:
128 |         print("Writing %s..." % genomes_file)
129 |         count = make_merged_genomes(genomes_file, names)
130 |         print("%i records in %s" % (count, genomes_file))
131 | 
132 |     if os.path.isfile(genomes_nr):
133 |         print("Got %s" % genomes_nr)
134 |     else:
135 |         dedup(genomes_file, genomes_nr)
136 | 
137 |     make_db(genomes_nr, protein=False)
138 | 
139 |     if os.path.isfile(protein_file):
140 |         print("Got %s" % protein_file)
141 |     else:
142 |         handle = open(protein_file, "w")
143 |         bad = 0
144 |         count = 0
145 |         for index, name in enumerate(names):
146 |             name = name.split(".", 1)[0]
147 |             filename = "GenBank/%s.gbk" % name
148 |             parent = None
149 |             for record in SeqIO.parse(open(filename), "genbank-cds"):
150 |                 if "pseudo" in record.annotations:
151 |                     continue
152 |                 if "pseudogene" in record.annotations:
153 |                     continue
154 |                 count += 1
155 |                 try:
156 |                     protein_id = record.annotations["protein_id"]
157 |                 except KeyError:
158 |                     print(filename)
159 |                     print(record)
160 |                     assert False
161 |                 gi = None
162 |                 for xref in record.dbxrefs:
163 |                     if xref.lower().startswith("gi:"):
164 |                         gi = xref[3:]
165 |                         break
166 |                 assert gi and protein_id, str(record)
167 |                 record.id = "gi|%s|ref|%s" % (gi, record.id)
168 |                 if record.description == "<unknown description>":
169 |                     if "product" in record.annotations:
170 |                         record.description = record.annotations["product"]
171 |                     elif "note" in record.annotations:
172 |                         record.description = record.annotations["note"]
173 |                 if record.seq is None:
174 |                     bad += 1
175 |                     print("%s %s" % (filename, record.annotations["raw_location"]))
176 |                     if parent is None:
177 |                         parent = SeqIO.read(open(filename), "gb")
178 |                     nuc = get_nuc(parent.seq, record.annotations["raw_location"])
179 |                     if "transl_table" in record.annotations:
180 |                         table = int(record.annotations["transl_table"])
181 |                     else:
182 |                         table = tables[name]
183 |                     pro = nuc.translate(table)
184 |                     assert pro.endswith("*") and pro.count("*") == 1
185 |                     record.seq = pro[:-1]  # remove stop
186 |                 SeqIO.write([record], handle, "fasta")
187 |             # print("%i: %i in %s" % (index+1, count, name))
188 |         handle.close()
189 |         print("Done")
190 |         print("%i proteins" % count)
191 |         print("%i missing provided translation" % bad)
192 | 
193 |     if os.path.isfile(protein_nr):
194 |         print("Got %s" % protein_nr)
195 |     else:
196 |         dedup(protein_file, protein_nr)
197 | 
198 |     make_db(protein_nr, protein=True)
199 | 
200 |     if os.path.isfile(nuc_file):
201 |         print("Got %s" % nuc_file)
202 |     else:
203 |         handle = open(nuc_file, "w")
204 |         count = 0
205 |         for index, name in enumerate(names):
206 |             name = name.split(".", 1)[0]
207 |             filename = "GenBank/%s.gbk" % name
208 |             # print(name)
209 |             parent = SeqIO.read(open(filename), "genbank")
210 |             for f in parent.features:
211 |                 if f.type != "CDS":
212 |                     continue
213 |                 if "pseudo" in f.qualifiers:
214 |                     continue
215 |                 if "pseudogene" in f.qualifiers:
216 |                     continue
217 |                 nuc = f.extract(parent.seq)
218 |                 protein_id = f.qualifiers["protein_id"][0]
219 |                 gi = None
220 |                 pro = nuc.translate(tables.get(name, 1))
221 |                 if not (pro.endswith("*") and pro.count("*") == 1):
222 |                     print("%s %s lacks stop codon" % (name, protein_id))
223 |                 for xref in f.qualifiers["db_xref"]:
224 |                     if xref.lower().startswith("gi:"):
225 |                         gi = xref[3:]
226 |                         break
227 |                 if not (gi and protein_id):
228 |                     print(f)
229 |                     assert False
230 |                 # Bit of a hack, we are using the protein's ID here!
231 |                 record = SeqRecord(
232 |                     nuc,
233 |                     id="gi|%s|ref|%s" % (gi, protein_id),
234 |                     description="; ".join(f.qualifiers.get("note", [])),
235 |                 )
236 |                 SeqIO.write([record], handle, "fasta")
237 |                 count += 1
238 |             # print("%i: %i in %s" % (index+1, count, name))
239 |         handle.close()
240 |         print("Done")
241 |         print("%i genes" % count)
242 | 
243 |     if os.path.isfile(nuc_nr):
244 |         print("Got %s" % nuc_nr)
245 |     else:
246 |         dedup(nuc_file, nuc_nr)
247 | 
248 |     make_db(nuc_nr, protein=False)
249 | 


--------------------------------------------------------------------------------
/primer_selection/plot_isprc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Based on tally_coliphage_primers_###.py developed
  4 | # while evaluating and designing primers for coliphage
  5 | # in August 2023 for a RESAS project.
  6 | import argparse
  7 | import sys
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import seaborn as sns
 12 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 13 | from matplotlib import pyplot as plt
 14 | 
 15 | if "-v" in sys.argv or "--version" in sys.argv:
 16 |     print("v0.0.1")
 17 |     sys.exit(0)
 18 | 
 19 | usage = """\
 20 | This script takes the output of Jim Kent's isPcr tool in FASTA format as
 21 | input. It computes a matrix with accessions as rows, and primer sets as
 22 | columns, with amplicon length as the value (or zero). This is output as a TSV
 23 | file, and plotted as a heatmap output as a PDF file.
 24 | 
 25 | Conflicting amplicon lengths (which requires multiple products from a given
 26 | reference sequence) is considered to be an error, and such primers are
 27 | discarded.
 28 | """
 29 | 
 30 | parser = argparse.ArgumentParser(
 31 |     prog="plot_ispcr.py",
 32 |     description="Plot heatmap from output of Jim Kent's isPcr tool.",
 33 |     epilog=usage,
 34 | )
 35 | parser.add_argument(
 36 |     "-i",
 37 |     "--input",
 38 |     dest="input",
 39 |     nargs="+",
 40 |     default=["/dev/stdin"],
 41 |     metavar="FILE",
 42 |     help="Input FASTA filename(s) from isPcr, default stdin.",
 43 | )
 44 | parser.add_argument(
 45 |     "-t",
 46 |     "--target",
 47 |     type=int,
 48 |     default="0",
 49 |     metavar="INTEGER",
 50 |     help="Target amplicon length (for a default divergent color scheme).",
 51 | )
 52 | parser.add_argument(
 53 |     "-m",
 54 |     "--mincount",
 55 |     type=int,
 56 |     default="5",
 57 |     metavar="INTEGER",
 58 |     help="Minimum number of hits required to report on a primer pair (default 5).",
 59 | )
 60 | parser.add_argument(
 61 |     "-x",
 62 |     "--maxlength",
 63 |     type=int,
 64 |     default="1000",
 65 |     metavar="INTEGER",
 66 |     help="Maximum amplicon size to report on a primer pair (default 1000).",
 67 | )
 68 | parser.add_argument(
 69 |     "--labelallprimers",
 70 |     action="store_true",
 71 |     default=False,
 72 |     help="Force captions for all primers (may overlap).",
 73 | )
 74 | parser.add_argument(
 75 |     "--labelallseqs",
 76 |     action="store_true",
 77 |     default=False,
 78 |     help="Force captions for all reference sequences (may overlap).",
 79 | )
 80 | parser.add_argument(
 81 |     "--vmin",
 82 |     type=int,
 83 |     default="-1",
 84 |     metavar="INTEGER",
 85 |     help="Minimum amplicon length for colour scheme (default automatic)",
 86 | )
 87 | parser.add_argument(
 88 |     "--vmax",
 89 |     type=int,
 90 |     default="-1",
 91 |     metavar="INTEGER",
 92 |     help="Maximum amplicon length for colour scheme (default automatic)",
 93 | )
 94 | # parser.add_argument(
 95 | #    "-d",
 96 | #    "--database",
 97 | #    nargs="+",
 98 | #    metavar="FILE",
 99 | #    help="FASTA filename(s) given to isPcr, or a text file containing those filenames.",
100 | # )
101 | parser.add_argument(
102 |     "-o",
103 |     "--output",
104 |     dest="output",
105 |     required=True,
106 |     metavar="STEM",
107 |     help="Output filename stem (required).",
108 | )
109 | 
110 | if len(sys.argv) == 1:
111 |     sys.exit("ERROR: Invalid command line, try -h or --help.")
112 | options = parser.parse_args()
113 | if options.target and options.target < 40:
114 |     sys.exit(f"ERROR: If used target length should be over 40, not {options.target}")
115 | if options.maxlength and options.maxlength < 40:
116 |     sys.exit(f"ERROR: If used max-length should be over 40, not {options.maxlength}")
117 | if options.target > options.maxlength:
118 |     sys.exit(f"ERROR: Max length {options.maxlength} < target length {options.target}")
119 | 
120 | 
121 | def main(
122 |     pcr_results,
123 |     output_stem,
124 |     min_count=0,
125 |     max_length=1000,
126 |     target_length=0,
127 |     label_all_primers=False,
128 |     label_all_seqs=False,
129 |     vmin=None,
130 |     vmax=None,
131 | ):
132 |     products = {}
133 |     rejected = set()
134 |     for pcr_file in pcr_results:
135 |         sys.stderr.write(f"Loading isPcr predictions from {pcr_file}\n")
136 |         with open(pcr_file) as handle:
137 |             for title, seq in SimpleFastaParser(handle):
138 |                 acc_loc, primer_name, amplicon_size, forward, reverse, acc_name = (
139 |                     title + " "
140 |                 ).split(" ", 5)
141 |                 assert amplicon_size.endswith("bp"), title
142 |                 amplicon_size = int(amplicon_size[:-2])
143 |                 acc = acc_loc.split(":", 1)[0]
144 |                 if primer_name not in rejected and amplicon_size > max_length:
145 |                     sys.stderr.write(
146 |                         f"Rejecting {primer_name} as can give too long an amplicon.\n"
147 |                     )
148 |                     rejected.add(primer_name)
149 |                 if primer_name not in rejected and amplicon_size != products.get(
150 |                     (acc, primer_name), amplicon_size
151 |                 ):
152 |                     sys.stderr.write(
153 |                         f"Rejecting {primer_name} for multiple product sizes.\n"
154 |                     )
155 |                     rejected.add(primer_name)
156 |                 products[acc, primer_name] = amplicon_size
157 |     if not products:
158 |         sys.exit("ERROR: Loaded no in-silico PCR products")
159 |     sys.stderr.write(
160 |         f"Loaded {len(products)} in-silico PCR results, "
161 |         f"max amplicon size {max(products.values())}\n"
162 |     )
163 | 
164 |     hits = sorted({acc for acc, primer_name in products})
165 |     sys.stderr.write(f"Was able to amplify {len(hits)} accessions in all\n")
166 | 
167 |     primers = sorted({primer_name for acc, primer_name in products})
168 |     sys.stderr.write(f"Have amplification from {len(primers)} primers\n")
169 | 
170 |     if rejected:
171 |         sys.stderr.write(
172 |             f"Rejecting {len(rejected)} primers due to multiple or large product sizes\n"
173 |         )
174 |     for primer_name in primers:
175 |         count = sum(1 for acc in hits if (acc, primer_name) in products)
176 |         if primer_name in rejected or count < min_count:
177 |             if primer_name not in rejected:
178 |                 sys.stderr.write(f"Culling {primer_name} as {count} is too few hits\n")
179 |             for acc in hits:
180 |                 if (acc, primer_name) in products:
181 |                     del products[acc, primer_name]
182 |     if not products:
183 |         sys.exit("ERROR: No primer/accession pairs accepted")
184 | 
185 |     # Updates hits list as after dropping primers some accessions may have no hits:
186 |     hits = sorted({acc for acc, primer_name in products})
187 |     primers = sorted({primer_name for acc, primer_name in products})
188 |     sys.stderr.write(
189 |         f"Now have {len(primers)} primers vs {len(hits)} accessions, "
190 |         f"max amplicon size {max(products.values())}\n"
191 |     )
192 | 
193 |     # Tabular output - todo, apply the same ordering as the heatmap?
194 |     with open(output_stem + ".tsv", "w") as out_handle:
195 |         out_handle.write("#\t" + "\t".join(primers) + "\n")
196 |         # for acc, row in zip(hits, as_array):
197 |         #    out_handle.write(acc + "\t" + "\t".join(str(_) for _ in row) + "\n")
198 |         for acc in hits:
199 |             out_handle.write(
200 |                 acc
201 |                 + "\t"
202 |                 + "\t".join(
203 |                     str(products.get((acc, primer_name), 0)) for primer_name in primers
204 |                 )
205 |                 + "\n"
206 |             )
207 |     sys.stderr.write(f"Wrote {output_stem}.tsv\n")
208 | 
209 |     as_array = np.array(
210 |         [
211 |             [products.get((acc, primer_name), 0) for primer_name in primers]
212 |             for acc in hits
213 |         ],
214 |         np.int16,
215 |     )
216 |     data_frame = pd.DataFrame(as_array, columns=primers, index=hits)
217 | 
218 |     cluster_grid = sns.clustermap(
219 |         data_frame,  # as_array,
220 |         mask=(as_array == 0),
221 |         # I like the default colour map in target mode, otherwise "flare":
222 |         center=target_length if target_length else None,
223 |         cmap=None if target_length else "flare",
224 |         row_cluster=(len(hits) > 1),
225 |         col_cluster=(len(primers) > 1),
226 |         xticklabels=True if label_all_primers else "auto",  # may overlap
227 |         yticklabels=True if label_all_seqs else "auto",  # may overlap
228 |         vmin=vmin,
229 |         vmax=vmax,
230 |         robust=True,  # ignored if both vmin and vmax is used
231 |     )
232 |     # Does this work for smaller font?:
233 |     plt.setp(cluster_grid.ax_heatmap.get_xticklabels(), fontsize=8)  # For x axis
234 | 
235 |     cluster_grid.savefig(output_stem + ".pdf")
236 |     sys.stderr.write(f"Wrote {output_stem}.pdf\n")
237 |     cluster_grid.savefig(output_stem + ".png")
238 |     sys.stderr.write(f"Wrote {output_stem}.png\n")
239 | 
240 | 
241 | main(
242 |     options.input,
243 |     options.output,
244 |     min_count=options.mincount,
245 |     max_length=options.maxlength,
246 |     target_length=options.target,
247 |     label_all_primers=options.labelallprimers,
248 |     label_all_seqs=options.labelallseqs,
249 |     vmin=options.vmin if options.vmin != -1 else None,
250 |     vmax=options.vmax if options.vmax != -1 else None,
251 | )
252 | 


--------------------------------------------------------------------------------
/assembly_comparison/README.rst:
--------------------------------------------------------------------------------
  1 | Python script(s) for visual assessment of (bacterial) assemblies
  2 | ================================================================
  3 | 
  4 | Python code using Biopython and GenomeDiagram (calling ReportLab) to render
  5 | images of (bacterial) assemblies, with the goal of a visual summary especially
  6 | where a reference genome is available.
  7 | 
  8 | This is a work in progress, and has not yet been formally released. See the
  9 | TODO list at the end.
 10 | 
 11 | .. image:: images/TY2482_20110610_vs_NC_018658.png
 12 |    :scale: 50 %
 13 |    :height: 200px
 14 |    :width: 200px
 15 |    :alt: Thumbnail of later TY-2482 assembly versus NC_018658 reference
 16 | 
 17 | .. image:: images/H112180280_vs_NC_018658.png
 18 |    :scale: 50 %
 19 |    :height: 200px
 20 |    :width: 200px
 21 |    :alt: Thumbnail of H112180280 assembly versus NC_018658 reference
 22 | 
 23 | 
 24 | Explanation
 25 | ===========
 26 | 
 27 | Single Comparison
 28 | -----------------
 29 | 
 30 | The ``assembly_comparison.py`` script compares an assembly to a single
 31 | reference sequence. It produces a GenomeDiagram using ReportLab, defaulting
 32 | to circular figures suitable for most bacterial genomes or plasmids, but
 33 | linear diagrams are also possible.
 34 | 
 35 | The central track is the reference genome drawn as a full circle in
 36 | dark grey, with any annotated genes shown in pale blue.
 37 | 
 38 | The outer tracks are used for any contig/scaffold which maps onto the main
 39 | chromosome, the vertical placement (height) is arbitrary and just to avoid
 40 | overlaps. The horizontal placement (radial position) is weighted to put
 41 | each contig/scaffold above where most of it maps.
 42 | 
 43 | The mappings between the contig/scaffold and the main chromosome are colour
 44 | coded, with the transparency set by the percentage identity (with the current
 45 | settings there is relatively little variation). This uses the cross-link
 46 | functionality added in Biopython 1.59, introduced in this blog post:
 47 | http://news.open-bio.org/news/2012/03/cross-links-in-genomediagram/
 48 | 
 49 | By default the colours are red/blue where a contig maps to the same strand
 50 | (red) or flipped to the opposite strand(blue). If most of a contig maps to
 51 | the reverse strand, it is drawn inverted and coloured purple/green instead.
 52 | I intend to add another example with some inversions to better explain this.
 53 | 
 54 | Regions of the genome not represented in the assembly are visible on the
 55 | central track as dark grey (with pale blue genes) with no cross-links.
 56 | 
 57 | Regions of the assembly not represented in the reference genome are visible
 58 | on the contigs as dark grey regions (often at either end of a contig).
 59 | 
 60 | Multiple Comparison
 61 | -------------------
 62 | 
 63 | The script ``multi_comparison.py`` comparing several assemblies/references.
 64 | It produces output a bit like Mauve, showing several genomes - each drawn
 65 | as a linear track - with cross-links between them for similar regions.
 66 | 
 67 | 
 68 | Dependencies
 69 | ============
 70 | 
 71 | * Python, tested with Python 2.7, available from http://python.org
 72 | * Biopython, tested with Biopython 1.62, available from http://biopython.org
 73 | * ReportLab, tested with ReportLab 2.6, available from http://reportlab.com
 74 | * NCBI BLAST+, tested with BLAST 2.2.27+, available from
 75 |   ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/
 76 | 
 77 | Sample Data
 78 | ===========
 79 | 
 80 | Example One
 81 | -----------
 82 | 
 83 | As an example, we will use the first public assembly of the 2011 Shiga-toxin
 84 | producing *Escherichia coli* O104:H4 outbreak in Germany. This was part of the
 85 | open-source crowd-sourcing analysis described in Rohde et al. (2011) and here:
 86 | https://github.com/ehec-outbreak-crowdsourced/BGI-data-analysis/wiki
 87 | 
 88 | You can download this FASTA file with 3,057 sequences for either of these URLs,
 89 | for example using the ``wget`` command under Linux:
 90 | 
 91 | * http://static.xbase.ac.uk/files/results/nick/TY2482/TY2482.fasta.txt
 92 | * https://github.com/ehec-outbreak-crowdsourced/BGI-data-analysis/blob/master/strains/TY2482/seqProject/BGI/assemblies/NickLoman/TY2482.fasta.txt
 93 | 
 94 | This FASTA file ``TY2482.fasta.txt`` was the initial TY-2482 strain assembled
 95 | by Nick Loman from 5 runs of Ion Torrent data released by the BGI, using the
 96 | MIRA 3.2 assembler. It was initially released via his blog,
 97 | http://pathogenomics.bham.ac.uk/blog/2011/06/ehec-genome-assembly/
 98 | 
 99 | We will also need a complete reference genome, ideally one which has been
100 | annotated. With the advantage of hindsight, we can use the complete genome
101 | from a closely related strain isolated from a US citizen infected in Germany
102 | during the outbreak by downloading these two files:
103 | 
104 | * ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_O104_H4_2011C_3493_uid176127/NC_018658.gbk
105 | * ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Escherichia_coli_O104_H4_2011C_3493_uid176127/NC_018658.fna
106 | 
107 | You will need to install the NCBI BLAST+ standalone tools, specifically we
108 | will use ``makeblastdb`` and (from within the Python script) ``blastn``.
109 | Now prepare a BLAST database from the reference FASTA file:
110 | 
111 | 
112 | .. sourcecode:: console
113 | 
114 |     $ makeblastdb -in NC_018658.fna -dbtype nucl
115 | 
116 | You can now run this script using this command:
117 | 
118 | 
119 | .. sourcecode:: console
120 | 
121 |     $ python assembly_comparison.py TY2482.fasta.txt NC_018658.fna
122 |     ...
123 |     Drawing 253 of the 2899 contigs/scaffolds, 1880904 bp
124 |     Unplaced: 2804 contigs/scaffolds, 3610128 bp
125 |     i.e. Placed 34% of the assembly
126 |     ...
127 | 
128 | This will call ``blastn`` to produce tabular output, then produce a PDF diagram
129 | comparing the TY-2482 assembly to the full circle of the reference *E. coli*
130 | strain.
131 | 
132 | .. image:: images/TY2482_vs_NC_018658.png
133 |    :height: 400px
134 |    :width: 400px
135 |    :alt: Thumbnail of first TY-2482 assembly versus NC_018658 reference
136 | 
137 | All these contigs are very small, but the figure shows matches all the way round
138 | the genome which is a good sign.
139 | 
140 | 
141 | Example Two
142 | -----------
143 | 
144 | Continuing this *E. coli* TY-2482 example, lets look at a later assembly from
145 | the same strain - using the same reference genome downloaded and turned into
146 | a BLAST database above:
147 | 
148 | .. sourcecode:: console
149 | 
150 |     $ wget ftp://ftp.genomics.org.cn/pub/Ecoli_TY-2482/Escherichia_coli_TY-2482.scaffold.20110610.fa.gz
151 | 
152 | After downloaded it, decompress it::
153 | 
154 |     $ gunzip Escherichia_coli_TY-2482.scaffold.20110610.fa.gz
155 | 
156 | And run the script:
157 | 
158 | .. sourcecode:: console
159 | 
160 |     $ python assembly_comparison.py Escherichia_coli_TY-2482.scaffold.20110610.fa NC_018658.fna
161 |     ...
162 |     Placed: 75 of the 452 contigs/scaffolds, 4994174 bp
163 |     Unplaced: 377 contigs/scaffolds, 358513 bp
164 |     i.e. Placed 93% of the assembly
165 |     ...
166 | 
167 | .. image:: images/TY2482_20110610_vs_NC_018658.png
168 |    :height: 400px
169 |    :width: 400px
170 |    :alt: Thumbnail of later TY-2482 assembly versus NC_018658 reference
171 | 
172 | The BGI released several versions of this genome during 2011, this was an
173 | intermediate version and the figure shows it covers almost all of the full
174 | genome in reasonably sized contigs - comparing the figures by eye we can see
175 | this is a big improvement over the first assembly (the previous example).
176 | 
177 | Example Three
178 | -------------
179 | 
180 | This is yet another *E. coli* assembly listed on that same wiki page, again download it:
181 | 
182 | .. sourcecode:: console
183 | 
184 |     $ wget https://github.com/ehec-outbreak-crowdsourced/BGI-data-analysis/raw/master/strains/H112180280/seqProject/HealthProtectionAgencyUK/assemblies/HPA_V2/EHEC_H112180280_HPA_scaffolds_reordered_vs_55989.fasta
185 | 
186 | And run the script:
187 | 
188 | .. sourcecode:: console
189 | 
190 |     $ python assembly_comparison.py EHEC_H112180280_HPA_scaffolds_reordered_vs_55989.fasta NC_018658.fna
191 |     ...
192 |     Placed: 7 of the 13 contigs/scaffolds, 5233123 bp
193 |     Unplaced: 6 contigs/scaffolds, 171958 bp
194 |     i.e. Placed 97% of the assembly
195 |     ....
196 | 
197 | This is for second assembly of the H112180280 strain sequenced by the Health
198 | Protection Agency (UK), and it had just 13 large scaffolds:
199 | 
200 | .. image:: images/H112180280_vs_NC_018658.png
201 |    :height: 400px
202 |    :width: 400px
203 |    :alt: Thumbnail of H112180280 assembly versus NC_018658 reference
204 | 
205 | This time the figure shows six sequences mapping very nicely onto the reference genome
206 | main chromosome (I presume the other four sequences are plasmids).
207 | 
208 | 
209 | Example Four
210 | ------------
211 | 
212 | We now switch to a multi-genome comparison, using some of the previous *E. coli*
213 | genomes once again.
214 | 
215 | .. sourcecode:: console
216 | 
217 |     $ python multi_comparison.py NC_018658.fna EHEC_H112180280_HPA_scaffolds_reordered_vs_55989.fasta Escherichia_coli_TY-2482.scaffold.20110610.fa -o e_coli.pdf
218 |     Saved 'e_coli.pdf'
219 | 
220 | 
221 | TODO
222 | ====
223 | 
224 | * Auto-generate reference FASTA file if given just GenBank/EMBL file?
225 | 
226 | * Control over sequence similarity thresholds.
227 | 
228 | * Control over linear or circular output.
229 | 
230 | * Multiple reference sequences (e.g. plasmids), perhaps as separate pages?
231 |   What about contigs hitting multiple plasmids?
232 | 
233 | * Try BLAT etc instead of BLASTN.
234 | 
235 | * Control over the colours?
236 | 
237 | * Galaxy wrapper?
238 | 
239 | * etc
240 | 


--------------------------------------------------------------------------------
/assembly_comparison/order_assembly.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script for assembly comparison."""
  3 | 
  4 | from __future__ import print_function
  5 | 
  6 | import os
  7 | import sys
  8 | import warnings
  9 | from optparse import OptionParser
 10 | 
 11 | from Bio import BiopythonExperimentalWarning
 12 | 
 13 | with warnings.catch_warnings():
 14 |     warnings.simplefilter("ignore", BiopythonExperimentalWarning)
 15 |     from Bio import SearchIO
 16 | 
 17 | from Bio import SeqIO
 18 | from Bio.Blast.Applications import NcbiblastnCommandline
 19 | 
 20 | usage = """Basic usage: order_assembly.py assembly.fasta reference.fasta output.fasta
 21 | 
 22 | There should be a (nucleotide) BLAST database next to the reference FASTA
 23 | file, created with some thing like this such that the BLAST database files
 24 | are named reference.fasta.n* and the database is referenced simply as
 25 | reference.fasta when calling blastn:
 26 | 
 27 | $ makeblastdb -dbtype nucl -in reference.fasta
 28 | 
 29 | Produces output.fasta based on the contigs in assembly.fasta which will
 30 | be reordered and/or reverse complemented to best match reference.fasta
 31 | 
 32 | Small repeat contigs which may match the reference in multiple locations
 33 | will be placed near one of these positions (using a weighted median
 34 | approach restricted to avoid phantom positioning with a mean of tied
 35 | values).
 36 | """
 37 | 
 38 | 
 39 | def hack_ncbi_fasta_name(pipe_name):
 40 |     """Turn 'gi|445210138|gb|CP003959.1|' into 'CP003959.1' etc.
 41 | 
 42 |     For use with NCBI provided FASTA and GenBank files to ensure
 43 |     contig names match up.
 44 | 
 45 |     Or Prokka's *.fna and *.gbk files, turning 'gnl|Prokka|contig000001'
 46 |     into 'contig000001'
 47 |     """
 48 |     if pipe_name.startswith("gi|") and pipe_name.endswith("|"):
 49 |         return pipe_name.split("|")[3]
 50 |     elif pipe_name.startswith("gnl|") and pipe_name.count("|") == 2:
 51 |         return pipe_name.split("|")[2]
 52 |     else:
 53 |         return pipe_name
 54 | 
 55 | 
 56 | parser = OptionParser(usage=usage)
 57 | parser.add_option(
 58 |     "-b",
 59 |     "--blast",
 60 |     dest="blast_filename",
 61 |     help="Use/write BLAST tabular output to FILE (default automatic)",
 62 |     default=None,
 63 |     metavar="FILE",
 64 | )
 65 | parser.add_option(
 66 |     "-m",
 67 |     "--min-len",
 68 |     dest="min_len",
 69 |     type="int",
 70 |     help="Minimum contig length for FASTA output (if no BLAST hit)",
 71 |     default=0,
 72 | )
 73 | parser.add_option(
 74 |     "-l",
 75 |     "--min-hit-len",
 76 |     dest="min_hit",
 77 |     type="int",
 78 |     help="Minimum BLAST hit length to consider",
 79 |     default=5000,
 80 | )
 81 | # parser.add_option("-u", "--unmapped", dest="unmapped",
 82 | #                  help="Included unmapped contigs at the end",
 83 | #                  action="store_true")
 84 | (options, args) = parser.parse_args()
 85 | 
 86 | if len(args) != 3:
 87 |     sys.exit("Requires three arguments!\n\n" + usage)
 88 | assembly_fasta, reference_fasta, output_fasta = args
 89 | blast_file = options.blast_filename
 90 | min_len = int(options.min_len)
 91 | min_hit = int(options.min_hit)
 92 | 
 93 | output_stem = "%s_vs_%s" % (
 94 |     os.path.splitext(os.path.basename(assembly_fasta))[0],
 95 |     os.path.splitext(os.path.basename(reference_fasta))[0],
 96 | )
 97 | output_stem = os.path.join(os.path.dirname(output_fasta), output_stem)
 98 | 
 99 | if not blast_file:
100 |     blast_file = output_stem + ".blast.tsv"
101 | 
102 | if not os.path.isfile(assembly_fasta):
103 |     sys.exit("Assembly FASTA file not found: %r" % assembly_fasta)
104 | 
105 | if not os.path.isfile(reference_fasta):
106 |     sys.exit("Reference FASTA file not found: %r" % reference_fasta)
107 | 
108 | 
109 | def do_blast(query_fasta, db_fasta, blast_file):
110 |     assert os.path.isfile(query_fasta)
111 |     assert os.path.isfile(db_fasta)
112 |     if not (
113 |         os.path.isfile(db_fasta + ".nhr")
114 |         and os.path.isfile(db_fasta + ".nin")
115 |         and os.path.isfile(db_fasta + ".nsq")
116 |     ):
117 |         sys.exit("Missing BLAST database for %s" % db_fasta)
118 |     cmd = NcbiblastnCommandline(
119 |         query=query_fasta, db=db_fasta, out=blast_file, outfmt=6, evalue=1e-5
120 |     )
121 |     print(cmd)
122 |     stdout, stderr = cmd()
123 |     return
124 | 
125 | 
126 | if not os.path.isfile(blast_file):
127 |     do_blast(assembly_fasta, reference_fasta, blast_file)
128 | 
129 | contigs = SeqIO.index(assembly_fasta, "fasta")
130 | blast_results = SearchIO.index(blast_file, "blast-tab")
131 | 
132 | reference_parser = SeqIO.parse(reference_fasta, "fasta")
133 | 
134 | fasta_handle = open(output_fasta, "w")
135 | fasta_saved_count = 0
136 | fasta_short_dropped = 0
137 | 
138 | offset = 0
139 | ref_offsets = {}
140 | for record in reference_parser:
141 |     ref_offsets[hack_ncbi_fasta_name(record.id)] = offset
142 |     offset += len(record)
143 | 
144 | 
145 | def reverse_complement_hsp_fragment(frag, query_length):
146 |     rev = SearchIO.HSPFragment(hit_id=frag.hit_id, query_id=frag.query_id)
147 |     rev.query_start = query_length - frag.query_end
148 |     rev.query_end = query_length - frag.query_start
149 |     rev.hit_start = frag.hit_start
150 |     rev.hit_end = frag.hit_end
151 |     if frag.hit_strand == -1:
152 |         rev.hit_strand = +1
153 |     elif frag.hit_strand == +1:
154 |         rev.hit_strand = -1
155 |     else:
156 |         # O or None,
157 |         rev.hit_strand = frag.hit_strand
158 |     return rev
159 | 
160 | 
161 | def reverse_complement_hsp(hsp, query_length):
162 |     rev = SearchIO.HSP(
163 |         fragments=[
164 |             reverse_complement_hsp_fragment(frag, query_length)
165 |             for frag in hsp.fragments[::-1]
166 |         ]
167 |     )
168 |     rev.ident_pct = hsp.ident_pct
169 |     return rev
170 | 
171 | 
172 | def filter_blast(blast_result, query_length):
173 |     hsps = [
174 |         hsp for hsp in blast_result.hsps if (hsp.query_end - hsp.query_start) >= min_hit
175 |     ]
176 |     hsps = sorted(hsps, key=lambda hsp: hsp.hit_start)
177 |     plus = 0
178 |     minus = 0
179 |     flipped = False
180 |     for hsp in hsps:
181 |         if hsp.hit_strand == -1:
182 |             minus += hsp.hit_end - hsp.hit_start
183 |         else:
184 |             plus += hsp.hit_end - hsp.hit_start
185 |     if minus > plus:
186 |         # Reverse the contig
187 |         flipped = True
188 |         hsps = [reverse_complement_hsp(hsp, query_length) for hsp in hsps]
189 |         hsps = sorted(hsps, key=lambda hsp: hsp.hit_start)
190 |     return make_offset(hsps, query_length), blast_result.id, hsps, flipped
191 | 
192 | 
193 | def weighted_median(values_and_weights, tie_break=True):
194 |     """Median of values with integer weights."""
195 |     x = []
196 |     count = sum(w for v, w in values_and_weights)
197 |     map(x.extend, ([v] * w for v, w in values_and_weights))
198 |     if tie_break:
199 |         # This can give the mean of the mid-points,
200 |         # with side effect of sometimes using an artificial
201 |         # offset not present in the data
202 |         return (x[count / 2] + x[(count - 1) / 2]) / 2.0
203 |     else:
204 |         # Approximiately the median - avoids mean of
205 |         # mid two values by taking the lower.
206 |         return x[count / 2]
207 | 
208 | 
209 | def make_offset(blast_hsps, contig_len):
210 |     if not blast_hsps:
211 |         return 0
212 |     # Weighted by the HSP length:
213 |     offset = int(
214 |         weighted_median(
215 |             [
216 |                 (
217 |                     ref_offsets[hack_ncbi_fasta_name(hsp.hit_id)] + hsp.hit_start,
218 |                     hsp.hit_end - hsp.hit_start,
219 |                 )
220 |                 for hsp in blast_hsps
221 |             ],
222 |             tie_break=False,
223 |         )
224 |     )
225 |     return offset
226 | 
227 | 
228 | # Yes, this does end up parsing the entire FASTA file :(
229 | contig_total_bp = sum(len(contigs[contig_id]) for contig_id in contigs)
230 | 
231 | # Sort the contigs by horizontal position on the diagram
232 | # (yes, this does mean parsing the entire BLAST output)
233 | # (and yes, also the FASTA file to get the query lengths)
234 | blast_data = sorted(
235 |     filter_blast(b, len(contigs[b.id])) for b in SearchIO.parse(blast_file, "blast-tab")
236 | )
237 | contigs_shown = set()
238 | contigs_shown_bp = 0
239 | contig_tracks = []
240 | for offset, contig_id, blast_hsps, flipped in blast_data:
241 |     # TODO - Use BLAST query length instead of parsing FASTA file?
242 |     contig = contigs[contig_id]
243 |     contig_len = len(contig)
244 |     if not blast_hsps:
245 |         # Works, but only if contig appears in BLAST output at all
246 |         # contigs_not_shown_bp += contig_len
247 |         continue
248 | 
249 |     contigs_shown.add(contig_id)
250 |     contigs_shown_bp += contig_len
251 |     if contig_len < min_len:
252 |         print("Note %s had BLAST hit but was only length %i" % (contig_id, contig_len))
253 |     if flipped:
254 |         SeqIO.write(
255 |             contigs[contig_id].reverse_complement(
256 |                 id=True, name=True, description="reversed"
257 |             ),
258 |             fasta_handle,
259 |             "fasta",
260 |         )
261 |     else:
262 |         # Fast provided don't need to take reverse complement
263 |         fasta_handle.write(contigs.get_raw(contig_id))
264 |     fasta_saved_count += 1
265 | 
266 | 
267 | # Now add the unmatched contigs
268 | position = 0
269 | unplaced = 0
270 | for contig in SeqIO.parse(assembly_fasta, "fasta"):
271 |     contig_id = contig.id
272 |     if contig_id in contigs_shown:
273 |         continue
274 |     unplaced += 1
275 |     contig_len = len(contig)
276 |     if min_len <= contig_len:
277 |         fasta_handle.write(contigs.get_raw(contig_id))
278 |         fasta_saved_count += 1
279 |     else:
280 |         fasta_short_dropped += 1
281 | 
282 | 
283 | assert unplaced == len(contigs) - len(contigs_shown), (
284 |     "Only processed %i unplaced contigs, expected %i"
285 |     % (
286 |         unplaced,
287 |         len(contigs) - len(contigs_shown),
288 |     )
289 | )
290 | 
291 | print(
292 |     "Placed: %i of the %i contigs/scaffolds, %i bp"
293 |     % (len(contigs_shown), len(contigs), contigs_shown_bp)
294 | )
295 | print(
296 |     "Unplaced: %i contigs/scaffolds, %i bp"
297 |     % (len(contigs) - len(contigs_shown), contig_total_bp - contigs_shown_bp)
298 | )
299 | print(
300 |     "i.e. Placed %0.f%% of the assembly" % (contigs_shown_bp * 100.0 / contig_total_bp)
301 | )
302 | 
303 | print("Wrote %i records to %r" % (fasta_saved_count, output_fasta))
304 | print("Dropped %i short records" % fasta_short_dropped)
305 | fasta_handle.close()
306 | if fasta_saved_count + fasta_short_dropped != len(contigs):
307 |     sys.exit("Should have written %i records!" % (len(contigs) - fasta_short_dropped))
308 | 


--------------------------------------------------------------------------------
/sambam/sam_to_sspace_tab.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Python script to convert SAM/BAM to SSPACE tab format.
  3 | 
  4 | This is a *PROTOTYPE ONLY* which has had limited testing!
  5 | 
  6 | Converts a SAM/BAM file containing mapped paired-end data into the
  7 | simple tab separated format used by the assembly scaffolder SSPACE:
  8 | 
  9 |     <contig1> <startpos_on_contig1> <endpos_on_contig1> <contig2> <startpos_on_contig2> <endpos_on_contig2>
 10 | 
 11 | e.g.
 12 | 
 13 |     contig1 100 150 contig1 350 300
 14 |     contig1 4000 4050 contig2 110 60
 15 | 
 16 | Essentially this is a rewrite of tools/sam_bam2Tab.pl script included
 17 | in SSPACE basic v2.0 to actually handle real SAM/BAM files where the
 18 | paired end data is correctly encoded using the FLAG field rather than
 19 | read name suffices. It also generates a library file to use with this.
 20 | 
 21 | Assuming your SAM/BAM file(s) have read groups, one tab file is created
 22 | for each read group - plus a library file with the observed fragment
 23 | size information (taken from the TLEN field).
 24 | 
 25 | Simple usage with a paired-end SAM file:
 26 | 
 27 | $ ./sam_to_sspace_tab.py < original.sam converted
 28 | 
 29 | Simple usage with BAM files with conversion to SAM via samtools:
 30 | 
 31 | $ samtools view -h original.bam | ./sam_to_sspace_tab.py converted
 32 | 
 33 | Note the -h is required with a BAM file in order to see the header
 34 | information.
 35 | 
 36 | This will produce files named converted_*.tab, one per read group
 37 | using the read group ID in the filename, plus converted.library
 38 | which is the main input file to give to SSPACE. This will attempt
 39 | to generate sensible values for the paired end insert size and
 40 | orientation, but you should check this and then run SSPACE:
 41 | 
 42 | $ SSPACE_Basic_v2.0.pl -l converted.libraries -s original.fasta ...
 43 | 
 44 | Todo:
 45 | ----
 46 |  * Accept library information (size, orientation) via command line?
 47 |  * Output to a subdirectory? Would need relative paths...
 48 | 
 49 | Copyright Peter Cock 2014. All rights reserved. See:
 50 | https://github.com/peterjc/picobio
 51 | 
 52 | """
 53 | 
 54 | import sys
 55 | 
 56 | if len(sys.argv) != 2:
 57 |     sys.exit("Requires one argument, prefix for output tab files.")
 58 | prefix = sys.argv[1]
 59 | 
 60 | 
 61 | def decode_cigar(cigar):
 62 |     """Return a list of 2-tuples, integer count and operator char."""
 63 |     count = ""
 64 |     answer = []
 65 |     for letter in cigar:
 66 |         if letter.isdigit():
 67 |             count += letter  # string addition
 68 |         elif letter in "MIDNSHP=X":
 69 |             answer.append((int(count), letter))
 70 |             count = ""
 71 |         else:
 72 |             raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
 73 |     return answer
 74 | 
 75 | 
 76 | assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [
 77 |     (14, "S"),
 78 |     (15, "M"),
 79 |     (1, "P"),
 80 |     (1, "D"),
 81 |     (3, "P"),
 82 |     (54, "M"),
 83 |     (1, "D"),
 84 |     (34, "M"),
 85 |     (5, "S"),
 86 | ]
 87 | 
 88 | 
 89 | def cigar_mapped_len(cigar):
 90 |     """Aligned length given by the sum of the CIGAR M/=/X/D/N operations."""
 91 |     if not cigar or cigar == "*":
 92 |         return 1  # Dummy value
 93 |     length = 0
 94 |     for op_length, op_code in decode_cigar(cigar):
 95 |         if op_code in "M=XDN":
 96 |             length += op_length
 97 |     return length
 98 | 
 99 | 
100 | reads = 0
101 | pairs = 0
102 | interesting = 0
103 | rg_handles = {}
104 | rg_lengths = {}
105 | rg_dir = {}
106 | 
107 | cached = {}  # Key by read name
108 | for line in sys.stdin:
109 |     if line[0] == "@":
110 |         # Header line
111 |         if line[1:3] == "RG":
112 |             tags = line.rstrip().split("\t")
113 |             rg = None
114 |             for t in tags:
115 |                 if t.startswith("ID:"):
116 |                     rg = t[3:]
117 |             if rg is None:
118 |                 sys.exit("Missing ID in this read group line: %r" % line)
119 |             rg_handles[rg] = open("%s_%s.tab" % (prefix, rg), "w")
120 |             rg_lengths[rg] = []
121 |             rg_dir[rg] = {"FR": 0, "RF": 0, "FF": 0}
122 |         continue
123 |     # Should be a read
124 |     if reads % 500000 == 0:
125 |         sys.stderr.write("Processed %i reads, %i pairs so far...\n" % (reads, pairs))
126 |     reads += 1
127 |     (
128 |         qname,
129 |         flag,
130 |         rname,
131 |         pos,
132 |         mapq,
133 |         cigar,
134 |         rnext,
135 |         pnext,
136 |         tlen,
137 |         seq,
138 |         qual,
139 |         tags,
140 |     ) = line.rstrip().split("\t", 11)
141 |     rg_tags = [t for t in tags.split("\t") if t[:2] == "RG"]
142 |     if not rg_tags:
143 |         rg = None
144 |         # Ignore this read? What about single library SAM/BAM files?
145 |         continue
146 |     elif len(rg_tags) > 1:
147 |         sys.exit("Multiple RG tags in this line: %r" % line)
148 |     else:
149 |         rg = rg_tags[0]
150 |         if not rg.startswith("RG:Z:"):
151 |             sys.exit("Malformed RG tag %r in this line: %r" % (rg, line))
152 |         rg = rg[5:]
153 | 
154 |     flag = int(flag)
155 |     if (
156 |         not (flag & 0x1)
157 |         or flag & 0x4  # Single end read
158 |         or flag & 0x8  # Unmapped  # Partner unmapped
159 |         or
160 |         # Neither R1 nor R2 (i.e. more than 2 parts)
161 |         (flag & 0x40 and flag & 0x80)
162 |         or not (flag & 0x40 or flag & 0x80)
163 |         or flag & 0x100  # Unknown fragment number
164 |         or flag & 0x800
165 |         or flag & 0x200  # Ignore secondary or supplementary alignments
166 |         or flag & 0x400  # failed QC
167 |     ):  # PCR or optical duplicate
168 |         # Ignore this read
169 |         continue
170 | 
171 |     if rnext == "=":
172 |         rnext = rname
173 |     if qname in cached:
174 |         # This is the second half of the pair (by file order)
175 |         other_flag, other_rname, other_pos, other_cigar = cached.pop(qname)
176 |         if other_rname != rnext or other_pos != pnext:
177 |             sys.stderr.write(
178 |                 "Mapping position mismatch %s:%s versus %s:%s for %s\n"
179 |                 % (other_rname, other_pos, rnext, pnext, qname)
180 |             )
181 |             sys.stderr.write(line)
182 |             sys.exit("Try running samtools fixmates?")
183 |         if bool(flag & 0x10) != bool(other_flag & 0x20) or bool(flag & 0x20) != bool(
184 |             other_flag & 0x10
185 |         ):
186 |             sys.stderr.write(
187 |                 "FLAG strand mismatch %i versus %i for %s\n" % (other_flag, flag, qname)
188 |             )
189 |             sys.stderr.write(line)
190 |             sys.exit("Try running samtools fixmates?")
191 |     else:
192 |         # This is the first half of the pair (by file order), cache it
193 |         cached[qname] = flag, rname, pos, cigar
194 |         continue
195 | 
196 |     if flag & 0x40:
197 |         # This is R1, other is R2
198 |         assert other_flag & 0x80
199 |         pass
200 |     elif flag & 0x80:
201 |         # This is R2, other is R1
202 |         assert other_flag & 0x40
203 |     else:
204 |         assert False, "Bad FLAGs for %s (%i and %i)" % (qname, flag, other_flag)
205 | 
206 |     len1 = cigar_mapped_len(cigar)
207 |     len2 = cigar_mapped_len(other_cigar)
208 |     if flag & 0x10:
209 |         # Read is on the reverse strand
210 |         end1 = int(pos)
211 |         start1 = end1 + len1 + 1
212 |     else:
213 |         # Read is on the forward strand
214 |         start1 = int(pos)
215 |         end1 = start1 + len1 - 1
216 |     if flag & 0x20:
217 |         # Partner (other read) is on the reverse strand
218 |         end2 = int(pnext)
219 |         start2 = end2 + len2 + 1
220 |     else:
221 |         # Partner (other read) is on the forward strand
222 |         start2 = int(pnext)
223 |         end2 = start2 + len2 - 1
224 |     if rname == rnext:
225 |         tlen = abs(int(tlen))
226 |         if tlen:
227 |             rg_lengths[rg].append(tlen)
228 |             if (flag & 0x10) and (flag & 0x20):
229 |                 rg_dir[rg]["FF"] += 1
230 |             elif flag & 0x10:
231 |                 assert end1 <= start1
232 |                 assert start2 <= end2
233 |                 # These are 'innies' --> <--,
234 |                 # Self:          end1 <---- start1
235 |                 # Other: start2 ----> end2
236 |                 #
237 |                 # Also consider overlapping reads as 'innies' --> <--
238 |                 # Self:      end1 <---- start1
239 |                 # Other: start2 ----> end2
240 |                 #
241 |                 # But these are 'outies' <-- -->
242 |                 # Self:  end1 <---- start1
243 |                 # Other:    start2 ----> end2
244 |                 if start1 < start2:
245 |                     rg_dir[rg]["RF"] += 1  # 'outies' <-- -->
246 |                 else:
247 |                     rg_dir[rg]["FR"] += 1  # 'innies' --> <--
248 |             elif flag & 0x20:
249 |                 assert start1 <= end1
250 |                 assert end2 <= start2
251 |                 # Likewise, these are 'outies' <-- -->
252 |                 # Self:      start1 ----> end1
253 |                 # Other:  end2 <---- start2
254 |                 if start2 < start1:
255 |                     rg_dir[rg]["RF"] += 1  # 'outies' <-- -->
256 |                 else:
257 |                     rg_dir[rg]["FR"] += 1  # 'innies' --> <--
258 |             else:
259 |                 rg_dir[rg]["FF"] += 1
260 |     else:
261 |         interesting += 1
262 | 
263 |     try:
264 |         handle = rg_handles[rg]
265 |     except KeyError:
266 |         sys.exit("Unexpected read group identifier %r in this line: %r" % (rg, line))
267 | 
268 |     handle.write(
269 |         "%s\t%i\t%i\t%s\t%i\t%i\n" % (rname, start1, end1, rnext, start2, end2)
270 |     )
271 |     pairs += 1
272 | 
273 | for handle in rg_handles.values():
274 |     handle.close()
275 | 
276 | sys.stderr.write("Extracted %i pairs from %i reads\n" % (pairs, reads))
277 | sys.stderr.write("Of these, %i pairs are mapped to different contigs\n" % interesting)
278 | assert not cached, cached
279 | 
280 | handle = open(prefix + ".library", "w")
281 | for rg in sorted(rg_lengths):
282 |     lengths = rg_lengths[rg]
283 |     size = 0
284 |     error = 0.0
285 |     direction = "??"
286 |     if lengths:
287 |         print(
288 |             "Read group %s length range when mapped to same contig %i to %i, count %i, mean %0.1f"
289 |             % (
290 |                 rg,
291 |                 min(lengths),
292 |                 max(lengths),
293 |                 len(lengths),
294 |                 float(sum(lengths)) / len(lengths),
295 |             )
296 |         )
297 |         print(rg_dir[rg])
298 |         assert sum(rg_dir[rg].values()) == len(lengths)
299 |         # Pick most common direction
300 |         direction = [
301 |             d for d in rg_dir[rg] if rg_dir[rg][d] == max(rg_dir[rg].values())
302 |         ][0]
303 |         print("Most common pairing direction %s" % direction)
304 |         # This attempts to maximize pairings used (very inclusive)
305 |         # TODO - Configurable?
306 |         size = 0.5 * (min(lengths) + max(lengths))
307 |         error = (max(lengths) - size) / size
308 |         if error >= 1.0:
309 |             # Ah. Can't cover all over them since SSPACE limits error to < 1.0
310 |             # times size.
311 |             size = float(sum(lengths)) / len(lengths)  # median?
312 |             error = 0.999
313 |     handle.write(
314 |         "%s TAB %s_%s.tab %i %0.3f %s\n" % (rg, prefix, rg, size, error, direction)
315 |     )
316 | handle.close()
317 | print("Now run SSPACE with your FASTA file and %s.library" % prefix)
318 | 


--------------------------------------------------------------------------------
/annotation_comparison/annotation_diff.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | from collections import OrderedDict
  4 | 
  5 | from Bio import SeqIO
  6 | from Bio.Seq import Seq
  7 | from Bio.Seq import UnknownSeq
  8 | from Bio.SeqFeature import FeatureLocation
  9 | from Bio.SeqFeature import SeqFeature
 10 | from Bio.SeqIO.InsdcIO import _insdc_location_string as location_string
 11 | from Bio.SeqRecord import SeqRecord
 12 | 
 13 | # TODO - expose location_string as a public API in Biopython
 14 | 
 15 | FEATURE_TYPE_TO_IGNORE = ["source"]
 16 | # Empty/None for any not in FEATURE_TYPE_TO_IGNORE
 17 | FEATURE_TYPE_WANTED = ["CDS"]
 18 | MISSING_QUALIFIERS_TO_IGNORE = [
 19 |     "translation",
 20 |     "codon_start",
 21 |     "db_xref",
 22 |     "ID",
 23 |     "transl_table",
 24 | ]
 25 | QUALIFIERS_TO_IGNORE = ["inference", "note"]
 26 | 
 27 | usage = """Annotation feature qualifier diff tool, for GenBank/EMBL/GFF3
 28 | 
 29 | Usage:
 30 | 
 31 | $ ./annotation_diff.py old_file.ext new_file.ext
 32 | 
 33 | The two files should contain the same set of sequences, with
 34 | matching features (in same order). The tool then compares the
 35 | feature annotations (e.g. gene names, locus tags, products).
 36 | 
 37 | Output is to standard out, use Unix redirection to capture as a file:
 38 | 
 39 | $ ./annotation_diff.py old_file.ext new_file.ext > old_vs_new.txt
 40 | 
 41 | The output is a simple tab based format, where lines with a
 42 | leading # are comments:
 43 | 
 44 | - Reference/chromosome name
 45 | - Feature type, e.g. 'CDS'
 46 | - INSDC style location string (as used in GenBank/EMBL files)
 47 | - Annotation qualifier key, e.g. 'gene'
 48 | - Old annotation qualifier value
 49 | - New annotation qualifier value
 50 | 
 51 | The qualifier values are generally quoted strings, or None as a
 52 | special value to indicate missing. If a key appears multiple times
 53 | you get a list using Python style square brackets.
 54 | 
 55 | The expectation is you can then apply (some or all) of the changes
 56 | to (an edited version of) the old file with the sister script:
 57 | 
 58 | $ ./annotation_patch.py old_vs_new.txt original.gff > updated.gff
 59 | 
 60 | The input files can be in GenBank or EMBL format (using Biopython)
 61 | or GFF3 (using a simplistic internal parser which DOES NOT SUPPORT
 62 | multi-line features, i.e. joins or multi-exon features).
 63 | 
 64 | Also note the two input files MUST have the same set of reference
 65 | sequences, and the same set of features.
 66 | """
 67 | 
 68 | 
 69 | def parse_gff(handle):
 70 |     """Quick hack to parse Bacterial GFF files from Prokka etc.
 71 | 
 72 |     Does NOT support multi-line features (i.e. splicing and
 73 |     multiple exons). Will load EVERYTHING into memory!
 74 | 
 75 |     Iterator yielding SeqRecord objects, intended to fit into the
 76 |     Biopython SeqIO structure.
 77 |     """
 78 |     line = handle.readline()
 79 |     assert line.startswith("##gff-version 3"), line
 80 |     # print("Parsing GFF3")
 81 |     references = OrderedDict()
 82 |     for line in handle:
 83 |         # print(line)
 84 |         if line.startswith("##sequence-region "):
 85 |             _, name, start, end = line.split()
 86 |             assert start == "1"
 87 |             references[name] = SeqRecord(UnknownSeq(int(end)), id=name, name=name)
 88 |         elif line.strip() == "##FASTA":
 89 |             break
 90 |         elif line.startswith("#"):
 91 |             raise NotImplementedError(line)
 92 |         elif line.count("\t") == 8:
 93 |             (
 94 |                 seqid,
 95 |                 source,
 96 |                 ftype,
 97 |                 start,
 98 |                 end,
 99 |                 score,
100 |                 strand,
101 |                 phase,
102 |                 attributes,
103 |             ) = line.split("\t")
104 |             assert seqid in references, (
105 |                 "Reference %r not declared with ##sequence-region line:\n%r"
106 |                 % (
107 |                     seqid,
108 |                     line,
109 |                 )
110 |             )
111 |             start = int(start) - 1
112 |             end = int(end)
113 |             assert 0 <= start < end < len(references[seqid])
114 |             if ftype in FEATURE_TYPE_TO_IGNORE:
115 |                 continue
116 |             if FEATURE_TYPE_WANTED and ftype not in FEATURE_TYPE_WANTED:
117 |                 continue
118 |             if strand == "+":
119 |                 loc = FeatureLocation(start, end, +1)
120 |             elif strand == "-":
121 |                 loc = FeatureLocation(start, end, -1)
122 |             elif strand == ".":
123 |                 # Unstranded - should use zero but +1 to match EMBL/GB
124 |                 loc = FeatureLocation(start, end, +1)
125 |             elif strand == "?":
126 |                 # Stranded by missing - should use None but +1 to match EMBL/GB
127 |                 loc = FeatureLocation(start, end, +1)
128 |             else:
129 |                 raise ValueError("Bad strand %r in line:\n%r" % (strand, line))
130 |             f = SeqFeature(loc, type=ftype)
131 |             for part in attributes.strip().split(";"):
132 |                 if not part:
133 |                     assert ";;" in line, line
134 |                     sys.stderr.write(
135 |                         "Warning - missing key=value or double semi-colon in line:\n%r\n"
136 |                         % line
137 |                     )
138 |                     continue
139 |                 if "=" not in part:
140 |                     sys.exit("Bad key=value entry %r in line:\n%r" % (part, line))
141 |                 key, value = part.split("=", 1)
142 |                 if key in MISSING_QUALIFIERS_TO_IGNORE:
143 |                     continue
144 |                 if key == "eC_number":
145 |                     key = "EC_number"
146 |                 value = value.replace("%2C", ",")
147 |                 try:
148 |                     f.qualifiers[key].append(value)
149 |                 except KeyError:
150 |                     f.qualifiers[key] = [value]
151 |             references[seqid].features.append(f)
152 |         else:
153 |             raise NotImplementedError(line)
154 |     # Deal with any FASTA block
155 |     name = None
156 |     seqs = []
157 |     for line in handle:
158 |         if line.startswith(">"):
159 |             if name and seqs:
160 |                 seq = "".join(seqs)
161 |                 assert len(seq) == len(references[name]), (
162 |                     "FASTA entry for %s was %i long, expected %i"
163 |                     % (
164 |                         name,
165 |                         len(seq),
166 |                         len(references[name]),
167 |                     )
168 |                 )
169 |                 references[name].seq = Seq(seq)
170 |             name = line[1:].split(None, 1)[0]
171 |             seqs = []
172 |         elif name:
173 |             seqs.append(line.strip())
174 |         elif line.strip():
175 |             raise NotImplementedError(line)
176 |     if name and seqs:
177 |         seq = "".join(seqs)
178 |         assert len(seq) == len(references[name]), (
179 |             "FASTA entry for %s was %i long, expected %i"
180 |             % (
181 |                 name,
182 |                 len(seq),
183 |                 len(references[name]),
184 |             )
185 |         )
186 |         references[name].seq = Seq(seq)
187 |     # Return results
188 |     for name, record in references.items():
189 |         # print("%s length %i with %i features" % (name, len(record), len(record.seq)))
190 |         yield record
191 | 
192 | 
193 | def sniff(handle):
194 |     offset = handle.tell()
195 |     line = handle.readline()
196 |     handle.seek(offset)
197 | 
198 |     if line.startswith("##gff-version"):
199 |         return parse_gff(handle)
200 |     elif line.startswith("LOCUS "):
201 |         return SeqIO.parse(handle, "gb")
202 |     elif line.startswith("ID "):
203 |         return SeqIO.parse(handle, "embl")
204 |     else:
205 |         sys.exit("Could not guess file type from first line:\n%s" % line)
206 | 
207 | 
208 | def clean(value):
209 |     if value is None:
210 |         return None
211 |     if isinstance(value, list):
212 |         if len(value) == 1:
213 |             value = value[0]  # unlist
214 |         else:
215 |             return [clean(v) for v in value]
216 |     assert isinstance(value, str), value
217 |     if value == "Uncharacterised protein":
218 |         return "hypothetical protein"
219 |     if "%2C" in value:
220 |         value = value.replace("%2C", ",")
221 |     return value
222 | 
223 | 
224 | def diff_f(ref_name, ref_len, old, new):
225 |     assert old.type == new.type
226 |     assert str(old.location) == str(new.location), "%s location %s vs %s" % (
227 |         old.type,
228 |         old.location,
229 |         new.location,
230 |     )
231 |     assert location_string(old.location, ref_len) == location_string(
232 |         new.location, ref_len
233 |     ), "%s location %s vs %s" % (
234 |         old.type,
235 |         location_string(old.location, ref_len),
236 |         location_string(new.location, ref_len),
237 |     )
238 | 
239 |     # This has no effect, what was it for?
240 |     # if "locustag" in old.qualifiers and "locustag" in new.qualifiers:
241 |     #    if old.qualifiers["locustag"] == new.qualifiers["locustag"]:
242 |     #        name = old.qualifiers["locustag"]
243 | 
244 |     keys = set(old.qualifiers).union(new.qualifiers).difference(QUALIFIERS_TO_IGNORE)
245 |     for k in keys:
246 |         if k in MISSING_QUALIFIERS_TO_IGNORE:
247 |             if k not in old.qualifiers or k not in new.qualifiers:
248 |                 continue
249 |         old_v = clean(old.qualifiers.get(k, None))
250 |         new_v = clean(new.qualifiers.get(k, None))
251 |         if old_v != new_v:
252 |             if k == "locus_tag" and old_v.split("_", 1)[1] == new_v.split("_", 1)[1]:
253 |                 # Different prefix, ignore
254 |                 pass
255 |             elif k == "product" and old_v.split() == new_v.split():
256 |                 # White space only, ignore
257 |                 pass
258 |             else:
259 |                 print(
260 |                     "\t".join(
261 |                         [
262 |                             ref_name,
263 |                             old.type,
264 |                             location_string(old.location, ref_len),
265 |                             k,
266 |                             repr(old_v),
267 |                             repr(new_v),
268 |                         ]
269 |                     )
270 |                 )
271 | 
272 | 
273 | # TODO: Proper command line API
274 | try:
275 |     old_filename, new_filename = sys.argv[1:]
276 | except ValueError:
277 |     sys.exit(usage)
278 | 
279 | old_handle = open(old_filename)
280 | new_handle = open(new_filename)
281 | 
282 | old_iter = sniff(old_handle)
283 | new_iter = sniff(new_handle)
284 | 
285 | for old, new in zip(old_iter, new_iter):
286 |     print("# Comparing records %s vs %s" % (old.id, new.id))
287 |     assert old.id == new.id or old.id == "XXX"
288 |     assert len(old) == len(new)
289 |     if FEATURE_TYPE_WANTED:
290 |         old_fs = [f for f in old.features if f.type in FEATURE_TYPE_WANTED]
291 |         new_fs = [f for f in new.features if f.type in FEATURE_TYPE_WANTED]
292 |     else:
293 |         old_fs = [f for f in old.features if f.type not in FEATURE_TYPE_TO_IGNORE]
294 |         new_fs = [f for f in new.features if f.type not in FEATURE_TYPE_TO_IGNORE]
295 | 
296 |     assert len(old_fs) == len(new_fs), "Have %i [%i] vs %i [%i] features, aborting" % (
297 |         len(old_fs),
298 |         len(old.features),
299 |         len(new_fs),
300 |         len(new.features),
301 |     )
302 |     for old_f, new_f in zip(old_fs, new_fs):
303 |         diff_f(old.id, len(old), old_f, new_f)
304 | 
305 | print("# Done")
306 | 


--------------------------------------------------------------------------------