├── acgt_dither ├── Swanson_et_al_2012_fig1a.pdf ├── Swanson_et_al_2012_fig1a.png ├── README.rst ├── Swanson_et_al_2012_fig1.txt ├── dither.py └── dither_rgb.py ├── assembly_comparison ├── images │ ├── TY2482_vs_NC_018658.png │ ├── H112180280_vs_NC_018658.png │ └── TY2482_20110610_vs_NC_018658.png ├── fasta_trim_n.py ├── dedup_assembly.py ├── README.rst └── order_assembly.py ├── galaxy_workflows ├── rxlr_venn_workflow │ ├── Phyca11_example_output.png │ ├── README.rst │ └── repository_dependencies.xml ├── README.rst └── secreted_protein_workflow │ ├── repository_dependencies.xml │ └── README.rst ├── fetch_viruses ├── README.txt ├── fetch_viruses.py └── merge_viruses.py ├── align └── align_back_trans.py ├── .gitignore ├── snakemake ├── demo.smk └── snakemake_progress_bar_demo.py ├── README.rst ├── LICENSE.rst ├── blast ├── README.rst ├── wwwblast2loc.py ├── blast_wrap.py ├── blast_most_matched.py └── blast_sync.py ├── seq_manipulation ├── pick_N_random_seqs.py ├── shred_contigs.py ├── seqio_index_db.py ├── rename_locustags.py └── insert_gaps_for_ena.py ├── annotation_comparison ├── mauve_orthologues_to_genbank.py ├── annotation_patch.py └── annotation_diff.py ├── sambam ├── profile │ └── bench_iter.py ├── sam_depair.py ├── samtools_auto.py ├── bgzf_add_eof.py ├── sam_drop_qname.py ├── bgzf_check_eof.py ├── fastq_to_sam.py ├── sam_drop_long_cigar.py ├── sam_strip_tags.py ├── sam_restore_seq.py └── sam_to_sspace_tab.py ├── ena_fetch ├── get_ENA_project_submissions.py ├── get_ENA_project_meta.py └── get_ENA_project_fastq.py ├── primer_selection ├── iupac_isPcr.py ├── species_dedup_gbk.py ├── primer_selection.py ├── isPcr_tally.py └── plot_isprc.py ├── blooming_reads ├── interlace_fastq.py └── stack_coverage_plot.py ├── .pre-commit-config.yaml └── hmmer └── hmmer_table2tabular.py /acgt_dither/Swanson_et_al_2012_fig1a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc/picobio/HEAD/acgt_dither/Swanson_et_al_2012_fig1a.pdf -------------------------------------------------------------------------------- /acgt_dither/Swanson_et_al_2012_fig1a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc/picobio/HEAD/acgt_dither/Swanson_et_al_2012_fig1a.png -------------------------------------------------------------------------------- /assembly_comparison/images/TY2482_vs_NC_018658.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/TY2482_vs_NC_018658.png -------------------------------------------------------------------------------- /assembly_comparison/images/H112180280_vs_NC_018658.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/H112180280_vs_NC_018658.png -------------------------------------------------------------------------------- /assembly_comparison/images/TY2482_20110610_vs_NC_018658.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/TY2482_20110610_vs_NC_018658.png -------------------------------------------------------------------------------- /galaxy_workflows/rxlr_venn_workflow/Phyca11_example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peterjc/picobio/HEAD/galaxy_workflows/rxlr_venn_workflow/Phyca11_example_output.png -------------------------------------------------------------------------------- /fetch_viruses/README.txt: -------------------------------------------------------------------------------- 1 | A couple of scripts used to build BLAST databases of complete viral genomes, 2 | and their genes/proteins. 3 | 4 | This worked great back in 2009, but now that the viral sequences have grown 5 | at least ten fold, hammering NCBI Entrez like this is not ideal... 6 | -------------------------------------------------------------------------------- /galaxy_workflows/README.rst: -------------------------------------------------------------------------------- 1 | As of 17 September 2013, my Galaxy workflow development has moved from here: 2 | 3 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/ 4 | 5 | To here, along with the associated Galaxy tools: 6 | 7 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/ 8 | -------------------------------------------------------------------------------- /acgt_dither/README.rst: -------------------------------------------------------------------------------- 1 | Python scripts to render photos using bases A, C, G, and T for pixels. 2 | 3 | Takes as input a PNG photo (JPEG should work if the right dependencies 4 | are installed), and a FASTA sequence file, and uses them to produce a 5 | PDF output image using ReportLab. 6 | 7 | The motivation and example images are described on this blog post: 8 | http://blastedbio.blogspot.co.uk/2013/08/pixelated-potato-posters-in-python.html 9 | -------------------------------------------------------------------------------- /galaxy_workflows/secreted_protein_workflow/repository_dependencies.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /align/align_back_trans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | sys.exit( 5 | """Python script for 'back-translating' a protein alignment. 6 | 7 | This script was originally available from here: 8 | https://github.com/peterjc/picobio/tree/master/align 9 | 10 | It is now available from here instead, with an optional Galaxy wrapper: 11 | https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans 12 | 13 | The Galaxy tool is available from the Galaxy Tool Shed here: 14 | http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans 15 | """ 16 | ) 17 | -------------------------------------------------------------------------------- /galaxy_workflows/rxlr_venn_workflow/README.rst: -------------------------------------------------------------------------------- 1 | This is package is a Galaxy workflow for comparing three RXLR prediction 2 | methods with a Venn Diagram, and creates a FASTA file of any proteins 3 | passing all three methods. 4 | 5 | As of 17 September 2013, development has moved from here: 6 | 7 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/rxlr_venn_workflow 8 | 9 | To here, along with the associated Galaxy tools: 10 | 11 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/rxlr_venn_workflow 12 | 13 | This workflow is available to download and/or install from the main 14 | Galaxy Tool Shed: 15 | 16 | * http://toolshed.g2.bx.psu.edu/view/peterjc/rxlr_venn_workflow 17 | -------------------------------------------------------------------------------- /galaxy_workflows/rxlr_venn_workflow/repository_dependencies.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #Ignore backup files from some Unix editors, 2 | *~ 3 | *.swp 4 | *.bak 5 | 6 | #Ignore any tar-balls 7 | *.tar.gz 8 | 9 | #Ignore patches and any original files created by patch command 10 | *.diff 11 | *.patch 12 | *.orig 13 | *.rej 14 | 15 | #Ignore these hidden files from Mac OS X 16 | .DS_Store 17 | 18 | #Ignore hidden files from Dolphin window manager 19 | .directory 20 | 21 | #Ignore all compiled python files (e.g. from running the unit tests): 22 | *.pyc 23 | *.pyo 24 | 25 | #Ignore all Jython class files (present if using Jython) 26 | *.class 27 | 28 | #Ignore any NCBI BLAST database files 29 | *.nhr 30 | *.nin 31 | *.nsq 32 | 33 | #Ignore any PDF or graphics output files 34 | *.pdf 35 | *.png 36 | 37 | -------------------------------------------------------------------------------- /acgt_dither/Swanson_et_al_2012_fig1.txt: -------------------------------------------------------------------------------- 1 | Figure 1. Transmission electron micrographs of phage virions showing their isometric heads and long non-contractile tails. 2 | show more 3 | 4 | Panel A shows multiple SpaA1 virions and panel B shows a single Bce A1 (B) virions. All scale bars represent 100 nm. 5 | 6 | doi:10.1371/journal.pone.0040683.g001 7 | 8 | Accession numbers HE614281 and gi|399498862|ref|NC_018277.1| (SpaA1) 9 | and HE614282 and gi|397174303|emb|HE614282.1| (BceA1) 10 | 11 | From: 12 | 13 | Swanson MM, Reavy B, Makarova KS, Cock PJ, Hopkins DW, et al. (2012) 14 | Novel Bacteriophages Containing a Genome of Another Bacteriophage within Their Genomes. 15 | PLoS ONE 7(7): e40683. doi:10.1371/journal.pone.0040683 16 | http://dx.doi.org/10.1371/journal.pone.0040683 17 | -------------------------------------------------------------------------------- /snakemake/demo.smk: -------------------------------------------------------------------------------- 1 | # Example usage at the command line: 2 | # 3 | # $ rm -rf *.md5; snakemake -q -s demo.smk -p $(for f in *.fna; do echo $f.md5; done); ls *.md5 4 | # 5 | # Here using a little bash loop to generate a listing of all the 6 | # desired MD5 files based on the FASTA files present. 7 | # 8 | # Example usage from Python via the API (using same logic for targets): 9 | # 10 | # $ rm -rf *.md5; ./snakemake_progress_bar_demo.py ; ls *.md5 11 | # 12 | # The rule will sleep for between 1 and 10s, and then compute the MD5. 13 | # However, 1 time in 20 it will fail instead. 14 | 15 | rule fasta_checksum: 16 | input: 17 | "{genome}.fna" 18 | output: 19 | "{genome}.fna.md5" 20 | shell: 21 | #'X=$((1 + $RANDOM % 10)); if [ "$X" == "1" ]; then sleep 5; exit 1; else sleep $X; md5sum {input} > {output}; fi' 22 | 'sleep $((1 + $RANDOM % 10)); if [ "$(($RANDOM % 20))" == "0" ]; then exit 1; else md5sum {input} > {output}; fi' 23 | -------------------------------------------------------------------------------- /galaxy_workflows/secreted_protein_workflow/README.rst: -------------------------------------------------------------------------------- 1 | This is package is a Galaxy workflow for the identification of candidate 2 | secreted proteins from a given protein FASTA file. 3 | 4 | It runs SignalP v3.0 (Bendtsen et al. 2004) and selects only proteins with a 5 | strong predicted signal peptide, and then runs TMHMM v2.0 (Krogh et al. 2001) 6 | on those, and selects only proteins without a predicted trans-membrane helix. 7 | This workflow was used in Kikuchi et al. (2011), and is a simplification of 8 | the candidate effector protocol described in Jones et al. (2009). 9 | 10 | As of 17 September 2013, development has moved from here: 11 | 12 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/secreted_protein_workflow 13 | 14 | To here, along with the associated Galaxy tools: 15 | 16 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/secreted_protein_workflow 17 | 18 | This workflow is available to download and/or install from the main 19 | Galaxy Tool Shed: 20 | 21 | * http://toolshed.g2.bx.psu.edu/view/peterjc/secreted_protein_workflow 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | .. image:: https://img.shields.io/github/license/peterjc/picobio.svg?label=License 2 | :alt: MIT License 3 | :target: https://github.com/peterjc/picobio/blob/master/LICENSE.rst 4 | .. image:: https://results.pre-commit.ci/badge/github/peterjc/picobio/master.svg 5 | :target: https://results.pre-commit.ci/latest/github/peterjc/picobio/master 6 | :alt: pre-commit.ci status 7 | .. image:: https://img.shields.io/badge/Code%20style-black-000000.svg 8 | :alt: Code style: black 9 | :target: https://github.com/python/black 10 | 11 | This is the README file for the picobio repository, 12 | https://github.com/peterjc/picobio 13 | 14 | This is a small general hold all for Miscellaneous Bioinformatics scripts etc 15 | mostly in Python, written by Peter Cock. 16 | 17 | The name "picobio" is a play on "pico" meaning small (10^-12), and the 18 | Japanense phonetics of my name (ピーター starting "pi" in the Latin alphabet, 19 | and コック starting "ko", giving "piko"), "bio" from Bioinformatics. 20 | 21 | Unless otherwise stated, the scripts in this repository are released under the 22 | MIT License. 23 | -------------------------------------------------------------------------------- /LICENSE.rst: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright 2011-2024, The James Hutton Institute, UK. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /blast/README.rst: -------------------------------------------------------------------------------- 1 | Misc. BLAST scripts. 2 | 3 | Auto-caching of Databases 4 | ========================= 5 | 6 | Files ``blast_sync.py`` and ``blast_wrap.py`` are used to 7 | pre-cache our central BLAST databases onto a cluster node's 8 | local hard drive (using ``rsync``). 9 | 10 | This works by adding wrapper scripts like ``$HOME/bin/blastp``:: 11 | 12 | $ more ~/bin/blastp 13 | #!/bin/bash 14 | #This bash script pretends to be an NCBI BLAST command line tool 15 | #acting as a proxy via a Python wrapper script to cache databases. 16 | #echo $@ 17 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 18 | $DIR/ncbi_blast/blast_wrap.py $DIR/ncbi_blast/blastp "$@" 19 | 20 | This runs ``$HOME/ncbi_blast/blast_wrap.py`` which checks if a sync 21 | is required via ``$HOME/ncbi_blast/blast_sync.py'', and then runs 22 | the real NCBI BLAST+ binary named ``$HOME/bin/ncbi_blast/blastp``. 23 | 24 | 25 | Converting wwwblast BLAST DB list to Galaxy loc files 26 | ===================================================== 27 | 28 | We used to run a ``wwwblast`` server with a collection of 29 | local BLAST databases, but transitioned to using BLAST+ via 30 | Galaxy - see https://github.com/peterjc/galaxy_blast 31 | 32 | The script ``wwwblast2loc.py`` was used during our transition 33 | period to generate the Galaxy location files ``blastdb.loc`` 34 | and ``blastdb_p.loc`` from the ```wwwblast`` listing defined 35 | in ``blast.rc`` and ``blast.html``. 36 | -------------------------------------------------------------------------------- /seq_manipulation/pick_N_random_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import random 3 | import sys 4 | 5 | from Bio import SeqIO 6 | from Bio.SeqIO.FastaIO import SimpleFastaParser 7 | 8 | """Extract N randomly selected sequences from a FASTA file. 9 | 10 | Takes three arguments: input FASTA filename, number of 11 | sequences to pick out, and output FASTA filename. e.g. 12 | 13 | $ python pick_N_random_seqs.py input.fasta 1000 output.fasta 14 | 15 | If the input FASTA file has less than the requested count, 16 | this will fail with an error. 17 | """ 18 | 19 | input_fasta, count, output_fasta = sys.argv[1:] 20 | count = int(count) 21 | 22 | with open(input_fasta) as handle: 23 | # Using as faster than SeqIO.parse(...) 24 | ids = [title.split(None, 1)[0] for title, seq in SimpleFastaParser(handle)] 25 | print("Input FASTA file %s has %i sequences" % (input_fasta, len(ids))) 26 | assert len(set(ids)) == len(ids), "You have duplicate identifiers" 27 | 28 | # seqs = SeqIO.index(input_fasta, "fasta") 29 | # print("Input FASTA file %s has %i sequences" 30 | # % (input_fasta, len(seqs))) 31 | # assert count <= len(seqs) 32 | # picked = set(random.sample(list(seqs), count)) 33 | # assert len(picked) == count 34 | # del seqs 35 | 36 | picked = set(random.sample(ids, count)) 37 | 38 | # This will preserve the input order, and do line wrapping 39 | wanted = (r for r in SeqIO.parse(input_fasta, "fasta") if r.id in picked) 40 | saved = SeqIO.write(wanted, output_fasta, "fasta") 41 | assert saved == count 42 | 43 | print( 44 | "Saved %i randomly selected records from %s into %s" 45 | % (count, input_fasta, output_fasta) 46 | ) 47 | -------------------------------------------------------------------------------- /annotation_comparison/mauve_orthologues_to_genbank.py: -------------------------------------------------------------------------------- 1 | # Use case: 2 | # - Have multiple annotated GenBank files 3 | # - Aligned with Mauve, and orthologue file exported 4 | # 5 | # Want to copy the sister genome's gene identifiers 6 | # into a reference GenBank file (as gene aliases, notes, 7 | # etc) so they can be viewed/searched for easily. 8 | from __future__ import print_function 9 | 10 | from Bio import SeqIO 11 | 12 | mauve_orthologues_file = "mauve_orthologues.txt" 13 | reference_genbank_file = "reference.gbk" 14 | reference_number_in_mauve = 0 15 | output_genbank_file = "reference_with_aliases.gbk" 16 | 17 | # Might be more than one contig 18 | reference_records = list(SeqIO.parse(reference_genbank_file, "genbank")) 19 | cds_dict = {} 20 | for r in reference_records: 21 | for f in r.features: 22 | if f.type == "CDS": 23 | name = f.qualifiers["gene"][0] 24 | key = "%i:%s:%i-%i" % ( 25 | reference_number_in_mauve, 26 | name, 27 | f.location.start + 1, 28 | f.location.end, 29 | ) 30 | cds_dict[key] = f 31 | # print(list(cds_dict.keys())) 32 | 33 | for line in open(mauve_orthologues_file, "rU"): 34 | parts = sorted(line.strip().split("\t")) 35 | key = None 36 | # print(parts) 37 | for x in parts: 38 | # if x.startswith("%i|" % reference_number_in_mauve): 39 | if x in cds_dict: 40 | print("Using: %r" % parts) 41 | name = x.split(":")[1] 42 | names = [y.split(":")[1] for y in parts if y != x] 43 | cds_dict[x].qualifiers["name"] = [",".join([name] + names)] 44 | 45 | SeqIO.write(reference_records, output_genbank_file, "genbank") 46 | print("Wrote to %s" % output_genbank_file) 47 | -------------------------------------------------------------------------------- /acgt_dither/dither.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | from Bio import SeqIO 5 | from PIL import Image 6 | from reportlab.graphics import renderPDF 7 | from reportlab.graphics.shapes import Drawing 8 | from reportlab.graphics.shapes import String 9 | from reportlab.lib import colors 10 | from reportlab.lib.units import cm 11 | from reportlab.pdfgen import canvas 12 | 13 | png_file = "Swanson_et_al_2012_fig1a.png" 14 | pdf_file = "Swanson_et_al_2012_fig1a.pdf" 15 | main_caption = "Swanson et al (2012) Figure 1" 16 | 17 | # Load sequence 18 | seq = SeqIO.read("SpaA1.fasta", "fasta").seq 19 | shape = (239, 176) 20 | scale = 0.125 * cm # per bp 21 | 22 | # Original is 1274 x 937 pixels, try about 20% 23 | pixels = np.product(shape) 24 | im = Image.open(png_file).resize(shape) 25 | # im.show() 26 | data = im.getdata() 27 | assert len(data) == pixels, len(data) 28 | assert shape == im.getbbox()[2:] 29 | data = np.array(data).reshape(shape, order="F") 30 | assert shape == data.shape 31 | pixels = np.product(shape) 32 | print("Have %i base pairs, and %i pixels" % (len(seq), pixels)) 33 | 34 | assert pixels <= len(seq) 35 | assert 0 <= data.min() <= data.max() <= 255 36 | 37 | # Open PDF 38 | width, height = page_size = [x * scale for x in shape] 39 | c = canvas.Canvas(pdf_file, page_size) 40 | c.setTitle(main_caption) 41 | d = Drawing(*page_size) 42 | base = 0 43 | for row in range(shape[1]): 44 | for col in range(shape[0]): 45 | color = colors.CMYKColor(black=(255 - data[col, row]) / 255.0) 46 | # From top left? 47 | s = String( 48 | (col + 0.5) * scale, 49 | (shape[1] - row) * scale, 50 | seq[base], 51 | fillColor=color, 52 | fontSize=4, 53 | textAnchor="middle", 54 | ) 55 | d.add(s) 56 | base += 1 57 | renderPDF.draw(d, c, 0, 0) 58 | c.showPage() 59 | c.save() 60 | -------------------------------------------------------------------------------- /sambam/profile/bench_iter.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import time 5 | 6 | to_profile = [] 7 | 8 | try: 9 | from Bio.Sequencing.SamBam import BamIterator 10 | 11 | def peter_iter(bam_filename, out_filename): 12 | """Peter's pure Python BAM iterator.""" 13 | h = open(bam_filename, "rb") 14 | out_h = open(out_filename, "w") 15 | count = 0 16 | mapped = 0 17 | for read in BamIterator(h): 18 | count += 1 19 | if read.is_mapped: 20 | mapped += 1 21 | out_h.write("%s\t%s\n" % (read.rname, read.pos)) 22 | h.close() 23 | out_h.close() 24 | return mapped, count 25 | 26 | to_profile.append(peter_iter) 27 | except ImportError: 28 | pass 29 | 30 | try: 31 | from pysam import Samfile 32 | 33 | def pysam_iter(bam_filename, out_filename): 34 | """PySam's Samfile as BAM iterator.""" 35 | out_h = open(out_filename, "w") 36 | count = 0 37 | mapped = 0 38 | for read in Samfile(bam_filename, "rb"): 39 | count += 1 40 | if not read.is_unmapped: 41 | mapped += 1 42 | out_h.write("%s\t%s\n" % (read.rname, read.pos)) 43 | out_h.close() 44 | return mapped, count 45 | 46 | to_profile.append(pysam_iter) 47 | except ImportError: 48 | pass 49 | 50 | print("Will profile %i functions:" % len(to_profile)) 51 | for p in to_profile: 52 | print(p.__doc__) 53 | print 54 | for f in os.listdir("."): 55 | if f.endswith(".bam"): 56 | print("Using %s" % f) 57 | for p in to_profile: 58 | print("Profiling %s" % p.__doc__) 59 | start = time.time() 60 | mapped, count = p(f, "/dev/null") 61 | taken = time.time() - start 62 | print("%s - %0.1fs giving %i/%i mapped" % (p.__doc__, taken, mapped, count)) 63 | -------------------------------------------------------------------------------- /sambam/sam_depair.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | usage = """Python script to remove paired information in SAM reads. 6 | 7 | The intended usage is where you wish to treat "orphaned" paired 8 | reads as single reads, meaning removing any /1 or /2 suffix in 9 | the FASTQ file and likewise clearing the paired bits in the SAM 10 | FLAG. 11 | 12 | This script is designed to be used as part of a Unix pipeline. It 13 | takes no command line arguments. It reads SAM format data from stdin, 14 | and writes SAM format data to stdout. 15 | 16 | The only change made to the FLAG field, clearing the following bits: 17 | * 0x1 template having multiple segments in sequencing 18 | * 0x8 next segment in the template unmapped 19 | * 0x20 next segment mapped to reverse strand 20 | * 0x40 the first segment in the template 21 | * 0x80 the last segment in the template 22 | 23 | Example: 24 | 25 | $ ./sam_depair.py < original.sam > as_singles.sam 26 | 27 | Simple usage with BAM files with conversion to/from SAM via samtools: 28 | 29 | $ samtools view -h original.bam | ./sam_depair.py | samtools view -S -b - > as_singles.bam 30 | 31 | Copyright Peter Cock 2014. All rights reserved. See: 32 | https://github.com/peterjc/picobio 33 | """ 34 | 35 | if len(sys.argv) != 1: 36 | sys.stderr.write("ERROR: Bad arguments.\n\n") 37 | sys.stderr.write("Expects SAM on stdin, and writes SAM to stdout.\n") 38 | sys.exit(1) 39 | 40 | count = 0 41 | tweaked = 0 42 | mask = 0x1 | 0x8 | 0x20 | 0x40 | 0x80 43 | flip_mask = ~mask 44 | for line in sys.stdin: 45 | if line[0] != "@": 46 | # Should be a read 47 | count += 1 48 | qname, flag, rest = line.split("\t", 2) 49 | flag = int(flag) 50 | if flag & mask: 51 | # Want to clear those bits... 52 | flag = flag & flip_mask 53 | tweaked += 1 54 | line = "%s\t%i\t%s" % (qname, flag, rest) 55 | sys.stdout.write(line) 56 | sys.stderr.write("Tweaked %i out of %i reads\n" % (tweaked, count)) 57 | -------------------------------------------------------------------------------- /ena_fetch/get_ENA_project_submissions.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import urllib 5 | 6 | project = "ERP000297" 7 | 8 | submissions_url = ( 9 | "http://www.ebi.ac.uk/ena/data/view/reports/sra/submitted_files/internal/%s" 10 | % project 11 | ) 12 | submissions_file = "%s_submissions.tsv" % project 13 | 14 | 15 | def download_in_one(url, filename): 16 | print("Fetching %s" % url) 17 | n = urllib.urlopen(url) 18 | data = n.read() 19 | n.close() 20 | 21 | h = open(filename, "w") 22 | h.write(data) 23 | h.close() 24 | print("Saved as %s" % filename) 25 | 26 | 27 | print 28 | if not os.path.isfile(submissions_file): 29 | download_in_one(submissions_url, submissions_file) 30 | 31 | 32 | def process_submissions(project, submissions_filename): 33 | h = open(submissions_filename) 34 | line = h.readline() 35 | assert ( 36 | line 37 | == "Study\tSample\tExperiment\tRun\tOrganism\tInstrument Platform\tInstrument Model\tLibrary Name\tLibrary Layout\tLibrary Source\tLibrary Selection\tRun Read Count\tRun Base Count\tFile Name\tFile Size\tmd5\tFtp\n" 38 | ), repr(line) 39 | for line in h: 40 | parts = line.rstrip("\n").split("\t") 41 | assert parts[0] == project 42 | url = parts[16] 43 | assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/ERA") 44 | filename = url[len("ftp://ftp.sra.ebi.ac.uk/") :] 45 | if os.path.isfile(filename): 46 | print("Already have %s" % filename) 47 | continue 48 | if filename.endswith(".srf"): 49 | print("Skipping %s" % filename) 50 | continue 51 | # Make directory... 52 | d = os.path.split(filename)[0] 53 | if not os.path.isdir(d): 54 | print("Making directory %s" % d) 55 | os.makedirs(d) 56 | # Download file... 57 | rc = os.system("wget -O %s %s" % (filename, url)) 58 | assert not rc, rc 59 | # Now check the md5... 60 | print(filename) 61 | h.close() 62 | 63 | 64 | process_submissions(project, submissions_file) 65 | -------------------------------------------------------------------------------- /primer_selection/iupac_isPcr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Generalise Jim Kent's isPcr to support IUPAC ambiguities by brute force. 3 | 4 | As of v33 at least, ambiguous bases are rejected in the primers. So, this 5 | script generalises the input file to record all the non-ambiguous 6 | interpretations of the primer. Running isPcr will take several times longer, 7 | and the output will probably need to be deduplicated. 8 | 9 | The input and output are simple three-column TSV files with the name of 10 | each primer pair, the forward primer sequence, and the reverse primer 11 | sequence. 12 | """ 13 | 14 | import itertools 15 | import sys 16 | 17 | from Bio.Data.IUPACData import ambiguous_dna_values 18 | 19 | expand_iupac = { 20 | # Treat I (inosine) like N 21 | "I": list(ambiguous_dna_values["N"].upper()), 22 | "i": list(ambiguous_dna_values["N"].lower()), 23 | } 24 | for base, expanded in ambiguous_dna_values.items(): 25 | expand_iupac[base.upper()] = list(expanded.upper()) 26 | expand_iupac[base.lower()] = list(expanded.lower()) 27 | 28 | 29 | def expand_iupac_bases(seq): 30 | """All possible unabmiguous sequences described with IUPAC ambiguities. 31 | 32 | e.g. 33 | 34 | >>> list(expand_iupac_bases("DAY")) 35 | ['AAC', 'AAT', 'GAC', 'GAT', 'TAC', 'TAT'] 36 | """ 37 | try: 38 | for alt in itertools.product(*[expand_iupac[base] for base in seq]): 39 | yield "".join(alt) 40 | except KeyError as err: 41 | sys.exit(f"ERROR - Problem with primer sequence {seq}, {err}") 42 | 43 | 44 | before = after = 0 45 | for line in sys.stdin: 46 | if line.startswith("#") or not line.strip(): 47 | continue 48 | try: 49 | idn, fwd, rev = line.strip("\n").split("\t")[:3] 50 | except ValueError: 51 | t = line.count("\t") 52 | sys.exit(f"ERROR: Only {t} tabs in line: {line}") 53 | before += 1 54 | for fwd2 in expand_iupac_bases(fwd): 55 | for rev2 in expand_iupac_bases(rev): 56 | sys.stdout.write(f"{idn}\t{fwd2}\t{rev2}\n") 57 | after += 1 58 | sys.stderr.write(f"Generalised {before} primer pairs into unabmiguous {after} pairs\n") 59 | -------------------------------------------------------------------------------- /sambam/samtools_auto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Dirty hack to allow mixing of samtools commands between versions. 3 | 4 | It can be a downside that the samtools command line API is a single 5 | binary which offers multiple (often independent) commands. 6 | 7 | Right now, samtools 1.1 still lacks some functionality from 0.1.19, 8 | for example "samtools index", "samtools depad" and "samtools rmdup" 9 | are not yet fully functional. e.g. 10 | 11 | - https://github.com/samtools/samtools/issues/199 12 | - https://github.com/samtools/samtools/issues/291 13 | 14 | Conversely, the "samtools bam2fq" from samtools 0.1.19 has had 15 | several issues fixed. 16 | 17 | This wrapper allows me to call "samtools" and route this to the 18 | appropriate binary. In this case: 19 | 20 | - ``samtools`` (alone) will call samtools 1.1 21 | - ``samtools bam2fq [...]`` will call samtools 1.1 22 | - ``samtools depad [...]`` will call samtools 0.1.19 23 | - ``samtools rmdup [...]`` will call samtools 0.1.19 24 | - etc 25 | 26 | Install this by putting the Python script (or a symlink to it) on 27 | your ``$PATH`` as ``samtools``, for example under ``~/bin/``:: 28 | 29 | $ cd ~/bin 30 | $ ln -s samtools_auto.py samtools 31 | 32 | Also install binaries for samtools 0.1.19 and 1.1 and set their 33 | paths below (variables ``samtools_old`` and ``samtools_new``). 34 | """ 35 | 36 | import os 37 | import sys 38 | 39 | samtools_old = "/mnt/galaxy/bin/samtools_0.1.19" 40 | samtools_new = "/mnt/galaxy/bin/samtools_1.1" 41 | 42 | 43 | def pick_binary(): 44 | """Return new samtools unless known to be using a broken command. 45 | 46 | i.e. Avoid samtools commands with known regressions! 47 | """ 48 | if len(sys.argv) == 1: 49 | return samtools_new 50 | elif sys.argv[1] in ["index", "depad", "rmdup"]: 51 | return samtools_old 52 | else: 53 | return samtools_new 54 | 55 | 56 | # argv[0] is this python script 57 | # Turn the argv list into a string, escaping as needed 58 | 59 | 60 | def wrap(text): 61 | if " " in text and not text[0] == '"' and not text[-1] == '"': 62 | return '"%s"' % text 63 | else: 64 | return text 65 | 66 | 67 | cmd = pick_binary() + " " + " ".join(wrap(arg) for arg in sys.argv[1:]) 68 | 69 | err = os.system(cmd) 70 | if 0 < err < 128: 71 | sys.exit(err) 72 | elif err: 73 | # Returning 512 gives 0 (odd) 74 | sys.exit(1) 75 | -------------------------------------------------------------------------------- /blooming_reads/interlace_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Simple FASTQ interlacer. 3 | 4 | Checks read identifiers agree, or end with /1 and /2 respectively. 5 | """ 6 | 7 | import gzip 8 | import sys 9 | 10 | try: 11 | from Bio.SeqIO.QualityIO import FastqGeneralIterator 12 | except ImportError: 13 | sys.exit("Missing FastqGeneralIterator from Biopython") 14 | 15 | if len(sys.argv) != 3: 16 | sys.exit("Requires two arguments, a pair of FASTQ filenames") 17 | fastq1 = sys.argv[1] 18 | fastq2 = sys.argv[2] 19 | 20 | sys.stderr.write("Interlacing %s and %s\n" % (fastq1, fastq2)) 21 | if fastq1.endswith(".gz"): 22 | sys.stderr.write("Decompressing %s\n" % fastq1) 23 | handle1 = gzip.open(fastq1) 24 | else: 25 | handle1 = open(fastq1) 26 | if fastq2.endswith(".gz"): 27 | sys.stderr.write("Decompressing %s\n" % fastq2) 28 | handle2 = gzip.open(fastq2) 29 | else: 30 | handle2 = open(fastq2) 31 | sys.stderr.write("Interlacing paired FASTQ files to stdout...\n") 32 | out_handle = sys.stdout 33 | 34 | iter1 = FastqGeneralIterator(handle1) 35 | iter2 = FastqGeneralIterator(handle2) 36 | 37 | for title1, seq1, qual1 in iter1: 38 | try: 39 | title2, seq2, qual2 = iter2.next() 40 | except StopIteration: 41 | sys.exit("More records in %s than %s, e.g. %s" % (fastq1, fastq2, title1)) 42 | id1, descr1 = title1.split(None, 1) 43 | id2, descr2 = title2.split(None, 1) 44 | if id1 == id2: 45 | # Add the /1 and /2, preserve any description after the ID 46 | if descr1: 47 | descr1 = " " + descr1 48 | if descr2: 49 | descr2 = " " + descr2 50 | out_handle.write( 51 | "@%s/1%s\n%s\n+\n%s\n@%s/2%s\n%s\n+\n%s\n" 52 | % (id1, descr1, seq1, qual1, id2, descr2, seq2, qual2) 53 | ) 54 | elif id1.endswith("/1") and id2.endswith("/2") and id1[:-2] == id2[:-2]: 55 | out_handle.write( 56 | "@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n" 57 | % (title1, seq1, qual1, title2, seq2, qual2) 58 | ) 59 | else: 60 | sys.exit("Mismatched records %r vs %r" % (title1, title2)) 61 | 62 | # Check at end of file two 63 | try: 64 | title2, seq2, qual2 = iter2.next() 65 | sys.exit("More records in %s than %s, e.g. %s" % (fastq2, fastq1, title2)) 66 | except StopIteration: 67 | pass 68 | 69 | handle1.close() 70 | handle2.close() 71 | sys.stderr.write("Interlacing paired FASTQ files done.\n") 72 | -------------------------------------------------------------------------------- /sambam/bgzf_add_eof.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Python script to add missing EOF marker to BAM or BGZF files. 3 | 4 | BAM files are compressed using BGZF, Blocked GNU Zip Format, which 5 | is a variant of GZIP. Modern BAM files include a special empty 6 | block at the end of the file (EOF) as a marker to help spot when 7 | a dataset has been truncated. This is just a 28 byte BGZF block, 8 | which when decompressed is empty. 9 | 10 | Some early tools output valid BAM files without this optional 11 | (but recommended) EOF marker. 12 | 13 | This script will add the EOF marker is not already present. 14 | 15 | WARNING: If your BAM or BGZF file is truly truncated, this will 16 | not magically fix it. It may hide or obscure the true problem. 17 | 18 | WARNING: To avoid excessive data writing, this script modifies 19 | the BAM or BGZF file in situ! 20 | 21 | Usage with one or more BAM or BGZF files: 22 | 23 | $ ./bam_add_eof.py example1.bam example2.bam ... exampleN.bam 24 | 25 | See also: http://samtools.sourceforge.net/ 26 | 27 | v0.0.0 - Original script 28 | v0.0.1 - Use append mode to add EOF block 29 | v0.0.2 - removed internal function sys_exit 30 | """ 31 | 32 | import os 33 | import sys 34 | 35 | 36 | def fix_bam(filename): 37 | header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" 38 | eof = ( 39 | "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC" 40 | "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" 41 | ) 42 | if not os.path.isfile(filename): 43 | sys.exit("Missing file %s" % filename) 44 | size = os.path.getsize(filename) 45 | h = open(filename, "rb") # read only for now 46 | # Check it looks like a BGZF file 47 | # (could still be GZIP'd, in which case the extra block is harmless) 48 | data = h.read(len(header)) 49 | if data != header: 50 | sys.exit("File %s is not a BAM file" % filename) 51 | # Check if it has the EOF already 52 | h.seek(size - 28) 53 | data = h.read(28) 54 | h.close() 55 | if data == eof: 56 | sys.stderr.write("EOF already present in %s\n" % filename) 57 | else: 58 | sys.stderr.write("Adding EOF block to %s\n" % filename) 59 | h = open(filename, "ab") 60 | h.write(eof) 61 | h.close() 62 | 63 | 64 | if len(sys.argv) == 1: 65 | sys.exit("Takes one or more BGZF/BAM filenames as arguments (edits in place)") 66 | for bam_filename in sys.argv[1:]: 67 | fix_bam(bam_filename) 68 | -------------------------------------------------------------------------------- /sambam/sam_drop_qname.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Python script to drop read name (QNAME) from SAM/BAM files. 3 | 4 | This script is designed to be used as part of a Unix pipeline. It reads 5 | SAM format data from stdin, and writes SAM format data to stdout. 6 | 7 | The only change made to the SAM reads is in the QNAME field. For 8 | single-fragment reads, QNAME is dropped (set to * for missing). 9 | For multi-fragment reads (e.g. paired end reads), a QNAME is 10 | required to cross reference the parts. Here short automatic names 11 | are substituted instead. 12 | 13 | The optional argument prefix is added to the start of any generated 14 | read name (allowing you to avoid read name clashes). 15 | 16 | Simple usage with SAM files: 17 | 18 | $ ./sam_drop_names [prefix] < original.sam > dropped_names.sam 19 | 20 | Simple usage with BAM files with conversion to/from SAM via samtools: 21 | 22 | $ samtools view -h original.bam | ./sam_drop_names.py [prefix] | samtools view -S -b - > dropped_names.bam 23 | 24 | If your SAM/BAM files lack @SQ headers, you may need to give 25 | samtools the reference FASTA file as well. 26 | 27 | Copyright Peter Cock 2012. All rights reserved. See: 28 | https://github.com/peterjc/picobio 29 | http://blastedbio.blogspot.co.uk/2012/03/bam-verus-cram-07.html 30 | """ 31 | 32 | import sys 33 | 34 | if len(sys.argv) == 1: 35 | prefix = "" 36 | elif len(sys.argv) == 2: 37 | prefix = sys.argv[1] 38 | else: 39 | sys.stderr.write("Error, expect one optional parameter only (read name prefix)") 40 | sys.exit(1) 41 | 42 | count = 0 43 | mapping = {} 44 | # TODO - Automatically remove mapping entries once all parts of the read 45 | # have been found? They would typically be near each other in the file... 46 | # otherwise memory will be a problem with big paired end datasets. 47 | for line in sys.stdin: 48 | if line[0] != "@": 49 | # Should be a read 50 | qname, flag, rest = line.split("\t", 2) 51 | if int(flag) & 0x1: 52 | # Multi-fragment read 53 | try: 54 | qname = prefix + str(mapping[qname]) 55 | except KeyError: 56 | count += 1 57 | mapping[qname] = count 58 | qname = prefix + str(count) 59 | else: 60 | # Single fragment read 61 | qname = "*" 62 | line = "\t".join([qname, flag, rest]) 63 | sys.stdout.write(line) 64 | sys.stderr.write("Modified %i multi-fragment reads\n" % count) 65 | -------------------------------------------------------------------------------- /assembly_comparison/fasta_trim_n.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Python script for trimming N bases from ends of sequences.""" 3 | 4 | import sys 5 | from optparse import OptionParser 6 | 7 | usage = """Basic usage: ./fasta_trim_n.py < input.fasta > output.fasta 8 | 9 | For more details, run with -h for the help. 10 | """ 11 | 12 | try: 13 | from Bio import SeqIO 14 | except ImportError: 15 | sys.exit("This script requires Biopython") 16 | 17 | parser = OptionParser(usage=usage) 18 | parser.add_option( 19 | "-i", 20 | "--input", 21 | dest="input_filename", 22 | help="Input sequence file (default is stdin)", 23 | default=None, 24 | metavar="FILE", 25 | ) 26 | parser.add_option( 27 | "-o", 28 | "--output", 29 | dest="output_filename", 30 | help="Output sequence file (fefault is stdout)", 31 | default=None, 32 | metavar="FILE", 33 | ) 34 | parser.add_option( 35 | "-f", 36 | "--format", 37 | dest="sequence_format", 38 | help='Sequence format (as named in Biopython SeqIO, default "fasta")', 39 | default="fasta", 40 | ) 41 | parser.add_option( 42 | "-c", 43 | "--chars", 44 | dest="characters", 45 | help='Characters to trim (default "Nn" covering upper and lower case)', 46 | default="Nn", 47 | metavar="FILE", 48 | ) 49 | (options, args) = parser.parse_args() 50 | 51 | chars = options.characters 52 | format = options.sequence_format.lower() 53 | 54 | sys.stderr.write( 55 | "Removing %s characters from start/end of %s format file...\n" % (chars, format) 56 | ) 57 | 58 | if options.input_filename: 59 | input_handle = open(options.input_filename) 60 | else: 61 | input_handle = sys.stdin 62 | 63 | if options.output_filename: 64 | output_handle = open(options.output_filename, "w") 65 | else: 66 | output_handle = sys.stdout 67 | 68 | chars = options.characters 69 | format = options.sequence_format.lower() 70 | 71 | 72 | def strip_seq(records): 73 | for record in records: 74 | # FASTQ etc will be a problem, must trim quality too! 75 | # old_len = len(record.seq) 76 | record.seq = record.seq.strip(chars) 77 | # TODO Minimum length! 78 | # new_len = len(record.seq) 79 | # if new_len < old_len: 80 | # sys.stderr.write("Trimmed %s from %i to %i\n" % (record.id, old_len, new_len)) 81 | yield record 82 | 83 | 84 | # Do the work, 85 | count = SeqIO.write(strip_seq(SeqIO.parse(input_handle, format)), output_handle, format) 86 | 87 | if options.input_filename: 88 | input_handle.close() 89 | if options.output_filename: 90 | output_handle.close() 91 | 92 | sys.stderr.write("Saved %i records\n" % count) 93 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # pre-commit run --all-files 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v5.0.0 5 | hooks: 6 | - id: check-added-large-files 7 | - id: check-case-conflict 8 | - id: check-executables-have-shebangs 9 | - id: check-json 10 | - id: check-merge-conflict 11 | - id: check-shebang-scripts-are-executable 12 | - id: check-symlinks 13 | - id: check-yaml 14 | - id: debug-statements 15 | - id: destroyed-symlinks 16 | - id: end-of-file-fixer 17 | files: \.(py|sh|rst|yml|yaml)$ 18 | - id: mixed-line-ending 19 | - id: trailing-whitespace 20 | files: \.(py|sh|rst|yml|yaml)$ 21 | - repo: local 22 | hooks: 23 | - id: no-tabs 24 | name: No tabs 25 | description: Reject any files containing a tab 26 | entry: '\t' 27 | language: pygrep 28 | files: \.(py|sh|rst|yml|yaml)$ 29 | - repo: https://github.com/astral-sh/ruff-pre-commit 30 | rev: v0.6.9 31 | hooks: 32 | # Run the Ruff linter (flake8 alternative): 33 | - id: ruff 34 | args: [ 35 | '--fix', 36 | '--exit-non-zero-on-fix', 37 | '--extend-select=BLE,C4,D,I,ISC', 38 | '--extend-ignore=D100,D103,D203,D213', 39 | '--config=lint.isort.force-single-line=true', 40 | '--config=lint.isort.order-by-type=false', 41 | '--config=lint.pyupgrade.keep-runtime-typing=true' 42 | ] 43 | # Run the Ruff formatter (black alternative): 44 | - id: ruff-format 45 | args: [ 46 | '--config=format.docstring-code-format=true' 47 | ] 48 | - repo: https://github.com/rstcheck/rstcheck 49 | rev: v6.2.4 50 | hooks: 51 | - id: rstcheck 52 | args: [ 53 | --report-level=warning, 54 | --ignore-roles=ref, 55 | "--ignore-directives=automodule,toctree", 56 | --ignore-substitutions=version 57 | ] 58 | - repo: https://github.com/PyCQA/doc8 59 | rev: 'v1.1.2' 60 | hooks: 61 | - id: doc8 62 | additional_dependencies: [pygments] 63 | args: [--quiet,--ignore=D001] 64 | - repo: https://github.com/codespell-project/codespell 65 | rev: v2.3.0 66 | hooks: 67 | - id: codespell 68 | files: \.(py|sh|rst|yml|yaml)$ 69 | args: ['-L', 'nin,mis'] 70 | ci: 71 | # Settings for the https://pre-commit.ci/ continuous integration service 72 | autofix_prs: true 73 | # Default message is more verbose 74 | autoupdate_commit_msg: '[pre-commit.ci] autoupdate' 75 | # Default is weekly 76 | autoupdate_schedule: monthly 77 | -------------------------------------------------------------------------------- /ena_fetch/get_ENA_project_meta.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import urllib 5 | 6 | project = "ERP000297" 7 | strain_file = "%s_strain.tsv" % project # output file 8 | 9 | fastq_url = ( 10 | "http://www.ebi.ac.uk/ena/data/view/reports/sra/fastq_files/internal/%s" % project 11 | ) 12 | fastq_file = "%s_fastq.tsv" % project 13 | 14 | 15 | def download_in_one(url, filename): 16 | print("Fetching %s" % url) 17 | n = urllib.urlopen(url) 18 | data = n.read() 19 | n.close() 20 | 21 | h = open(filename, "w") 22 | h.write(data) 23 | h.close() 24 | print("Saved as %s" % filename) 25 | 26 | 27 | if not os.path.isfile(fastq_file): 28 | download_in_one(fastq_url, fastq_file) 29 | 30 | 31 | def get_strain(meta_xml_filename): 32 | h = open(meta_xml_filename) 33 | while True: 34 | line = h.readline() 35 | if not line: 36 | break 37 | if "strain" in line.lower(): 38 | strain = h.readline().strip() 39 | assert strain.lower().startswith(""), strain 40 | assert strain.lower().endswith(""), strain 41 | h.close() 42 | return strain[7:-8] 43 | h.close() 44 | return None 45 | 46 | 47 | def process_meta(project, fastq_filename, strain_file): 48 | h = open(fastq_filename) 49 | out = open(strain_file, "w") 50 | line = h.readline() 51 | assert ( 52 | line 53 | == "Study\tSample\tExperiment\tRun\tOrganism\tInstrument Platform\tInstrument Model\tLibrary Name\tLibrary Layout\tLibrary Source\tLibrary Selection\tRun Read Count\tRun Base Count\tFile Name\tFile Size\tmd5\tFtp\n" 54 | ), repr(line) 55 | out.write(line[:-1] + "\tStrain\n") 56 | for line in h: 57 | parts = line.rstrip("\n").split("\t") 58 | assert parts[0] == project 59 | url = parts[16] 60 | assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR"), url 61 | 62 | sample = parts[1] 63 | assert sample.startswith("ERS") 64 | url = "http://www.ebi.ac.uk/ena/data/view/%s&display=xml" % sample 65 | url = "http://www.ebi.ac.uk/ena/data/view/%s&display=xml&download" % sample 66 | filename = "xml/%s.xml" % sample 67 | 68 | # Download file... 69 | if not os.path.isfile(filename): 70 | print(url) 71 | rc = os.system("wget -O %s '%s'" % (filename, url)) 72 | assert not rc, rc 73 | 74 | strain = get_strain(filename) 75 | if not strain: 76 | strain = "" 77 | print(filename, strain) 78 | out.write(line[:-1] + "\t" + strain + "\n") 79 | h.close() 80 | out.close() 81 | 82 | 83 | process_meta(project, fastq_file, strain_file) 84 | -------------------------------------------------------------------------------- /sambam/bgzf_check_eof.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Python script to check if BGZF (e.g. BAM) files have EOF marker. 3 | 4 | BAM files are compressed using BGZF, Blocked GNU Zip Format, which 5 | is a variant of GZIP. Modern BAM files include a special empty 6 | block at the end of the file (EOF) as a marker to help spot when 7 | a dataset has been truncated. This is just a 28 byte BGZF block, 8 | which when decompressed is empty. 9 | 10 | Some early tools output valid BAM files without this optional 11 | (but recommended) EOF marker. 12 | 13 | Usage with one or more BAM or BGZF files: 14 | 15 | $ ./bgzf_check_eof.py example1.bam example2.bam ... exampleN.bam 16 | 17 | The filenames are checked in the order given, if any are invalid 18 | the tool exits with a non-zero error level and a message to stderr. 19 | If all the files are valid, it returns with a zero error level. 20 | 21 | Return codes: 22 | * 0 - No errors found 23 | * 1 - Invalid arguments 24 | * 2 - File not found 25 | * 3 - File is zero bytes (and thus not valid BGZF or BAM) 26 | * 4 - File missing BGZF header 27 | * 5 - File looks like BGZF, but missing BGZF EOF marker 28 | 29 | See also: http://samtools.sourceforge.net/ 30 | 31 | v0.0.0 - Original script 32 | v0.0.1 - Dropped internal function sys_exit 33 | """ 34 | 35 | import os 36 | import sys 37 | 38 | 39 | def check_bam(filename): 40 | header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00" 41 | eof = ( 42 | "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC" 43 | "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00" 44 | ) 45 | if not os.path.isfile(filename): 46 | sys.stderr.write("Missing file %s\n" % filename) 47 | sys.exit(2) 48 | size = os.path.getsize(filename) 49 | if not size: 50 | sys.stderr.write("Empty file (zero bytes) %s\n" % filename) 51 | sys.exit(3) 52 | h = open(filename, "rb") 53 | # Check it looks like a BGZF file 54 | # (could still be GZIP'd, in which case the extra block is harmless) 55 | data = h.read(len(header)) 56 | if data != header: 57 | sys.stderr.write("File %s is not a BGZF/BAM file\n" % filename) 58 | sys.exit(4) 59 | # Check if it has the EOF already 60 | h.seek(size - 28) 61 | data = h.read(28) 62 | h.close() 63 | if data == eof: 64 | sys.stderr.write("Good, BGZF EOF already present in %s\n" % filename) 65 | else: 66 | sys.stderr.write("Missing EOF marker in BGZF/BAM file %s\n" % filename) 67 | sys.exit(5) 68 | 69 | 70 | if len(sys.argv) == 1: 71 | sys.stderr.write( 72 | "Takes one or more BGZF/BAM filenames as arguments (edits in place)" 73 | ) 74 | sys.exit(1) 75 | for bam_filename in sys.argv[1:]: 76 | check_bam(bam_filename) 77 | -------------------------------------------------------------------------------- /blooming_reads/stack_coverage_plot.py: -------------------------------------------------------------------------------- 1 | """Script to produce stacked coverage plot with matplotlib.""" 2 | 3 | from __future__ import print_function 4 | 5 | import sys 6 | 7 | import numpy as np 8 | from matplotlib import pyplot as plt 9 | 10 | 11 | def load(filename): 12 | h = open(filename) 13 | line = h.readline() 14 | assert line.startswith(">") 15 | while line and line[0] == ">": 16 | name = line[1:].split(None, 1)[0] 17 | values = [] 18 | while line: 19 | line = h.readline() 20 | if not line or line[0] == ">": 21 | break 22 | values.append([float(v) for v in line.rstrip("\n").split("\t")]) 23 | yield name, np.array(values, np.float) 24 | h.close() 25 | 26 | 27 | def make_colors(start, end, steps): 28 | delta = (end - start) / float(steps - 1) 29 | return ["#%02x%02x%02x" % tuple(start + i * delta) for i in range(steps)] 30 | 31 | 32 | def stack(data, filename, colors=None): 33 | total = len(data) 34 | max_value = 0 35 | for names, values in data: 36 | max_value = max(max_value, values.sum(axis=0).max()) 37 | plt.ylim([0, max_value]) 38 | 39 | fig = plt.figure(figsize=(12, 2 * total)) 40 | if not colors: 41 | # Assumes all the examples have same number of colors: 42 | if data[0][1].shape[0] == 3: 43 | colors = ["#CC6666", "#1DACD6", "#6E5160"] 44 | elif data[0][1].shape[0] == 5: 45 | colors = ["#CDCDC1", "#8B8B83", "#FF6A6A", "#F0E68C", "#CDC673"] 46 | else: 47 | colors = make_colors( 48 | np.array([0xCC, 0x66, 0x66]), 49 | # np.array([0x6E, 0x51, 0x60]), 50 | np.array([0x90, 0x41, 0x50]), 51 | # np.array([0x20, 0xF0, 0x60]), 52 | data[0][1].shape[0], 53 | ) 54 | print(colors) 55 | for i, (name, values) in enumerate(data): 56 | x = range(values.shape[1]) 57 | print(i, name, values.shape, "coverage:") 58 | print("\t".join("%0.1f" % v for v in values.sum(axis=1))) 59 | y_stack = np.cumsum(values, axis=0) 60 | ax1 = fig.add_subplot(total, 1, i + 1) 61 | ax1.set_autoscaley_on(False) 62 | ax1.set_ylim([0, max_value]) 63 | ax1.set_title(name.split(None, 1)[0], fontsize="xx-small") 64 | ax1.fill_between(x, 0, y_stack[0, :], facecolor=colors[0], alpha=0.7) 65 | for i in range(0, values.shape[0] - 1): 66 | ax1.fill_between( 67 | x, y_stack[i, :], y_stack[i + 1, :], facecolor=colors[i + 1], alpha=0.7 68 | ) 69 | # fig.tight_layout() 70 | plt.show() 71 | plt.savefig(filename) 72 | 73 | 74 | for filename in sys.argv[1:]: 75 | if not filename.endswith(".cov"): 76 | continue 77 | print("-" * 60) 78 | print(filename) 79 | print("-" * 60) 80 | data = list(load(filename)) 81 | stack(data, filename + ".png") 82 | print("Done") 83 | -------------------------------------------------------------------------------- /sambam/fastq_to_sam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Python script to turn FASTQ into unaliged SAM/BAM files. 3 | 4 | This script is designed to be used as part of a Unix pipeline. It 5 | works with Python 2 and Python 3, e.g. 6 | 7 | $ python fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam 8 | Done, 532 pairs 9 | 10 | Or: 11 | 12 | $ python3 fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam 13 | Done, 532 pairs 14 | 15 | As long as the Python script is marked as executable you can do: 16 | 17 | $ ./fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam 18 | Done, 532 pairs 19 | 20 | Simple usage with BAM files with conversion from SAM via samtools: 21 | 22 | $ ./fastq_to_sam.py R1.fastq R2.fastq | samtools view -S -b - > unmapped.bam 23 | [samopen] no @SQ lines in the header. 24 | Done, 532 pairs 25 | 26 | Note that no @SQ lines are expected in SAM/BAM files with only unaligned reads. 27 | 28 | WARNING: This assumes your FASTQ files use the Sanger quality encoding. 29 | 30 | Todo: 31 | ---- 32 | - Test cases 33 | - Galaxy wrapper? 34 | - Proper command line API 35 | - Support for gzipped FASTQ (detected via filename?) 36 | - Support for interlaced FASTQ 37 | - Support for setting read groups 38 | - Support for multiple FASTQ input pairs (and read groups) 39 | 40 | Copyright Peter Cock 2015. All rights reserved. See: 41 | https://github.com/peterjc/picobio 42 | 43 | """ 44 | 45 | import sys 46 | 47 | if "-v" in sys.argv or "--version" in sys.argv: 48 | print("This is fastq_to_sam.py version 0.0.1") 49 | sys.exit(0) 50 | 51 | # TODO - proper API, allow interleaved FASTQ, read group, etc 52 | if len(sys.argv) != 3: 53 | sys.stderr.write("Expects two arguments, a pair of FASTQ filenames\n") 54 | sys.exit(1) 55 | 56 | try: 57 | from Bio._py3k import zip 58 | from Bio.SeqIO.QualityIO import FastqGeneralIterator 59 | except ImportError: 60 | sys.exit("ERROR: This requires Biopython.\n") 61 | sys.exit(1) 62 | 63 | fastq1 = FastqGeneralIterator(open(sys.argv[1])) 64 | fastq2 = FastqGeneralIterator(open(sys.argv[2])) 65 | 66 | # Paired, unmapped, mate unmapped, either first or second in pair: 67 | flag1 = "77" 68 | flag2 = "141" 69 | rname = "*" 70 | pos = "0" 71 | mapq = "0" 72 | cigar = "*" 73 | rnext = "*" 74 | pnext = "0" 75 | tlen = "0" 76 | 77 | pairs = 0 78 | for (t1, s1, q1), (t2, s2, q2) in zip(fastq1, fastq2): 79 | id1 = t1.split(None, 1)[0] 80 | id2 = t2.split(None, 1)[0] 81 | if id1 == id2: 82 | # Good, should we check the description follows Illumina naming? 83 | qname = id1 84 | else: 85 | assert id1.endswith("/1"), t1 86 | assert id2.endswith("/2"), t2 87 | qname = id1[:-2] 88 | 89 | print( 90 | "\t".join([qname, flag1, rname, pos, mapq, cigar, rnext, pnext, tlen, s1, q1]) 91 | ) 92 | print( 93 | "\t".join([qname, flag1, rname, pos, mapq, cigar, rnext, pnext, tlen, s2, q2]) 94 | ) 95 | pairs += 1 96 | sys.stderr.write("Done, %i pairs\n" % pairs) 97 | -------------------------------------------------------------------------------- /sambam/sam_drop_long_cigar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | usage = """Python script to remove SAM reads with long CIGAR strings. 6 | 7 | The BAM format (currently) uses an unsigned 16bit integer for the 8 | number of CIGAR operations in a read, and therefore BAM files can 9 | only hold reads with up to 65535 CIGAR operators. SAM does not have 10 | this limit, but the samtools implementation (reasonably) also has 11 | the same 16bit limit. See also: 12 | https://github.com/samtools/samtools/pull/39 13 | 14 | This script is designed to be used as part of a Unix pipeline. It 15 | takes no command line arguments. It reads SAM format data from stdin, 16 | and writes SAM format data to stdout. 17 | 18 | The only change made to the SAM reads is to drop records with over 19 | 65535 CIGAR operators. These are logged to stderr. 20 | 21 | $ ./sam_drop_long_cigar.py < original.sam > no_long_cigar.sam 22 | 23 | Simple usage with BAM files with conversion to/from SAM via samtools: 24 | 25 | $ samtools view -h original.bam | ./sam_drop_long_cigar.py | samtools view -S -b - > no_long_cigar.bam 26 | 27 | Copyright Peter Cock 2014. All rights reserved. See: 28 | https://github.com/peterjc/picobio 29 | """ 30 | 31 | if len(sys.argv) != 1: 32 | sys.stderr.write("ERROR: Bad arguments.\n\n") 33 | sys.stderr.write("Expects SAM on stdin, and writes SAM to stdout.\n") 34 | sys.exit(1) 35 | 36 | # def decode_cigar(cigar): 37 | # """Returns a list of 2-tuples, integer count and operator char.""" 38 | # count = "" 39 | # answer = [] 40 | # for letter in cigar: 41 | # if letter.isdigit(): 42 | # count += letter #string addition 43 | # elif letter in "MIDNSHP=X": 44 | # answer.append((int(count), letter)) 45 | # count = "" 46 | # else: 47 | # raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar)) 48 | # return answer 49 | # 50 | # assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14,'S'),(15,'M'),(1,'P'),(1,'D'),(3,'P'),(54,'M'),(1,'D'),(34,'M'),(5,'S')] 51 | 52 | 53 | def cigar_length(cigar): 54 | """Return number of cigar operators (integer).""" 55 | answer = 0 56 | for letter in cigar: 57 | if letter.isdigit(): 58 | pass 59 | elif letter in "MIDNSHP=X": 60 | answer += 1 61 | else: 62 | raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar)) 63 | return answer 64 | 65 | 66 | count = 0 67 | longs = 0 68 | for line in sys.stdin: 69 | if line[0] != "@": 70 | # Should be a read 71 | count += 1 72 | qname, flag, rname, pos, mapq, cigar, rest = line.split("\t", 6) 73 | if cigar != "*": 74 | len_cigar = cigar_length(cigar) 75 | if len_cigar > 65535: 76 | longs += 1 77 | sys.stderr.write( 78 | "Dropping read %s with %i CIGAR operators\n" % (qname, len_cigar) 79 | ) 80 | continue 81 | sys.stdout.write(line) 82 | sys.stderr.write("Dropped %i out of %i reads\n" % (longs, count)) 83 | -------------------------------------------------------------------------------- /snakemake/snakemake_progress_bar_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Demonstration of calling a snakemake workflow with a progress bar. 3 | 4 | Written and tested using snakemake 8.20.6 under macOS. 5 | 6 | Currently the snakemake API doesn't have any obvious way 7 | to get callbacks or an iterator approach to running a 8 | workflow which would allow direct updates to a progress 9 | bar. Improvements to their logging system may allow this? 10 | 11 | Instead, this demonstrate running snakemake on a subprocess, 12 | and monitoring the creation of the expected output files 13 | as a proxy to update a progress bar. This works, but would 14 | put some additional load on the file system. 15 | 16 | This uses the rich library's progress bar, but the same idea 17 | would work with another library like tqdm. We must explicitly 18 | update the progress bar whenever a new output file is found. 19 | """ 20 | 21 | from multiprocessing import Process 22 | from pathlib import Path 23 | 24 | from rich.progress import Progress # or use tqdm, or ... 25 | 26 | from snakemake.api import DAGSettings 27 | from snakemake.api import ResourceSettings 28 | from snakemake.api import SnakemakeApi 29 | from snakemake.settings.types import OutputSettings 30 | from snakemake.settings.types import Quietness 31 | 32 | inputs = Path(".").glob("*.fna") 33 | targets = [str(_) + ".md5" for _ in inputs] 34 | 35 | 36 | def black_box(output_files): 37 | """Black-box function which generates known files (here snakemake).""" 38 | snakefile = Path("demo.smk") 39 | with SnakemakeApi(OutputSettings(quiet={Quietness.ALL})) as snakemake_api: 40 | workflow_api = snakemake_api.workflow( 41 | snakefile=snakefile, 42 | resource_settings=ResourceSettings(), 43 | # config_settings=ConfigSettings(config=config_args), 44 | # workdir=workdir, 45 | ) 46 | dag_api = workflow_api.dag( 47 | dag_settings=DAGSettings(targets=output_files), 48 | ) 49 | dag_api.unlock() 50 | dag_api.execute_workflow() 51 | 52 | 53 | def with_progress_bar(function, output_files, interval=0.5): 54 | """Run given function via subprocess with a progress bar. 55 | 56 | The function must accept a single argument, the given file list. 57 | The appearance of those files on disk is used to update the progress 58 | bar. This runs the function in a process via multiprocessing, and 59 | returns the process exit code (should be zero for success). 60 | """ 61 | pending = [Path(_) for _ in targets] 62 | p = Process(target=function, args=(output_files,)) 63 | p.start() 64 | with Progress() as progress: 65 | task = progress.add_task("Snakemake...", total=len(pending)) 66 | while pending: 67 | p.join(interval) 68 | for t in pending[:]: 69 | if t.is_file(): 70 | print(f"Done: {t}") 71 | pending.remove(t) 72 | progress.update(task, advance=1) 73 | if p.exitcode is not None: 74 | # Should be finished, but was it success or failure? 75 | pending = [] # to break the loop 76 | p.join() # Should be immediate as should have finished 77 | assert not p.is_alive() 78 | print(f"Snakemake return code {p.exitcode}") 79 | return p.exitcode 80 | 81 | 82 | if __name__ == "__main__": 83 | # black_box(targets) 84 | with_progress_bar(black_box, targets) 85 | -------------------------------------------------------------------------------- /seq_manipulation/shred_contigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Python script for shredding contigs into fake reads. 3 | 4 | e.g. for input into Newbler. 5 | """ 6 | 7 | import os 8 | import sys 9 | from optparse import OptionParser 10 | 11 | from Bio import SeqIO 12 | 13 | usage = """Basic usage: python shred_contigs.py assembly.fasta -o shredded.fasta 14 | 15 | Multiple input FASTA files are accepted, see -h for more details. 16 | 17 | Using Roche 454 Newbler, non-SFF input reads are limited to 1999 bp, thus 18 | you might wish to use something like this on an Illumina assembly: 19 | 20 | $ python shred_contigs.py other_assemby.fasta -o shredded.fasta -m 1999 -l 1999 -s 500 21 | """ 22 | 23 | parser = OptionParser(usage=usage) 24 | parser.add_option( 25 | "-m", 26 | "--min-contig-len", 27 | dest="max_contig", 28 | type="int", 29 | help="Max contig length to reuse as is (default 2000)", 30 | default=2000, 31 | ) 32 | parser.add_option( 33 | "-l", 34 | "--shred-length", 35 | dest="shred_length", 36 | type="int", 37 | help="Length of fake reads to generate (default 1000 bp)", 38 | default=1000, 39 | ) 40 | parser.add_option( 41 | "-s", 42 | "--shred-step", 43 | dest="shred_step", 44 | type="int", 45 | help="Offset between fake reads (default 500 bp)", 46 | default=500, 47 | ) 48 | parser.add_option( 49 | "-o", 50 | "--output", 51 | dest="output_filename", 52 | help="FASTA output filename for fake reads (required)", 53 | default=None, 54 | metavar="FILE", 55 | ) 56 | (options, args) = parser.parse_args() 57 | if not args: 58 | sys.exit("Requires at least one input FASTA filename\n\n" + usage) 59 | 60 | max_contig = int(options.max_contig) 61 | shred_length = int(options.shred_length) 62 | shred_step = int(options.shred_step) 63 | output_fasta = options.output_filename 64 | 65 | if shred_step < 1: 66 | sys.exit("Shred step should be positive") 67 | if shred_length < shred_step: 68 | sys.exit("Shred step should be less than shred length") 69 | 70 | print("Accepting contigs up to length %i as they are (option -m)" % max_contig) 71 | print( 72 | "Shredding longer contigs into reads of %i bp (option -l), step %i (option -s)" 73 | % (shred_length, shred_step) 74 | ) 75 | 76 | for assembly_fasta in args: 77 | if not os.path.isfile(assembly_fasta): 78 | sys.exit("Assembly FASTA file not found: %r" % assembly_fasta) 79 | 80 | 81 | def shred(input_filename): 82 | global as_is, shredded 83 | for record in SeqIO.parse(input_filename, "fasta"): 84 | length = len(record) 85 | if length <= max_contig: 86 | as_is += 1 87 | yield record 88 | else: 89 | # Shred it! 90 | shredded += 1 91 | for i, start in enumerate(range(0, length - shred_step, shred_step)): 92 | fragment = record[start : start + shred_length] 93 | fragment.id = "%s_fragment%i" % (record.id, i + 1) 94 | yield fragment 95 | 96 | 97 | count = 0 98 | as_is = 0 99 | shredded = 0 100 | with open(output_fasta, "w") as output_handle: 101 | for assembly_fasta in args: 102 | count += SeqIO.write(shred(assembly_fasta), output_handle, "fasta") 103 | print("Shredded %i FASTA files, containing %i contigs" % (len(args), as_is + shredded)) 104 | print( 105 | "Accepted %i short contigs, shredded %i long contigs, giving %i reads" 106 | % (as_is, shredded, count) 107 | ) 108 | -------------------------------------------------------------------------------- /blast/wwwblast2loc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Short Python script to parse the blastwww setup files to extract a list of 3 | # nucleotide and protein BLAST databases (with descriptions) and write them 4 | # out as location files for use in Galaxy. 5 | # 6 | # Copyright 2010, Peter Cock. 7 | # 8 | # v001 - First version 9 | # v002 - Use print as function 10 | from __future__ import print_function 11 | 12 | import os 13 | 14 | # This gives us the list of databases and their type (nt vs aa): 15 | blastrc = "/var/www/html/blast/blast.rc" 16 | # This gives us a sensible order and their descriptions: 17 | blastwww = "/var/www/html/blast/blast.html" 18 | 19 | # BLAST DB path, 20 | blastpath = "/data/blastdb" 21 | 22 | # Output files 23 | blast_nt = "blastdb.loc" 24 | blast_aa = "blastdb_p.loc" 25 | 26 | 27 | def load_blast_db_list(filename): 28 | nt = set() 29 | aa = set() 30 | handle = open(filename) 31 | for line in handle: 32 | if line.startswith("#") or not line.strip(): 33 | continue 34 | elif line.startswith("NumCpuToUse"): 35 | continue 36 | elif line.startswith(("blastn ", "tblastn", "tblastx ")): 37 | nt.update(line.rstrip().split()[1:]) 38 | elif line.startswith(("blastp ", "blastx")): 39 | aa.update(line.rstrip().split()[1:]) 40 | else: 41 | raise ValueError(line) 42 | handle.close() 43 | return nt, aa 44 | 45 | 46 | nt, aa = load_blast_db_list(blastrc) 47 | # print(nt) 48 | # print(aa) 49 | 50 | 51 | def load_blast_db_descr(html_filename, nt, aa): 52 | nt_list = [] 53 | aa_list = [] 54 | handle = open(html_filename) 55 | for line in handle: 56 | line = line.strip() 57 | if not line.startswith("