├── acgt_dither
├── Swanson_et_al_2012_fig1a.pdf
├── Swanson_et_al_2012_fig1a.png
├── README.rst
├── Swanson_et_al_2012_fig1.txt
├── dither.py
└── dither_rgb.py
├── assembly_comparison
├── images
│ ├── TY2482_vs_NC_018658.png
│ ├── H112180280_vs_NC_018658.png
│ └── TY2482_20110610_vs_NC_018658.png
├── fasta_trim_n.py
├── dedup_assembly.py
├── README.rst
└── order_assembly.py
├── galaxy_workflows
├── rxlr_venn_workflow
│ ├── Phyca11_example_output.png
│ ├── README.rst
│ └── repository_dependencies.xml
├── README.rst
└── secreted_protein_workflow
│ ├── repository_dependencies.xml
│ └── README.rst
├── fetch_viruses
├── README.txt
├── fetch_viruses.py
└── merge_viruses.py
├── align
└── align_back_trans.py
├── .gitignore
├── snakemake
├── demo.smk
└── snakemake_progress_bar_demo.py
├── README.rst
├── LICENSE.rst
├── blast
├── README.rst
├── wwwblast2loc.py
├── blast_wrap.py
├── blast_most_matched.py
└── blast_sync.py
├── seq_manipulation
├── pick_N_random_seqs.py
├── shred_contigs.py
├── seqio_index_db.py
├── rename_locustags.py
└── insert_gaps_for_ena.py
├── annotation_comparison
├── mauve_orthologues_to_genbank.py
├── annotation_patch.py
└── annotation_diff.py
├── sambam
├── profile
│ └── bench_iter.py
├── sam_depair.py
├── samtools_auto.py
├── bgzf_add_eof.py
├── sam_drop_qname.py
├── bgzf_check_eof.py
├── fastq_to_sam.py
├── sam_drop_long_cigar.py
├── sam_strip_tags.py
├── sam_restore_seq.py
└── sam_to_sspace_tab.py
├── ena_fetch
├── get_ENA_project_submissions.py
├── get_ENA_project_meta.py
└── get_ENA_project_fastq.py
├── primer_selection
├── iupac_isPcr.py
├── species_dedup_gbk.py
├── primer_selection.py
├── isPcr_tally.py
└── plot_isprc.py
├── blooming_reads
├── interlace_fastq.py
└── stack_coverage_plot.py
├── .pre-commit-config.yaml
└── hmmer
└── hmmer_table2tabular.py
/acgt_dither/Swanson_et_al_2012_fig1a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/acgt_dither/Swanson_et_al_2012_fig1a.pdf
--------------------------------------------------------------------------------
/acgt_dither/Swanson_et_al_2012_fig1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/acgt_dither/Swanson_et_al_2012_fig1a.png
--------------------------------------------------------------------------------
/assembly_comparison/images/TY2482_vs_NC_018658.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/TY2482_vs_NC_018658.png
--------------------------------------------------------------------------------
/assembly_comparison/images/H112180280_vs_NC_018658.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/H112180280_vs_NC_018658.png
--------------------------------------------------------------------------------
/assembly_comparison/images/TY2482_20110610_vs_NC_018658.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/assembly_comparison/images/TY2482_20110610_vs_NC_018658.png
--------------------------------------------------------------------------------
/galaxy_workflows/rxlr_venn_workflow/Phyca11_example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peterjc/picobio/HEAD/galaxy_workflows/rxlr_venn_workflow/Phyca11_example_output.png
--------------------------------------------------------------------------------
/fetch_viruses/README.txt:
--------------------------------------------------------------------------------
1 | A couple of scripts used to build BLAST databases of complete viral genomes,
2 | and their genes/proteins.
3 |
4 | This worked great back in 2009, but now that the viral sequences have grown
5 | at least ten fold, hammering NCBI Entrez like this is not ideal...
6 |
--------------------------------------------------------------------------------
/galaxy_workflows/README.rst:
--------------------------------------------------------------------------------
1 | As of 17 September 2013, my Galaxy workflow development has moved from here:
2 |
3 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/
4 |
5 | To here, along with the associated Galaxy tools:
6 |
7 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/
8 |
--------------------------------------------------------------------------------
/acgt_dither/README.rst:
--------------------------------------------------------------------------------
1 | Python scripts to render photos using bases A, C, G, and T for pixels.
2 |
3 | Takes as input a PNG photo (JPEG should work if the right dependencies
4 | are installed), and a FASTA sequence file, and uses them to produce a
5 | PDF output image using ReportLab.
6 |
7 | The motivation and example images are described on this blog post:
8 | http://blastedbio.blogspot.co.uk/2013/08/pixelated-potato-posters-in-python.html
9 |
--------------------------------------------------------------------------------
/galaxy_workflows/secreted_protein_workflow/repository_dependencies.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/align/align_back_trans.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 |
4 | sys.exit(
5 | """Python script for 'back-translating' a protein alignment.
6 |
7 | This script was originally available from here:
8 | https://github.com/peterjc/picobio/tree/master/align
9 |
10 | It is now available from here instead, with an optional Galaxy wrapper:
11 | https://github.com/peterjc/pico_galaxy/tree/master/tools/align_back_trans
12 |
13 | The Galaxy tool is available from the Galaxy Tool Shed here:
14 | http://toolshed.g2.bx.psu.edu/view/peterjc/align_back_trans
15 | """
16 | )
17 |
--------------------------------------------------------------------------------
/galaxy_workflows/rxlr_venn_workflow/README.rst:
--------------------------------------------------------------------------------
1 | This is package is a Galaxy workflow for comparing three RXLR prediction
2 | methods with a Venn Diagram, and creates a FASTA file of any proteins
3 | passing all three methods.
4 |
5 | As of 17 September 2013, development has moved from here:
6 |
7 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/rxlr_venn_workflow
8 |
9 | To here, along with the associated Galaxy tools:
10 |
11 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/rxlr_venn_workflow
12 |
13 | This workflow is available to download and/or install from the main
14 | Galaxy Tool Shed:
15 |
16 | * http://toolshed.g2.bx.psu.edu/view/peterjc/rxlr_venn_workflow
17 |
--------------------------------------------------------------------------------
/galaxy_workflows/rxlr_venn_workflow/repository_dependencies.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #Ignore backup files from some Unix editors,
2 | *~
3 | *.swp
4 | *.bak
5 |
6 | #Ignore any tar-balls
7 | *.tar.gz
8 |
9 | #Ignore patches and any original files created by patch command
10 | *.diff
11 | *.patch
12 | *.orig
13 | *.rej
14 |
15 | #Ignore these hidden files from Mac OS X
16 | .DS_Store
17 |
18 | #Ignore hidden files from Dolphin window manager
19 | .directory
20 |
21 | #Ignore all compiled python files (e.g. from running the unit tests):
22 | *.pyc
23 | *.pyo
24 |
25 | #Ignore all Jython class files (present if using Jython)
26 | *.class
27 |
28 | #Ignore any NCBI BLAST database files
29 | *.nhr
30 | *.nin
31 | *.nsq
32 |
33 | #Ignore any PDF or graphics output files
34 | *.pdf
35 | *.png
36 |
37 |
--------------------------------------------------------------------------------
/acgt_dither/Swanson_et_al_2012_fig1.txt:
--------------------------------------------------------------------------------
1 | Figure 1. Transmission electron micrographs of phage virions showing their isometric heads and long non-contractile tails.
2 | show more
3 |
4 | Panel A shows multiple SpaA1 virions and panel B shows a single Bce A1 (B) virions. All scale bars represent 100 nm.
5 |
6 | doi:10.1371/journal.pone.0040683.g001
7 |
8 | Accession numbers HE614281 and gi|399498862|ref|NC_018277.1| (SpaA1)
9 | and HE614282 and gi|397174303|emb|HE614282.1| (BceA1)
10 |
11 | From:
12 |
13 | Swanson MM, Reavy B, Makarova KS, Cock PJ, Hopkins DW, et al. (2012)
14 | Novel Bacteriophages Containing a Genome of Another Bacteriophage within Their Genomes.
15 | PLoS ONE 7(7): e40683. doi:10.1371/journal.pone.0040683
16 | http://dx.doi.org/10.1371/journal.pone.0040683
17 |
--------------------------------------------------------------------------------
/snakemake/demo.smk:
--------------------------------------------------------------------------------
1 | # Example usage at the command line:
2 | #
3 | # $ rm -rf *.md5; snakemake -q -s demo.smk -p $(for f in *.fna; do echo $f.md5; done); ls *.md5
4 | #
5 | # Here using a little bash loop to generate a listing of all the
6 | # desired MD5 files based on the FASTA files present.
7 | #
8 | # Example usage from Python via the API (using same logic for targets):
9 | #
10 | # $ rm -rf *.md5; ./snakemake_progress_bar_demo.py ; ls *.md5
11 | #
12 | # The rule will sleep for between 1 and 10s, and then compute the MD5.
13 | # However, 1 time in 20 it will fail instead.
14 |
15 | rule fasta_checksum:
16 | input:
17 | "{genome}.fna"
18 | output:
19 | "{genome}.fna.md5"
20 | shell:
21 | #'X=$((1 + $RANDOM % 10)); if [ "$X" == "1" ]; then sleep 5; exit 1; else sleep $X; md5sum {input} > {output}; fi'
22 | 'sleep $((1 + $RANDOM % 10)); if [ "$(($RANDOM % 20))" == "0" ]; then exit 1; else md5sum {input} > {output}; fi'
23 |
--------------------------------------------------------------------------------
/galaxy_workflows/secreted_protein_workflow/README.rst:
--------------------------------------------------------------------------------
1 | This is package is a Galaxy workflow for the identification of candidate
2 | secreted proteins from a given protein FASTA file.
3 |
4 | It runs SignalP v3.0 (Bendtsen et al. 2004) and selects only proteins with a
5 | strong predicted signal peptide, and then runs TMHMM v2.0 (Krogh et al. 2001)
6 | on those, and selects only proteins without a predicted trans-membrane helix.
7 | This workflow was used in Kikuchi et al. (2011), and is a simplification of
8 | the candidate effector protocol described in Jones et al. (2009).
9 |
10 | As of 17 September 2013, development has moved from here:
11 |
12 | * https://github.com/peterjc/picobio/tree/master/galaxy_workflows/secreted_protein_workflow
13 |
14 | To here, along with the associated Galaxy tools:
15 |
16 | * https://github.com/peterjc/pico_galaxy/tree/master/workflows/secreted_protein_workflow
17 |
18 | This workflow is available to download and/or install from the main
19 | Galaxy Tool Shed:
20 |
21 | * http://toolshed.g2.bx.psu.edu/view/peterjc/secreted_protein_workflow
22 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | .. image:: https://img.shields.io/github/license/peterjc/picobio.svg?label=License
2 | :alt: MIT License
3 | :target: https://github.com/peterjc/picobio/blob/master/LICENSE.rst
4 | .. image:: https://results.pre-commit.ci/badge/github/peterjc/picobio/master.svg
5 | :target: https://results.pre-commit.ci/latest/github/peterjc/picobio/master
6 | :alt: pre-commit.ci status
7 | .. image:: https://img.shields.io/badge/Code%20style-black-000000.svg
8 | :alt: Code style: black
9 | :target: https://github.com/python/black
10 |
11 | This is the README file for the picobio repository,
12 | https://github.com/peterjc/picobio
13 |
14 | This is a small general hold all for Miscellaneous Bioinformatics scripts etc
15 | mostly in Python, written by Peter Cock.
16 |
17 | The name "picobio" is a play on "pico" meaning small (10^-12), and the
18 | Japanense phonetics of my name (ピーター starting "pi" in the Latin alphabet,
19 | and コック starting "ko", giving "piko"), "bio" from Bioinformatics.
20 |
21 | Unless otherwise stated, the scripts in this repository are released under the
22 | MIT License.
23 |
--------------------------------------------------------------------------------
/LICENSE.rst:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright 2011-2024, The James Hutton Institute, UK.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/blast/README.rst:
--------------------------------------------------------------------------------
1 | Misc. BLAST scripts.
2 |
3 | Auto-caching of Databases
4 | =========================
5 |
6 | Files ``blast_sync.py`` and ``blast_wrap.py`` are used to
7 | pre-cache our central BLAST databases onto a cluster node's
8 | local hard drive (using ``rsync``).
9 |
10 | This works by adding wrapper scripts like ``$HOME/bin/blastp``::
11 |
12 | $ more ~/bin/blastp
13 | #!/bin/bash
14 | #This bash script pretends to be an NCBI BLAST command line tool
15 | #acting as a proxy via a Python wrapper script to cache databases.
16 | #echo $@
17 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
18 | $DIR/ncbi_blast/blast_wrap.py $DIR/ncbi_blast/blastp "$@"
19 |
20 | This runs ``$HOME/ncbi_blast/blast_wrap.py`` which checks if a sync
21 | is required via ``$HOME/ncbi_blast/blast_sync.py'', and then runs
22 | the real NCBI BLAST+ binary named ``$HOME/bin/ncbi_blast/blastp``.
23 |
24 |
25 | Converting wwwblast BLAST DB list to Galaxy loc files
26 | =====================================================
27 |
28 | We used to run a ``wwwblast`` server with a collection of
29 | local BLAST databases, but transitioned to using BLAST+ via
30 | Galaxy - see https://github.com/peterjc/galaxy_blast
31 |
32 | The script ``wwwblast2loc.py`` was used during our transition
33 | period to generate the Galaxy location files ``blastdb.loc``
34 | and ``blastdb_p.loc`` from the ```wwwblast`` listing defined
35 | in ``blast.rc`` and ``blast.html``.
36 |
--------------------------------------------------------------------------------
/seq_manipulation/pick_N_random_seqs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import random
3 | import sys
4 |
5 | from Bio import SeqIO
6 | from Bio.SeqIO.FastaIO import SimpleFastaParser
7 |
8 | """Extract N randomly selected sequences from a FASTA file.
9 |
10 | Takes three arguments: input FASTA filename, number of
11 | sequences to pick out, and output FASTA filename. e.g.
12 |
13 | $ python pick_N_random_seqs.py input.fasta 1000 output.fasta
14 |
15 | If the input FASTA file has less than the requested count,
16 | this will fail with an error.
17 | """
18 |
19 | input_fasta, count, output_fasta = sys.argv[1:]
20 | count = int(count)
21 |
22 | with open(input_fasta) as handle:
23 | # Using as faster than SeqIO.parse(...)
24 | ids = [title.split(None, 1)[0] for title, seq in SimpleFastaParser(handle)]
25 | print("Input FASTA file %s has %i sequences" % (input_fasta, len(ids)))
26 | assert len(set(ids)) == len(ids), "You have duplicate identifiers"
27 |
28 | # seqs = SeqIO.index(input_fasta, "fasta")
29 | # print("Input FASTA file %s has %i sequences"
30 | # % (input_fasta, len(seqs)))
31 | # assert count <= len(seqs)
32 | # picked = set(random.sample(list(seqs), count))
33 | # assert len(picked) == count
34 | # del seqs
35 |
36 | picked = set(random.sample(ids, count))
37 |
38 | # This will preserve the input order, and do line wrapping
39 | wanted = (r for r in SeqIO.parse(input_fasta, "fasta") if r.id in picked)
40 | saved = SeqIO.write(wanted, output_fasta, "fasta")
41 | assert saved == count
42 |
43 | print(
44 | "Saved %i randomly selected records from %s into %s"
45 | % (count, input_fasta, output_fasta)
46 | )
47 |
--------------------------------------------------------------------------------
/annotation_comparison/mauve_orthologues_to_genbank.py:
--------------------------------------------------------------------------------
1 | # Use case:
2 | # - Have multiple annotated GenBank files
3 | # - Aligned with Mauve, and orthologue file exported
4 | #
5 | # Want to copy the sister genome's gene identifiers
6 | # into a reference GenBank file (as gene aliases, notes,
7 | # etc) so they can be viewed/searched for easily.
8 | from __future__ import print_function
9 |
10 | from Bio import SeqIO
11 |
12 | mauve_orthologues_file = "mauve_orthologues.txt"
13 | reference_genbank_file = "reference.gbk"
14 | reference_number_in_mauve = 0
15 | output_genbank_file = "reference_with_aliases.gbk"
16 |
17 | # Might be more than one contig
18 | reference_records = list(SeqIO.parse(reference_genbank_file, "genbank"))
19 | cds_dict = {}
20 | for r in reference_records:
21 | for f in r.features:
22 | if f.type == "CDS":
23 | name = f.qualifiers["gene"][0]
24 | key = "%i:%s:%i-%i" % (
25 | reference_number_in_mauve,
26 | name,
27 | f.location.start + 1,
28 | f.location.end,
29 | )
30 | cds_dict[key] = f
31 | # print(list(cds_dict.keys()))
32 |
33 | for line in open(mauve_orthologues_file, "rU"):
34 | parts = sorted(line.strip().split("\t"))
35 | key = None
36 | # print(parts)
37 | for x in parts:
38 | # if x.startswith("%i|" % reference_number_in_mauve):
39 | if x in cds_dict:
40 | print("Using: %r" % parts)
41 | name = x.split(":")[1]
42 | names = [y.split(":")[1] for y in parts if y != x]
43 | cds_dict[x].qualifiers["name"] = [",".join([name] + names)]
44 |
45 | SeqIO.write(reference_records, output_genbank_file, "genbank")
46 | print("Wrote to %s" % output_genbank_file)
47 |
--------------------------------------------------------------------------------
/acgt_dither/dither.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import numpy as np
4 | from Bio import SeqIO
5 | from PIL import Image
6 | from reportlab.graphics import renderPDF
7 | from reportlab.graphics.shapes import Drawing
8 | from reportlab.graphics.shapes import String
9 | from reportlab.lib import colors
10 | from reportlab.lib.units import cm
11 | from reportlab.pdfgen import canvas
12 |
13 | png_file = "Swanson_et_al_2012_fig1a.png"
14 | pdf_file = "Swanson_et_al_2012_fig1a.pdf"
15 | main_caption = "Swanson et al (2012) Figure 1"
16 |
17 | # Load sequence
18 | seq = SeqIO.read("SpaA1.fasta", "fasta").seq
19 | shape = (239, 176)
20 | scale = 0.125 * cm # per bp
21 |
22 | # Original is 1274 x 937 pixels, try about 20%
23 | pixels = np.product(shape)
24 | im = Image.open(png_file).resize(shape)
25 | # im.show()
26 | data = im.getdata()
27 | assert len(data) == pixels, len(data)
28 | assert shape == im.getbbox()[2:]
29 | data = np.array(data).reshape(shape, order="F")
30 | assert shape == data.shape
31 | pixels = np.product(shape)
32 | print("Have %i base pairs, and %i pixels" % (len(seq), pixels))
33 |
34 | assert pixels <= len(seq)
35 | assert 0 <= data.min() <= data.max() <= 255
36 |
37 | # Open PDF
38 | width, height = page_size = [x * scale for x in shape]
39 | c = canvas.Canvas(pdf_file, page_size)
40 | c.setTitle(main_caption)
41 | d = Drawing(*page_size)
42 | base = 0
43 | for row in range(shape[1]):
44 | for col in range(shape[0]):
45 | color = colors.CMYKColor(black=(255 - data[col, row]) / 255.0)
46 | # From top left?
47 | s = String(
48 | (col + 0.5) * scale,
49 | (shape[1] - row) * scale,
50 | seq[base],
51 | fillColor=color,
52 | fontSize=4,
53 | textAnchor="middle",
54 | )
55 | d.add(s)
56 | base += 1
57 | renderPDF.draw(d, c, 0, 0)
58 | c.showPage()
59 | c.save()
60 |
--------------------------------------------------------------------------------
/sambam/profile/bench_iter.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import os
4 | import time
5 |
6 | to_profile = []
7 |
8 | try:
9 | from Bio.Sequencing.SamBam import BamIterator
10 |
11 | def peter_iter(bam_filename, out_filename):
12 | """Peter's pure Python BAM iterator."""
13 | h = open(bam_filename, "rb")
14 | out_h = open(out_filename, "w")
15 | count = 0
16 | mapped = 0
17 | for read in BamIterator(h):
18 | count += 1
19 | if read.is_mapped:
20 | mapped += 1
21 | out_h.write("%s\t%s\n" % (read.rname, read.pos))
22 | h.close()
23 | out_h.close()
24 | return mapped, count
25 |
26 | to_profile.append(peter_iter)
27 | except ImportError:
28 | pass
29 |
30 | try:
31 | from pysam import Samfile
32 |
33 | def pysam_iter(bam_filename, out_filename):
34 | """PySam's Samfile as BAM iterator."""
35 | out_h = open(out_filename, "w")
36 | count = 0
37 | mapped = 0
38 | for read in Samfile(bam_filename, "rb"):
39 | count += 1
40 | if not read.is_unmapped:
41 | mapped += 1
42 | out_h.write("%s\t%s\n" % (read.rname, read.pos))
43 | out_h.close()
44 | return mapped, count
45 |
46 | to_profile.append(pysam_iter)
47 | except ImportError:
48 | pass
49 |
50 | print("Will profile %i functions:" % len(to_profile))
51 | for p in to_profile:
52 | print(p.__doc__)
53 | print
54 | for f in os.listdir("."):
55 | if f.endswith(".bam"):
56 | print("Using %s" % f)
57 | for p in to_profile:
58 | print("Profiling %s" % p.__doc__)
59 | start = time.time()
60 | mapped, count = p(f, "/dev/null")
61 | taken = time.time() - start
62 | print("%s - %0.1fs giving %i/%i mapped" % (p.__doc__, taken, mapped, count))
63 |
--------------------------------------------------------------------------------
/sambam/sam_depair.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 |
5 | usage = """Python script to remove paired information in SAM reads.
6 |
7 | The intended usage is where you wish to treat "orphaned" paired
8 | reads as single reads, meaning removing any /1 or /2 suffix in
9 | the FASTQ file and likewise clearing the paired bits in the SAM
10 | FLAG.
11 |
12 | This script is designed to be used as part of a Unix pipeline. It
13 | takes no command line arguments. It reads SAM format data from stdin,
14 | and writes SAM format data to stdout.
15 |
16 | The only change made to the FLAG field, clearing the following bits:
17 | * 0x1 template having multiple segments in sequencing
18 | * 0x8 next segment in the template unmapped
19 | * 0x20 next segment mapped to reverse strand
20 | * 0x40 the first segment in the template
21 | * 0x80 the last segment in the template
22 |
23 | Example:
24 |
25 | $ ./sam_depair.py < original.sam > as_singles.sam
26 |
27 | Simple usage with BAM files with conversion to/from SAM via samtools:
28 |
29 | $ samtools view -h original.bam | ./sam_depair.py | samtools view -S -b - > as_singles.bam
30 |
31 | Copyright Peter Cock 2014. All rights reserved. See:
32 | https://github.com/peterjc/picobio
33 | """
34 |
35 | if len(sys.argv) != 1:
36 | sys.stderr.write("ERROR: Bad arguments.\n\n")
37 | sys.stderr.write("Expects SAM on stdin, and writes SAM to stdout.\n")
38 | sys.exit(1)
39 |
40 | count = 0
41 | tweaked = 0
42 | mask = 0x1 | 0x8 | 0x20 | 0x40 | 0x80
43 | flip_mask = ~mask
44 | for line in sys.stdin:
45 | if line[0] != "@":
46 | # Should be a read
47 | count += 1
48 | qname, flag, rest = line.split("\t", 2)
49 | flag = int(flag)
50 | if flag & mask:
51 | # Want to clear those bits...
52 | flag = flag & flip_mask
53 | tweaked += 1
54 | line = "%s\t%i\t%s" % (qname, flag, rest)
55 | sys.stdout.write(line)
56 | sys.stderr.write("Tweaked %i out of %i reads\n" % (tweaked, count))
57 |
--------------------------------------------------------------------------------
/ena_fetch/get_ENA_project_submissions.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import os
4 | import urllib
5 |
6 | project = "ERP000297"
7 |
8 | submissions_url = (
9 | "http://www.ebi.ac.uk/ena/data/view/reports/sra/submitted_files/internal/%s"
10 | % project
11 | )
12 | submissions_file = "%s_submissions.tsv" % project
13 |
14 |
15 | def download_in_one(url, filename):
16 | print("Fetching %s" % url)
17 | n = urllib.urlopen(url)
18 | data = n.read()
19 | n.close()
20 |
21 | h = open(filename, "w")
22 | h.write(data)
23 | h.close()
24 | print("Saved as %s" % filename)
25 |
26 |
27 | print
28 | if not os.path.isfile(submissions_file):
29 | download_in_one(submissions_url, submissions_file)
30 |
31 |
32 | def process_submissions(project, submissions_filename):
33 | h = open(submissions_filename)
34 | line = h.readline()
35 | assert (
36 | line
37 | == "Study\tSample\tExperiment\tRun\tOrganism\tInstrument Platform\tInstrument Model\tLibrary Name\tLibrary Layout\tLibrary Source\tLibrary Selection\tRun Read Count\tRun Base Count\tFile Name\tFile Size\tmd5\tFtp\n"
38 | ), repr(line)
39 | for line in h:
40 | parts = line.rstrip("\n").split("\t")
41 | assert parts[0] == project
42 | url = parts[16]
43 | assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/ERA")
44 | filename = url[len("ftp://ftp.sra.ebi.ac.uk/") :]
45 | if os.path.isfile(filename):
46 | print("Already have %s" % filename)
47 | continue
48 | if filename.endswith(".srf"):
49 | print("Skipping %s" % filename)
50 | continue
51 | # Make directory...
52 | d = os.path.split(filename)[0]
53 | if not os.path.isdir(d):
54 | print("Making directory %s" % d)
55 | os.makedirs(d)
56 | # Download file...
57 | rc = os.system("wget -O %s %s" % (filename, url))
58 | assert not rc, rc
59 | # Now check the md5...
60 | print(filename)
61 | h.close()
62 |
63 |
64 | process_submissions(project, submissions_file)
65 |
--------------------------------------------------------------------------------
/primer_selection/iupac_isPcr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Generalise Jim Kent's isPcr to support IUPAC ambiguities by brute force.
3 |
4 | As of v33 at least, ambiguous bases are rejected in the primers. So, this
5 | script generalises the input file to record all the non-ambiguous
6 | interpretations of the primer. Running isPcr will take several times longer,
7 | and the output will probably need to be deduplicated.
8 |
9 | The input and output are simple three-column TSV files with the name of
10 | each primer pair, the forward primer sequence, and the reverse primer
11 | sequence.
12 | """
13 |
14 | import itertools
15 | import sys
16 |
17 | from Bio.Data.IUPACData import ambiguous_dna_values
18 |
19 | expand_iupac = {
20 | # Treat I (inosine) like N
21 | "I": list(ambiguous_dna_values["N"].upper()),
22 | "i": list(ambiguous_dna_values["N"].lower()),
23 | }
24 | for base, expanded in ambiguous_dna_values.items():
25 | expand_iupac[base.upper()] = list(expanded.upper())
26 | expand_iupac[base.lower()] = list(expanded.lower())
27 |
28 |
29 | def expand_iupac_bases(seq):
30 | """All possible unabmiguous sequences described with IUPAC ambiguities.
31 |
32 | e.g.
33 |
34 | >>> list(expand_iupac_bases("DAY"))
35 | ['AAC', 'AAT', 'GAC', 'GAT', 'TAC', 'TAT']
36 | """
37 | try:
38 | for alt in itertools.product(*[expand_iupac[base] for base in seq]):
39 | yield "".join(alt)
40 | except KeyError as err:
41 | sys.exit(f"ERROR - Problem with primer sequence {seq}, {err}")
42 |
43 |
44 | before = after = 0
45 | for line in sys.stdin:
46 | if line.startswith("#") or not line.strip():
47 | continue
48 | try:
49 | idn, fwd, rev = line.strip("\n").split("\t")[:3]
50 | except ValueError:
51 | t = line.count("\t")
52 | sys.exit(f"ERROR: Only {t} tabs in line: {line}")
53 | before += 1
54 | for fwd2 in expand_iupac_bases(fwd):
55 | for rev2 in expand_iupac_bases(rev):
56 | sys.stdout.write(f"{idn}\t{fwd2}\t{rev2}\n")
57 | after += 1
58 | sys.stderr.write(f"Generalised {before} primer pairs into unabmiguous {after} pairs\n")
59 |
--------------------------------------------------------------------------------
/sambam/samtools_auto.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Dirty hack to allow mixing of samtools commands between versions.
3 |
4 | It can be a downside that the samtools command line API is a single
5 | binary which offers multiple (often independent) commands.
6 |
7 | Right now, samtools 1.1 still lacks some functionality from 0.1.19,
8 | for example "samtools index", "samtools depad" and "samtools rmdup"
9 | are not yet fully functional. e.g.
10 |
11 | - https://github.com/samtools/samtools/issues/199
12 | - https://github.com/samtools/samtools/issues/291
13 |
14 | Conversely, the "samtools bam2fq" from samtools 0.1.19 has had
15 | several issues fixed.
16 |
17 | This wrapper allows me to call "samtools" and route this to the
18 | appropriate binary. In this case:
19 |
20 | - ``samtools`` (alone) will call samtools 1.1
21 | - ``samtools bam2fq [...]`` will call samtools 1.1
22 | - ``samtools depad [...]`` will call samtools 0.1.19
23 | - ``samtools rmdup [...]`` will call samtools 0.1.19
24 | - etc
25 |
26 | Install this by putting the Python script (or a symlink to it) on
27 | your ``$PATH`` as ``samtools``, for example under ``~/bin/``::
28 |
29 | $ cd ~/bin
30 | $ ln -s samtools_auto.py samtools
31 |
32 | Also install binaries for samtools 0.1.19 and 1.1 and set their
33 | paths below (variables ``samtools_old`` and ``samtools_new``).
34 | """
35 |
36 | import os
37 | import sys
38 |
39 | samtools_old = "/mnt/galaxy/bin/samtools_0.1.19"
40 | samtools_new = "/mnt/galaxy/bin/samtools_1.1"
41 |
42 |
43 | def pick_binary():
44 | """Return new samtools unless known to be using a broken command.
45 |
46 | i.e. Avoid samtools commands with known regressions!
47 | """
48 | if len(sys.argv) == 1:
49 | return samtools_new
50 | elif sys.argv[1] in ["index", "depad", "rmdup"]:
51 | return samtools_old
52 | else:
53 | return samtools_new
54 |
55 |
56 | # argv[0] is this python script
57 | # Turn the argv list into a string, escaping as needed
58 |
59 |
60 | def wrap(text):
61 | if " " in text and not text[0] == '"' and not text[-1] == '"':
62 | return '"%s"' % text
63 | else:
64 | return text
65 |
66 |
67 | cmd = pick_binary() + " " + " ".join(wrap(arg) for arg in sys.argv[1:])
68 |
69 | err = os.system(cmd)
70 | if 0 < err < 128:
71 | sys.exit(err)
72 | elif err:
73 | # Returning 512 gives 0 (odd)
74 | sys.exit(1)
75 |
--------------------------------------------------------------------------------
/blooming_reads/interlace_fastq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Simple FASTQ interlacer.
3 |
4 | Checks read identifiers agree, or end with /1 and /2 respectively.
5 | """
6 |
7 | import gzip
8 | import sys
9 |
10 | try:
11 | from Bio.SeqIO.QualityIO import FastqGeneralIterator
12 | except ImportError:
13 | sys.exit("Missing FastqGeneralIterator from Biopython")
14 |
15 | if len(sys.argv) != 3:
16 | sys.exit("Requires two arguments, a pair of FASTQ filenames")
17 | fastq1 = sys.argv[1]
18 | fastq2 = sys.argv[2]
19 |
20 | sys.stderr.write("Interlacing %s and %s\n" % (fastq1, fastq2))
21 | if fastq1.endswith(".gz"):
22 | sys.stderr.write("Decompressing %s\n" % fastq1)
23 | handle1 = gzip.open(fastq1)
24 | else:
25 | handle1 = open(fastq1)
26 | if fastq2.endswith(".gz"):
27 | sys.stderr.write("Decompressing %s\n" % fastq2)
28 | handle2 = gzip.open(fastq2)
29 | else:
30 | handle2 = open(fastq2)
31 | sys.stderr.write("Interlacing paired FASTQ files to stdout...\n")
32 | out_handle = sys.stdout
33 |
34 | iter1 = FastqGeneralIterator(handle1)
35 | iter2 = FastqGeneralIterator(handle2)
36 |
37 | for title1, seq1, qual1 in iter1:
38 | try:
39 | title2, seq2, qual2 = iter2.next()
40 | except StopIteration:
41 | sys.exit("More records in %s than %s, e.g. %s" % (fastq1, fastq2, title1))
42 | id1, descr1 = title1.split(None, 1)
43 | id2, descr2 = title2.split(None, 1)
44 | if id1 == id2:
45 | # Add the /1 and /2, preserve any description after the ID
46 | if descr1:
47 | descr1 = " " + descr1
48 | if descr2:
49 | descr2 = " " + descr2
50 | out_handle.write(
51 | "@%s/1%s\n%s\n+\n%s\n@%s/2%s\n%s\n+\n%s\n"
52 | % (id1, descr1, seq1, qual1, id2, descr2, seq2, qual2)
53 | )
54 | elif id1.endswith("/1") and id2.endswith("/2") and id1[:-2] == id2[:-2]:
55 | out_handle.write(
56 | "@%s\n%s\n+\n%s\n@%s\n%s\n+\n%s\n"
57 | % (title1, seq1, qual1, title2, seq2, qual2)
58 | )
59 | else:
60 | sys.exit("Mismatched records %r vs %r" % (title1, title2))
61 |
62 | # Check at end of file two
63 | try:
64 | title2, seq2, qual2 = iter2.next()
65 | sys.exit("More records in %s than %s, e.g. %s" % (fastq2, fastq1, title2))
66 | except StopIteration:
67 | pass
68 |
69 | handle1.close()
70 | handle2.close()
71 | sys.stderr.write("Interlacing paired FASTQ files done.\n")
72 |
--------------------------------------------------------------------------------
/sambam/bgzf_add_eof.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Python script to add missing EOF marker to BAM or BGZF files.
3 |
4 | BAM files are compressed using BGZF, Blocked GNU Zip Format, which
5 | is a variant of GZIP. Modern BAM files include a special empty
6 | block at the end of the file (EOF) as a marker to help spot when
7 | a dataset has been truncated. This is just a 28 byte BGZF block,
8 | which when decompressed is empty.
9 |
10 | Some early tools output valid BAM files without this optional
11 | (but recommended) EOF marker.
12 |
13 | This script will add the EOF marker is not already present.
14 |
15 | WARNING: If your BAM or BGZF file is truly truncated, this will
16 | not magically fix it. It may hide or obscure the true problem.
17 |
18 | WARNING: To avoid excessive data writing, this script modifies
19 | the BAM or BGZF file in situ!
20 |
21 | Usage with one or more BAM or BGZF files:
22 |
23 | $ ./bam_add_eof.py example1.bam example2.bam ... exampleN.bam
24 |
25 | See also: http://samtools.sourceforge.net/
26 |
27 | v0.0.0 - Original script
28 | v0.0.1 - Use append mode to add EOF block
29 | v0.0.2 - removed internal function sys_exit
30 | """
31 |
32 | import os
33 | import sys
34 |
35 |
36 | def fix_bam(filename):
37 | header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00"
38 | eof = (
39 | "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC"
40 | "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
41 | )
42 | if not os.path.isfile(filename):
43 | sys.exit("Missing file %s" % filename)
44 | size = os.path.getsize(filename)
45 | h = open(filename, "rb") # read only for now
46 | # Check it looks like a BGZF file
47 | # (could still be GZIP'd, in which case the extra block is harmless)
48 | data = h.read(len(header))
49 | if data != header:
50 | sys.exit("File %s is not a BAM file" % filename)
51 | # Check if it has the EOF already
52 | h.seek(size - 28)
53 | data = h.read(28)
54 | h.close()
55 | if data == eof:
56 | sys.stderr.write("EOF already present in %s\n" % filename)
57 | else:
58 | sys.stderr.write("Adding EOF block to %s\n" % filename)
59 | h = open(filename, "ab")
60 | h.write(eof)
61 | h.close()
62 |
63 |
64 | if len(sys.argv) == 1:
65 | sys.exit("Takes one or more BGZF/BAM filenames as arguments (edits in place)")
66 | for bam_filename in sys.argv[1:]:
67 | fix_bam(bam_filename)
68 |
--------------------------------------------------------------------------------
/sambam/sam_drop_qname.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Python script to drop read name (QNAME) from SAM/BAM files.
3 |
4 | This script is designed to be used as part of a Unix pipeline. It reads
5 | SAM format data from stdin, and writes SAM format data to stdout.
6 |
7 | The only change made to the SAM reads is in the QNAME field. For
8 | single-fragment reads, QNAME is dropped (set to * for missing).
9 | For multi-fragment reads (e.g. paired end reads), a QNAME is
10 | required to cross reference the parts. Here short automatic names
11 | are substituted instead.
12 |
13 | The optional argument prefix is added to the start of any generated
14 | read name (allowing you to avoid read name clashes).
15 |
16 | Simple usage with SAM files:
17 |
18 | $ ./sam_drop_names [prefix] < original.sam > dropped_names.sam
19 |
20 | Simple usage with BAM files with conversion to/from SAM via samtools:
21 |
22 | $ samtools view -h original.bam | ./sam_drop_names.py [prefix] | samtools view -S -b - > dropped_names.bam
23 |
24 | If your SAM/BAM files lack @SQ headers, you may need to give
25 | samtools the reference FASTA file as well.
26 |
27 | Copyright Peter Cock 2012. All rights reserved. See:
28 | https://github.com/peterjc/picobio
29 | http://blastedbio.blogspot.co.uk/2012/03/bam-verus-cram-07.html
30 | """
31 |
32 | import sys
33 |
34 | if len(sys.argv) == 1:
35 | prefix = ""
36 | elif len(sys.argv) == 2:
37 | prefix = sys.argv[1]
38 | else:
39 | sys.stderr.write("Error, expect one optional parameter only (read name prefix)")
40 | sys.exit(1)
41 |
42 | count = 0
43 | mapping = {}
44 | # TODO - Automatically remove mapping entries once all parts of the read
45 | # have been found? They would typically be near each other in the file...
46 | # otherwise memory will be a problem with big paired end datasets.
47 | for line in sys.stdin:
48 | if line[0] != "@":
49 | # Should be a read
50 | qname, flag, rest = line.split("\t", 2)
51 | if int(flag) & 0x1:
52 | # Multi-fragment read
53 | try:
54 | qname = prefix + str(mapping[qname])
55 | except KeyError:
56 | count += 1
57 | mapping[qname] = count
58 | qname = prefix + str(count)
59 | else:
60 | # Single fragment read
61 | qname = "*"
62 | line = "\t".join([qname, flag, rest])
63 | sys.stdout.write(line)
64 | sys.stderr.write("Modified %i multi-fragment reads\n" % count)
65 |
--------------------------------------------------------------------------------
/assembly_comparison/fasta_trim_n.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Python script for trimming N bases from ends of sequences."""
3 |
4 | import sys
5 | from optparse import OptionParser
6 |
7 | usage = """Basic usage: ./fasta_trim_n.py < input.fasta > output.fasta
8 |
9 | For more details, run with -h for the help.
10 | """
11 |
12 | try:
13 | from Bio import SeqIO
14 | except ImportError:
15 | sys.exit("This script requires Biopython")
16 |
17 | parser = OptionParser(usage=usage)
18 | parser.add_option(
19 | "-i",
20 | "--input",
21 | dest="input_filename",
22 | help="Input sequence file (default is stdin)",
23 | default=None,
24 | metavar="FILE",
25 | )
26 | parser.add_option(
27 | "-o",
28 | "--output",
29 | dest="output_filename",
30 | help="Output sequence file (fefault is stdout)",
31 | default=None,
32 | metavar="FILE",
33 | )
34 | parser.add_option(
35 | "-f",
36 | "--format",
37 | dest="sequence_format",
38 | help='Sequence format (as named in Biopython SeqIO, default "fasta")',
39 | default="fasta",
40 | )
41 | parser.add_option(
42 | "-c",
43 | "--chars",
44 | dest="characters",
45 | help='Characters to trim (default "Nn" covering upper and lower case)',
46 | default="Nn",
47 | metavar="FILE",
48 | )
49 | (options, args) = parser.parse_args()
50 |
51 | chars = options.characters
52 | format = options.sequence_format.lower()
53 |
54 | sys.stderr.write(
55 | "Removing %s characters from start/end of %s format file...\n" % (chars, format)
56 | )
57 |
58 | if options.input_filename:
59 | input_handle = open(options.input_filename)
60 | else:
61 | input_handle = sys.stdin
62 |
63 | if options.output_filename:
64 | output_handle = open(options.output_filename, "w")
65 | else:
66 | output_handle = sys.stdout
67 |
68 | chars = options.characters
69 | format = options.sequence_format.lower()
70 |
71 |
72 | def strip_seq(records):
73 | for record in records:
74 | # FASTQ etc will be a problem, must trim quality too!
75 | # old_len = len(record.seq)
76 | record.seq = record.seq.strip(chars)
77 | # TODO Minimum length!
78 | # new_len = len(record.seq)
79 | # if new_len < old_len:
80 | # sys.stderr.write("Trimmed %s from %i to %i\n" % (record.id, old_len, new_len))
81 | yield record
82 |
83 |
84 | # Do the work,
85 | count = SeqIO.write(strip_seq(SeqIO.parse(input_handle, format)), output_handle, format)
86 |
87 | if options.input_filename:
88 | input_handle.close()
89 | if options.output_filename:
90 | output_handle.close()
91 |
92 | sys.stderr.write("Saved %i records\n" % count)
93 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # pre-commit run --all-files
2 | repos:
3 | - repo: https://github.com/pre-commit/pre-commit-hooks
4 | rev: v5.0.0
5 | hooks:
6 | - id: check-added-large-files
7 | - id: check-case-conflict
8 | - id: check-executables-have-shebangs
9 | - id: check-json
10 | - id: check-merge-conflict
11 | - id: check-shebang-scripts-are-executable
12 | - id: check-symlinks
13 | - id: check-yaml
14 | - id: debug-statements
15 | - id: destroyed-symlinks
16 | - id: end-of-file-fixer
17 | files: \.(py|sh|rst|yml|yaml)$
18 | - id: mixed-line-ending
19 | - id: trailing-whitespace
20 | files: \.(py|sh|rst|yml|yaml)$
21 | - repo: local
22 | hooks:
23 | - id: no-tabs
24 | name: No tabs
25 | description: Reject any files containing a tab
26 | entry: '\t'
27 | language: pygrep
28 | files: \.(py|sh|rst|yml|yaml)$
29 | - repo: https://github.com/astral-sh/ruff-pre-commit
30 | rev: v0.6.9
31 | hooks:
32 | # Run the Ruff linter (flake8 alternative):
33 | - id: ruff
34 | args: [
35 | '--fix',
36 | '--exit-non-zero-on-fix',
37 | '--extend-select=BLE,C4,D,I,ISC',
38 | '--extend-ignore=D100,D103,D203,D213',
39 | '--config=lint.isort.force-single-line=true',
40 | '--config=lint.isort.order-by-type=false',
41 | '--config=lint.pyupgrade.keep-runtime-typing=true'
42 | ]
43 | # Run the Ruff formatter (black alternative):
44 | - id: ruff-format
45 | args: [
46 | '--config=format.docstring-code-format=true'
47 | ]
48 | - repo: https://github.com/rstcheck/rstcheck
49 | rev: v6.2.4
50 | hooks:
51 | - id: rstcheck
52 | args: [
53 | --report-level=warning,
54 | --ignore-roles=ref,
55 | "--ignore-directives=automodule,toctree",
56 | --ignore-substitutions=version
57 | ]
58 | - repo: https://github.com/PyCQA/doc8
59 | rev: 'v1.1.2'
60 | hooks:
61 | - id: doc8
62 | additional_dependencies: [pygments]
63 | args: [--quiet,--ignore=D001]
64 | - repo: https://github.com/codespell-project/codespell
65 | rev: v2.3.0
66 | hooks:
67 | - id: codespell
68 | files: \.(py|sh|rst|yml|yaml)$
69 | args: ['-L', 'nin,mis']
70 | ci:
71 | # Settings for the https://pre-commit.ci/ continuous integration service
72 | autofix_prs: true
73 | # Default message is more verbose
74 | autoupdate_commit_msg: '[pre-commit.ci] autoupdate'
75 | # Default is weekly
76 | autoupdate_schedule: monthly
77 |
--------------------------------------------------------------------------------
/ena_fetch/get_ENA_project_meta.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import os
4 | import urllib
5 |
6 | project = "ERP000297"
7 | strain_file = "%s_strain.tsv" % project # output file
8 |
9 | fastq_url = (
10 | "http://www.ebi.ac.uk/ena/data/view/reports/sra/fastq_files/internal/%s" % project
11 | )
12 | fastq_file = "%s_fastq.tsv" % project
13 |
14 |
15 | def download_in_one(url, filename):
16 | print("Fetching %s" % url)
17 | n = urllib.urlopen(url)
18 | data = n.read()
19 | n.close()
20 |
21 | h = open(filename, "w")
22 | h.write(data)
23 | h.close()
24 | print("Saved as %s" % filename)
25 |
26 |
27 | if not os.path.isfile(fastq_file):
28 | download_in_one(fastq_url, fastq_file)
29 |
30 |
31 | def get_strain(meta_xml_filename):
32 | h = open(meta_xml_filename)
33 | while True:
34 | line = h.readline()
35 | if not line:
36 | break
37 | if "strain" in line.lower():
38 | strain = h.readline().strip()
39 | assert strain.lower().startswith(""), strain
40 | assert strain.lower().endswith(""), strain
41 | h.close()
42 | return strain[7:-8]
43 | h.close()
44 | return None
45 |
46 |
47 | def process_meta(project, fastq_filename, strain_file):
48 | h = open(fastq_filename)
49 | out = open(strain_file, "w")
50 | line = h.readline()
51 | assert (
52 | line
53 | == "Study\tSample\tExperiment\tRun\tOrganism\tInstrument Platform\tInstrument Model\tLibrary Name\tLibrary Layout\tLibrary Source\tLibrary Selection\tRun Read Count\tRun Base Count\tFile Name\tFile Size\tmd5\tFtp\n"
54 | ), repr(line)
55 | out.write(line[:-1] + "\tStrain\n")
56 | for line in h:
57 | parts = line.rstrip("\n").split("\t")
58 | assert parts[0] == project
59 | url = parts[16]
60 | assert url.startswith("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR"), url
61 |
62 | sample = parts[1]
63 | assert sample.startswith("ERS")
64 | url = "http://www.ebi.ac.uk/ena/data/view/%s&display=xml" % sample
65 | url = "http://www.ebi.ac.uk/ena/data/view/%s&display=xml&download" % sample
66 | filename = "xml/%s.xml" % sample
67 |
68 | # Download file...
69 | if not os.path.isfile(filename):
70 | print(url)
71 | rc = os.system("wget -O %s '%s'" % (filename, url))
72 | assert not rc, rc
73 |
74 | strain = get_strain(filename)
75 | if not strain:
76 | strain = ""
77 | print(filename, strain)
78 | out.write(line[:-1] + "\t" + strain + "\n")
79 | h.close()
80 | out.close()
81 |
82 |
83 | process_meta(project, fastq_file, strain_file)
84 |
--------------------------------------------------------------------------------
/sambam/bgzf_check_eof.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Python script to check if BGZF (e.g. BAM) files have EOF marker.
3 |
4 | BAM files are compressed using BGZF, Blocked GNU Zip Format, which
5 | is a variant of GZIP. Modern BAM files include a special empty
6 | block at the end of the file (EOF) as a marker to help spot when
7 | a dataset has been truncated. This is just a 28 byte BGZF block,
8 | which when decompressed is empty.
9 |
10 | Some early tools output valid BAM files without this optional
11 | (but recommended) EOF marker.
12 |
13 | Usage with one or more BAM or BGZF files:
14 |
15 | $ ./bgzf_check_eof.py example1.bam example2.bam ... exampleN.bam
16 |
17 | The filenames are checked in the order given, if any are invalid
18 | the tool exits with a non-zero error level and a message to stderr.
19 | If all the files are valid, it returns with a zero error level.
20 |
21 | Return codes:
22 | * 0 - No errors found
23 | * 1 - Invalid arguments
24 | * 2 - File not found
25 | * 3 - File is zero bytes (and thus not valid BGZF or BAM)
26 | * 4 - File missing BGZF header
27 | * 5 - File looks like BGZF, but missing BGZF EOF marker
28 |
29 | See also: http://samtools.sourceforge.net/
30 |
31 | v0.0.0 - Original script
32 | v0.0.1 - Dropped internal function sys_exit
33 | """
34 |
35 | import os
36 | import sys
37 |
38 |
39 | def check_bam(filename):
40 | header = "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42\x43\x02\x00"
41 | eof = (
42 | "\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC"
43 | "\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00"
44 | )
45 | if not os.path.isfile(filename):
46 | sys.stderr.write("Missing file %s\n" % filename)
47 | sys.exit(2)
48 | size = os.path.getsize(filename)
49 | if not size:
50 | sys.stderr.write("Empty file (zero bytes) %s\n" % filename)
51 | sys.exit(3)
52 | h = open(filename, "rb")
53 | # Check it looks like a BGZF file
54 | # (could still be GZIP'd, in which case the extra block is harmless)
55 | data = h.read(len(header))
56 | if data != header:
57 | sys.stderr.write("File %s is not a BGZF/BAM file\n" % filename)
58 | sys.exit(4)
59 | # Check if it has the EOF already
60 | h.seek(size - 28)
61 | data = h.read(28)
62 | h.close()
63 | if data == eof:
64 | sys.stderr.write("Good, BGZF EOF already present in %s\n" % filename)
65 | else:
66 | sys.stderr.write("Missing EOF marker in BGZF/BAM file %s\n" % filename)
67 | sys.exit(5)
68 |
69 |
70 | if len(sys.argv) == 1:
71 | sys.stderr.write(
72 | "Takes one or more BGZF/BAM filenames as arguments (edits in place)"
73 | )
74 | sys.exit(1)
75 | for bam_filename in sys.argv[1:]:
76 | check_bam(bam_filename)
77 |
--------------------------------------------------------------------------------
/blooming_reads/stack_coverage_plot.py:
--------------------------------------------------------------------------------
1 | """Script to produce stacked coverage plot with matplotlib."""
2 |
3 | from __future__ import print_function
4 |
5 | import sys
6 |
7 | import numpy as np
8 | from matplotlib import pyplot as plt
9 |
10 |
11 | def load(filename):
12 | h = open(filename)
13 | line = h.readline()
14 | assert line.startswith(">")
15 | while line and line[0] == ">":
16 | name = line[1:].split(None, 1)[0]
17 | values = []
18 | while line:
19 | line = h.readline()
20 | if not line or line[0] == ">":
21 | break
22 | values.append([float(v) for v in line.rstrip("\n").split("\t")])
23 | yield name, np.array(values, np.float)
24 | h.close()
25 |
26 |
27 | def make_colors(start, end, steps):
28 | delta = (end - start) / float(steps - 1)
29 | return ["#%02x%02x%02x" % tuple(start + i * delta) for i in range(steps)]
30 |
31 |
32 | def stack(data, filename, colors=None):
33 | total = len(data)
34 | max_value = 0
35 | for names, values in data:
36 | max_value = max(max_value, values.sum(axis=0).max())
37 | plt.ylim([0, max_value])
38 |
39 | fig = plt.figure(figsize=(12, 2 * total))
40 | if not colors:
41 | # Assumes all the examples have same number of colors:
42 | if data[0][1].shape[0] == 3:
43 | colors = ["#CC6666", "#1DACD6", "#6E5160"]
44 | elif data[0][1].shape[0] == 5:
45 | colors = ["#CDCDC1", "#8B8B83", "#FF6A6A", "#F0E68C", "#CDC673"]
46 | else:
47 | colors = make_colors(
48 | np.array([0xCC, 0x66, 0x66]),
49 | # np.array([0x6E, 0x51, 0x60]),
50 | np.array([0x90, 0x41, 0x50]),
51 | # np.array([0x20, 0xF0, 0x60]),
52 | data[0][1].shape[0],
53 | )
54 | print(colors)
55 | for i, (name, values) in enumerate(data):
56 | x = range(values.shape[1])
57 | print(i, name, values.shape, "coverage:")
58 | print("\t".join("%0.1f" % v for v in values.sum(axis=1)))
59 | y_stack = np.cumsum(values, axis=0)
60 | ax1 = fig.add_subplot(total, 1, i + 1)
61 | ax1.set_autoscaley_on(False)
62 | ax1.set_ylim([0, max_value])
63 | ax1.set_title(name.split(None, 1)[0], fontsize="xx-small")
64 | ax1.fill_between(x, 0, y_stack[0, :], facecolor=colors[0], alpha=0.7)
65 | for i in range(0, values.shape[0] - 1):
66 | ax1.fill_between(
67 | x, y_stack[i, :], y_stack[i + 1, :], facecolor=colors[i + 1], alpha=0.7
68 | )
69 | # fig.tight_layout()
70 | plt.show()
71 | plt.savefig(filename)
72 |
73 |
74 | for filename in sys.argv[1:]:
75 | if not filename.endswith(".cov"):
76 | continue
77 | print("-" * 60)
78 | print(filename)
79 | print("-" * 60)
80 | data = list(load(filename))
81 | stack(data, filename + ".png")
82 | print("Done")
83 |
--------------------------------------------------------------------------------
/sambam/fastq_to_sam.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Python script to turn FASTQ into unaliged SAM/BAM files.
3 |
4 | This script is designed to be used as part of a Unix pipeline. It
5 | works with Python 2 and Python 3, e.g.
6 |
7 | $ python fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam
8 | Done, 532 pairs
9 |
10 | Or:
11 |
12 | $ python3 fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam
13 | Done, 532 pairs
14 |
15 | As long as the Python script is marked as executable you can do:
16 |
17 | $ ./fastq_to_sam.py R1.fastq R2.fastq > unmapped.sam
18 | Done, 532 pairs
19 |
20 | Simple usage with BAM files with conversion from SAM via samtools:
21 |
22 | $ ./fastq_to_sam.py R1.fastq R2.fastq | samtools view -S -b - > unmapped.bam
23 | [samopen] no @SQ lines in the header.
24 | Done, 532 pairs
25 |
26 | Note that no @SQ lines are expected in SAM/BAM files with only unaligned reads.
27 |
28 | WARNING: This assumes your FASTQ files use the Sanger quality encoding.
29 |
30 | Todo:
31 | ----
32 | - Test cases
33 | - Galaxy wrapper?
34 | - Proper command line API
35 | - Support for gzipped FASTQ (detected via filename?)
36 | - Support for interlaced FASTQ
37 | - Support for setting read groups
38 | - Support for multiple FASTQ input pairs (and read groups)
39 |
40 | Copyright Peter Cock 2015. All rights reserved. See:
41 | https://github.com/peterjc/picobio
42 |
43 | """
44 |
45 | import sys
46 |
47 | if "-v" in sys.argv or "--version" in sys.argv:
48 | print("This is fastq_to_sam.py version 0.0.1")
49 | sys.exit(0)
50 |
51 | # TODO - proper API, allow interleaved FASTQ, read group, etc
52 | if len(sys.argv) != 3:
53 | sys.stderr.write("Expects two arguments, a pair of FASTQ filenames\n")
54 | sys.exit(1)
55 |
56 | try:
57 | from Bio._py3k import zip
58 | from Bio.SeqIO.QualityIO import FastqGeneralIterator
59 | except ImportError:
60 | sys.exit("ERROR: This requires Biopython.\n")
61 | sys.exit(1)
62 |
63 | fastq1 = FastqGeneralIterator(open(sys.argv[1]))
64 | fastq2 = FastqGeneralIterator(open(sys.argv[2]))
65 |
66 | # Paired, unmapped, mate unmapped, either first or second in pair:
67 | flag1 = "77"
68 | flag2 = "141"
69 | rname = "*"
70 | pos = "0"
71 | mapq = "0"
72 | cigar = "*"
73 | rnext = "*"
74 | pnext = "0"
75 | tlen = "0"
76 |
77 | pairs = 0
78 | for (t1, s1, q1), (t2, s2, q2) in zip(fastq1, fastq2):
79 | id1 = t1.split(None, 1)[0]
80 | id2 = t2.split(None, 1)[0]
81 | if id1 == id2:
82 | # Good, should we check the description follows Illumina naming?
83 | qname = id1
84 | else:
85 | assert id1.endswith("/1"), t1
86 | assert id2.endswith("/2"), t2
87 | qname = id1[:-2]
88 |
89 | print(
90 | "\t".join([qname, flag1, rname, pos, mapq, cigar, rnext, pnext, tlen, s1, q1])
91 | )
92 | print(
93 | "\t".join([qname, flag1, rname, pos, mapq, cigar, rnext, pnext, tlen, s2, q2])
94 | )
95 | pairs += 1
96 | sys.stderr.write("Done, %i pairs\n" % pairs)
97 |
--------------------------------------------------------------------------------
/sambam/sam_drop_long_cigar.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 |
5 | usage = """Python script to remove SAM reads with long CIGAR strings.
6 |
7 | The BAM format (currently) uses an unsigned 16bit integer for the
8 | number of CIGAR operations in a read, and therefore BAM files can
9 | only hold reads with up to 65535 CIGAR operators. SAM does not have
10 | this limit, but the samtools implementation (reasonably) also has
11 | the same 16bit limit. See also:
12 | https://github.com/samtools/samtools/pull/39
13 |
14 | This script is designed to be used as part of a Unix pipeline. It
15 | takes no command line arguments. It reads SAM format data from stdin,
16 | and writes SAM format data to stdout.
17 |
18 | The only change made to the SAM reads is to drop records with over
19 | 65535 CIGAR operators. These are logged to stderr.
20 |
21 | $ ./sam_drop_long_cigar.py < original.sam > no_long_cigar.sam
22 |
23 | Simple usage with BAM files with conversion to/from SAM via samtools:
24 |
25 | $ samtools view -h original.bam | ./sam_drop_long_cigar.py | samtools view -S -b - > no_long_cigar.bam
26 |
27 | Copyright Peter Cock 2014. All rights reserved. See:
28 | https://github.com/peterjc/picobio
29 | """
30 |
31 | if len(sys.argv) != 1:
32 | sys.stderr.write("ERROR: Bad arguments.\n\n")
33 | sys.stderr.write("Expects SAM on stdin, and writes SAM to stdout.\n")
34 | sys.exit(1)
35 |
36 | # def decode_cigar(cigar):
37 | # """Returns a list of 2-tuples, integer count and operator char."""
38 | # count = ""
39 | # answer = []
40 | # for letter in cigar:
41 | # if letter.isdigit():
42 | # count += letter #string addition
43 | # elif letter in "MIDNSHP=X":
44 | # answer.append((int(count), letter))
45 | # count = ""
46 | # else:
47 | # raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
48 | # return answer
49 | #
50 | # assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14,'S'),(15,'M'),(1,'P'),(1,'D'),(3,'P'),(54,'M'),(1,'D'),(34,'M'),(5,'S')]
51 |
52 |
53 | def cigar_length(cigar):
54 | """Return number of cigar operators (integer)."""
55 | answer = 0
56 | for letter in cigar:
57 | if letter.isdigit():
58 | pass
59 | elif letter in "MIDNSHP=X":
60 | answer += 1
61 | else:
62 | raise ValueError("Invalid character %s in CIGAR %s" % (letter, cigar))
63 | return answer
64 |
65 |
66 | count = 0
67 | longs = 0
68 | for line in sys.stdin:
69 | if line[0] != "@":
70 | # Should be a read
71 | count += 1
72 | qname, flag, rname, pos, mapq, cigar, rest = line.split("\t", 6)
73 | if cigar != "*":
74 | len_cigar = cigar_length(cigar)
75 | if len_cigar > 65535:
76 | longs += 1
77 | sys.stderr.write(
78 | "Dropping read %s with %i CIGAR operators\n" % (qname, len_cigar)
79 | )
80 | continue
81 | sys.stdout.write(line)
82 | sys.stderr.write("Dropped %i out of %i reads\n" % (longs, count))
83 |
--------------------------------------------------------------------------------
/snakemake/snakemake_progress_bar_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Demonstration of calling a snakemake workflow with a progress bar.
3 |
4 | Written and tested using snakemake 8.20.6 under macOS.
5 |
6 | Currently the snakemake API doesn't have any obvious way
7 | to get callbacks or an iterator approach to running a
8 | workflow which would allow direct updates to a progress
9 | bar. Improvements to their logging system may allow this?
10 |
11 | Instead, this demonstrate running snakemake on a subprocess,
12 | and monitoring the creation of the expected output files
13 | as a proxy to update a progress bar. This works, but would
14 | put some additional load on the file system.
15 |
16 | This uses the rich library's progress bar, but the same idea
17 | would work with another library like tqdm. We must explicitly
18 | update the progress bar whenever a new output file is found.
19 | """
20 |
21 | from multiprocessing import Process
22 | from pathlib import Path
23 |
24 | from rich.progress import Progress # or use tqdm, or ...
25 |
26 | from snakemake.api import DAGSettings
27 | from snakemake.api import ResourceSettings
28 | from snakemake.api import SnakemakeApi
29 | from snakemake.settings.types import OutputSettings
30 | from snakemake.settings.types import Quietness
31 |
32 | inputs = Path(".").glob("*.fna")
33 | targets = [str(_) + ".md5" for _ in inputs]
34 |
35 |
36 | def black_box(output_files):
37 | """Black-box function which generates known files (here snakemake)."""
38 | snakefile = Path("demo.smk")
39 | with SnakemakeApi(OutputSettings(quiet={Quietness.ALL})) as snakemake_api:
40 | workflow_api = snakemake_api.workflow(
41 | snakefile=snakefile,
42 | resource_settings=ResourceSettings(),
43 | # config_settings=ConfigSettings(config=config_args),
44 | # workdir=workdir,
45 | )
46 | dag_api = workflow_api.dag(
47 | dag_settings=DAGSettings(targets=output_files),
48 | )
49 | dag_api.unlock()
50 | dag_api.execute_workflow()
51 |
52 |
53 | def with_progress_bar(function, output_files, interval=0.5):
54 | """Run given function via subprocess with a progress bar.
55 |
56 | The function must accept a single argument, the given file list.
57 | The appearance of those files on disk is used to update the progress
58 | bar. This runs the function in a process via multiprocessing, and
59 | returns the process exit code (should be zero for success).
60 | """
61 | pending = [Path(_) for _ in targets]
62 | p = Process(target=function, args=(output_files,))
63 | p.start()
64 | with Progress() as progress:
65 | task = progress.add_task("Snakemake...", total=len(pending))
66 | while pending:
67 | p.join(interval)
68 | for t in pending[:]:
69 | if t.is_file():
70 | print(f"Done: {t}")
71 | pending.remove(t)
72 | progress.update(task, advance=1)
73 | if p.exitcode is not None:
74 | # Should be finished, but was it success or failure?
75 | pending = [] # to break the loop
76 | p.join() # Should be immediate as should have finished
77 | assert not p.is_alive()
78 | print(f"Snakemake return code {p.exitcode}")
79 | return p.exitcode
80 |
81 |
82 | if __name__ == "__main__":
83 | # black_box(targets)
84 | with_progress_bar(black_box, targets)
85 |
--------------------------------------------------------------------------------
/seq_manipulation/shred_contigs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Python script for shredding contigs into fake reads.
3 |
4 | e.g. for input into Newbler.
5 | """
6 |
7 | import os
8 | import sys
9 | from optparse import OptionParser
10 |
11 | from Bio import SeqIO
12 |
13 | usage = """Basic usage: python shred_contigs.py assembly.fasta -o shredded.fasta
14 |
15 | Multiple input FASTA files are accepted, see -h for more details.
16 |
17 | Using Roche 454 Newbler, non-SFF input reads are limited to 1999 bp, thus
18 | you might wish to use something like this on an Illumina assembly:
19 |
20 | $ python shred_contigs.py other_assemby.fasta -o shredded.fasta -m 1999 -l 1999 -s 500
21 | """
22 |
23 | parser = OptionParser(usage=usage)
24 | parser.add_option(
25 | "-m",
26 | "--min-contig-len",
27 | dest="max_contig",
28 | type="int",
29 | help="Max contig length to reuse as is (default 2000)",
30 | default=2000,
31 | )
32 | parser.add_option(
33 | "-l",
34 | "--shred-length",
35 | dest="shred_length",
36 | type="int",
37 | help="Length of fake reads to generate (default 1000 bp)",
38 | default=1000,
39 | )
40 | parser.add_option(
41 | "-s",
42 | "--shred-step",
43 | dest="shred_step",
44 | type="int",
45 | help="Offset between fake reads (default 500 bp)",
46 | default=500,
47 | )
48 | parser.add_option(
49 | "-o",
50 | "--output",
51 | dest="output_filename",
52 | help="FASTA output filename for fake reads (required)",
53 | default=None,
54 | metavar="FILE",
55 | )
56 | (options, args) = parser.parse_args()
57 | if not args:
58 | sys.exit("Requires at least one input FASTA filename\n\n" + usage)
59 |
60 | max_contig = int(options.max_contig)
61 | shred_length = int(options.shred_length)
62 | shred_step = int(options.shred_step)
63 | output_fasta = options.output_filename
64 |
65 | if shred_step < 1:
66 | sys.exit("Shred step should be positive")
67 | if shred_length < shred_step:
68 | sys.exit("Shred step should be less than shred length")
69 |
70 | print("Accepting contigs up to length %i as they are (option -m)" % max_contig)
71 | print(
72 | "Shredding longer contigs into reads of %i bp (option -l), step %i (option -s)"
73 | % (shred_length, shred_step)
74 | )
75 |
76 | for assembly_fasta in args:
77 | if not os.path.isfile(assembly_fasta):
78 | sys.exit("Assembly FASTA file not found: %r" % assembly_fasta)
79 |
80 |
81 | def shred(input_filename):
82 | global as_is, shredded
83 | for record in SeqIO.parse(input_filename, "fasta"):
84 | length = len(record)
85 | if length <= max_contig:
86 | as_is += 1
87 | yield record
88 | else:
89 | # Shred it!
90 | shredded += 1
91 | for i, start in enumerate(range(0, length - shred_step, shred_step)):
92 | fragment = record[start : start + shred_length]
93 | fragment.id = "%s_fragment%i" % (record.id, i + 1)
94 | yield fragment
95 |
96 |
97 | count = 0
98 | as_is = 0
99 | shredded = 0
100 | with open(output_fasta, "w") as output_handle:
101 | for assembly_fasta in args:
102 | count += SeqIO.write(shred(assembly_fasta), output_handle, "fasta")
103 | print("Shredded %i FASTA files, containing %i contigs" % (len(args), as_is + shredded))
104 | print(
105 | "Accepted %i short contigs, shredded %i long contigs, giving %i reads"
106 | % (as_is, shredded, count)
107 | )
108 |
--------------------------------------------------------------------------------
/blast/wwwblast2loc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Short Python script to parse the blastwww setup files to extract a list of
3 | # nucleotide and protein BLAST databases (with descriptions) and write them
4 | # out as location files for use in Galaxy.
5 | #
6 | # Copyright 2010, Peter Cock.
7 | #
8 | # v001 - First version
9 | # v002 - Use print as function
10 | from __future__ import print_function
11 |
12 | import os
13 |
14 | # This gives us the list of databases and their type (nt vs aa):
15 | blastrc = "/var/www/html/blast/blast.rc"
16 | # This gives us a sensible order and their descriptions:
17 | blastwww = "/var/www/html/blast/blast.html"
18 |
19 | # BLAST DB path,
20 | blastpath = "/data/blastdb"
21 |
22 | # Output files
23 | blast_nt = "blastdb.loc"
24 | blast_aa = "blastdb_p.loc"
25 |
26 |
27 | def load_blast_db_list(filename):
28 | nt = set()
29 | aa = set()
30 | handle = open(filename)
31 | for line in handle:
32 | if line.startswith("#") or not line.strip():
33 | continue
34 | elif line.startswith("NumCpuToUse"):
35 | continue
36 | elif line.startswith(("blastn ", "tblastn", "tblastx ")):
37 | nt.update(line.rstrip().split()[1:])
38 | elif line.startswith(("blastp ", "blastx")):
39 | aa.update(line.rstrip().split()[1:])
40 | else:
41 | raise ValueError(line)
42 | handle.close()
43 | return nt, aa
44 |
45 |
46 | nt, aa = load_blast_db_list(blastrc)
47 | # print(nt)
48 | # print(aa)
49 |
50 |
51 | def load_blast_db_descr(html_filename, nt, aa):
52 | nt_list = []
53 | aa_list = []
54 | handle = open(html_filename)
55 | for line in handle:
56 | line = line.strip()
57 | if not line.startswith("