├── .gitignore ├── bin ├── babelfish ├── concatenate-annotation ├── diff-genes ├── fasta-to-phylip ├── gbk-cds-to-fasta ├── gbk-ref-to-fasta ├── gc-content ├── kraken-table.py ├── print-gbk-products ├── qseq-to-fastq ├── quick-blast ├── revcomp-stdin ├── seq-eval ├── seq-filter ├── sequence-statistics ├── split-seqs-by-id ├── split-sequences └── wrap-fasta ├── bioinformatics.jpg ├── install.sh ├── license.md └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store -------------------------------------------------------------------------------- /bin/babelfish: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | 5 | from Bio import SeqIO 6 | 7 | 8 | def parse_args(): 9 | """ 10 | return arguments 11 | >>> args = parse_args() 12 | 13 | """ 14 | 15 | parser = argparse.ArgumentParser( 16 | description=""" 17 | 18 | Convert between bioinformatics file formats for genome annotations: 19 | 20 | - genbank (gbk) 21 | - gff 22 | - fasta 23 | 24 | """ 25 | ) 26 | parser.add_argument( 27 | "--log", default="/dev/stderr", help="log file (default=stderr)" 28 | ) 29 | parser.add_argument("--output", default="/dev/stdout") 30 | parser.add_argument("--in-format", required=True) 31 | parser.add_argument("--out-format", required=True) 32 | parser.add_argument("--input", default="/dev/stdin") 33 | return parser.parse_args() 34 | 35 | 36 | def main(): 37 | """ 38 | >>> main() # stuff happens 39 | """ 40 | 41 | args = parse_args() 42 | 43 | with open(args.input) as handle, open(args.output, "w") as output: 44 | records = SeqIO.parse(handle, args.in_format) 45 | 46 | if args.out_format == "gff": 47 | from BCBio import GFF 48 | 49 | GFF.write(records, output) 50 | else: 51 | SeqIO.write(records, output, args.out_format) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /bin/concatenate-annotation: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from Bio.SeqRecord import SeqRecord 8 | from Bio.Alphabet import IUPAC 9 | 10 | L = 1000 11 | 12 | 13 | def spacer(l): 14 | return SeqRecord(Seq("N" * l, IUPAC.ambiguous_dna)) 15 | 16 | 17 | with open(sys.argv[1]) as handle: 18 | records = list(SeqIO.parse(handle, "genbank")) 19 | 20 | # insert spacer between each record 21 | records = [i + spacer(L) for i in records] 22 | 23 | concatenated = reduce(lambda x, y: x + y, records) 24 | 25 | print(concatenated.format("genbank")) 26 | -------------------------------------------------------------------------------- /bin/diff-genes: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # 4 | # Given two genbank files, output a list of genes in A but not in B 5 | # 6 | 7 | from Bio import SeqIO 8 | import sys 9 | 10 | gene_sets = [] 11 | for path in sys.argv[1:]: 12 | gene_set = set() 13 | with open(path) as handle: 14 | for scaffold in SeqIO.parse(handle, "genbank"): 15 | for feature in scaffold.features: 16 | if feature.type == "CDS": 17 | if "gene" in feature.qualifiers: 18 | gene_set.add(feature.qualifiers["gene"][0]) 19 | gene_sets.append(gene_set) 20 | 21 | for i in gene_sets[1] - gene_sets[0]: 22 | print(i) 23 | -------------------------------------------------------------------------------- /bin/fasta-to-phylip: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __DESCRIPTION__ = """ 4 | 5 | Convert a FASTA alignment to Phylip format. 6 | 7 | Dependenies: BioPython 8 | 9 | fasta_to_phylip --input-fasta file.fasta --output-phy file.phy 10 | 11 | """ 12 | 13 | from Bio import AlignIO 14 | import argparse 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument("--input-fasta", default="/dev/stdin") 21 | parser.add_argument("--output-phy", default="/dev/stdout") 22 | 23 | return parser.parse_args() 24 | 25 | 26 | def main(): 27 | 28 | args = parse_args() 29 | 30 | with open(args.input_fasta) as handle: 31 | records = AlignIO.parse(handle, "fasta") 32 | 33 | with open(args.output_phy, "w") as output_handle: 34 | AlignIO.write(records, output_handle, "phylip") 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /bin/gbk-cds-to-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # (thrifted from: https://www.biostars.org/p/83058/) 4 | 5 | import argparse 6 | import logging 7 | from Bio import SeqIO 8 | 9 | 10 | def parse_args(): 11 | """ 12 | return arguments 13 | >>> args = parse_args() 14 | 15 | """ 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument( 19 | "--log", default="/dev/stderr", help="log file (default=stderr)" 20 | ) 21 | parser.add_argument("--output", default="/dev/stdout") 22 | parser.add_argument("--input", default="/dev/stdin") 23 | parser.add_argument("--molecule", default="nucl", help="nucl or prot") 24 | parser.add_argument("--name", default=None) 25 | parser.add_argument("--type", default="CDS") 26 | return parser.parse_args() 27 | 28 | 29 | def main(): 30 | """ 31 | >>> main() # stuff happens 32 | """ 33 | 34 | args = parse_args() 35 | logging.basicConfig(filename=args.log, level=logging.INFO) 36 | 37 | if args.name is not None: 38 | logging.info("prepending {} to sequence IDs".format(args.name)) 39 | prefix = "{}_".format(args.name) 40 | else: 41 | prefix = "" 42 | 43 | logging.info("writing features as {}".format(args.molecule)) 44 | 45 | with open(args.input) as handle, open(args.output, "w") as output: 46 | for record in SeqIO.parse(handle, "genbank"): 47 | for feature in record.features: 48 | if feature.type == args.type: 49 | feature.id = "{}{}".format(prefix, feature.id) 50 | if args.molecule == "nucl": 51 | seq = str(feature.extract(record).seq) 52 | elif args.molecule == "prot": 53 | seq = str(feature.qualifiers["translation"][0]) 54 | 55 | output.write( 56 | ">{} {}\n{}\n".format( 57 | feature.qualifiers["locus_tag"][0], 58 | feature.qualifiers["product"][0], 59 | seq, 60 | ) 61 | ) 62 | 63 | 64 | if __name__ == "__main__": 65 | main() 66 | -------------------------------------------------------------------------------- /bin/gbk-ref-to-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # (thrifted from: https://www.biostars.org/p/83058/) 4 | 5 | import argparse 6 | from Bio import SeqIO 7 | 8 | 9 | def parse_args(): 10 | """ 11 | return arguments 12 | >>> args = parse_args() 13 | 14 | """ 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--output", default="/dev/stdout") 18 | parser.add_argument("--input", default="/dev/stdin") 19 | parser.add_argument("--name", default=None, help="prepend name to sequence IDs") 20 | return parser.parse_args() 21 | 22 | 23 | def main(): 24 | """ 25 | >>> main() # stuff happens 26 | """ 27 | 28 | args = parse_args() 29 | 30 | if args.name is not None: 31 | prefix = "{}_".format(args.name) 32 | else: 33 | prefix = "" 34 | 35 | with open(args.input) as handle: 36 | for record in SeqIO.parse(handle, "genbank"): 37 | print(">{}{}\n{}".format(prefix, str(record.id), str(record.seq))) 38 | 39 | 40 | if __name__ == "__main__": 41 | main() 42 | -------------------------------------------------------------------------------- /bin/gc-content: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | 6 | import re 7 | 8 | from Bio import SeqIO 9 | 10 | 11 | def parse_args(): 12 | """ 13 | return arguments 14 | >>> args = parse_args() 15 | 16 | """ 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument( 20 | "--log", default="/dev/stderr", help="log file (default=stderr)" 21 | ) 22 | parser.add_argument("--output", default="/dev/stdout") 23 | parser.add_argument("--input", default="/dev/stdin") 24 | parser.add_argument("--format", default="fasta") 25 | parser.add_argument("--width", default=1000, type=int) 26 | parser.add_argument("--step", default=10, type=int) 27 | parser.add_argument("--motif", default="[GC]", type=re.compile) 28 | return parser.parse_args() 29 | 30 | 31 | def sliding_windows(seq, width, step): 32 | for i in range(0, len(seq), step): 33 | j = i + width 34 | yield (i, j, seq[i:j]) 35 | 36 | 37 | def sliding_window_count_matches(seq, width, step, motif): 38 | for i, j, sub in sliding_windows(seq, width, step): 39 | matches = len(motif.findall(sub)) 40 | p = matches / float(len(sub)) 41 | yield (i, j, p) 42 | 43 | 44 | def main(): 45 | """ 46 | >>> main() # stuff happens 47 | """ 48 | 49 | args = parse_args() 50 | logging.basicConfig(filename=args.log, level=logging.INFO) 51 | 52 | logging.info(args) 53 | 54 | with open(args.input) as handle: 55 | records = list(SeqIO.parse(handle, args.format)) 56 | 57 | output = open(args.output, "w") 58 | 59 | for record in records: 60 | seq = str(record.seq) 61 | for i, j, p in sliding_window_count_matches( 62 | seq, args.width, args.step, args.motif 63 | ): 64 | print >> output, "%s\t%s\t%s\t%s" % (record.id, i, j, p) 65 | 66 | output.close() 67 | 68 | 69 | if __name__ == "__main__": 70 | main() 71 | -------------------------------------------------------------------------------- /bin/kraken-table.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # # kraken-table 4 | # 5 | # Generates a table from [Kraken](https://ccb.jhu.edu/software/kraken/) 6 | # output(s). Like [kraken-biom](https://github.com/smdabdoub/kraken-biom) but 7 | # doesn't require you to install biom-format, SciPy and Numpy just to generate a 8 | # table. 9 | # 10 | # ## Usage 11 | # 12 | # ``` 13 | # ./kraken-table.py \ 14 | # --inputs \ 15 | # kraken_output_1.txt \ 16 | # kraken_output_2.txt \ 17 | # > otus.csv 18 | # ``` 19 | 20 | import argparse 21 | import logging 22 | from csv import DictWriter 23 | from collections import defaultdict 24 | 25 | 26 | def parse_args(): 27 | """ 28 | return arguments 29 | >>> args = parse_args() 30 | 31 | """ 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument( 35 | "--log", default="/dev/stderr", help="log file (default=stderr)" 36 | ) 37 | parser.add_argument("--output", default="/dev/stdout") 38 | parser.add_argument("--inputs", nargs="*", default=[]) 39 | return parser.parse_args() 40 | 41 | 42 | def parse_kraken_file(handle): 43 | for line in handle: 44 | line = line.strip().split("\t") 45 | 46 | yield { 47 | "classified": {"C": True, "U": False}[line[0]], 48 | "read_id": line[1], 49 | "ncbi_taxid": line[2], 50 | "length": int(line[3]), 51 | "assignments": line[4].split(), 52 | } 53 | 54 | 55 | def main(): 56 | """ 57 | >>> main() # stuff happens 58 | """ 59 | 60 | args = parse_args() 61 | logging.basicConfig(filename=args.log, level=logging.INFO) 62 | 63 | input_otu_counts = defaultdict(lambda: defaultdict(lambda: 0)) 64 | field_names = set() 65 | 66 | for input in args.inputs: 67 | with open(input) as handle: 68 | kraken_data = parse_kraken_file(handle) 69 | 70 | for row in kraken_data: 71 | field_names.add(row["ncbi_taxid"]) 72 | input_otu_counts[input][row["ncbi_taxid"]] += 1 73 | 74 | field_names = ["input"] + sorted([i for i in field_names]) 75 | 76 | with open(args.output, "w") as handle: 77 | writer = DictWriter(handle, fieldnames=field_names) 78 | 79 | writer.writeheader() 80 | 81 | for input, otu_counts in list(input_otu_counts.items()): 82 | otu_counts["input"] = input 83 | writer.writerow(otu_counts) 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /bin/print-gbk-products: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from Bio import SeqIO 4 | 5 | with open("/dev/stdin") as handle: 6 | products = ( 7 | product 8 | for record in SeqIO.parse(handle, "genbank") 9 | for feature in record.features 10 | if "product" in feature.qualifiers 11 | for product in feature.qualifiers["product"] 12 | if feature.type == "CDS" 13 | ) 14 | 15 | for product in products: 16 | print(product) 17 | -------------------------------------------------------------------------------- /bin/qseq-to-fastq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __DESCRIPTION__ = """ 4 | 5 | qseq to fastq - convert two qseq files from paired-end sequencing 6 | to a single, interleaved fastq file 7 | 8 | qseq_to_fastq --left left_reads.txt --right right_reads.txt --out interleaved.fastq 9 | 10 | or 11 | 12 | qseq_to_fasta --left left_reads.txt --out left_reads.fastq 13 | 14 | """ 15 | 16 | import argparse 17 | from itertools import izip 18 | 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser() 22 | 23 | parser.add_argument("--left", required=True, default="/dev/stdin") 24 | parser.add_argument("--right", help="optional (will interleave)") 25 | parser.add_argument("--barcode", default=None, help="optional (added to header)") 26 | 27 | parser.add_argument("--out", default="/dev/stdout") 28 | 29 | return parser.parse_args() 30 | 31 | 32 | def to_fasta(s, label=None): 33 | 34 | s = s.strip().split("\t") 35 | 36 | header = ":".join(s[0:8]) 37 | sequence = s[8].replace(".", "N") 38 | quality = s[9] 39 | 40 | if label != None: 41 | header += ":" + label 42 | 43 | return "@%s\n%s\n+%s\n%s" % (header, sequence, header, quality) 44 | 45 | 46 | def add_barcode_to_header(seq, barcode): 47 | seq.description += barcode 48 | 49 | 50 | def flatten(iterable): 51 | """ Flatten a list of lists (but go no further) """ 52 | for i in iterable: 53 | for j in i: 54 | yield j 55 | 56 | 57 | def main(): 58 | 59 | args = parse_args() 60 | 61 | left_handle = open(args.left) 62 | right_handle = open(args.right) 63 | 64 | if args.barcode != None: 65 | bc_handle = open(args.barcode) 66 | 67 | out_handle = open(args.out, "w") 68 | 69 | for l, r in izip(left_handle, right_handle): 70 | if args.barcode: 71 | bc_sequence = bc_handle.next().split("\t")[8].replace(".", "N") 72 | else: 73 | bc_sequence = None 74 | 75 | l_fasta = to_fasta(l, label=bc_sequence) 76 | r_fasta = to_fasta(r, label=bc_sequence) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /bin/quick-blast: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import tempfile 4 | import argparse 5 | import logging 6 | import subprocess 7 | import os 8 | 9 | 10 | def parse_args(): 11 | """ 12 | return arguments 13 | >>> args = parse_args() 14 | 15 | """ 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument( 19 | "--log", default="/dev/stderr", help="log file (default=stderr)" 20 | ) 21 | parser.add_argument("--query", default="/dev/stdout") 22 | parser.add_argument("--db", default="/dev/stdin") 23 | parser.add_argument("--dbtype", default="nucl") 24 | return parser.parse_args() 25 | 26 | 27 | def main(): 28 | """ 29 | >>> main() # stuff happens 30 | """ 31 | 32 | args = parse_args() 33 | logging.basicConfig(filename=args.log, level=logging.INFO) 34 | 35 | tempdir = tempfile.TemporaryDirectory() 36 | 37 | # make blast db 38 | db_prefix = os.path.join(tempdir.name, os.path.basename(args.query)) 39 | subprocess.Popen( 40 | "makeblastdb -dbtype nucl -in {} -out {}".format(args.db, db_prefix).split() 41 | ) 42 | 43 | subprocess.Popen("blastn -query {} -db {}".format(args.query, db_prefix).split()) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /bin/revcomp-stdin: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from Bio.Seq import Seq 4 | 5 | with open("/dev/stdin") as handle: 6 | for line in handle: 7 | print(Seq(line.strip()).reverse_complement()) 8 | -------------------------------------------------------------------------------- /bin/seq-eval: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import re 5 | from Bio import SeqIO 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument("-i", "--input", default="/dev/stdin") 12 | parser.add_argument("-p", "--proc", type=str) 13 | parser.add_argument( 14 | "-s", 15 | "--skip-fail", 16 | default=False, 17 | action="store_true", 18 | help="Skip sequence if theres an exception", 19 | ) 20 | parser.add_argument("--format", default="fasta") 21 | 22 | return parser.parse_args() 23 | 24 | 25 | def main(): 26 | 27 | args = parse_args() 28 | 29 | with open(args.fasta_file) as handle: 30 | records = SeqIO.parse(handle, args.format) 31 | 32 | for n, r in enumerate(records): 33 | try: 34 | print(eval(args.proc)) 35 | except Exception as e: 36 | if args.skip_fail: 37 | continue 38 | else: 39 | raise e 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /bin/seq-filter: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Filter sequence file by some arbitrary parameter. 4 | 5 | # Example 6 | 7 | # Filter any reads less than 50nt in input.fastq and save to output.fastq 8 | # 9 | # bin/seq-filter --input input.fastq --output output.fastq --format fastq --param 'len(r) > 50' 10 | # 11 | 12 | import argparse 13 | from Bio import SeqIO 14 | import re 15 | 16 | 17 | def parse_args(): 18 | parser = argparse.ArgumentParser() 19 | 20 | parser.add_argument("--input", default="/dev/stdin") 21 | parser.add_argument("--format", default="fasta") 22 | parser.add_argument("--output", default="/dev/stdout") 23 | parser.add_argument("--param", default=True, type=str) 24 | parser.add_argument( 25 | "--skip-fail", 26 | help="do not print record if an exception is raised", 27 | default=False, 28 | action="store_true", 29 | ) 30 | 31 | return parser.parse_args() 32 | 33 | 34 | def main(): 35 | 36 | args = parse_args() 37 | 38 | with open(args.input) as handle: 39 | records = SeqIO.parse(handle, args.format) 40 | 41 | out_handle = open(args.output, "w") 42 | 43 | for n, r in enumerate(records): 44 | print_record = False 45 | 46 | try: 47 | print_record = eval(args.param) 48 | except Exception as e: 49 | if args.skip_fail: 50 | continue 51 | else: 52 | raise e 53 | 54 | if print_record: 55 | out_handle.write(r.format(args.format)) 56 | 57 | out_handle.close() 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /bin/sequence-statistics: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Calculate various statistics on a fasta/fastq file 4 | 5 | import argparse 6 | import logging 7 | 8 | from Bio import SeqIO 9 | 10 | import numpy 11 | 12 | 13 | class Stats: 14 | """ Wrapper class for statisticl functions. 15 | In the future, remove numpy dependency and replace 16 | with the statistics in the Python 3.4 stdlib. 17 | """ 18 | 19 | @classmethod 20 | def mean(self, a): 21 | """ Return mean of a list of integers 22 | 23 | >>> Stats.mean([1, 2, 3]) 24 | 2.0 25 | >>> Stats.mean([0, 1, -1]) 26 | 0.0 27 | >>> Stats.mean([0, 0, 0]) 28 | 0.0 29 | 30 | """ 31 | 32 | return numpy.mean(a) 33 | 34 | @classmethod 35 | def median(self, a): 36 | """ Return median of a list of integers 37 | 38 | >>> Stats.median([1, 2, 3]) 39 | 2.0 40 | >>> Stats.median([0, 1, 1]) 41 | 1.0 42 | >>> Stats.median([0, 0, 1]) 43 | 0.0 44 | """ 45 | 46 | return numpy.median(a) 47 | 48 | @classmethod 49 | def n50(self, a): 50 | """ Return N50 of a list of integers 51 | >>> Stats.n50([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) 52 | 7 53 | >>> Stats.n50([0, 100, 100, 100, 100, 100, 100, 101]) 54 | 100 55 | """ 56 | 57 | t = sum(a) / 2.0 58 | lengths = reversed(sorted(a)) 59 | test_sum = 0 60 | for i in lengths: 61 | test_sum += i 62 | if t < test_sum: 63 | return i 64 | 65 | @classmethod 66 | def standard_deviation(self, a): 67 | """ Return standard deviation of a list of numbers 68 | 69 | >>> Stats.standard_deviation([1, 1, 0, -1, -1]) 70 | 0.89442719099991586 71 | 72 | """ 73 | return numpy.std(a) 74 | 75 | 76 | def parse_args(): 77 | """ return arguments 78 | >>> args = parse_args() 79 | """ 80 | 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument("--log", default="/dev/stderr", help="log file (default=stderr)") 83 | parser.add_argument("--verbose", default=False, action="store_true") 84 | parser.add_argument("--input", "-i", default="/dev/stdin") 85 | parser.add_argument("--format", default="fasta", help="input format: fasta (default) or fastq") 86 | parser.add_argument("--output", "-o", default="/dev/stdout") 87 | 88 | return parser.parse_args() 89 | 90 | 91 | def setup_logging(logfile="/dev/stderr", verbose=False): 92 | 93 | if verbose: 94 | level = logging.DEBUG 95 | else: 96 | level = logging.ERROR 97 | 98 | return logging.basicConfig(filename=logfile, level=level) 99 | 100 | 101 | def main(): 102 | """ 103 | """ 104 | 105 | args = parse_args() 106 | 107 | setup_logging(logfile=args.log, verbose=args.verbose) 108 | 109 | logging.info("args: %s" % args) 110 | 111 | lengths = [] 112 | 113 | with open(args.input) as handle: 114 | records = SeqIO.parse(handle, args.format) 115 | 116 | for record in records: 117 | lengths.append(len(record)) 118 | 119 | print("LENGTH STATISTICS:") 120 | print("mean: %s" % Stats.mean(lengths)) 121 | print("median: %s" % Stats.median(lengths)) 122 | print("std: %s" % Stats.standard_deviation(lengths)) 123 | print("n50: %s" % Stats.n50(lengths)) 124 | 125 | 126 | if __name__ == "__main__": 127 | import doctest 128 | 129 | doctest.testmod() 130 | main() 131 | -------------------------------------------------------------------------------- /bin/split-seqs-by-id: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | from Bio import SeqIO 6 | 7 | 8 | def parse_args(): 9 | """ 10 | return arguments 11 | >>> args = parse_args() 12 | 13 | """ 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument( 17 | "--log", default="/dev/stderr", help="log file (default=stderr)" 18 | ) 19 | parser.add_argument("--output", default="/dev/stdout") 20 | parser.add_argument("--input", default="/dev/stdin") 21 | return parser.parse_args() 22 | 23 | 24 | def main(): 25 | """ 26 | >>> main() # stuff happens 27 | """ 28 | 29 | args = parse_args() 30 | logging.basicConfig(filename=args.log, level=logging.INFO) 31 | 32 | with open(args.input) as handle: 33 | records = SeqIO.parse(handle, "fasta") 34 | 35 | for record in records: 36 | out_file = "{}.fasta".format(record.id) 37 | 38 | with open(out_file, "w") as output: 39 | output.write(record.format("fasta")) 40 | 41 | 42 | if __name__ == "__main__": 43 | main() 44 | -------------------------------------------------------------------------------- /bin/split-sequences: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | 6 | import os 7 | 8 | from Bio import SeqIO 9 | 10 | 11 | def parse_args(): 12 | """ return arguments 13 | >>> args = parse_args() 14 | """ 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "--log", default="/dev/stderr", help="log file (default=stderr)" 19 | ) 20 | parser.add_argument("--input", default="/dev/stdin") 21 | parser.add_argument("--chunk-size", type=int) 22 | parser.add_argument("--out-dir", default="chunks") 23 | parser.add_argument("--format", default="fasta") 24 | return parser.parse_args() 25 | 26 | 27 | def main(): 28 | 29 | args = parse_args() 30 | logging.basicConfig(filename=args.log, level=logging.INFO) 31 | 32 | os.mkdir(args.out_dir) 33 | 34 | chunk = 0 35 | 36 | with open(args.input) as handle: 37 | 38 | for i, record in enumerate(SeqIO.parse(handle, args.format)): 39 | if i % args.chunk_size == 0: 40 | chunk += 1 41 | logging.info("writing to chunk %s" % chunk) 42 | out = open("%s/chunk-%s.%s" % (args.out_dir, chunk, args.format), "w") 43 | 44 | out.write(record.format(args.format)) 45 | 46 | out.close() 47 | 48 | 49 | if __name__ == "__main__": 50 | main() 51 | -------------------------------------------------------------------------------- /bin/wrap-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from Bio import SeqIO 4 | 5 | with open("/dev/stdin") as handle: 6 | records = SeqIO.parse(handle, "fasta") 7 | 8 | for record in records: 9 | print(record.format("fasta")) 10 | -------------------------------------------------------------------------------- /bioinformatics.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/audy/bioinformatics-hacks/1def5b44dbc258115b2c4d03a829311953e57c30/bioinformatics.jpg -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | set -x 5 | 6 | for i in bin/*; do 7 | cp ${i} /usr/local/bin/$(basename ${i}) 8 | done 9 | -------------------------------------------------------------------------------- /license.md: -------------------------------------------------------------------------------- 1 | ## License 2 | 3 | The MIT License (MIT) 4 | Copyright (c) 2013-2017 Austin Davis-Richardson 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # BioHacks 2 | 3 | Austin G. Davis-Richardson 4 | 5 | A collection of Python scripts I use for random bioinformatics-related 6 | tasks. 7 | 8 | ``` 9 | bin 10 | ├── babelfish - convert between fasta/gbk/gff or whatever BioPython supports 11 | ├── seq-eval - evaluate arbitrary Python code on records in a fasta/q file 12 | ├── fasta-to-phylip - convert FASTA alignment to Phylip format 13 | ├── fastq-to-fasta - convert FASTQ file to FASTA file 14 | ├── filter-seq - filter sequence file based on arbitrary Python lambda 15 | ├── qseq-to-fastq - convert QSEQ file to FASTQ file 16 | ├── gc-content - measure GC content along a sliding window (can measure any "motif") 17 | ├── seq-convert - arbitrarily convert between BioPython SeqIO formats 18 | ├── concatenate-annotation - concatenate scaffolds in a genbank annotation 19 | └── sequence-statistics - calculate various stats on sequence lengths (mean, median, n50) 20 | ``` 21 | 22 | ![jurassic park dna manipulation](bioinformatics.jpg) 23 | --------------------------------------------------------------------------------