├── ctbBio ├── __init__.py ├── fasta_stats.py ├── entrez_genome.sh ├── fasta_stats.sh ├── rp16_retreive.sh ├── fastq2fasta.py ├── lookup.py ├── strip_align_inserts.py ├── parallel.py ├── lookup-word.py ├── fastq_split.py ├── fastq_merge.py ├── fix_fasta.py ├── stats.py ├── stockholm2fa.py ├── rec_best_blast.py ├── fasta_length.py ├── name2fasta.py ├── name2faa.py ├── concat_align.py ├── unmapped.py ├── rc.py ├── stockholm2oneline.py ├── fasta_region.py ├── sixframe.py ├── subset_reads.py ├── numblast-pident.py ├── fasta.py ├── nr_fasta.py ├── n50.py ├── numblast.py ├── strip_masked.py ├── strip_align.py ├── subset_sam.py ├── orthologer_summary.py ├── calculate_coverage.py ├── sam2fastq.py ├── shuffle_genome.py ├── genome_coverage.py ├── filter_fastq_sam.py ├── crossmap.py ├── rRNA_copies.py ├── genome_abundance.py ├── transform.py ├── rp16.py ├── search.py ├── rRNA_insertions_gff.py ├── orthologer.py ├── ncbi_download.py ├── mapped.py ├── compare_aligned.py ├── rax.py ├── besthits.py ├── cluster_ani.py └── 23SfromHMM.py ├── requirements.txt ├── LICENSE ├── MANIFEST ├── .gitignore ├── setup.py └── README.md /ctbBio/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # required packages 2 | networkx 3 | python-Levenshtein 4 | numpy 5 | pandas 6 | biopython 7 | tqdm 8 | -------------------------------------------------------------------------------- /ctbBio/fasta_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import os 5 | from ctbBio.fasta import iterate_fasta as fasta_parser 6 | 7 | 8 | 9 | if __name__ == '__main__': 10 | fasta = sys.argv[1] 11 | stats(fasta) 12 | -------------------------------------------------------------------------------- /ctbBio/entrez_genome.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 1 ] 4 | then 5 | echo "usage: entrez_genome.sh " 6 | exit 1 7 | fi 8 | 9 | esearch -db genome -query $1 | efetch -db BioSample -format docsum 10 | -------------------------------------------------------------------------------- /ctbBio/fasta_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in "$1"; 4 | do 5 | echo $i | tr '\n' '\t'; 6 | n50.py -i $i | tr '\n' '\t'; 7 | echo 'scaffolds: ' $(grep -c "^>" $i) | tr '\n' '\t'; 8 | echo 'largest scaffold: ' $(fasta_length.py $i 0 | grep "^>" | cut -f 2 | sort -n -r | head -n 1); 9 | done | tr '\t' '!' | tr2 '!' '!|' | tr '!' '\n' | tr '|' '\t' 10 | -------------------------------------------------------------------------------- /ctbBio/rp16_retreive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 1 ] 4 | then 5 | echo "usage: ribosomal_get_sequences.sh " 6 | exit 1 7 | fi 8 | 9 | table=$1 10 | fa=$2 11 | 12 | IFS=$'\n' 13 | 14 | for i in $(cat $table | grep -v database | head -n 1 | cut -f 2- | tr '\t' '\n' | cat -n) 15 | do 16 | out=$(echo `basename $fa .faa`.rp$(echo $i | cut -f 2).faa) 17 | c=$(echo 1+$(echo $i | rev | cut -d ' ' -f 1 | cut -f 2 | rev) | bc) 18 | cat $table | grep -v "^#" | cut -f $c | pullseq -i $fa -N > $out 19 | done 20 | -------------------------------------------------------------------------------- /ctbBio/fastq2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for converting fastq file to fasta file 5 | """ 6 | 7 | import sys 8 | import os 9 | from itertools import cycle 10 | 11 | def fq2fa(fq): 12 | """ 13 | convert fq to fa 14 | """ 15 | c = cycle([1, 2, 3, 4]) 16 | for line in fq: 17 | n = next(c) 18 | if n == 1: 19 | seq = ['>%s' % (line.strip().split('@', 1)[1])] 20 | if n == 2: 21 | seq.append(line.strip()) 22 | yield seq 23 | 24 | if __name__ == '__main__': 25 | if len(sys.argv) != 2: 26 | print('specify fastq file') 27 | exit() 28 | fq = sys.argv[1] 29 | if fq == '-': 30 | fq = sys.stdin 31 | else: 32 | fq = open(fq) 33 | for seq in fq2fa(fq): 34 | print('\n'.join(seq)) 35 | -------------------------------------------------------------------------------- /ctbBio/lookup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for finding and replacing elements in a file 5 | """ 6 | 7 | import sys 8 | import os 9 | 10 | def f2lookup(f, lookup): 11 | """ 12 | find and replace elements in lookup within file 13 | """ 14 | lookup = {i: r for i, r in [l.strip().split('\t')[0:2] for l in lookup]} 15 | for line in f: 16 | line = line.strip() 17 | for find, replace in list(lookup.items()): 18 | line = line.replace(find, replace) 19 | yield line 20 | 21 | if __name__ == '__main__': 22 | if len(sys.argv) != 3: 23 | print('specify file and lookup') 24 | exit() 25 | for c, i in enumerate(sys.argv[1:], 1): 26 | if i == '-': 27 | i = sys.stdin 28 | else: 29 | i = open(i) 30 | sys.argv[c] = i 31 | f, lookup = sys.argv[1:] 32 | for line in f2lookup(f, lookup): 33 | print(line) 34 | -------------------------------------------------------------------------------- /ctbBio/strip_align_inserts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for removing insertion columns ('.' and lower case bases) from 5 | alignment fasta file 6 | """ 7 | 8 | import sys 9 | import os 10 | from ctbBio.fasta import iterate_fasta as parse_fasta 11 | 12 | def strip_inserts(fasta): 13 | """ 14 | remove insertion columns from aligned fasta file 15 | """ 16 | for seq in parse_fasta(fasta): 17 | seq[1] = ''.join([b for b in seq[1] if b == '-' or b.isupper()]) 18 | yield seq 19 | 20 | if __name__ == '__main__': 21 | if len(sys.argv) != 2: 22 | print('specify aligned fasta file') 23 | exit() 24 | fasta = sys.argv[1] 25 | if fasta == '-': 26 | fasta = sys.stdin 27 | else: 28 | fasta = open(fasta) 29 | for seq in strip_inserts(fasta): 30 | print('\n'.join(seq)) 31 | -------------------------------------------------------------------------------- /ctbBio/parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for parallel execution of scripts 5 | """ 6 | 7 | import sys 8 | import os 9 | from multiprocessing import Pool as multithread 10 | from subprocess import Popen 11 | 12 | def run_process(process): 13 | """ 14 | execute process 15 | """ 16 | p = Popen(process, shell = True) 17 | return p.communicate() 18 | 19 | def parallel(processes, threads): 20 | """ 21 | execute jobs in processes using N threads 22 | """ 23 | pool = multithread(threads) 24 | pool.map(run_process, processes) 25 | pool.close() 26 | pool.join() 27 | 28 | if __name__ == '__main__': 29 | if len(sys.argv) != 2: 30 | print('usage: | parallel.py ') 31 | exit() 32 | threads = int(sys.argv[1]) 33 | processes = sys.stdin 34 | parallel(processes, threads) 35 | -------------------------------------------------------------------------------- /ctbBio/lookup-word.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for finding and replacing elements in a file 5 | """ 6 | 7 | import sys 8 | import os 9 | 10 | def f2lookup(f, lookup): 11 | """ 12 | find and replace elements in lookup within file 13 | """ 14 | lookup = {i: r for i, r in [l.strip().split('\t')[0:2] for l in lookup]} 15 | for line in f: 16 | line = line.strip().split() 17 | for i, w in enumerate(line): 18 | if w in lookup: 19 | line[i] = lookup[w] 20 | yield ' '.join(line) 21 | 22 | if __name__ == '__main__': 23 | if len(sys.argv) != 3: 24 | print('specify file and lookup') 25 | exit() 26 | for c, i in enumerate(sys.argv[1:], 1): 27 | if i == '-': 28 | i = sys.stdin 29 | else: 30 | i = open(i) 31 | sys.argv[c] = i 32 | f, lookup = sys.argv[1:] 33 | for line in f2lookup(f, lookup): 34 | print(line) 35 | -------------------------------------------------------------------------------- /ctbBio/fastq_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | ''' 4 | script for taking an interleaved fastq file and printing the left and 5 | right reads in separate fasta files 6 | ''' 7 | 8 | import sys 9 | import os 10 | from itertools import cycle 11 | 12 | def split(fastq, prefix): 13 | f1 = open('%s.R1.fastq' % (prefix), 'w') 14 | f2 = open('%s.R2.fastq' % (prefix), 'w') 15 | c = cycle([1, 1, 1, 1, 2, 2, 2, 2]) 16 | for line in fastq: 17 | n = next(c) 18 | if n == 1: 19 | f1.write(line) 20 | else: 21 | f2.write(line) 22 | f1.close() 23 | f2.close() 24 | return [f1.name, f2.name] 25 | 26 | if __name__ == '__main__': 27 | if len(sys.argv) != 3: 28 | print('specify fastq file and file prefix') 29 | exit() 30 | fastq, prefix = sys.argv[1], sys.argv[2] 31 | if fastq == '-': 32 | fastq = sys.stdin 33 | else: 34 | fastq = open(fastq) 35 | split(fastq, prefix) 36 | -------------------------------------------------------------------------------- /ctbBio/fastq_merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for merging separate fastq files into 5 | an interleaved fastq file 6 | """ 7 | 8 | import sys 9 | import os 10 | import itertools 11 | import gzip 12 | 13 | def fq_merge(R1, R2): 14 | """ 15 | merge separate fastq files 16 | """ 17 | c = itertools.cycle([1, 2, 3, 4]) 18 | for r1, r2 in zip(R1, R2): 19 | n = next(c) 20 | if n == 1: 21 | pair = [[], []] 22 | pair[0].append(r1.strip()) 23 | pair[1].append(r2.strip()) 24 | if n == 4: 25 | yield pair 26 | 27 | if __name__ == '__main__': 28 | if len(sys.argv) != 3: 29 | print('usage: fastq_merge.py ') 30 | exit() 31 | R1, R2 = sys.argv[1], sys.argv[2] 32 | if R1.rsplit('.', 1)[1] == 'gz': 33 | R1, R2 = gzip.open(R1, 'rt'), gzip.open(R2, 'rt') 34 | else: 35 | R1, R2 = open(R1), open(R2) 36 | for pair in fq_merge(R1, R2): 37 | print('\n'.join(itertools.chain(*pair))) 38 | -------------------------------------------------------------------------------- /ctbBio/fix_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | fix fasta headers so that they don't have weird characters 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | 11 | remove_characters = ['/', '\\', ':', ',', '(', ')', ' ', '|', ';',] # characters to remove from headers 12 | 13 | def remove_char(header): 14 | for character in remove_characters: 15 | header = header.replace(character, '_') 16 | return header 17 | 18 | def fix_fasta(fasta): 19 | """ 20 | remove pesky characters from fasta file header 21 | """ 22 | for seq in parse_fasta(fasta): 23 | seq[0] = remove_char(seq[0]) 24 | if len(seq[1]) > 0: 25 | yield seq 26 | 27 | if __name__ == '__main__': 28 | if len(sys.argv) != 2: 29 | print('specify fasta file') 30 | exit() 31 | fasta = sys.argv[1] 32 | if fasta == '-': 33 | fasta = sys.stdin 34 | else: 35 | fasta = open(fasta) 36 | for seq in fix_fasta(fasta): 37 | print('\n'.join(seq)) 38 | -------------------------------------------------------------------------------- /ctbBio/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import numpy 6 | 7 | def stats(nums): 8 | stat = {} 9 | stat['lines'] = len(nums) 10 | stat['mean'] = numpy.average(nums) 11 | stat['median'] = numpy.median(nums) 12 | stat['variance'] = numpy.var(nums) 13 | stat['std dev'] = numpy.std(nums) 14 | stat['sum'] = sum(nums) 15 | stat['min'] = numpy.amin(nums) 16 | stat['max'] = numpy.amax(nums) 17 | return stat 18 | 19 | if __name__ == '__main__': 20 | if len(sys.argv) == 1: 21 | nums = [float(i) for i in sys.stdin] 22 | elif len(sys.argv) == 2: 23 | nums = [float(i) for i in open(sys.argv[1])] 24 | else: 25 | print('specify file or use stdin') 26 | exit() 27 | if len(nums) > 0: 28 | stats = stats(nums) 29 | else: 30 | print('no nums') 31 | exit() 32 | for stat in ['lines', 'mean', 'median', \ 33 | 'variance', 'std dev', \ 34 | 'sum', 'min', 'max']: 35 | print('%s:\t%s' % (stat, format(stats[stat]))) 36 | -------------------------------------------------------------------------------- /ctbBio/stockholm2fa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for converting a stockholm formatted alignment to fasta 5 | """ 6 | 7 | import os 8 | import re 9 | import sys 10 | 11 | def stock2fa(stock): 12 | """ 13 | convert stockholm to fasta 14 | """ 15 | seqs = {} 16 | for line in stock: 17 | if line.startswith('#') is False and line.startswith(' ') is False and len(line) > 3: 18 | id, seq = line.strip().split() 19 | id = id.rsplit('/', 1)[0] 20 | id = re.split('[0-9]\|', id, 1)[-1] 21 | if id not in seqs: 22 | seqs[id] = [] 23 | seqs[id].append(seq) 24 | if line.startswith('//'): 25 | break 26 | return seqs 27 | 28 | if __name__ == '__main__': 29 | if len(sys.argv) != 2: 30 | print('specify stockholm formatted alignment') 31 | exit() 32 | stock = sys.argv[1] 33 | if stock == '-': 34 | stock = sys.stdin 35 | else: 36 | stock = open(stock) 37 | for id, seq in list(stock2fa(stock).items()): 38 | print('\n'.join(['>%s' % (id), ''.join(seq)])) 39 | -------------------------------------------------------------------------------- /ctbBio/rec_best_blast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for getting rec. best blast hits 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.numblast import best as bestblast 10 | 11 | def rec_hits(blast, evalue = float(0.01), bit = False): 12 | genes = {} 13 | rec_hits = [] 14 | for out in blast: 15 | out = open(out) 16 | for hit in bestblast(out, 1, evalue, bit): 17 | query, match = hit[0].split()[0], hit[1].split()[0] 18 | if query not in genes: 19 | genes[query] = {} 20 | genes[query][match] = 0 21 | for out in blast: 22 | out = open(out) 23 | for hit in bestblast(out, 1, evalue, bit): 24 | query, match = hit[0].split()[0], hit[1].split()[0] 25 | if match in genes and query in genes[match]: 26 | genes[match][query] = 1 27 | for out in blast: 28 | out = open(out) 29 | for hit in bestblast(out, 1, evalue, bit): 30 | query, match = hit[0].split()[0], hit[1].split()[0] 31 | if genes[query][match] == 1: 32 | rec_hits.append('\t'.join(hit)) 33 | return set(rec_hits) 34 | 35 | if __name__ == '__main__': 36 | blast = sys.argv[1:] 37 | for rec in rec_hits(blast): 38 | print(rec) 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 christophertbrown 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ctbBio/fasta_length.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for reporting the length of sequences in a fasta file 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | import ctbBio.fasta as fasta_parser 11 | 12 | def get_length(sequence): 13 | return [sequence[0], len(sequence[1].replace('-', '').replace('.', ''))] 14 | 15 | if __name__ == "__main__": 16 | if len(sys.argv) != 3: 17 | print('usage: fasta_length.py ', \ 18 | file=sys.stderr) 19 | exit() 20 | fasta = sys.argv[1] 21 | threshold = float(sys.argv[2]) 22 | if fasta == '-': 23 | fasta = sys.stdin 24 | for sequence in parse_fasta(fasta): 25 | length = get_length(sequence) 26 | if threshold == 0: 27 | length = [length[0].split('>')[1].split()[0], length[0].split('>')[1], str(length[1])] 28 | print('>%s' % ('\t'.join(length[1:]))) 29 | print(sequence[1]) 30 | elif length[1] >= threshold: 31 | length = [length[0].split('>')[1].split()[0], length[0].split('>')[1], str(length[1])] 32 | print('\n'.join(sequence)) 33 | -------------------------------------------------------------------------------- /ctbBio/name2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | from ctbBio.fasta import iterate_fasta as parse_fasta 6 | 7 | def split_fasta(f, id2f): 8 | """ 9 | split fasta file into separate fasta files based on list of scaffolds 10 | that belong to each separate file 11 | """ 12 | opened = {} 13 | for seq in parse_fasta(f): 14 | id = seq[0].split('>')[1].split()[0] 15 | if id not in id2f: 16 | continue 17 | fasta = id2f[id] 18 | if fasta not in opened: 19 | opened[fasta] = '%s.fa' % fasta 20 | seq[1] += '\n' 21 | with open(opened[fasta], 'a+') as f_out: 22 | f_out.write('\n'.join(seq)) 23 | 24 | if __name__ == '__main__': 25 | if len(sys.argv) != 3: 26 | print('usage: name2fasta.py ') 27 | exit() 28 | fasta, id2fasta = sys.argv[1:] 29 | if fasta == '-': 30 | fasta = sys.stdin 31 | else: 32 | fasta = open(fasta) 33 | if id2fasta == '-': 34 | id2fasta = sys.stdin 35 | else: 36 | id2fasta = open(id2fasta) 37 | id2fasta = {s.strip().split()[0]:s.strip().replace(' ', '_').split()[1] for s in id2fasta} 38 | split_fasta(fasta, id2fasta) 39 | -------------------------------------------------------------------------------- /ctbBio/name2faa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | from fasta import iterate_fasta as parse_fasta 6 | 7 | def split_fasta(f, id2f): 8 | """ 9 | split fasta file into separate fasta files based on list of scaffolds 10 | that belong to each separate file 11 | """ 12 | opened = {} 13 | for seq in parse_fasta(f): 14 | id = seq[0].split('>')[1].split()[0].rsplit('_', 1)[0] 15 | if id not in id2f: 16 | continue 17 | fasta = id2f[id] 18 | if fasta not in opened: 19 | opened[fasta] = '%s.faa' % fasta 20 | seq[1] += '\n' 21 | with open(opened[fasta], 'a+') as f_out: 22 | f_out.write('\n'.join(seq)) 23 | 24 | if __name__ == '__main__': 25 | if len(sys.argv) != 3: 26 | print('usage: name2fasta.py ') 27 | exit() 28 | fasta, id2fasta = sys.argv[1:] 29 | if fasta == '-': 30 | fasta = sys.stdin 31 | else: 32 | fasta = open(fasta) 33 | if id2fasta == '-': 34 | id2fasta = sys.stdin 35 | else: 36 | id2fasta = open(id2fasta) 37 | id2fasta = {s.strip().split()[0]:s.strip().replace(' ', '_').split()[1] for s in id2fasta} 38 | split_fasta(fasta, id2fasta) 39 | -------------------------------------------------------------------------------- /ctbBio/concat_align.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for concatenating alignments 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | 11 | def concat_align(fastas): 12 | """ 13 | concatenate alignments 14 | """ 15 | # read in sequences 16 | fa2len = {} 17 | seqs = {} 18 | IDs = [] 19 | for fasta in fastas: 20 | seqs[fasta] = {} 21 | for seq in parse_fasta(fasta): 22 | ID = seq[0].split('>')[1].split()[0] 23 | IDs.append(ID) 24 | seqs[fasta][ID] = seq[1] 25 | fa2len[fasta] = len(seq[1]) 26 | # concat sequences 27 | IDs = set(IDs) 28 | concat = {} 29 | for fasta in fastas: 30 | for ID in IDs: 31 | if ID not in concat: 32 | concat[ID] = [] 33 | if ID not in seqs[fasta]: 34 | concat[ID].append('-'*fa2len[fasta]) 35 | else: 36 | concat[ID].append(seqs[fasta][ID]) 37 | return concat 38 | 39 | if __name__ == '__main__': 40 | if len(sys.argv) == 1: 41 | print('usage: concat_align.py ') 42 | exit() 43 | fastas = sys.argv[1:] 44 | for id, c in list(concat_align(fastas).items()): 45 | print('\n'.join(['>%s' % (id), ''.join(c)])) 46 | -------------------------------------------------------------------------------- /ctbBio/unmapped.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | get unmapped reads 5 | """ 6 | 7 | import sys 8 | import os 9 | import argparse 10 | 11 | def unmapped(sam, mates): 12 | """ 13 | get unmapped reads 14 | """ 15 | for read in sam: 16 | if read.startswith('@') is True: 17 | continue 18 | read = read.strip().split() 19 | if read[2] == '*' and read[6] == '*': 20 | yield read 21 | elif mates is True: 22 | if read[2] == '*' or read[6] == '*': 23 | yield read 24 | for i in read: 25 | if i == 'YT:Z:UP': 26 | yield read 27 | 28 | if __name__ == '__main__': 29 | parser = argparse.ArgumentParser(description = '# unmapped reads from sam file') 30 | parser.add_argument(\ 31 | '-s', required = True, help = 'path to sam file (- for stdin)') 32 | parser.add_argument(\ 33 | '--mates', action = 'store_true', help = 'return both mates if one did not map (default: return neither mate if one mapped)') 34 | args = vars(parser.parse_args()) 35 | sam, mates = args['s'], args['mates'] 36 | if sam == '-': 37 | sam = sys.stdin 38 | else: 39 | sam = open(sam) 40 | for read in unmapped(sam, mates): 41 | print('\t'.join(read)) 42 | -------------------------------------------------------------------------------- /ctbBio/rc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for getting the reverse complement of a nucleotide sequence 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | 11 | rc = {'A': 'T', \ 12 | 'T': 'A', \ 13 | 'G': 'C', \ 14 | 'C': 'G', \ 15 | 'N': 'N', \ 16 | 'a': 't', \ 17 | 't': 'a', \ 18 | 'g': 'c', \ 19 | 'c': 'g', \ 20 | 'n': 'n'} 21 | 22 | def complement(seq): 23 | rev_c = [] 24 | for base in seq[1]: 25 | rev_c.append(rc[base]) 26 | return [seq[0], ''.join(rev_c)] 27 | 28 | def reverse_complement(seq): 29 | rev_c = [] 30 | for base in seq[1][::-1]: 31 | if base not in rc: 32 | rev_c.append('N') 33 | else: 34 | rev_c.append(rc[base]) 35 | return [seq[0], ''.join(rev_c)] 36 | 37 | if __name__ == '__main__': 38 | if len(sys.argv) != 3: 39 | print('specify fasta or - if from stdin and c (for complement) or rc (for reverse complement)') 40 | exit() 41 | fasta, option = sys.argv[1], sys.argv[2] 42 | if fasta == '-': 43 | fasta = sys.stdin 44 | else: 45 | fasta = open(fasta) 46 | if option == 'c': 47 | for seq in parse_fasta(fasta): 48 | print('\n'.join(complement(seq))) 49 | elif option == 'rc': 50 | for seq in parse_fasta(fasta): 51 | print('\n'.join(reverse_complement(seq))) 52 | else: 53 | print('specify fasta or - if from stdin \ 54 | and c (for complement) or rc (for reverse complement)') 55 | exit() 56 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.py 3 | ctbBio/16SfromHMM.py 4 | ctbBio/23SfromHMM.py 5 | ctbBio/__init__.py 6 | ctbBio/besthits.py 7 | ctbBio/calculate_coverage.py 8 | ctbBio/cluster_ani.py 9 | ctbBio/compare_aligned.py 10 | ctbBio/concat_align.py 11 | ctbBio/crossmap.py 12 | ctbBio/fasta.py 13 | ctbBio/fasta_length.py 14 | ctbBio/fasta_region.py 15 | ctbBio/fasta_stats.py 16 | ctbBio/fastq2fasta.py 17 | ctbBio/fastq_merge.py 18 | ctbBio/fastq_split.py 19 | ctbBio/filter_fastq_sam.py 20 | ctbBio/fix_fasta.py 21 | ctbBio/genome_abundance.py 22 | ctbBio/genome_coverage.py 23 | ctbBio/genome_variation.py 24 | ctbBio/lookup-word.py 25 | ctbBio/lookup.py 26 | ctbBio/mapped.py 27 | ctbBio/n50.py 28 | ctbBio/name2faa.py 29 | ctbBio/name2fasta.py 30 | ctbBio/ncbi_download.py 31 | ctbBio/neto.py 32 | ctbBio/nr_fasta.py 33 | ctbBio/numblast-pident.py 34 | ctbBio/numblast.py 35 | ctbBio/orthologer.py 36 | ctbBio/orthologer_summary.py 37 | ctbBio/parallel.py 38 | ctbBio/rRNA_copies.py 39 | ctbBio/rRNA_insertions.py 40 | ctbBio/rRNA_insertions_gff.py 41 | ctbBio/rax.py 42 | ctbBio/rc.py 43 | ctbBio/rec_best_blast.py 44 | ctbBio/rp16.py 45 | ctbBio/rp16_retreive.sh 46 | ctbBio/sam2fastq.py 47 | ctbBio/search.py 48 | ctbBio/shuffle_genome.py 49 | ctbBio/sixframe.py 50 | ctbBio/stats.py 51 | ctbBio/stockholm2fa.py 52 | ctbBio/stockholm2oneline.py 53 | ctbBio/strip_align.py 54 | ctbBio/strip_align_inserts.py 55 | ctbBio/strip_masked.py 56 | ctbBio/subset_sam.py 57 | ctbBio/transform.py 58 | ctbBio/unmapped.py 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # vim 2 | .*.swp 3 | 4 | # Ignore Mac DS_Store files 5 | .DS_Store 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # IPython Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # dotenv 85 | .env 86 | 87 | # virtualenv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | -------------------------------------------------------------------------------- /ctbBio/stockholm2oneline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for converting a stockholm formatted alignment to single line 5 | """ 6 | 7 | import os 8 | import re 9 | import sys 10 | 11 | def print_line(l): 12 | """ 13 | print line if starts with ... 14 | """ 15 | print_lines = ['# STOCKHOLM', '#=GF', '#=GS', ' '] 16 | if len(l.split()) == 0: 17 | return True 18 | for start in print_lines: 19 | if l.startswith(start): 20 | return True 21 | return False 22 | 23 | def stock2one(stock): 24 | """ 25 | convert stockholm to single line format 26 | """ 27 | lines = {} 28 | for line in stock: 29 | line = line.strip() 30 | if print_line(line) is True: 31 | yield line 32 | continue 33 | if line.startswith('//'): 34 | continue 35 | ID, seq = line.rsplit(' ', 1) 36 | if ID not in lines: 37 | lines[ID] = '' 38 | else: 39 | # remove preceding white space 40 | seq = seq.strip() 41 | lines[ID] += seq 42 | for ID, line in lines.items(): 43 | yield '\t'.join([ID, line]) 44 | yield '\n//' 45 | 46 | if __name__ == '__main__': 47 | if len(sys.argv) != 2: 48 | print('convert to single line stockholm formatted alignment') 49 | exit() 50 | stock = sys.argv[1] 51 | if stock == '-': 52 | stock = sys.stdin 53 | else: 54 | stock = open(stock) 55 | for line in stock2one(stock): 56 | print(line) 57 | -------------------------------------------------------------------------------- /ctbBio/fasta_region.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for getting a specific region of a fasta file 5 | """ 6 | 7 | import sys 8 | import os 9 | import ctbBio.fasta as fasta_parser 10 | 11 | def positions(region): 12 | region = region.split('-') 13 | start = int(region[0]) 14 | if region[1] == '': 15 | stop = len(sequence[1]) 16 | else: 17 | stop = int(region[1]) 18 | return start, stop 19 | 20 | def extract(sequence, region, length=0): 21 | start, stop = positions(region) 22 | header = '%s %s' % (sequence[0], region) 23 | cut = sequence[1][start:stop] 24 | yield fasta_parser.format_print([header, cut], length) 25 | 26 | if __name__ == "__main__": 27 | if len(sys.argv) == 2: 28 | region = sys.argv[1] 29 | for sequence in fasta_parser.iterate_fasta(sys.stdin): 30 | for split in extract(sequence, region): 31 | print('\n'.join(split)) 32 | elif len(sys.argv) == 3: 33 | region, length = sys.argv[1], int(sys.argv[2]) 34 | for sequence in fasta_parser.iterate_fasta(sys.stdin): 35 | for split in extract(sequence, region, length): 36 | print('\n'.join(split)) 37 | elif len(sys.argv) != 4: 38 | print('please specify the fasta file, the region that you would like to extract, \ 39 | ' 'and the number of characters that you want per line, or 0 for one line \ 40 | ' 'eg: 0-500 or 0- or 100-500') 41 | exit() 42 | else: 43 | fasta, region, length = sys.argv[1], sys.argv[2], int(sys.argv[3]) 44 | if fasta == '-': 45 | fasta = sys.stdin 46 | else: 47 | fasta = open(fasta) 48 | for sequence in fasta_parser.iterate_fasta(fasta): 49 | for split in extract(sequence, region, length): 50 | print('\n'.join(split)) 51 | -------------------------------------------------------------------------------- /ctbBio/sixframe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for generating six frame translations 5 | """ 6 | 7 | import sys 8 | import os 9 | from Bio.Seq import Seq 10 | from Bio.Alphabet import IUPAC 11 | from ctbBio.fasta import iterate_fasta as parse_fasta 12 | 13 | def six_frame(genome, table, minimum = 10): 14 | """ 15 | translate each sequence into six reading frames 16 | """ 17 | for seq in parse_fasta(genome): 18 | dna = Seq(seq[1].upper().replace('U', 'T'), IUPAC.ambiguous_dna) 19 | counter = 0 20 | for sequence in ['f', dna], ['rc', dna.reverse_complement()]: 21 | direction, sequence = sequence 22 | for frame in range(0, 3): 23 | for prot in \ 24 | sequence[frame:].\ 25 | translate(table = table, to_stop = False).split('*'): 26 | if len(prot) < minimum: 27 | continue 28 | counter += 1 29 | header = '%s_%s table=%s frame=%s-%s %s' % \ 30 | (seq[0].split()[0], counter, table, frame+1, \ 31 | direction, ' '.join(seq[0].split()[1:])) 32 | yield [header, prot] 33 | 34 | if __name__ == '__main__': 35 | if len(sys.argv) != 3: 36 | print('usage: sixframe.py ') 37 | exit() 38 | genome, table = sys.argv[1:] 39 | if genome == '-': 40 | genome = sys.stdin 41 | else: 42 | genome = open(genome) 43 | for seq in six_frame(genome, table, minimum = 10): 44 | print('%s\n%s' % (seq[0], seq[1])) 45 | -------------------------------------------------------------------------------- /ctbBio/subset_reads.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for randomly subsetting reads 5 | in fastq file 6 | """ 7 | 8 | import os 9 | import sys 10 | import gzip 11 | import random 12 | import argparse 13 | from itertools import cycle 14 | from subprocess import Popen, PIPE 15 | 16 | def parse_fq(fq): 17 | """ 18 | parse fastq file 19 | """ 20 | c = cycle([1, 2, 3, 4]) 21 | read = [] 22 | for line in fq: 23 | n = next(c) 24 | read.append(line.strip()) 25 | if n == 4: 26 | yield read 27 | read = [] 28 | 29 | def sub_fq(R1, R2, percent): 30 | """ 31 | randomly subset fastq files 32 | """ 33 | pool = [1 for i in range(0, percent)] + [0 for i in range(0, 100 - percent)] 34 | for r1, r2 in zip(parse_fq(R1), parse_fq(R2)): 35 | if random.choice(pool) == 1: 36 | yield r1 37 | yield r2 38 | 39 | if __name__ == '__main__': 40 | parser = argparse.ArgumentParser(description = '# randomly subset fastq files') 41 | parser.add_argument(\ 42 | '-1', required = True, type = str, \ 43 | help = 'path to forward reads') 44 | parser.add_argument(\ 45 | '-2', required = True, type = str, \ 46 | help = 'path to reverse reads') 47 | parser.add_argument(\ 48 | '-p', required = True, type = int,\ 49 | help = 'percent of reads to report, e.g. 50 (approximate)') 50 | args = vars(parser.parse_args()) 51 | R1, R2, percent = args['1'], args['2'], args['p'] 52 | if R1.rsplit('.', 1)[1] == 'gz': 53 | R1, R2 = gzip.open(R1, 'rt'), gzip.open(R2, 'rt') 54 | else: 55 | R1, R2 = open(R1), open(R2) 56 | for read in sub_fq(R1, R2, percent): 57 | print('\n'.join(read)) 58 | -------------------------------------------------------------------------------- /ctbBio/numblast-pident.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for getting a specified number of hits from a blast tsv file 5 | under a specific evalue or bit score threshold, if provided 6 | 7 | """ 8 | 9 | import sys, os 10 | from operator import itemgetter 11 | 12 | def top_hits(hits, max): 13 | hits = sorted(hits, key = itemgetter(-1, 11), reverse = True) 14 | for hit in hits[0:max]: 15 | yield [str(i) for i in hit[0:-1]] 16 | 17 | def best(blast, max, pident = False): 18 | prev, hits = None, [] 19 | for line in blast: 20 | line = line.strip().split('\t') 21 | if line[10] == '*': 22 | line[10] = line[11] = float(line[2]) 23 | else: 24 | line[10], line[11] = float(line[10]), float(line[11]) 25 | id = line[0] 26 | line.append(float(line[10]) / -1) 27 | if id != prev: 28 | if len(hits) > 0: 29 | for hit in top_hits(hits, max): 30 | yield hit 31 | hits = [] 32 | if pident == False: 33 | hits.append(line) 34 | elif float(line[2]) >= pident: 35 | hits.append(line) 36 | prev = id 37 | for hit in top_hits(hits, max): 38 | yield hit 39 | 40 | if __name__ == "__main__": 41 | if len(sys.argv) != 4: 42 | print('usage: numblast-pident.py ') 43 | exit() 44 | blast, max, thresholds = sys.argv[1], int(sys.argv[2]), sys.argv[3:] 45 | if blast == '-': 46 | blast = sys.stdin 47 | else: 48 | blast = open(blast) 49 | for i, t in enumerate(thresholds): 50 | if t == '-': 51 | t = False 52 | else: 53 | t = float(t) 54 | thresholds[i] = t 55 | pident = thresholds[0] 56 | for hit in best(blast, max, pident): 57 | print('\t'.join(hit)) 58 | -------------------------------------------------------------------------------- /ctbBio/fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for parsing a fasta file 5 | """ 6 | 7 | import sys 8 | import os 9 | 10 | def iterate_fasta(fasta, length=0, string = False): 11 | sequence = [] 12 | if type(fasta) is str and string is False: 13 | fasta = open(fasta) 14 | elif type(fasta) is str and string is True: 15 | fasta = fasta.split('\n') 16 | for line in fasta: 17 | if line == '\n': 18 | continue 19 | sequence, formatted = parse_fasta(line, sequence, length) 20 | if formatted != []: 21 | yield formatted 22 | yield format_print(sequence, length) 23 | 24 | def parse_fasta(line, sequence, length=0): 25 | line = line.strip() 26 | formatted = [] 27 | if line.startswith('>'): 28 | if sequence != []: 29 | formatted = format_print(sequence, length) 30 | sequence = [line, []] 31 | else: 32 | sequence[1].append(line.replace(' ', '')) 33 | return sequence, formatted 34 | 35 | def format_print(sequence, length=0): 36 | if sequence == []: 37 | return [[], []] 38 | if length == 0: 39 | formatted = [sequence[0], ''.join(sequence[1])] 40 | else: 41 | sequence[1] = ''.join(sequence[1]) 42 | formatted = [sequence[0], '\n'.join(sequence[1][i:i+length] for i in range(0, len(sequence[1]), length))] 43 | return formatted 44 | 45 | if __name__ == "__main__": 46 | if len(sys.argv) == 1: 47 | for sequence in iterate_fasta(sys.stdin): 48 | print('\n'.join(sequence)) 49 | elif len(sys.argv) == 2: 50 | length = int(sys.argv[1]) 51 | for sequence in iterate_fasta(sys.stdin, length): 52 | print('\n'.join(sequence)) 53 | elif len(sys.argv) != 3: 54 | print('please specify the fasta file and the number of characters \ 55 | ' 'to print on each line, or 0 to print all characters on one line') 56 | exit() 57 | else: 58 | fasta, length = sys.argv[1], int(sys.argv[2]) 59 | for sequence in iterate_fasta(fasta, length): 60 | print('\n'.join(sequence)) 61 | -------------------------------------------------------------------------------- /ctbBio/nr_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for dereplicating fasta files based on sequence names (not sequence similarity) 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | 11 | def append_index_id(id, ids): 12 | """ 13 | add index to id to make it unique wrt ids 14 | """ 15 | index = 1 16 | mod = '%s_%s' % (id, index) 17 | while mod in ids: 18 | index += 1 19 | mod = '%s_%s' % (id, index) 20 | ids.append(mod) 21 | return mod, ids 22 | 23 | def de_rep(fastas, append_index, return_original = False): 24 | """ 25 | de-replicate fastas based on sequence names 26 | """ 27 | ids = [] 28 | for fasta in fastas: 29 | for seq in parse_fasta(fasta): 30 | header = seq[0].split('>')[1].split() 31 | id = header[0] 32 | if id not in ids: 33 | ids.append(id) 34 | if return_original is True: 35 | yield [header, seq] 36 | else: 37 | yield seq 38 | elif append_index == True: 39 | new, ids = append_index_id(id, ids) 40 | if return_original is True: 41 | yield [header, ['>%s %s' % (new, ' '.join(header[1::])), seq[1]]] 42 | else: 43 | yield ['>%s %s' % (new, ' '.join(header[1::])), seq[1]] 44 | 45 | if __name__ == '__main__': 46 | if len(sys.argv) == 1: 47 | print('usage: nr_fasta.py ') 48 | exit() 49 | option, fastas = sys.argv[1], sys.argv[2:] 50 | for fasta in fastas: 51 | if fasta == '-': 52 | fastas = [sys.stdin] 53 | if option == 'rename': 54 | append_index = True 55 | elif option == 'exclude': 56 | append_index = False 57 | else: 58 | print('specify rename or exclude for redundant sequences') 59 | exit() 60 | for seq in de_rep(fastas, append_index): 61 | print('\n'.join(seq)) 62 | -------------------------------------------------------------------------------- /ctbBio/n50.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for calculating n50 and other genome stats 5 | """ 6 | 7 | import sys 8 | import os 9 | import argparse 10 | 11 | # ctb scripts 12 | from fasta import iterate_fasta as parse_fasta 13 | 14 | def gc(sequence): 15 | count = 0 16 | for base in sequence: 17 | base = base.lower() 18 | if base == 'g' or base == 'c': 19 | count += 1 20 | return float(float(count) / float(len(sequence))) * float(100) 21 | 22 | def n50(fasta): 23 | length_list = [] 24 | sequences = [] 25 | for sequence in parse_fasta(fasta): 26 | length_list.append(float(len(sequence[1]))) 27 | sequences.append(sequence[1]) 28 | length_list.sort() 29 | length_list.reverse() 30 | total = float(sum(length_list)) 31 | n = total * float(0.50) 32 | n50_value = running = length_list[0] 33 | for length in length_list: 34 | if running >= n: 35 | return n50_value, total, \ 36 | len(length_list), gc(''.join(sequences)) 37 | else: 38 | n50_value = length 39 | running += n50_value 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser(description = '# calculate n50 and other genome stats') 43 | parser.add_argument(\ 44 | '-i', nargs = '*', action = 'store', required = True, \ 45 | help = 'fasta(s)') 46 | args = vars(parser.parse_args()) 47 | genomes = [] 48 | for genome in args['i']: 49 | if genome == '-': 50 | genome = sys.stdin 51 | else: 52 | genome = open(genome) 53 | genomes.append(genome) 54 | print('\t'.join(['#genome', 'contigs', 'bases', 'gc', 'n50'])) 55 | for genome in genomes: 56 | n50_value, total_bases, num_contigs, gc_cont = n50(genome) 57 | total_bases = '{:,}'.format(int(total_bases)) 58 | gc_cont = round(gc_cont, 2) 59 | n50_value = '{:,}'.format(int(n50_value)) 60 | print('\t'.join([str(i) for i in [genome.name, num_contigs, total_bases, gc_cont, n50_value]])) 61 | -------------------------------------------------------------------------------- /ctbBio/numblast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for getting a specified number of hits from a blast tsv file 5 | under a specific evalue or bit score threshold, if provided 6 | 7 | """ 8 | 9 | import sys, os 10 | from operator import itemgetter 11 | 12 | def top_hits(hits, max): 13 | hits = sorted(hits, key = itemgetter(-1, 11), reverse = True) 14 | for hit in hits[0:max]: 15 | yield [str(i) for i in hit[0:-1]] 16 | 17 | def best(blast, max, evalue = False, bit = False): 18 | prev, hits = None, [] 19 | for line in blast: 20 | line = line.strip().split('\t') 21 | if line[10] == '*': 22 | line[10] = line[11] = float(line[2]) 23 | else: 24 | line[10], line[11] = float(line[10]), float(line[11]) 25 | ID = line[0].split()[0] 26 | # line.append(float(line[10]) / -1) 27 | if ID != prev: 28 | if len(hits) > 0: 29 | for hit in top_hits(hits, max): 30 | yield hit 31 | hits = [] 32 | if evalue == False and bit == False: 33 | hits.append(line) 34 | elif line[10] <= evalue and bit == False: 35 | hits.append(line) 36 | elif line[10] <= evalue and line[11] >= bit: 37 | hits.append(line) 38 | elif evalue == False and line[11] >= bit: 39 | hits.append(line) 40 | prev = ID 41 | for hit in top_hits(hits, max): 42 | yield hit 43 | 44 | if __name__ == "__main__": 45 | if len(sys.argv) != 5: 46 | print('usage: numblast.py ') 47 | print('# use - if from stdin or in place of a threshold') 48 | exit() 49 | blast, max, thresholds = sys.argv[1], int(sys.argv[2]), sys.argv[3:6] 50 | if blast == '-': 51 | blast = sys.stdin 52 | else: 53 | blast = open(blast) 54 | for i, t in enumerate(thresholds): 55 | if t == '-': 56 | t = False 57 | else: 58 | t = float(t) 59 | thresholds[i] = t 60 | e, bit = thresholds 61 | for hit in best(blast, max, e, bit): 62 | print('\t'.join(hit)) 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # python setup.py sdist upload -r pypi 4 | 5 | from distutils.core import setup 6 | 7 | version = '0.47' 8 | 9 | packages = ['ctbBio'] 10 | 11 | scripts = ['ctbBio/16SfromHMM.py', 'ctbBio/23SfromHMM.py', 'ctbBio/numblast.py', 'ctbBio/besthits.py', 12 | 'ctbBio/calculate_coverage.py', 'ctbBio/cluster_ani.py', 'ctbBio/compare_aligned.py', 13 | 'ctbBio/concat_align.py', 'ctbBio/crossmap.py', 'ctbBio/fasta.py', 'ctbBio/fasta_length.py', 14 | 'ctbBio/fasta_region.py', 'ctbBio/fasta_stats.py', 'ctbBio/fastq2fasta.py', 'ctbBio/fastq_merge.py', 15 | 'ctbBio/fastq_split.py', 'ctbBio/filter_fastq_sam.py', 'ctbBio/fix_fasta.py', 'ctbBio/genome_abundance.py', 16 | 'ctbBio/genome_coverage.py', 'ctbBio/genome_variation.py', 'ctbBio/lookup-word.py', 'ctbBio/lookup.py', 17 | 'ctbBio/mapped.py', 'ctbBio/n50.py', 'ctbBio/name2fasta.py', 18 | 'ctbBio/name2faa.py', 'ctbBio/neto.py', 'ctbBio/rec_best_blast.py', 19 | 'ctbBio/nr_fasta.py', 'ctbBio/numblast-pident.py', 'ctbBio/orthologer.py', 'ctbBio/orthologer_summary.py', 20 | 'ctbBio/parallel.py', 'ctbBio/rRNA_copies.py', 'ctbBio/rRNA_insertions.py', 'ctbBio/rax.py', 21 | 'ctbBio/rc.py', 'ctbBio/rp16.py', 'ctbBio/rp16_retreive.sh', 'ctbBio/sam2fastq.py', 'ctbBio/search.py', 22 | 'ctbBio/shuffle_genome.py', 'ctbBio/sixframe.py', 'ctbBio/stats.py', 'ctbBio/stockholm2fa.py', 23 | 'ctbBio/stockholm2oneline.py', 'ctbBio/strip_align.py', 'ctbBio/strip_align_inserts.py', 24 | 'ctbBio/strip_masked.py', 'ctbBio/subset_sam.py', 'ctbBio/subset_reads.py', 'ctbBio/transform.py', 25 | 'ctbBio/unmapped.py', 'ctbBio/rRNA_insertions_gff.py', 26 | 'ctbBio/ncbi_download.py'] 27 | 28 | classifiers = ['Programming Language :: Python', 'Programming Language :: Python :: 3'] 29 | 30 | requirements = ['networkx', 'python-Levenshtein', 'numpy', 'pandas', 'biopython', 'tqdm'] 31 | 32 | setup(name='ctbBio', 33 | author='Chris Brown', 34 | author_email='ctb@berkeley.edu', 35 | packages=packages, 36 | scripts=scripts, 37 | version=version, 38 | license='MIT', 39 | url='https://github.com/christophertbrown/bioscripts', 40 | description='scripts for working with sequencing data', 41 | install_requires=requirements, 42 | classifiers=classifiers 43 | ) 44 | -------------------------------------------------------------------------------- /ctbBio/strip_masked.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for stripping out masked sequences if the masked sequence 5 | is above a specified length 6 | """ 7 | 8 | import sys 9 | import os 10 | from ctbBio.fasta import iterate_fasta as parse_fasta 11 | import argparse 12 | 13 | def parse_masked(seq, min_len): 14 | """ 15 | parse masked sequence into non-masked and masked regions 16 | """ 17 | nm, masked = [], [[]] 18 | prev = None 19 | for base in seq[1]: 20 | if base.isupper(): 21 | nm.append(base) 22 | if masked != [[]] and len(masked[-1]) < min_len: 23 | nm.extend(masked[-1]) 24 | del masked[-1] 25 | prev = False 26 | elif base.islower(): 27 | if prev is False: 28 | masked.append([]) 29 | masked[-1].append(base) 30 | prev = True 31 | return nm, masked 32 | 33 | def strip_masked(fasta, min_len, print_masked): 34 | """ 35 | remove masked regions from fasta file as long as 36 | they are longer than min_len 37 | """ 38 | for seq in parse_fasta(fasta): 39 | nm, masked = parse_masked(seq, min_len) 40 | nm = ['%s removed_masked >=%s' % (seq[0], min_len), ''.join(nm)] 41 | yield [0, nm] 42 | if print_masked is True: 43 | for i, m in enumerate([i for i in masked if i != []], 1): 44 | m = ['%s insertion:%s' % (seq[0], i), ''.join(m)] 45 | yield [1, m] 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description = '# remove masked portion of sequences in fasta file') 49 | parser.add_argument(\ 50 | '-f', required = True, \ 51 | help = 'fasta file') 52 | parser.add_argument(\ 53 | '-l', default = 0, \ 54 | type = int, \ 55 | help = 'minimum length of masked region required for removal') 56 | parser.add_argument(\ 57 | '--print-masked', action = 'store_true', \ 58 | help = 'print masked sequences to stderr') 59 | args = vars(parser.parse_args()) 60 | fasta, min_len, print_masked = \ 61 | args['f'], args['l'], args['print_masked'] 62 | if fasta == '-': 63 | fasta = sys.stdin 64 | else: 65 | fasta = open(fasta) 66 | for i in strip_masked(fasta, min_len, print_masked): 67 | if i[0] == 0: 68 | print('\n'.join(i[1])) 69 | else: 70 | print('\n'.join(i[1]), file=sys.stderr) 71 | -------------------------------------------------------------------------------- /ctbBio/strip_align.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for 'stripping' out columns in a MSA that represent gaps for X percent of sequences 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | 11 | def plot_gaps(plot, columns): 12 | """ 13 | plot % of gaps at each position 14 | """ 15 | from plot_window import window_plot_convolve as plot_window 16 | # plot_window([columns], len(columns)*.01, plot) 17 | plot_window([[100 - i for i in columns]], len(columns)*.01, plot) 18 | 19 | def strip_msa_100(msa, threshold, plot = False): 20 | """ 21 | strip out columns of a MSA that represent gaps for X percent (threshold) of sequences 22 | """ 23 | msa = [seq for seq in parse_fasta(msa)] 24 | columns = [[0, 0] for pos in msa[0][1]] # [[#bases, #gaps], [#bases, #gaps], ...] 25 | for seq in msa: 26 | for position, base in enumerate(seq[1]): 27 | if base == '-' or base == '.': 28 | columns[position][1] += 1 29 | else: 30 | columns[position][0] += 1 31 | columns = [float(float(g)/float(g+b)*100) for b, g in columns] # convert to percent gaps 32 | for seq in msa: 33 | stripped = [] 34 | for position, base in enumerate(seq[1]): 35 | if columns[position] < threshold: 36 | stripped.append(base) 37 | yield [seq[0], ''.join(stripped)] 38 | if plot is not False: 39 | plot_gaps(plot, columns) 40 | 41 | def strip_msa(msa, threshold, plot = False): 42 | """ 43 | strip out columns of a MSA that represent gaps for X percent (threshold) of sequences 44 | """ 45 | msa = [seq for seq in parse_fasta(msa)] 46 | columns = [[0, 0] for pos in msa[0][1]] # [[#bases, #gaps], [#bases, #gaps], ...] 47 | for seq in msa: 48 | for position, base in enumerate(seq[1]): 49 | if base == '-' or base == '.': 50 | columns[position][1] += 1 51 | else: 52 | columns[position][0] += 1 53 | columns = [float(float(g)/float(g+b)*100) for b, g in columns] # convert to percent gaps 54 | for seq in msa: 55 | stripped = [] 56 | for position, base in enumerate(seq[1]): 57 | if columns[position] <= threshold: 58 | stripped.append(base) 59 | yield [seq[0], ''.join(stripped)] 60 | if plot is not False: 61 | plot_gaps(plot, columns) 62 | 63 | 64 | if __name__ == '__main__': 65 | if len(sys.argv) != 4: 66 | print('specify MSA, threshold, and file name for pdf or False') 67 | exit() 68 | msa, threshold, plot = sys.argv[1], float(sys.argv[2]), sys.argv[3] 69 | if msa == '-': 70 | msa = sys.stdin 71 | else: 72 | msa = open(msa) 73 | if plot == 'False': 74 | plot = False 75 | if threshold == 100: 76 | for seq in strip_msa_100(msa, threshold, plot): 77 | print('\n'.join(seq)) 78 | else: 79 | for seq in strip_msa(msa, threshold, plot): 80 | print('\n'.join(seq)) 81 | 82 | -------------------------------------------------------------------------------- /ctbBio/subset_sam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for randomly subsetting a sam file 5 | """ 6 | 7 | import sys 8 | import os 9 | from itertools import cycle 10 | from subprocess import Popen, PIPE 11 | import argparse 12 | import random 13 | 14 | def sort_sam(sam, sort): 15 | """ 16 | sort sam file 17 | """ 18 | tempdir = '%s/' % (os.path.abspath(sam).rsplit('/', 1)[0]) 19 | if sort is True: 20 | mapping = '%s.sorted.sam' % (sam.rsplit('.', 1)[0]) 21 | if sam != '-': 22 | if os.path.exists(mapping) is False: 23 | os.system("\ 24 | sort -k1 --buffer-size=%sG -T %s -o %s %s\ 25 | " % (sbuffer, tempdir, mapping, sam)) 26 | else: 27 | mapping = 'stdin-sam.sorted.sam' 28 | p = Popen("sort -k1 --buffer-size=%sG -T %s -o %s" \ 29 | % (sbuffer, tempdir, mapping), stdin = sys.stdin, shell = True) 30 | p.communicate() 31 | mapping = open(mapping) 32 | else: 33 | if sam == '-': 34 | mapping = sys.stdin 35 | else: 36 | mapping = open(sam) 37 | return mapping 38 | 39 | def sub_sam(sam, percent, sort = True, sbuffer = False): 40 | """ 41 | randomly subset sam file 42 | """ 43 | mapping = sort_sam(sam, sort) 44 | pool = [1 for i in range(0, percent)] + [0 for i in range(0, 100 - percent)] 45 | c = cycle([1, 2]) 46 | for line in mapping: 47 | line = line.strip().split() 48 | if line[0].startswith('@'): # get the sam header 49 | yield line 50 | continue 51 | if int(line[1]) <= 20: # is this from a single read? 52 | if random.choice(pool) == 1: 53 | yield line 54 | else: 55 | n = next(c) 56 | if n == 1: 57 | prev = line 58 | if n == 2 and random.choice(pool) == 1: 59 | yield prev 60 | yield line 61 | 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser(description = '# randomly subset sam file') 64 | parser.add_argument(\ 65 | '-s', required = True, help = 'path to sorted sam file (- for stdin)') 66 | parser.add_argument(\ 67 | '-p', required = True, type = int,\ 68 | help = 'percent of reads to report, e.g. 50 (approximate)') 69 | parser.add_argument(\ 70 | '--sort', action = 'store_true', help = 'sort the sam file') 71 | parser.add_argument(\ 72 | '-b', default = "100", help = 'buffer size (GB) to use when sorting sam file (default = 100)') 73 | args = vars(parser.parse_args()) 74 | sam, percent, sort, buff = args['s'], args['p'], args['sort'], args['b'] 75 | for line in sub_sam(sam, percent, sort, buff): 76 | print('\t'.join(line)) 77 | -------------------------------------------------------------------------------- /ctbBio/orthologer_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | summarize orthologer output 5 | """ 6 | 7 | import sys, os 8 | import numpy as np 9 | 10 | def count_genes(counts, genes): 11 | index = 0 12 | for gene in genes: 13 | if gene != '-': 14 | counts[index] += 1 15 | index += 1 16 | return counts 17 | 18 | def count_orthologs(counts, genes): 19 | index = 1 20 | if genes[0] != '-': 21 | for gene in genes[1:]: 22 | if gene != '-': 23 | counts[index] += 1 24 | index += 1 25 | return counts 26 | 27 | def append_scores(scores, current): 28 | current = current[0].split(' ** ') 29 | index = 0 30 | for score in current: 31 | if score == '!': 32 | scores[index].append(100) 33 | elif score == '-' or score[0] == '-': 34 | continue 35 | else: 36 | pident = float(score.split()[2]) 37 | scores[index].append(pident) 38 | index += 1 39 | return scores 40 | 41 | def print_summary(genomes, genes, orthologs, pident): 42 | if len(genomes) > 2: 43 | header = ['# genome', 'genes', 'orthologs', 'average percent identity of orthologs'] 44 | print('\t'.join(header)) 45 | index = 0 46 | for genome in genomes: 47 | out = [genome] 48 | out.append(genes[index]) 49 | out.append(orthologs[index]) 50 | out.append(pident[index]) 51 | out = [str(i) for i in out] 52 | print('\t'.join(out)) 53 | index += 1 54 | else: 55 | header = ['# query genome', 'genes in query', 'reference genome', 'genes in reference', 'number of orthologs', 'average percent identity of orthologs'] 56 | print('\t'.join(header)) 57 | out = [genomes[0], genes[0], genomes[1], genes[1], orthologs[1], pident[1]] 58 | out = [str(i) for i in out] 59 | print('\t'.join(out)) 60 | 61 | def summarize(file): 62 | switch = 0 63 | for line in file: 64 | line = line.strip().split('\t') 65 | if line[0].startswith('### output'): 66 | switch = 1 67 | continue 68 | if switch == 0: 69 | continue 70 | if len(line) == 1: 71 | continue 72 | if line[0].startswith('#'): 73 | line[0] = line[0].split('# ')[1] 74 | genomes = line[::3] 75 | gene_counts = [0 for i in genomes] 76 | ortholog_counts = [0 for i in genomes] 77 | scores = [[] for i in genomes] 78 | continue 79 | gene_counts = count_genes(gene_counts, line[::3]) 80 | ortholog_counts = count_orthologs(ortholog_counts, line[::3]) 81 | scores = append_scores(scores, line[1::3]) 82 | average_pident = [np.average(i) for i in scores] 83 | return genomes, gene_counts, ortholog_counts, average_pident 84 | 85 | if __name__ == '__main__': 86 | if len(sys.argv) != 2: 87 | print('usage: orthologer_summary.py ') 88 | exit() 89 | file = sys.argv[1] 90 | if file == '-': 91 | file = sys.stdin 92 | else: 93 | file = open(file) 94 | genomes, gene_counts, ortholog_counts, average_pident = summarize(file) 95 | print_summary(genomes, gene_counts, ortholog_counts, average_pident) 96 | -------------------------------------------------------------------------------- /ctbBio/calculate_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import argparse 6 | 7 | def length_and_bases(coverage, sam): 8 | for line in sam: 9 | line = line.strip() 10 | if line.startswith('@SQ'): 11 | line = line.strip().split() 12 | scaffold, length = line[1].split(':', 1)[1], float(line[-1].split(':', 1)[1]) 13 | if scaffold not in coverage: 14 | coverage[scaffold] = [length, {}] 15 | elif line.startswith('@') is False: 16 | line = line.split('\t') 17 | scaffold, bases = line[2], float(len(line[9])) 18 | if scaffold not in coverage: 19 | coverage[scaffold] = [length, {}] 20 | map = int(line[1]) 21 | if map != 4 and map != 8: 22 | if scaffold != '*': 23 | if sam not in coverage[scaffold][1]: 24 | coverage[scaffold][1][sam] = 0 25 | coverage[scaffold][1][sam] += bases 26 | return coverage 27 | 28 | def combine_by_sample(coverage): 29 | combined_coverage = {} 30 | sams = [] 31 | for scaffold in coverage: 32 | length, combined = coverage[scaffold][0], {} 33 | for sam in coverage[scaffold][1]: 34 | comb = '.'.join(sam.name.split('.')[0:2]) 35 | if comb not in sams: 36 | sams.append(comb) 37 | if comb not in combined: 38 | combined[comb] = 0 39 | combined[comb] += coverage[scaffold][1][sam] 40 | combined_coverage[scaffold] = [length, combined] 41 | return combined_coverage, sams 42 | 43 | def calculate_coverage(bases): 44 | coverage = {} 45 | for scaffold in bases: 46 | length, counts = bases[scaffold][0], bases[scaffold][1] 47 | scaffold_coverage = {} 48 | for count in counts: 49 | scaffold_coverage[count] = float(counts[count] / length) 50 | coverage[scaffold] = [length, scaffold_coverage] 51 | return coverage 52 | 53 | def print_coverage(coverage, sams): 54 | out = ['# scaffold: length'] 55 | for sam in sams: 56 | out.append(sam.name) 57 | yield out 58 | for scaffold in coverage: 59 | length, cov = coverage[scaffold][0], coverage[scaffold][1] 60 | out = ['%s: %s' % (scaffold, length)] 61 | for sam in sams: 62 | if sam in cov: 63 | out.append(cov[sam]) 64 | else: 65 | out.append(0) 66 | yield out 67 | 68 | def iterate_sams(sams, combine = False): 69 | coverage = {} 70 | for sam in sams: 71 | coverage = length_and_bases(coverage, sam) 72 | if combine is True: 73 | coverage, sams = combine_by_sample(coverage) 74 | coverage = calculate_coverage(coverage) 75 | return coverage, sams 76 | 77 | if __name__ == '__main__': 78 | parser = argparse.ArgumentParser(description = '# calculate coverage from sam file') 79 | parser.add_argument(\ 80 | '-s', nargs = '*', action = 'store', required = True, \ 81 | help = 'sam(s)') 82 | args = vars(parser.parse_args()) 83 | sams = [] 84 | for sam in sorted(args['s']): 85 | if sam == '-': 86 | sam = sys.stdin 87 | else: 88 | sam = open(sam) 89 | sams.append(sam) 90 | coverage, sams = iterate_sams(sams) 91 | for i in print_coverage(coverage, sams): 92 | print('\t'.join([str(j) for j in i])) 93 | -------------------------------------------------------------------------------- /ctbBio/sam2fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | convert sam file to fastq file 5 | """ 6 | 7 | import sys 8 | import os 9 | 10 | # ctb 11 | from ctbBio.rc import reverse_complement as rc 12 | 13 | def print_single(line, rev): 14 | """ 15 | print single reads to stderr 16 | """ 17 | if rev is True: 18 | seq = rc(['', line[9]])[1] 19 | qual = line[10][::-1] 20 | else: 21 | seq = line[9] 22 | qual = line[10] 23 | fq = ['@%s' % line[0], seq, '+%s' % line[0], qual] 24 | print('\n'.join(fq), file = sys.stderr) 25 | 26 | def sam2fastq(sam, singles = False, force = False): 27 | """ 28 | convert sam to fastq 29 | """ 30 | L, R = None, None 31 | for line in sam: 32 | if line.startswith('@') is True: 33 | continue 34 | line = line.strip().split() 35 | bit = [True if i == '1' else False \ 36 | for i in bin(int(line[1])).split('b')[1][::-1]] 37 | while len(bit) < 8: 38 | bit.append(False) 39 | bit = bit[0:8] 40 | pair, proper, na, nap, rev, mrev, left, right = bit 41 | # make sure read is paired 42 | if pair is False: 43 | if singles is True: 44 | print_single(line, rev) 45 | continue 46 | # check if sequence is reverse-complemented 47 | if rev is True: 48 | seq = rc(['', line[9]])[1] 49 | qual = line[10][::-1] 50 | else: 51 | seq = line[9] 52 | qual = line[10] 53 | # check if read is forward or reverse, return when both have been found 54 | if left is True: 55 | if L is not None and force is False: 56 | print('sam file is not sorted', file = sys.stderr) 57 | print('\te.g.: %s' % (line[0]), file = sys.stderr) 58 | exit() 59 | if L is not None: 60 | L = None 61 | continue 62 | L = ['@%s' % line[0], seq, '+%s' % line[0], qual] 63 | if R is not None: 64 | yield L 65 | yield R 66 | L, R = None, None 67 | if right is True: 68 | if R is not None and force is False: 69 | print('sam file is not sorted', file = sys.stderr) 70 | print('\te.g.: %s' % (line[0]), file = sys.stderr) 71 | exit() 72 | if R is not None: 73 | R = None 74 | continue 75 | R = ['@%s' % line[0], seq, '+%s' % line[0], qual] 76 | if L is not None: 77 | yield L 78 | yield R 79 | L, R = None, None 80 | 81 | if __name__ == '__main__': 82 | if len(sys.argv) != 4: 83 | print('usage: sam2fastq.py ', 84 | file = sys.stderr) 85 | exit() 86 | sam, singles, force = sys.argv[1], sys.argv[2], sys.argv[3] 87 | if sam == '-': 88 | sam = sys.stdin 89 | else: 90 | sam = open(sam) 91 | if singles == 'True': 92 | singles = True 93 | else: 94 | singles = False 95 | if force == 'True': 96 | force = True 97 | else: 98 | force = False 99 | for seq in sam2fastq(sam, singles, force): 100 | print('\n'.join(seq)) 101 | -------------------------------------------------------------------------------- /ctbBio/shuffle_genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for randomly messing up a genome 5 | """ 6 | 7 | import os 8 | import sys 9 | import random 10 | import argparse 11 | import numpy as np 12 | 13 | # ctb 14 | from ctbBio.fasta import iterate_fasta as parse_fasta 15 | 16 | def plot_dist_normal(s, mu, sigma): 17 | """ 18 | plot distribution 19 | """ 20 | import matplotlib.pyplot as plt 21 | count, bins, ignored = plt.hist(s, 30, normed=True) 22 | plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) \ 23 | * np.exp( - (bins - mu)**2 / (2 * sigma**2) ), \ 24 | linewidth = 2, color = 'r') 25 | plt.show() 26 | 27 | def rev_c(read): 28 | """ 29 | return reverse completment of read 30 | """ 31 | rc = [] 32 | rc_nucs = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N'} 33 | for base in read: 34 | rc.extend(rc_nucs[base.upper()]) 35 | return rc[::-1] 36 | 37 | def shuffle_genome(genome, cat, fraction = float(100), plot = True, \ 38 | alpha = 0.1, beta = 100000, \ 39 | min_length = 1000, max_length = 200000): 40 | """ 41 | randomly shuffle genome 42 | """ 43 | header = '>randomized_%s' % (genome.name) 44 | sequence = list(''.join([i[1] for i in parse_fasta(genome)])) 45 | length = len(sequence) 46 | shuffled = [] 47 | # break genome into pieces 48 | while sequence is not False: 49 | s = int(random.gammavariate(alpha, beta)) 50 | if s <= min_length or s >= max_length: 51 | continue 52 | if len(sequence) < s: 53 | seq = sequence[0:] 54 | else: 55 | seq = sequence[0:s] 56 | sequence = sequence[s:] 57 | # if bool(random.getrandbits(1)) is True: 58 | # seq = rev_c(seq) 59 | # print('fragment length: %s reverse complement: True' % ('{:,}'.format(s)), file=sys.stderr) 60 | # else: 61 | # print('fragment length: %s reverse complement: False' % ('{:,}'.format(s)), file=sys.stderr) 62 | shuffled.append(''.join(seq)) 63 | if sequence == []: 64 | break 65 | # shuffle pieces 66 | random.shuffle(shuffled) 67 | # subset fragments 68 | if fraction == float(100): 69 | subset = shuffled 70 | else: 71 | max_pieces = int(length * fraction/100) 72 | subset, total = [], 0 73 | for fragment in shuffled: 74 | length = len(fragment) 75 | if total + length <= max_pieces: 76 | subset.append(fragment) 77 | total += length 78 | else: 79 | diff = max_pieces - total 80 | subset.append(fragment[0:diff]) 81 | break 82 | # combine sequences, if requested 83 | if cat is True: 84 | yield [header, ''.join(subset)] 85 | else: 86 | for i, seq in enumerate(subset): 87 | yield ['%s fragment:%s' % (header, i), seq] 88 | 89 | if __name__ == '__main__': 90 | parser = argparse.ArgumentParser(description = '# randomly re-arrange genome') 91 | parser.add_argument(\ 92 | '-f', nargs = '*', action = 'store', required = True, \ 93 | help = 'fasta(s)') 94 | parser.add_argument(\ 95 | '-p', type = float, default = 100, 96 | help = 'percent of genome to return (default = 100)') 97 | parser.add_argument(\ 98 | '--cat', action = 'store_true', \ 99 | help = 'concatenate random fragments') 100 | args = vars(parser.parse_args()) 101 | for genome in args['f']: 102 | if genome == '-': 103 | genome = sys.stdin 104 | else: 105 | genome = open(genome) 106 | for seq in shuffle_genome(genome, args['cat'], fraction = args['p']): 107 | print('\n'.join(seq)) 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bioscripts 2 | 3 | * some useful scripts for working with genomics and sequencing data 4 | 5 | * see also [bioscripts27](https://github.com/christophertbrown/bioscripts27) 6 | 7 | # installation 8 | 9 | `pip install ctbBio` 10 | 11 | ## rRNA identification using 16SfromHMM.py and 23SfromHMM.py 12 | 13 | The scripts use `cmsearch` from the Infernal package to do a sequence-based HMM search against 16S and 23S covariance models. The curated model from SSU-Align is used for 16S, and a custom-built model for 23S. The method is similar to what SSU-Align does by default, but accounts for the fact that rRNA genes may contain large insertion sequences. 14 | 15 | The method is described in: 16 | 17 | ["Unusual biology across a group comprising more than 15% of domain Bacteria"](http://dx.doi.org/10.1038/nature14486) - Christopher T. Brown, Laura A. Hug, Brian C. Thomas, Itai Sharon, Cindy J. Castelle, Andrea Singh, Michael J. Wilkins, Kelly C. Wrighton, Kenneth H. Williams & Jillian F. Banfield (*Nature* 2015). 18 | 19 | If using this software, please cite our paper along with Infernal and SSU-Align. 20 | 21 | E. P. Nawrocki, D. L. Kolbe, and S. R. Eddy, "Infernal 1.0: inference of RNA alignments.," Bioinformatics, vol. 25, no. 10, pp. 1335–1337, May 2009. 22 | 23 | E. P. Nawrocki, "Structural RNA Homology Search and Alignment using Covariance Models," Washington University in Saint Louis, School of Medicine, 2009. 24 | 25 | ### requirements 26 | 27 | * python3 28 | * [infernal](http://eddylab.org/infernal/) 29 | * rRNA_insertions.py requires HMMER3 and Pfam (use databases env. variable: ~/databases/pfam/Pfam-A.hmm). 30 | 31 | ### databases 32 | 33 | * 16S CM: databases/ssu-align-0p1.1.cm 34 | * 23S CM: databases/23S.cm 35 | 36 | ### use env. variable to reference databases (optional) 37 | 38 | `export ssucmdb="databases/ssu-align-0p1.1.cm"` 39 | 40 | `export lsucmdb="databases/23S.cm"` 41 | 42 | ### example usage for finding and analyzing rRNA insertions 43 | 44 | * find 16S rRNA genes and insertions 45 | 46 | `16SfromHMM.py -f -m -t 6 > ` 47 | 48 | * remove insertions (useful for phylogenetic analyses) 49 | 50 | `strip_masked.py -f -l 10 > ` 51 | 52 | note: -l 10 specifies that insertions >= 10 bp are removed 53 | 54 | ### analyze insertions 55 | 56 | `rRNA_insertions.py ` 57 | 58 | ## ortholog identification between pairs of genomes using orthologer.py 59 | 60 | * orthologer.py conducts reciprocal usearch similarity searches between pairs of provided genomes to identify reciprocal best hits 61 | 62 | * genomes can be supplied as either gene or protein multi-fasta files (one file per genome; each ORF must have a unique identifier) 63 | 64 | ### requirements 65 | 66 | * python3 67 | * `usearch` 68 | 69 | ### usage 70 | 71 | `orthologer.py > orthologer.out` 72 | 73 | Mode can be either "reference" or "global." 74 | 75 | In "reference" mode all searches will be conducted against the first genome that is listed. In "global" mode all possible pairwise searches are conducted between the listed genomes (# searches = #genomes^#genomes). 76 | 77 | ## download genomes from NCBI 78 | 79 | for usage see: `ncbi_download.py -h` 80 | 81 | ### requirements 82 | * python3 83 | * tqdm 84 | * `wget` 85 | * `esearch` and `efetch` from [NCBI Direct](https://www.ncbi.nlm.nih.gov/books/NBK179288/) (optional) 86 | 87 | ### download genomes 88 | `cat accessions.list | ncbi_download.py -g -` 89 | 90 | ### download proteins 91 | `cat accessions.list | ncbi_download.py -g - -s "*protein.faa.gz"` 92 | 93 | ### options 94 | The --test option can be used to determine whether or not the accession numbers can be found without downloading the genomes. 95 | The --convert option attempts to convert accession numbers when necessary, but requires that `esearch` and `efetch` from [NCBI Direct](https://www.ncbi.nlm.nih.gov/books/NBK179288/) be installed to the system PATH. 96 | -------------------------------------------------------------------------------- /ctbBio/genome_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for calculating genome coverage 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import pandas as pd 11 | from ctbBio.fasta import iterate_fasta as parse_fasta 12 | 13 | def parse_cov(cov_table, scaffold2genome): 14 | """ 15 | calculate genome coverage from scaffold coverage table 16 | """ 17 | size = {} # size[genome] = genome size 18 | mapped = {} # mapped[genome][sample] = mapped bases 19 | # parse coverage files 20 | for line in open(cov_table): 21 | line = line.strip().split('\t') 22 | if line[0].startswith('#'): 23 | samples = line[1:] 24 | samples = [i.rsplit('/', 1)[-1].split('.', 1)[0] for i in samples] 25 | continue 26 | scaffold, length = line[0].split(': ') 27 | length = float(length) 28 | covs = [float(i) for i in line[1:]] 29 | bases = [c * length for c in covs] 30 | if scaffold not in scaffold2genome: 31 | continue 32 | genome = scaffold2genome[scaffold] 33 | if genome not in size: 34 | size[genome] = 0 35 | mapped[genome] = {sample:0 for sample in samples} 36 | # keep track of genome size 37 | size[genome] += length 38 | # keep track of number of mapped bases 39 | for sample, count in zip(samples, bases): 40 | mapped[genome][sample] += count 41 | # calculate coverage from base counts and genome size 42 | coverage = {'genome':[], 'genome size (bp)':[], 'sample':[], 'coverage':[]} 43 | for genome, length in size.items(): 44 | for sample in samples: 45 | cov = mapped[genome][sample] / length 46 | coverage['genome'].append(genome) 47 | coverage['genome size (bp)'].append(length) 48 | coverage['sample'].append(sample) 49 | coverage['coverage'].append(cov) 50 | return pd.DataFrame(coverage) 51 | 52 | def genome_coverage(covs, s2b): 53 | """ 54 | calculate genome coverage from scaffold coverage 55 | """ 56 | COV = [] 57 | for cov in covs: 58 | COV.append(parse_cov(cov, s2b)) 59 | return pd.concat(COV) 60 | 61 | def parse_s2bs(s2bs): 62 | """ 63 | convert s2b files to dictionary 64 | """ 65 | s2b = {} 66 | for s in s2bs: 67 | for line in open(s): 68 | line = line.strip().split('\t') 69 | s, b = line[0], line[1] 70 | s2b[s] = b 71 | return s2b 72 | 73 | def fa2s2b(fastas): 74 | """ 75 | convert fastas to s2b dictionary 76 | """ 77 | s2b = {} 78 | for fa in fastas: 79 | for seq in parse_fasta(fa): 80 | s = seq[0].split('>', 1)[1].split()[0] 81 | s2b[s] = fa.rsplit('/', 1)[-1].rsplit('.', 1)[0] 82 | return s2b 83 | 84 | if __name__ == '__main__': 85 | parser = argparse.ArgumentParser(description = '# calculate genome coverage from scaffold coverage') 86 | parser.add_argument(\ 87 | '-c', required = True, nargs = '*', \ 88 | help = 'calculate_coverage.py scaffold coverage file(s) - required') 89 | parser.add_argument(\ 90 | '-s', required = False, nargs = '*', \ 91 | help = 'scaffold to bin files(s)') 92 | parser.add_argument(\ 93 | '-f', required = False, nargs = '*', \ 94 | help = 'fasta file(s) for each genome - use instead of -s') 95 | args = vars(parser.parse_args()) 96 | s2bs, fastas, coverages = args['s'], args['f'], args['c'] 97 | if s2bs is None and fastas is None: 98 | print('-s or -f is required') 99 | exit() 100 | if s2bs is not None: 101 | s2b = parse_s2bs(s2bs) 102 | else: 103 | s2b = fa2s2b(fastas) 104 | df = genome_coverage(coverages, s2b) 105 | df['genome: length (bp)'] = ['%s: %s' % (g, l) for g, l in zip(df['genome'], df['genome size (bp)'])] 106 | print(df.pivot('genome: length (bp)', 'sample', 'coverage').to_csv(sep = '\t')) 107 | -------------------------------------------------------------------------------- /ctbBio/filter_fastq_sam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys, os 4 | from glob import glob as glob 5 | from itertools import cycle as cycle 6 | 7 | def sam_list(sam): 8 | """ 9 | get a list of mapped reads 10 | """ 11 | list = [] 12 | for file in sam: 13 | for line in file: 14 | if line.startswith('@') is False: 15 | line = line.strip().split() 16 | id, map = line[0], int(line[1]) 17 | if map != 4 and map != 8: 18 | list.append(id) 19 | return set(list) 20 | 21 | def sam_list_paired(sam): 22 | """ 23 | get a list of mapped reads 24 | require that both pairs are mapped in the sam file in order to remove the reads 25 | """ 26 | list = [] 27 | pair = ['1', '2'] 28 | prev = '' 29 | for file in sam: 30 | for line in file: 31 | if line.startswith('@') is False: 32 | line = line.strip().split() 33 | id, map = line[0], int(line[1]) 34 | if map != 4 and map != 8: 35 | read = id.rsplit('/')[0] 36 | if read == prev: 37 | list.append(read) 38 | prev = read 39 | return set(list) 40 | 41 | def filter_paired(list): 42 | """ 43 | require that both pairs are mapped in the sam file in order to remove the reads 44 | """ 45 | pairs = {} 46 | filtered = [] 47 | for id in list: 48 | read = id.rsplit('/')[0] 49 | if read not in pairs: 50 | pairs[read] = [] 51 | pairs[read].append(id) 52 | for read in pairs: 53 | ids = pairs[read] 54 | if len(ids) == 2: 55 | filtered.extend(ids) 56 | return set(filtered) 57 | 58 | def filter_fastq(fastq, list): 59 | c = cycle([1, 2, 3, 4]) 60 | for file in fastq: 61 | if '/' in file: 62 | filtered = '%s.filtered.fastq' % (file.rsplit('.', 1)[0].rsplit('/', 1)[1]) 63 | matched = '%s.matched.fastq' % (file.rsplit('.', 1)[0].rsplit('/', 1)[1]) 64 | else: 65 | filtered = '%s.filtered.fastq' % (file.rsplit('.', 1)[0]) 66 | matched = '%s.matched.fastq' % (file.rsplit('.', 1)[0]) 67 | filtered, matched = open(filtered, 'w'), open(matched, 'w') 68 | switch = 1 69 | for line in open(file): 70 | n = next(c) 71 | if n == 1: 72 | id = line.strip().split()[0].split('@')[1] 73 | if id in list: 74 | switch = 0 75 | else: 76 | switch = 1 77 | if switch == 1: 78 | print(line.strip(), file=filtered) 79 | else: 80 | print(line.strip(), file=matched) 81 | filtered.close() 82 | matched.close() 83 | 84 | def filter_fastq_paired(fastq, list): 85 | c = cycle([1, 2, 3, 4]) 86 | for file in fastq: 87 | if '/' in file: 88 | filtered = '%s.filtered.fastq' % (file.rsplit('.', 1)[0].rsplit('/', 1)[1]) 89 | matched = '%s.matched.fastq' % (file.rsplit('.', 1)[0].rsplit('/', 1)[1]) 90 | else: 91 | filtered = '%s.filtered.fastq' % (file.rsplit('.', 1)[0]) 92 | matched = '%s.matched.fastq' % (file.rsplit('.', 1)[0]) 93 | filtered, matched = open(filtered, 'w'), open(matched, 'w') 94 | switch = 1 95 | for line in open(file): 96 | n = next(c) 97 | if n == 1: 98 | read = line.strip().split()[0].split('@')[1].rsplit('/', 1)[0] 99 | if read in list: 100 | switch = 0 101 | else: 102 | switch = 1 103 | if switch == 1: 104 | print(line.strip(), file=filtered) 105 | else: 106 | print(line.strip(), file=matched) 107 | filtered.close() 108 | matched.close() 109 | 110 | 111 | def filter(fastq, sam, paired = False): 112 | """ 113 | filter sequences that are shown to be mapped in the sam file 114 | reads not in sam file are in *.filtered.fastq 115 | reads that are in the sam file are in *.matched.fastq 116 | """ 117 | if paired is False: 118 | list = sam_list(sam) 119 | else: 120 | list = sam_list_paired(sam) 121 | if paired is False: 122 | filter_fastq(fastq, list) 123 | else: 124 | filter_fastq_paired(fastq, list) 125 | 126 | if __name__ == '__main__': 127 | if len(sys.argv) != 4: 128 | print('specify fastq, sam, and paired or unpaired') 129 | exit() 130 | fastq, sam, paired = glob(sys.argv[1]), glob(sys.argv[2]), sys.argv[3] 131 | if paired == 'paired': 132 | paired = True 133 | else: 134 | paired = False 135 | sam = [open(i) for i in sam] 136 | list(filter(fastq, sam, paired)) 137 | -------------------------------------------------------------------------------- /ctbBio/crossmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for mapping reads against scaffolds 5 | """ 6 | 7 | import os 8 | import sys 9 | import json 10 | import numpy 11 | import shutil 12 | import random 13 | import argparse 14 | import subprocess 15 | 16 | def bowtiedb(fa, keepDB): 17 | """ 18 | make bowtie db 19 | """ 20 | btdir = '%s/bt2' % (os.getcwd()) 21 | # make directory for 22 | if not os.path.exists(btdir): 23 | os.mkdir(btdir) 24 | btdb = '%s/%s' % (btdir, fa.rsplit('/', 1)[-1]) 25 | if keepDB is True: 26 | if os.path.exists('%s.1.bt2' % (btdb)): 27 | return btdb 28 | p = subprocess.Popen('bowtie2-build -q %s %s' \ 29 | % (fa, btdb), shell = True) 30 | p.communicate() 31 | return btdb 32 | 33 | def bowtie(sam, btd, f, r, u, opt, no_shrink, threads): 34 | """ 35 | generate bowtie2 command 36 | """ 37 | bt2 = 'bowtie2 -x %s -p %s ' % (btd, threads) 38 | if f is not False: 39 | bt2 += '-1 %s -2 %s ' % (f, r) 40 | if u is not False: 41 | bt2 += '-U %s ' % (u) 42 | bt2 += opt 43 | if no_shrink is False: 44 | if f is False: 45 | bt2 += ' | shrinksam -u -k %s-shrunk.sam ' % (sam) 46 | else: 47 | bt2 += ' | shrinksam -k %s-shrunk.sam ' % (sam) 48 | else: 49 | bt2 += ' > %s.sam' % (sam) 50 | return bt2 51 | 52 | def chunks(l, n): 53 | return numpy.array_split(numpy.array(l), n) 54 | 55 | def crossmap(fas, reads, options, no_shrink, keepDB, threads, cluster, nodes): 56 | """ 57 | map all read sets against all fasta files 58 | """ 59 | if cluster is True: 60 | threads = '48' 61 | btc = [] 62 | for fa in fas: 63 | btd = bowtiedb(fa, keepDB) 64 | F, R, U = reads 65 | if F is not False: 66 | if U is False: 67 | u = False 68 | for i, f in enumerate(F): 69 | r = R[i] 70 | if U is not False: 71 | u = U[i] 72 | sam = '%s/%s-vs-%s' % (os.getcwd(), \ 73 | fa.rsplit('/', 1)[-1], f.rsplit('/', 1)[-1].rsplit('.', 3)[0]) 74 | btc.append(bowtie(sam, btd, f, r, u, options, no_shrink, threads)) 75 | else: 76 | f = False 77 | r = False 78 | for u in U: 79 | sam = '%s/%s-vs-%s' % (os.getcwd(), \ 80 | fa.rsplit('/', 1)[-1], u.rsplit('/', 1)[-1].rsplit('.', 3)[0]) 81 | btc.append(bowtie(sam, btd, f, r, u, options, no_shrink, threads)) 82 | if cluster is False: 83 | for i in btc: 84 | p = subprocess.Popen(i, shell = True) 85 | p.communicate() 86 | else: 87 | ID = ''.join(random.choice([str(i) for i in range(0, 9)]) for _ in range(5)) 88 | for node, commands in enumerate(chunks(btc, nodes), 1): 89 | bs = open('%s/crossmap-qsub.%s.%s.sh' % (os.getcwd(), ID, node), 'w') 90 | print('\n'.join(commands), file=bs) 91 | bs.close() 92 | p = subprocess.Popen(\ 93 | 'qsub -V -N crossmap %s' \ 94 | % (bs.name), \ 95 | shell = True) 96 | p.communicate() 97 | 98 | if __name__ == '__main__': 99 | parser = argparse.ArgumentParser(description = '# cross map using bowtie') 100 | parser.add_argument(\ 101 | '-f', nargs = '*', action = 'store', \ 102 | required = True, help = 'path to fasta(s)') 103 | parser.add_argument(\ 104 | '-1', nargs = '*', action = 'store', \ 105 | default = False, help = 'path to forward reads') 106 | parser.add_argument(\ 107 | '-2', nargs = '*', action = 'store', \ 108 | default = False, help = 'path to reverse reads') 109 | parser.add_argument(\ 110 | '-U', nargs = '*', action = 'store', \ 111 | default = False, help = 'path to single reads') 112 | parser.add_argument(\ 113 | '-p', default = '--very-fast --reorder --quiet', \ 114 | help = 'bowtie options (default = "--very-fast --reorder --quiet"') 115 | parser.add_argument(\ 116 | '--no-shrink', action = 'store_true', help = 'do not use shrinksam') 117 | parser.add_argument(\ 118 | '--keepDB', action = 'store_true', help ='do not overwrite bowtie database') 119 | parser.add_argument(\ 120 | '-t', default = '6', help = 'number of cpus (default = 6)') 121 | parser.add_argument(\ 122 | '--cluster', action = 'store_true', help = 'run on cluster') 123 | parser.add_argument(\ 124 | '-n', default = 1, type = int, help = 'number of cluster nodes (default = 1)') 125 | args = vars(parser.parse_args()) 126 | fa = [os.path.abspath(i) for i in args['f']] 127 | if (args['1'] is False or args['2'] is False) and args['U'] is False: 128 | print('# specify -1 and -2 and/or -U', file=sys.stderr) 129 | exit() 130 | if args['1'] is not False: 131 | f = [os.path.abspath(i) for i in args['1']] 132 | r = [os.path.abspath(i) for i in args['2']] 133 | else: 134 | f = r = False 135 | if args['U'] is not False: 136 | u = [os.path.abspath(i) for i in args['U']] 137 | else: 138 | u = False 139 | crossmap(fa, [f, r, u], args['p'], args['no_shrink'], args['keepDB'], args['t'], \ 140 | args['cluster'], args['n']) 141 | -------------------------------------------------------------------------------- /ctbBio/rRNA_copies.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for determining 16S copy number from read mapping 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | from ctbBio.mapped import count_mismatches as count_mismatches 11 | 12 | def get_overlap(a, b): 13 | """ 14 | get overlap between ranges 15 | """ 16 | return max(0, min(a[1], b[1]) - max(a[0], b[0])) 17 | 18 | def rna_bases(rna_cov, scaffold, bases, line): 19 | """ 20 | determine if read overlaps with rna, if so count bases 21 | """ 22 | start = int(line[3]) 23 | stop = start + bases - 1 24 | if scaffold not in rna_cov: 25 | return rna_cov 26 | for pos in rna_cov[scaffold][2]: 27 | ol = get_overlap([start, stop], pos) 28 | rna_cov[scaffold][0] += ol 29 | return rna_cov 30 | 31 | def parse_s2bins(s2bins): 32 | """ 33 | parse ggKbase scaffold-to-bin mapping 34 | - scaffolds-to-bins and bins-to-scaffolds 35 | """ 36 | s2b = {} 37 | b2s = {} 38 | for line in s2bins: 39 | line = line.strip().split() 40 | s, b = line[0], line[1] 41 | if 'UNK' in b: 42 | continue 43 | if len(line) > 2: 44 | g = ' '.join(line[2:]) 45 | else: 46 | g = 'n/a' 47 | b = '%s\t%s' % (b, g) 48 | s2b[s] = b 49 | if b not in b2s: 50 | b2s[b] = [] 51 | b2s[b].append(s) 52 | return s2b, b2s 53 | 54 | def parse_rna(rna, s2bins, min_rna): 55 | """ 56 | parse [16,23]SfromHMM.py output 57 | - rna_cov[scaffold] = [0, 0, []] # [bases, length, ranges] 58 | """ 59 | rna_cov = {} 60 | for seq in parse_fasta(rna): 61 | # check that length passes threshold 62 | length = len(seq[1]) 63 | if length < min_rna: 64 | continue 65 | # check if sequence is binnned 66 | s = seq[0].split('>')[1].split()[0] 67 | if s not in s2bins: 68 | continue 69 | if s not in rna_cov: 70 | rna_cov[s] = [0, 0, []] 71 | position = [int(i) for i in seq[0].rsplit('pos=', 1)[1].split()[0].split('-')] 72 | rna_cov[s][2].append(position) 73 | rna_cov[s][1] += length 74 | return rna_cov 75 | 76 | def filter_missing_rna(s2bins, bins2s, rna_cov): 77 | """ 78 | remove any bins that don't have 16S 79 | """ 80 | for bin, scaffolds in list(bins2s.items()): 81 | c = 0 82 | for s in scaffolds: 83 | if s in rna_cov: 84 | c += 1 85 | if c == 0: 86 | del bins2s[bin] 87 | for scaffold, bin in list(s2bins.items()): 88 | if bin not in bins2s: 89 | del s2bins[scaffold] 90 | return s2bins, bins2s 91 | 92 | def calc_bin_cov(scaffolds, cov): 93 | """ 94 | calculate bin coverage 95 | """ 96 | bases = sum([cov[i][0] for i in scaffolds if i in cov]) 97 | length = sum([cov[i][1] for i in scaffolds if i in cov]) 98 | if length == 0: 99 | return 0 100 | return float(float(bases)/float(length)) 101 | 102 | def copies(mapping, s2bins, rna, min_rna = 800, mismatches = 0): 103 | """ 104 | 1. determine bin coverage 105 | 2. determine rRNA gene coverage 106 | 3. compare 107 | """ 108 | cov = {} # cov[scaffold] = [bases, length] 109 | s2bins, bins2s = parse_s2bins(s2bins) 110 | rna_cov = parse_rna(rna, s2bins, min_rna) 111 | s2bins, bins2s = filter_missing_rna(s2bins, bins2s, rna_cov) 112 | # count bases mapped to scaffolds and rRNA gene regions 113 | for line in mapping: 114 | line = line.strip().split() 115 | # get scaffold lengths 116 | if line[0].startswith('@'): 117 | if line[0].startswith('@SQ') is False: 118 | continue 119 | s = line[1].split(':')[1] 120 | l = int(line[2].split(':')[1]) 121 | # check if scaffold is binned 122 | if s not in s2bins: 123 | continue 124 | if s not in cov: 125 | cov[s] = [0, l] 126 | # check mismatch threshold 127 | mm = count_mismatches(line) 128 | if mm is False or mm > mismatches: 129 | continue 130 | # check that scaffold is in bin 131 | s, bases = line[2], len(line[9]) 132 | if s not in cov: 133 | continue 134 | cov[s][0] += bases 135 | rna_cov = rna_bases(rna_cov, s, bases, line) 136 | print('# mismatches threshold: %s' % (mismatches)) 137 | header = ['#rRNA scaffold', 'rRNA genes >=%sbp on scaffold' % (min_rna), \ 138 | 'rRNA coverage', \ 139 | 'bin', 'bin info', 'bin coverage', \ 140 | 'rRNAs >=%sbp in bin' % (min_rna), \ 141 | 'rRNA coverage/bin coverage', \ 142 | 'estimated number of copies'] 143 | print('\t'.join(header)) 144 | for bin, scaffolds in list(bins2s.items()): 145 | rna_count = sum([len(rna_cov[s][2]) for s in scaffolds if s in rna_cov]) 146 | for s in scaffolds: 147 | if s not in rna_cov: 148 | continue 149 | out = [] 150 | counts = rna_cov[s] 151 | bin_cov = calc_bin_cov(bins2s[bin], cov) 152 | num_genes = len(counts[2]) 153 | rna_coverage = float(float(counts[0])/float(counts[1])) 154 | if bin_cov == 0: 155 | rna_div_bin = 0 156 | else: 157 | rna_div_bin = float(rna_coverage/bin_cov) 158 | est = int(max([rna_count, counts, rna_div_bin])) 159 | out = [s, num_genes, rna_coverage, bin, bin_cov, rna_count, rna_div_bin, est] 160 | print('\t'.join([str(i) for i in out])) 161 | 162 | if __name__ == '__main__': 163 | if len(sys.argv) != 4: 164 | print('usage: rRNA_copies.py <[16S,23s]fromHMM.py sequences>') 165 | exit() 166 | mapping = sys.argv[1] 167 | if mapping == '-': 168 | mapping = sys.stdin 169 | else: 170 | mapping = sys.argv[1] 171 | s2bins, rna = [open(i) for i in sys.argv[2:]] 172 | copies(mapping, s2bins, rna) 173 | -------------------------------------------------------------------------------- /ctbBio/genome_abundance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | calculate genome coverage, relative abundance, and absolute abundance 5 | """ 6 | 7 | import sys 8 | import os 9 | from ctbBio.fasta import iterate_fasta as parse_fasta 10 | from glob import glob as glob 11 | import numpy 12 | 13 | def calc_custom(custom, genome, scaffold, sequence, scaffold_coverage, total_bases): 14 | """ 15 | custom = {(reads mapped to scaffold)/(total reads for sample)}/(length of scaffold) 16 | """ 17 | index = 0 18 | if scaffold in scaffold_coverage: # what if the scaffold does not have bases mapped back to it? (this *should* not happen) 19 | if genome not in custom: 20 | custom[genome] = [[] for i in scaffold_coverage[scaffold]] 21 | for cov in scaffold_coverage[scaffold]: 22 | length = float(len(sequence[1])) 23 | bases = cov * length 24 | custom_value = ((bases) / (total_bases[index])) / length 25 | custom[genome][index].append(custom_value) 26 | index += 1 27 | return custom 28 | 29 | def sum_coverage(coverage, std, genome, scaffold, sequence, scaffold_coverage): 30 | index = 0 31 | if scaffold in scaffold_coverage: # what if the scaffold does not have bases mapped back to it? (this *should* not happen) 32 | if genome not in std: 33 | std[genome] = [[] for i in scaffold_coverage[scaffold]] 34 | if genome not in coverage: 35 | coverage[genome] = [[0, 0] for i in scaffold_coverage[scaffold]] 36 | for cov in scaffold_coverage[scaffold]: 37 | length = float(len(sequence[1])) 38 | bases = cov * length 39 | coverage[genome][index][0] += bases 40 | coverage[genome][index][1] += length 41 | std[genome][index].append(cov) 42 | index += 1 43 | return coverage, std 44 | 45 | def absolute_abundance(coverage, total_bases): 46 | """ 47 | absolute abundance = (number of bases mapped to genome / total number of bases in sample) * 100 48 | """ 49 | absolute = {} 50 | for genome in coverage: 51 | absolute[genome] = [] 52 | index = 0 53 | for calc in coverage[genome]: 54 | bases = calc[0] 55 | total = total_bases[index] 56 | absolute[genome].append((bases / total) * float(100)) 57 | index += 1 58 | total_assembled = [0 for i in absolute[genome]] 59 | for genome in absolute: 60 | index = 0 61 | for cov in absolute[genome]: 62 | total_assembled[index] += cov 63 | index += 1 64 | absolute['Unassembled'] = [(100 - i) for i in total_assembled] 65 | return absolute 66 | 67 | def relative_abundance(coverage): 68 | """ 69 | cov = number of bases / length of genome 70 | relative abundance = [(cov) / sum(cov for all genomes)] * 100 71 | """ 72 | relative = {} 73 | sums = [] 74 | for genome in coverage: 75 | for cov in coverage[genome]: 76 | sums.append(0) 77 | break 78 | for genome in coverage: 79 | index = 0 80 | for cov in coverage[genome]: 81 | sums[index] += cov 82 | index += 1 83 | for genome in coverage: 84 | index = 0 85 | relative[genome] = [] 86 | for cov in coverage[genome]: 87 | if sums[index] == 0: 88 | relative[genome].append(0) 89 | else: 90 | relative[genome].append((cov / sums[index]) * float(100)) 91 | index += 1 92 | return relative 93 | 94 | def calc_total_mapped_bases(coverage): 95 | total = [] 96 | for genome in coverage: 97 | for cov in coverage[genome]: 98 | total.append(0) 99 | break 100 | for genome in coverage: 101 | index = 0 102 | for cov in coverage[genome]: 103 | bases = cov[0] 104 | total[index] += bases 105 | index += 1 106 | return total 107 | 108 | def calc_std(cov): 109 | std = {} 110 | for genome in cov: 111 | std[genome] = [] 112 | for sample in cov[genome]: 113 | std[genome].append(numpy.std(sample)) 114 | return std 115 | 116 | def genome_coverage(genomes, scaffold_coverage, total_bases): 117 | """ 118 | coverage = (number of bases / length of genome) * 100 119 | """ 120 | coverage = {} 121 | custom = {} 122 | std = {} 123 | for genome in genomes: 124 | for sequence in parse_fasta(genome): 125 | scaffold = sequence[0].split('>')[1].split()[0] 126 | coverage, std = sum_coverage(coverage, std, genome, scaffold, sequence, scaffold_coverage) 127 | custom = calc_custom(custom, genome, scaffold, sequence, scaffold_coverage, total_bases) 128 | std = calc_std(std) 129 | custom_std = calc_std(custom) 130 | custom_av = {} 131 | for genome in custom: 132 | custom_av[genome] = [] 133 | for sample in custom[genome]: 134 | custom_av[genome].append(numpy.mean(sample)) 135 | for genome in coverage: 136 | print('%s\t%s' % (genome, coverage[genome][0][1])) 137 | if total_bases is True: 138 | total_bases = calc_total_mapped_bases(coverage) 139 | absolute = absolute_abundance(coverage, total_bases) 140 | for genome in coverage: 141 | calculated = [] 142 | for calc in coverage[genome]: 143 | calculated.append(calc[0] / calc[1]) 144 | coverage[genome] = calculated 145 | relative = relative_abundance(coverage) 146 | return coverage, std, absolute, relative, custom_av, custom_std 147 | 148 | def print_genome_calculations(genomes, scaffold_coverage, total_bases, samples): 149 | coverage, std, absolute, relative, custom, custom_std = genome_coverage(genomes, scaffold_coverage, total_bases) 150 | for i in ['coverage', coverage], ['coverage_std', std], ['absolute abundance', absolute], ['relative abundance', relative], ['custom', custom], ['custom_std', custom_std]: 151 | out = open('%s.tsv' % (i[0]).replace(' ', '_'), 'w') 152 | i = i[1] 153 | print('#\t%s\ttotal' % ('\t'.join(samples)), file=out) 154 | for genome in i: 155 | print('%s\t%s' % (genome, '\t'.join([str(j) for j in i[genome]])), file=out) 156 | out.close() 157 | 158 | def parse_scaffold_coverage(coverage): 159 | cov = {} 160 | for line in coverage: 161 | if line.startswith('#') is False: 162 | line = line.strip().split('\t') 163 | scaffold, coverage = line[0].split(':')[0], [float(i) for i in line[1:]] 164 | coverage.append(sum(coverage)) 165 | cov[scaffold] = coverage 166 | return cov 167 | 168 | def parse_total_bases(total_bases, samples): 169 | total = {} 170 | for line in total_bases: 171 | if line.startswith('#') is False: 172 | line = line.strip().split() 173 | sample, bases = '.'.join(line[0].split('.')[0:2]), float(line[2]) 174 | if sample not in total: 175 | total[sample] = 0 176 | total[sample] += bases 177 | totals = [float(total[i]) for i in samples if 'total' not in i] 178 | totals.append(sum(totals)) 179 | return totals 180 | 181 | def sample_names(file): 182 | for line in file: 183 | if line.startswith('#') is True: 184 | header = line.strip().split('\t')[1:] 185 | return header 186 | break 187 | 188 | if __name__ == '__main__': 189 | if len(sys.argv) == 1: 190 | print('please provide coverage file, total reads file, and fasta files for each genome for calculating coverage') 191 | exit() 192 | genomes = glob(sys.argv[3]) 193 | scaffold_coverage, total_bases = sys.argv[1], sys.argv[2] 194 | # scaffold_coverage = '/Users/ctb/banfield_lab/dora/5.assembly/coverage_ctb.tsv' 195 | print('using %s for coverage information' % (scaffold_coverage)) 196 | scaffold_coverage = open(scaffold_coverage) 197 | # total_bases = '/Users/ctb/banfield_lab/dora/5.assembly/dora_base_counts.tsv' 198 | print('using %s for total bases' % (total_bases)) 199 | print('coverage = (number of bases mapped to genome / length of genome) * 100') 200 | print('absolute abundance = (number of bases mapped to genome / total number of bases in sample) * 100') 201 | print('relative abundance = (coverage / sum(coverage for all genomes in sample)) * 100') 202 | print('custom = average({(reads mapped to scaffold)/(total reads for sample)} / (length of scaffold))') 203 | total_bases = open(total_bases) 204 | samples = sample_names(scaffold_coverage) 205 | scaffold_coverage = parse_scaffold_coverage(scaffold_coverage) 206 | total_bases = parse_total_bases(total_bases, samples) 207 | print_genome_calculations(genomes, scaffold_coverage, total_bases, samples) 208 | -------------------------------------------------------------------------------- /ctbBio/transform.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for normalizing/standardizing data 5 | 6 | based on rows, column, or table: 7 | 1. 0-1 8 | 2. percent of total 9 | 3. standardize (X - mean)/(standard deviation) 10 | 11 | example: normalize rows 0-1 | standardize rows | standardize table 12 | """ 13 | 14 | import sys 15 | import os 16 | import numpy as np 17 | 18 | def zero_to_one(table, option): 19 | """ 20 | normalize from zero to one for row or table 21 | """ 22 | if option == 'table': 23 | m = min(min(table)) 24 | ma = max(max(table)) 25 | t = [] 26 | for row in table: 27 | t_row = [] 28 | if option != 'table': 29 | m, ma = min(row), max(row) 30 | for i in row: 31 | if ma == m: 32 | t_row.append(0) 33 | else: 34 | t_row.append((i - m)/(ma - m)) 35 | t.append(t_row) 36 | return t 37 | 38 | def pertotal(table, option): 39 | """ 40 | calculate percent of total 41 | """ 42 | if option == 'table': 43 | total = sum([i for line in table for i in line]) 44 | t = [] 45 | for row in table: 46 | t_row = [] 47 | if option != 'table': 48 | total = sum(row) 49 | for i in row: 50 | if total == 0: 51 | t_row.append(0) 52 | else: 53 | t_row.append(i/total*100) 54 | t.append(t_row) 55 | return t 56 | 57 | def standardize(table, option): 58 | """ 59 | standardize 60 | Z = (X - mean) / (standard deviation) 61 | """ 62 | if option == 'table': 63 | mean = np.mean(table) 64 | std = np.std(table) 65 | t = [] 66 | for row in table: 67 | t_row = [] 68 | if option != 'table': 69 | mean = np.mean(row) 70 | std = np.std(row) 71 | for i in row: 72 | if std == 0: 73 | t_row.append(0) 74 | else: 75 | t_row.append((i - mean)/std) 76 | t.append(t_row) 77 | return t 78 | 79 | def scale(table): 80 | """ 81 | scale table based on the column with the largest sum 82 | """ 83 | t = [] 84 | columns = [[] for i in table[0]] 85 | for row in table: 86 | for i, v in enumerate(row): 87 | columns[i].append(v) 88 | sums = [float(sum(i)) for i in columns] 89 | scale_to = float(max(sums)) 90 | scale_factor = [scale_to/i for i in sums if i != 0] 91 | for row in table: 92 | t.append([a * b for a,b in zip(row, scale_factor)]) 93 | return t 94 | 95 | def norm(table): 96 | """ 97 | fit to normal distribution 98 | """ 99 | print('# norm dist is broken', file=sys.stderr) 100 | exit() 101 | from matplotlib.pyplot import hist as hist 102 | t = [] 103 | for i in table: 104 | t.append(np.ndarray.tolist(hist(i, bins = len(i), normed = True)[0])) 105 | return t 106 | 107 | def log_trans(table): 108 | """ 109 | log transform each value in table 110 | """ 111 | t = [] 112 | all = [item for sublist in table for item in sublist] 113 | if min(all) == 0: 114 | scale = min([i for i in all if i != 0]) * 10e-10 115 | else: 116 | scale = 0 117 | for i in table: 118 | t.append(np.ndarray.tolist(np.log10([j + scale for j in i]))) 119 | return t 120 | 121 | def box_cox(table): 122 | """ 123 | box-cox transform table 124 | """ 125 | from scipy.stats import boxcox as bc 126 | t = [] 127 | for i in table: 128 | if min(i) == 0: 129 | scale = min([j for j in i if j != 0]) * 10e-10 130 | else: 131 | scale = 0 132 | t.append(np.ndarray.tolist(bc(np.array([j + scale for j in i]))[0])) 133 | return t 134 | 135 | def inh(table): 136 | """ 137 | inverse hyperbolic sine transformation 138 | """ 139 | t = [] 140 | for i in table: 141 | t.append(np.ndarray.tolist(np.arcsinh(i))) 142 | return t 143 | 144 | def diri(table): 145 | """ 146 | from SparCC - "randomly draw from the corresponding posterior 147 | Dirichlet distribution with a uniform prior" 148 | """ 149 | t = [] 150 | for i in table: 151 | a = [j + 1 for j in i] 152 | t.append(np.ndarray.tolist(np.random.mtrand.dirichlet(a))) 153 | return t 154 | 155 | def transpose(table): 156 | """ 157 | transpose matrix 158 | """ 159 | t = [] 160 | for i in range(0, len(table[0])): 161 | t.append([row[i] for row in table]) 162 | return t 163 | 164 | def transform(table, option, mode): 165 | """ 166 | transform data table: 167 | option: 168 | rows 169 | columns 170 | table 171 | mode: 172 | 0: 0-1 173 | pertotal: percent of total 174 | stand: standardize 175 | scale: scale based on largest column 176 | """ 177 | if option != 'rows' and option != 'columns' and option != 'table': 178 | print('# specify: rows, columns, or table', file=sys.stderr) 179 | exit() 180 | if option == 'columns': 181 | table = transpose(table) 182 | if mode == '0': 183 | transformed = zero_to_one(table, option) 184 | elif mode == 'pertotal': 185 | transformed = pertotal(table, option) 186 | elif mode == 'stand': 187 | transformed = standardize(table, option) 188 | elif mode == 'scale': 189 | if option != 'table': 190 | print('# scaling is done on an entire table, based on the column with the largest sum - use table', file=sys.stderr) 191 | exit() 192 | transformed = scale(table) 193 | elif mode == 'log': 194 | if option != 'table': 195 | print('# scaling is done on a per-value basis - use table', file=sys.stderr) 196 | exit() 197 | transformed = log_trans(table) 198 | elif mode == 'box': 199 | transformed = box_cox(table) 200 | elif mode == 'inh': 201 | transformed = inh(table) 202 | elif mode == 'norm': 203 | transformed = norm(table) 204 | elif mode == 'diri': 205 | transformed = diri(table) 206 | else: 207 | print('# specify: 0 (for 0-1), pertotal (for percent of total), stand (for standardize), scale (scale table to largest column), norm (fit to normal distribution), log (log10(x+1)-transform), box (box-cox), inh (inverse hyperbolic sine), diri (Dirichlet distribution)', file=sys.stderr) 208 | exit() 209 | if option == 'columns': 210 | transformed = transpose(transformed) 211 | return transformed 212 | 213 | if __name__ == '__main__': 214 | if len(sys.argv) != 3: 215 | print('usage: cat table.tsv | transform.py \n', file=sys.stderr) 216 | print('methods: 0 (0-1), pertotal (percent of total), stand (standardize), scale (scale table to largest column), norm (fit to normal distribution), log (log10(x+1)-transform), box (box-cox), inh (inverse hyperbolic sine transformation>, diri (draw from posterior Dirichlet distribution)\n', file=sys.stderr) 217 | print('# example: cat table.tsv | transform.py rows 0 > table.trans.tsv', file=sys.stderr) 218 | exit() 219 | option, mode = sys.argv[1:] 220 | data = [i.strip().split('\t') for i in sys.stdin] 221 | if data[0][0][0] == '%': 222 | header, table, names = [], [], [] 223 | for line in data: 224 | if line[0][0] == '%': 225 | header.append(line) 226 | else: 227 | names.append(line[0]) 228 | table.append([float(i) for i in line[1::]]) 229 | else: 230 | header = [data[0]] 231 | names = [i[0] for i in data[1::]] 232 | table = [[float(j) for j in i[1:]] for i in data[1:]] 233 | names.insert(0, header[-1][0]) 234 | transformed = transform(table, option, mode) 235 | for line in header: 236 | print('\t'.join(line)) 237 | for i, line in enumerate(transformed, 1): 238 | print('%s\t%s' % (names[i], '\t'.join([str(i) for i in line]))) 239 | -------------------------------------------------------------------------------- /ctbBio/rp16.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | * script for finding set of 16 syntenous ribosomal proteins 5 | in ORFs predicted for scaffolds (Prodigal) 6 | * evaluates each scaffold independently 7 | (does not expect that the input as that from a single genome) 8 | """ 9 | 10 | import sys 11 | import os 12 | import argparse 13 | from random import choice 14 | from operator import itemgetter 15 | 16 | # ctb 17 | from ctbBio.search import search as search 18 | from ctbBio.numblast import best as numblast 19 | from ctbBio.fasta import iterate_fasta as parse_fasta 20 | 21 | def find_databases(databases): 22 | """ 23 | define ribosomal proteins and location of curated databases 24 | """ 25 | # 16 ribosomal proteins in their expected order 26 | proteins = ['L15', 'L18', 'L6', 'S8', 'L5', 'L24', 'L14', 27 | 'S17', 'L16', 'S3', 'L22', 'S19', 'L2', 'L4', 'L3', 'S10'] 28 | # curated databases 29 | protein_databases = { 30 | 'L14': 'rpL14_JGI_MDM.filtered.faa', 31 | 'L15': 'rpL15_JGI_MDM.filtered.faa', 32 | 'L16': 'rpL16_JGI_MDM.filtered.faa', 33 | 'L18': 'rpL18_JGI_MDM.filtered.faa', 34 | 'L22': 'rpL22_JGI_MDM.filtered.faa', 35 | 'L24': 'rpL24_JGI_MDM.filtered.faa', 36 | 'L2': 'rpL2_JGI_MDM.filtered.faa', 37 | 'L3': 'rpL3_JGI_MDM.filtered.faa', 38 | 'L4': 'rpL4_JGI_MDM.filtered.faa', 39 | 'L5': 'rpL5_JGI_MDM.filtered.faa', 40 | 'L6': 'rpL6_JGI_MDM.filtered.faa', 41 | 'S10': 'rpS10_JGI_MDM.filtered.faa', 42 | 'S17': 'rpS17_JGI_MDM.filtered.faa', 43 | 'S19': 'rpS19_JGI_MDM.filtered.faa', 44 | 'S3': 'rpS3_JGI_MDM.filtered.faa', 45 | 'S8': 'rpS8_JGI_MDM.filtered.faa'} 46 | protein_databases = {key: '%s/%s' % (databases, database) \ 47 | for key, database in list(protein_databases.items())} 48 | return proteins, protein_databases 49 | 50 | def scaffold_hits(searches, fasta, max_hits): 51 | """ 52 | get hits from each search against each RP 53 | scaffolds[scaffold] = # ORfs 54 | s2rp[scaffold] = {rp:[hits]} 55 | """ 56 | # initialize 57 | ## scaffolds[scaffold] = # ORFs 58 | scaffolds = {} 59 | for seq in parse_fasta(fasta): 60 | scaffold = seq[0].split()[0].split('>', 1)[1].rsplit('_', 1)[0] 61 | if scaffold not in scaffolds: 62 | scaffolds[scaffold] = 0 63 | scaffolds[scaffold] += 1 64 | s2rp = {s: {r[0]: [] 65 | for r in searches} 66 | for s in scaffolds} 67 | # get hits from blast 68 | for search in searches: 69 | rp, blast = search 70 | hits = [i for i in numblast(open(blast), max_hits, evalue_thresh, bit_thresh)] 71 | for hit in hits: 72 | s = hit[0].split()[0].rsplit('_', 1)[0] 73 | hit[10], hit[11] = float(hit[10]), float(hit[11]) 74 | s2rp[s][rp].append(hit) 75 | return scaffolds, s2rp 76 | 77 | def find_next(start, stop, i2hits): 78 | """ 79 | which protein has the best hit, the one to the 'right' or to the 'left?' 80 | """ 81 | if start not in i2hits and stop in i2hits: 82 | index = stop 83 | elif stop not in i2hits and start in i2hits: 84 | index = start 85 | elif start not in i2hits and stop not in i2hits: 86 | index = choice([start, stop]) 87 | i2hits[index] = [[False]] 88 | else: 89 | A, B = i2hits[start][0], i2hits[stop][0] 90 | if B[10] <= A[10]: 91 | index = stop 92 | else: 93 | index = start 94 | if index == start: 95 | nstart = start - 1 96 | nstop = stop 97 | else: 98 | nstop = stop + 1 99 | nstart = start 100 | match = i2hits[index][0] 101 | rp = match[-1] 102 | return index, nstart, nstop, rp, match 103 | 104 | def find_block(rps, num_orfs, hits, best, max_errors): 105 | best_index, best_rp = int(best[0].split()[0].rsplit('_', 1)[1]), best[-1] 106 | index2hits = {i: [] for i in range(1, num_orfs + 1)} 107 | errors = 0 108 | found = [best_rp] 109 | block = {best_index: best} 110 | for rp, matches in list(hits.items()): 111 | for match in matches: 112 | i = int(match[0].split()[0].rsplit('_', 1)[1]) 113 | index2hits[i].append(match + [rp]) 114 | for i, matches in list(index2hits.items()): 115 | if matches == []: 116 | del index2hits[i] 117 | continue 118 | index2hits[i] = sorted(matches, key = itemgetter(10)) 119 | index, start, stop, rp = best_index, best_index - 1, best_index + 1, best_rp 120 | while errors < max_errors and len(set(found)) < 17: # and index <= num_orfs: 121 | new_index, start, stop, rp, match = find_next(start, stop, index2hits) 122 | if rp is False: # count as error if there is no hit 123 | errors += 1 124 | index = new_index 125 | found.append(rp) 126 | block[index] = match 127 | trimmed_block = {} 128 | for i in sorted(block): 129 | if block[i] is not False: 130 | out = [str(j) for j in block[i]] 131 | trimmed_block[out[-1]] = out 132 | return trimmed_block 133 | 134 | def find_ribosomal(rps, scaffolds, s2rp, min_hits, max_hits_rp, max_errors): 135 | """ 136 | determine which hits represent real ribosomal proteins, identify each in syntenic block 137 | max_hits_rp = maximum number of hits to consider per ribosomal protein per scaffold 138 | """ 139 | for scaffold, proteins in list(s2rp.items()): 140 | # for each scaffold, get best hits for each rp 141 | hits = {p: [i for i in sorted(hits, key = itemgetter(10))][0:max_hits_rp] 142 | for p, hits in list(proteins.items()) if len(hits) > 0} 143 | # skip if fewer than min_hits RPs are identified 144 | if len(hits) < min_hits: 145 | continue 146 | best = sorted([hit[0] + [p] 147 | for p, hit in list(hits.items())], key = itemgetter(10))[0] 148 | block = find_block(rps, scaffolds[scaffold], hits, best, max_errors) 149 | if (len(block) - 1) >= min_hits: 150 | yield scaffold, block 151 | 152 | def ribosomal(scaffolds, DBdir, min_hits, evalue_thresh, bit_thresh, \ 153 | method = 'usearch', threads = 6, \ 154 | max_hits = 1, max_hits_rp = 1, max_errors = 35): 155 | """ 156 | find ribosomal proteins 157 | max_hits = maximum number of blast hits to consider for an orf 158 | if 1, only consider best blast hit for each ORF 159 | max_hits_rp = maximum number of hits to consider per ribosomal protein per scaffold 160 | if 1, only consider best RP match to contig 161 | max_errors = maximum number of errors when looking for block of proteins (e.g. out of order or gap) 162 | """ 163 | # rps = list (in syntenic order) of ribosomal proteins 164 | # rp_db = dictionary to find the database files 165 | rps, rp_db = find_databases(DBdir) 166 | searches = [[rp, search(scaffolds, rp_db[rp], method = method, threads = str(threads), max_hits = 10)] 167 | for rp in rp_db] 168 | scaffolds, scaffold2rp = scaffold_hits(searches, scaffolds, max_hits) 169 | print('# scaffold\t%s' % ('\t'.join(rps))) 170 | for scaffold, block in \ 171 | find_ribosomal(rps, scaffolds, scaffold2rp, min_hits, max_hits_rp, max_errors): 172 | id_rps = [] 173 | for rp in rps: 174 | if rp in block: 175 | id_rps.append(block[rp][0].split()[0]) 176 | else: 177 | id_rps.append('-') 178 | print('%s\t%s' % (scaffold, '\t'.join(id_rps))) 179 | 180 | 181 | if __name__ == '__main__': 182 | desc = '# find syntenic group of 16 ribosomal proteins' 183 | parser = argparse.ArgumentParser(description = desc) 184 | parser.add_argument(\ 185 | '-f', required = True, type = str, \ 186 | help = 'ORF predictions (Prodigal-format fasta)') 187 | parser.add_argument(\ 188 | '-d', required = False, default = False, type = str, \ 189 | help = 'directory with ribosomal protein databases \ 190 | (default = check for databases env. variable)') 191 | parser.add_argument(\ 192 | '-m', required = False, default = 3, type = int, \ 193 | help = 'min. # of RPs to include as match (default = 3)') 194 | parser.add_argument(\ 195 | '-e', required = False, default = float(1e-6), type = float, \ 196 | help = 'maximum evalue to consider as hit (default = 1e-6)') 197 | parser.add_argument(\ 198 | '-a', required = False, default = 'usearch', type = str, \ 199 | help = 'algorithm: usearch (default), usearch-cluster, blast') 200 | parser.add_argument(\ 201 | '-t', required = False, default = 6, type = int, \ 202 | help = 'threads') 203 | parser.add_argument(\ 204 | '-b', required = False, default = float(40), type = float, \ 205 | help = 'minimum bit score to consider as hit (default = 40)') 206 | args = vars(parser.parse_args()) 207 | evalue_thresh, bit_thresh = args['e'], args['b'] 208 | scaffolds, min_hits = args['f'], args['m'] 209 | method = args['a'] 210 | threads = args['t'] 211 | DBdir = args['d'] 212 | if 'databases' not in os.environ and DBdir is False: 213 | print('# specify databases directory', file = sys.stderr) 214 | exit() 215 | if DBdir is False: 216 | DBdir = '%s/rp16/Laura/' % (os.environ['databases']) 217 | ribosomal(scaffolds, DBdir, min_hits, evalue_thresh, bit_thresh, method = method, threads = threads) 218 | -------------------------------------------------------------------------------- /ctbBio/search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for searching a query fasta against a database using either 5 | usearch or blast 6 | """ 7 | 8 | import sys 9 | import os 10 | import argparse 11 | from ctbBio.fasta import iterate_fasta as parse_fasta 12 | 13 | def check_type(fasta): 14 | nucl = ['A', 'T', 'G', 'C'] 15 | junk = ['N', 'U', '.', '-', ' '] 16 | type = 'nucl' 17 | for seq in parse_fasta(fasta): 18 | seq = seq[1].upper() 19 | for residue in seq: 20 | if residue in junk: 21 | continue 22 | if residue not in nucl: 23 | type = 'prot' 24 | break 25 | break 26 | return type 27 | 28 | def blastdb(fasta, maxfile = 10000000): 29 | """ 30 | make blast db 31 | """ 32 | db = fasta.rsplit('.', 1)[0] 33 | type = check_type(fasta) 34 | if type == 'nucl': 35 | type = ['nhr', type] 36 | else: 37 | type = ['phr', type] 38 | if os.path.exists('%s.%s' % (db, type[0])) is False \ 39 | and os.path.exists('%s.00.%s' % (db, type[0])) is False: 40 | print('# ... making blastdb for: %s' % (fasta), file=sys.stderr) 41 | os.system('makeblastdb \ 42 | -in %s -out %s -dbtype %s -max_file_sz %s >> log.txt' \ 43 | % (fasta, db, type[1], maxfile)) 44 | else: 45 | print('# ... database found for: %s' % (fasta), file=sys.stderr) 46 | return db 47 | 48 | def usearchdb5(fasta): 49 | """ 50 | make usearch db 51 | """ 52 | if '.udb' in fasta: 53 | print('# ... database found: %s' % (fasta), file=sys.stderr) 54 | return db 55 | type = check_type(fasta) 56 | if type == 'nucl': 57 | type = ['wdb', type] 58 | else: 59 | type = ['udb', type] 60 | db = '%s.%s' % (fasta.rsplit('.', 1)[0], type[0]) 61 | if os.path.exists(db) is False: 62 | print('# ... making usearch db for: %s' % (fasta), file=sys.stderr) 63 | os.system('usearch -make%s %s -output %s >> log.txt' % (type[0], fasta, db)) 64 | else: 65 | print('# ... database found for: %s' % (fasta), file=sys.stderr) 66 | return db 67 | 68 | def usearchdb(fasta, alignment = 'local', usearch_loc = 'usearch'): 69 | """ 70 | make usearch db 71 | """ 72 | if '.udb' in fasta: 73 | print('# ... database found: %s' % (fasta), file=sys.stderr) 74 | return fasta 75 | type = check_type(fasta) 76 | db = '%s.%s.udb' % (fasta.rsplit('.', 1)[0], type) 77 | if os.path.exists(db) is False: 78 | print('# ... making usearch db for: %s' % (fasta), file=sys.stderr) 79 | if alignment == 'local': 80 | os.system('%s -makeudb_ublast %s -output %s >> log.txt' % (usearch_loc, fasta, db)) 81 | elif alignment == 'global': 82 | os.system('%s -makeudb_usearch %s -output %s >> log.txt' % (usearch_loc, fasta, db)) 83 | else: 84 | print('# ... database found for: %s' % (fasta), file=sys.stderr) 85 | return db 86 | 87 | def phmmer2blast(phmmer, out): 88 | out = open(out, 'w') 89 | na = 'n/a' 90 | for line in open(phmmer): 91 | if line.startswith('#'): 92 | continue 93 | line = line.strip().split() 94 | na = 'n/a' 95 | if len(line) >= 6: 96 | blast = [line[2], line[0], na, na, na, na, na, na, na, na, line[4], line[5]] 97 | print('\t'.join(blast), file=out) 98 | out.close() 99 | 100 | def phmmer(query, db, type, out, threads = '4', evalue = '0.01'): 101 | """ 102 | run phmmer 103 | """ 104 | if os.path.exists(out) is False: 105 | print('# ... running phmmer with %s as query and %s as database' % (query, db)) 106 | os.system('phmmer -o %s.ph1 --tblout %s.ph2 --acc --noali --notextw -E %s --cpu %s %s %s' % (out, out, evalue, threads, query, db)) 107 | else: 108 | print('# ... phmmer output found for %s as query and %s as database' % (query, db)) 109 | phmmer2blast('%s.ph2' % out, out) 110 | 111 | def blast(query, db, type, out, threads = '4', maxtargets = '100', megablast = False): 112 | """ 113 | run blast 114 | """ 115 | if os.path.exists(out) is False: 116 | db = blastdb(db) # make the database file, if necessary 117 | print('# ... running blast with %s as query and %s as database' % (query, db)) 118 | if type == 'nucl': 119 | blast = 'blastn' 120 | if megablast == True: 121 | blast = 'blastn -task megablast' 122 | else: 123 | blast = 'blastp' 124 | os.system('%s \ 125 | -query %s -db %s -out %s -outfmt 6 \ 126 | -max_target_seqs %s -num_threads %s >> log.txt' \ 127 | % (blast, query, db, out, maxtargets, threads)) 128 | else: 129 | print('# ... blast output found for %s as query and %s as database' % (query, db)) 130 | 131 | def usearch5(query, db, type, out, threads = '4', evalue = '100', alignment = 'local'): 132 | """ 133 | run usearch 134 | """ 135 | if os.path.exists(out) is False: 136 | print('# ... running usearch with %s as query and %s as database' % (query, db)) 137 | if type[1] == 'nucl': 138 | threads = '' 139 | else: 140 | threads = '-threads %s' % (threads) 141 | os.system('usearch \ 142 | -query %s -%s %s -blast6out %s \ 143 | -evalue %s %s -%s >> log.txt' \ 144 | % (query, type[0], db, out, evalue, threads, alignment)) 145 | else: 146 | print('# ... usearch output found for %s as query and %s as database' % (query, db)) 147 | 148 | def usearch(query, db, type, out, threads = '6', evalue = '100', alignment = 'local', max_hits = 100, cluster = False): 149 | """ 150 | run usearch 151 | """ 152 | if 'usearch64' in os.environ: 153 | usearch_loc = os.environ['usearch64'] 154 | else: 155 | usearch_loc = 'usearch' 156 | if os.path.exists(out) is False: 157 | db = usearchdb(db, alignment, usearch_loc) # make the database file, if neceesary 158 | print('# ... running usearch with %s as query and %s as database' % (query, db), file=sys.stderr) 159 | if type == 'nucl': 160 | strand = '-strand both' 161 | else: 162 | strand = '' 163 | if alignment == 'local' and cluster is False: 164 | os.system('%s \ 165 | -ublast %s -db %s -blast6out %s \ 166 | -evalue %s -threads %s %s -maxhits %s >> log.txt' \ 167 | % (usearch_loc, query, db, out, evalue, threads, strand, max_hits)) 168 | elif alignment == 'global' and cluster is False: 169 | os.system('%s \ 170 | -usearch_global %s -db %s -blast6out %s \ 171 | -id 0.10 -threads %s %s >> log.txt' \ 172 | % (usearch_loc, query, db, out, threads, strand)) 173 | elif alignment == 'local' and cluster is True: 174 | qsub = 'qsub -V -N usearch' 175 | os.system('echo "%s -ublast `pwd`/%s -db %s -blast6out `pwd`/%s -evalue %s -threads %s %s -maxhits %s >> `pwd`/log.txt" | %s' \ 176 | % (usearch_loc, query, db, out, evalue, threads, strand, max_hits, qsub)) 177 | else: 178 | print('specify local or global alignment', file=sys.stderr) 179 | exit() 180 | else: 181 | print('# ... usearch output found for %s as query and %s as database' % (query, db), file=sys.stderr) 182 | 183 | def outfile(query, method, database, prefix): 184 | type = check_type(query) 185 | query = query.rsplit('.', 1)[0] 186 | database = database.rsplit('.', 1)[0] 187 | if '/' in query: 188 | query = query.rsplit('/', 1)[1] 189 | if '/' in database: 190 | database = database.rsplit('/', 1)[1] 191 | out = '%s-%s_%s-%s.b6' % (query, method.split('-')[0], type, database) 192 | if prefix is not False: 193 | out = '%s%s' % (prefix, out) 194 | return out, type 195 | 196 | def search(query, database, method = 'usearch', alignment = 'local', max_hits = 100, threads = '6', prefix = False): 197 | out, type = outfile(query, method, database, prefix) 198 | if method == 'usearch': 199 | usearch(query, database, type, out, alignment = alignment, max_hits = max_hits, threads = threads) 200 | elif method == 'usearch-cluster': 201 | usearch(query, database, type, out, alignment = alignment, max_hits = max_hits, threads = threads, cluster = True) 202 | elif method == 'blast': 203 | blast(query, database, type, out, threads = threads) 204 | elif method == 'phmmer': 205 | phmmer(query, database, type, out, threads = threads) 206 | return out 207 | 208 | if __name__ == "__main__": 209 | parser = argparse.ArgumentParser(description = \ 210 | '# search sequences against database') 211 | parser.add_argument(\ 212 | '-q', required = True, \ 213 | help = 'query') 214 | parser.add_argument(\ 215 | '-d', required = True, \ 216 | help = 'database') 217 | parser.add_argument(\ 218 | '-a', default = 'usearch', \ 219 | help = 'algorithm: usearch (default), usearch-cluster, blast, phmmer') 220 | parser.add_argument(\ 221 | '-m', required = False, default = 100, type = int, \ 222 | help = 'max. number of hits (default = 100)') 223 | parser.add_argument(\ 224 | '-t', default = "6", \ 225 | help = 'threads (default = 6)') 226 | args = vars(parser.parse_args()) 227 | threads, query, database, method = args['t'], args['q'], args['d'], args['a'] 228 | if method != 'usearch-cluster': 229 | os.system('cat %s' % (search(query, database, method, threads = threads))) 230 | else: 231 | search(query, database, method, threads = '48') 232 | -------------------------------------------------------------------------------- /ctbBio/rRNA_insertions_gff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | convert rRNA_insertions.py iTable to gff 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import pandas as pd 11 | from ctbBio.fasta import iterate_fasta as parse_fasta 12 | 13 | def parse_catalytic(insertion, gff): 14 | """ 15 | parse catalytic RNAs to gff format 16 | """ 17 | offset = insertion['offset'] 18 | GeneStrand = insertion['strand'] 19 | if type(insertion['intron']) is not str: 20 | return gff 21 | for intron in parse_fasta(insertion['intron'].split('|')): 22 | ID, annot, strand, pos = intron[0].split('>')[1].split() 23 | Start, End = [int(i) for i in pos.split('-')] 24 | if strand != GeneStrand: 25 | if strand == '+': 26 | strand = '-' 27 | else: 28 | strand = '+' 29 | Start, End = End - 2, Start - 2 30 | Start, End = abs(Start + offset) - 1, abs(End + offset) - 1 31 | gff['#seqname'].append(insertion['ID']) 32 | gff['source'].append('Rfam') 33 | gff['feature'].append('Catalytic RNA') 34 | gff['start'].append(Start) 35 | gff['end'].append(End) 36 | gff['score'].append('.') 37 | gff['strand'].append(strand) 38 | gff['frame'].append('.') 39 | gff['attribute'].append('ID=%s; Name=%s' % (ID, annot)) 40 | return gff 41 | 42 | def parse_orf(insertion, gff): 43 | """ 44 | parse ORF to gff format 45 | """ 46 | offset = insertion['offset'] 47 | if type(insertion['orf']) is not str: 48 | return gff 49 | for orf in parse_fasta(insertion['orf'].split('|')): 50 | ID = orf[0].split('>')[1].split()[0] 51 | Start, End, strand = [int(i) for i in orf[0].split(' # ')[1:4]] 52 | if strand == 1: 53 | strand = '+' 54 | else: 55 | strand = '-' 56 | GeneStrand = insertion['strand'] 57 | if strand != GeneStrand: 58 | if strand == '+': 59 | strand = '-' 60 | else: 61 | strand = '+' 62 | Start, End = End - 2, Start - 2 63 | Start, End = abs(Start + offset) - 1, abs(End + offset) - 1 64 | annot = orf[0].split()[1] 65 | if annot == 'n/a': 66 | annot = 'unknown' 67 | gff['#seqname'].append(insertion['ID']) 68 | gff['source'].append('Prodigal and Pfam') 69 | gff['feature'].append('CDS') 70 | gff['start'].append(Start) 71 | gff['end'].append(End) 72 | gff['score'].append('.') 73 | gff['strand'].append(strand) 74 | gff['frame'].append('.') 75 | gff['attribute'].append('ID=%s; Name=%s' % (ID, annot)) 76 | return gff 77 | 78 | def parse_insertion(insertion, gff): 79 | """ 80 | parse insertion to gff format 81 | """ 82 | offset = insertion['offset'] 83 | for ins in parse_fasta(insertion['insertion sequence'].split('|')): 84 | strand = insertion['strand'] 85 | ID = ins[0].split('>')[1].split()[0] 86 | Start, End = [int(i) for i in ins[0].split('gene-pos=', 1)[1].split()[0].split('-')] 87 | Start, End = abs(Start + offset), abs(End + offset) 88 | if strand == '-': 89 | Start, End = End, Start 90 | gff['#seqname'].append(insertion['ID']) 91 | gff['source'].append(insertion['source']) 92 | gff['feature'].append('IVS') 93 | gff['start'].append(Start) 94 | gff['end'].append(End) 95 | gff['score'].append('.') 96 | gff['strand'].append(strand) # same as rRNA 97 | gff['frame'].append('.') 98 | gff['attribute'].append('ID=%s' % (ID)) 99 | return gff 100 | 101 | def parse_masked(seq, min_len): 102 | """ 103 | parse masked sequence into non-masked and masked regions 104 | """ 105 | nm, masked = [[]], [[]] 106 | prev = None 107 | for base in seq[1]: 108 | if base.isupper(): 109 | nm[-1].append(base) 110 | if masked != [[]] and len(masked[-1]) < min_len: 111 | nm.extend(masked[-1]) 112 | del masked[-1] 113 | prev = False 114 | elif base.islower(): 115 | if prev is False: 116 | masked.append([]) 117 | nm.append([]) 118 | masked[-1].append(base) 119 | prev = True 120 | return nm, masked 121 | 122 | def parse_rRNA(insertion, seq, gff): 123 | """ 124 | parse rRNA to gff format 125 | """ 126 | offset = insertion['offset'] 127 | strand = insertion['strand'] 128 | for rRNA in parse_masked(seq, 0)[0]: 129 | rRNA = ''.join(rRNA) 130 | Start = seq[1].find(rRNA) + 1 131 | End = Start + len(rRNA) - 1 132 | if strand == '-': 133 | Start, End = End - 2, Start - 2 134 | pos = (abs(Start + offset) - 1, abs(End + offset) - 1) 135 | Start, End = min(pos), max(pos) 136 | source = insertion['source'] 137 | annot = '%s rRNA' % (source.split('from', 1)[0]) 138 | gff['#seqname'].append(insertion['ID']) 139 | gff['source'].append(source) 140 | gff['feature'].append('rRNA') 141 | gff['start'].append(Start) 142 | gff['end'].append(End) 143 | gff['score'].append('.') 144 | gff['strand'].append(strand) 145 | gff['frame'].append('.') 146 | gff['attribute'].append('Name=%s' % (annot)) 147 | return gff 148 | 149 | def iTable2GFF(iTable, fa, contig = False): 150 | """ 151 | convert iTable to gff file 152 | """ 153 | columns = ['#seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'] 154 | gff = {c:[] for c in columns} 155 | for insertion in iTable.iterrows(): 156 | insertion = insertion[1] 157 | if insertion['ID'] not in fa: 158 | continue 159 | # rRNA strand 160 | strand = insertion['sequence'].split('strand=', 1)[1].split()[0] 161 | # set rRNA positions for reporting features on contig or extracted sequence 162 | if contig is True: 163 | gene = [int(i) for i in insertion['sequence'].split('pos=', 1)[1].split()[0].split('-')] 164 | if strand == '-': 165 | offset = -1 * (gene[1]) 166 | else: 167 | offset = gene[0] 168 | else: 169 | strand = '+' 170 | gene = [1, int(insertion['sequence'].split('total-len=', 1)[1].split()[0])] 171 | offset = gene[0] 172 | insertion['strand'] = strand 173 | insertion['offset'] = offset 174 | # source for prediction 175 | source = insertion['sequence'].split('::model', 1)[0].rsplit(' ', 1)[-1] 176 | insertion['source'] = source 177 | # rRNA gene 178 | geneAnnot = '%s rRNA gene' % (source.split('from', 1)[0]) 179 | geneNum = insertion['sequence'].split('seq=', 1)[1].split()[0] 180 | gff['#seqname'].append(insertion['ID']) 181 | gff['source'].append(source) 182 | gff['feature'].append('Gene') 183 | gff['start'].append(gene[0]) 184 | gff['end'].append(gene[1]) 185 | gff['score'].append('.') 186 | gff['strand'].append(strand) 187 | gff['frame'].append('.') 188 | gff['attribute'].append('ID=%s; Name=%s' % (geneNum, geneAnnot)) 189 | # rRNA 190 | gff = parse_rRNA(insertion, fa[insertion['ID']], gff) 191 | # insertions 192 | gff = parse_insertion(insertion, gff) 193 | # orfs 194 | gff = parse_orf(insertion, gff) 195 | # catalytic RNAs 196 | gff = parse_catalytic(insertion, gff) 197 | return pd.DataFrame(gff)[columns].drop_duplicates() 198 | 199 | def name2id(name): 200 | """ 201 | convert header to id (check gene #) 202 | """ 203 | try: 204 | return '%s_%s' % (name.split()[0], name.split('seq=', 1)[1].split()[0]) 205 | except: 206 | print('name error:', name) 207 | 208 | if __name__ == '__main__': 209 | parser = argparse.ArgumentParser(description='# convert rRNA_insertions.py iTable to gff file') 210 | parser.add_argument(\ 211 | '-i', required = True, \ 212 | help = 'path to rRNA iTable') 213 | parser.add_argument(\ 214 | '-f', required = True, \ 215 | help = 'path to rRNA fasta file') 216 | parser.add_argument(\ 217 | '--contigs', action = 'store_true', \ 218 | help = 'report positions relative to contigs, not rRNA genes') 219 | args = vars(parser.parse_args()) 220 | # load iTable 221 | rn = {'insertion.1':'insertion sequence', '#sequence':'sequence'} 222 | iTable = pd.read_csv(args['i'], sep = '\t').rename(columns = rn) 223 | if args['contigs'] == True: 224 | iTable['ID'] = [name.split()[0] for name in iTable['sequence']] 225 | else: 226 | iTable['ID'] = [name2id(name) for name in iTable['sequence']] 227 | iTable['insertion ID'] = [name.split('>')[1].split()[0] for name in iTable['insertion sequence']] 228 | # load sequences 229 | fa = args['f'] 230 | if args['contigs'] == True: 231 | fa = {seq[0].split('>')[1].split()[0]:seq for seq in parse_fasta(fa)} 232 | else: 233 | fa = {name2id(seq[0].split('>')[1]):seq for seq in parse_fasta(fa)} 234 | if args['contigs'] is False: 235 | gffFile = '%s.gff' % (args['i'].rsplit('.', 1)[0]) 236 | # print fasta with seq IDs 237 | faFile = '%s.gff.fa' % (args['i'].rsplit('.', 1)[0]) 238 | faFile = open(faFile, 'w') 239 | for ID, seq in fa.items(): 240 | print('>%s' % (ID), file = faFile) 241 | print(seq[1], file = faFile) 242 | # convert to gff 243 | else: 244 | gffFile = '%s.contigs.gff' % (args['i'].rsplit('.', 1)[0]) 245 | gff = iTable2GFF(iTable, fa, contig = args['contigs']) 246 | gff.to_csv(open(gffFile, 'w'), sep = '\t', index = False) 247 | -------------------------------------------------------------------------------- /ctbBio/orthologer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | chris brown 5 | banfield lab 6 | ctb@berkeley.edu 7 | 8 | draft script 9 | 10 | script for finding orthologs and making a table for comparing 11 | fasta files or genomes (.faa or .fna ORF predictions) by ordering the output 12 | 13 | global or reference modes 14 | blast / usearch every genome against every other genome 15 | make a dictionary using every gene ID as a key: 16 | genes[ID] = [sort id, fasta, description], match 17 | match[ID] = [rec. hit?, [alignment length, pident, evalue, bitscore]] 18 | output = [id, scores, description, id2, scores2, description2, id3, scores3, description3, ...] 19 | scores = [! ** alignment lenght2 | pidnet2 | evalue2 | bitscore2 ** alignment length3 | pidnet3 | evalue3 | bitscore3 ** ...] 20 | """ 21 | 22 | # improve sorting to use a list (priority sort the list) 23 | # improve sorting module 24 | # documentation 25 | # for global - must examine every combination of possible orthologs 26 | # - do this in a non-redundant manner (eg only half of total gene list) 27 | # - will not need to convert to a set 28 | # - will need to evaluate the list, and the complete list for redundnacies 29 | 30 | import os 31 | import sys 32 | from operator import itemgetter 33 | from datetime import datetime as datetime 34 | from multiprocessing import Pool as multithread 35 | from itertools import permutations as permutations 36 | 37 | #ctbBio 38 | from ctbBio.search import search as search 39 | from ctbBio.fasta import iterate_fasta as parse_fasta 40 | from ctbBio.rec_best_blast import rec_hits as rec_best_blast 41 | 42 | # global variables 43 | threads = 4 44 | e = float(0.01) 45 | bit = float(40) 46 | length = float(.65) 47 | algorithm = 'usearch' 48 | genes = {} # info and search results for each gene and if match is a rec. best match (under threshold) 49 | g2index = {} # index value for each gene ID 50 | 51 | def log(mode, fastas): 52 | print('### running: %s' % (sys.argv[0])) 53 | print('### start time: %s' % (datetime.now())) 54 | print('### mode: %s' % (mode)) 55 | print('### algorithm: %s' % (algorithm)) 56 | print('### evalue threshold: %s' % (e)) 57 | print('### bit score threshold: %s' % (bit)) 58 | print('### alignment length threshold (currently only for global mode): %s' % (length)) 59 | print('### comparing files: \n ... %s \n' % ('\n ... '.join(fastas))) 60 | 61 | def find_genes(fastas): 62 | index = 0 63 | for fasta in fastas: 64 | previous = 0 65 | for sequence in parse_fasta(fasta): 66 | header = sequence[0].split('>')[1].split(' ', 1) 67 | id = header[0] 68 | if len(header) > 1: 69 | description = header[1] 70 | else: 71 | description = 'n/a' 72 | genes[id] = [[id, fasta, description, previous], {}] 73 | g2index[id] = index 74 | index += 1 75 | previous = id 76 | 77 | def run_search(comparison): 78 | query, database = comparison[0], comparison[1] 79 | return search(query, database, algorithm, max_hits = 5) 80 | 81 | def search_fastas(fastas, mode): 82 | if mode == 'global': 83 | comparisons = [comparison for comparison in permutations(fastas, 2)] 84 | elif mode == 'reference': 85 | comparisons = [[fastas[0], fasta] for fasta in fastas if fasta != fastas[0]] 86 | reverse = [[i[1], i[0]] for i in comparisons] 87 | comparisons.extend(reverse) 88 | search_output = [] # list of blast or usearch output files 89 | pool = multithread(threads) 90 | search_output.append(pool.map(run_search, comparisons)) 91 | pool.close() 92 | pool.join() 93 | return search_output 94 | 95 | def rec_hits(outfile): 96 | for hit in rec_best_blast(outfile, evalue = e, bit = bit): 97 | hit = hit.split('\t') 98 | query, match = hit[0].split()[0], hit[1].split()[0] 99 | length, pidnet, evalue, bitscore = hit[3], hit[2], hit[-2], hit[-1] 100 | info = [length, pidnet, evalue, bitscore] 101 | genes[query][1][match] = [1, info] 102 | 103 | def global_orthologs(fastas): 104 | from neto import neto as neto 105 | import networkx as nx 106 | graph = neto(fastas, algorithm = algorithm, e = e, bit = bit, length = length, norm_bit = False) 107 | results = [] 108 | for group in nx.connected_components(graph): 109 | f2g = {} 110 | r = [] 111 | for g in group: 112 | f2g[genes[g][0][1]] = g 113 | for other in group: 114 | if other != g: 115 | if other in graph[g]: 116 | s = graph[g][other] 117 | scores = [s['length_fraction'], s['percent_id'], s['e_value'], s['bit_score']] 118 | scores = [str(i) for i in scores] 119 | rbh = 1 120 | else: 121 | rbh = 0 122 | scores = ['-', '-', '-', '-'] 123 | if other not in genes[g][1]: 124 | genes[g][1][other] = [rbh, []] 125 | genes[g][1][other][1] = scores 126 | for fasta in fastas: 127 | if fasta not in f2g: 128 | r.append('-') 129 | else: 130 | r.append(f2g[fasta]) 131 | results.append('*'.join(r)) 132 | return set(results) 133 | 134 | def reference_orthologs(fastas): 135 | results = [] 136 | for gene in genes: 137 | gene_info = genes[gene][0] 138 | gene_matches = genes[gene][1] 139 | fasta_index = fastas.index(gene_info[1]) 140 | query = [gene, fasta_index] 141 | orthologs = ['-' for i in fastas] 142 | if len(gene_matches) == 0: 143 | orthologs[fasta_index] = gene 144 | elif fasta_index == 0: 145 | for match in gene_matches: 146 | if gene_matches[match][0] == 1: 147 | gene_info = genes[match][0] 148 | fasta_index = fastas.index(gene_info[1]) 149 | orthologs[query[1]] = query[0] 150 | orthologs[fasta_index] = match 151 | if orthologs != ['-' for i in fastas]: 152 | results.append('*'.join(orthologs)) 153 | return set(results) 154 | 155 | def get_scores(match, matches): 156 | scores = [] 157 | for hit in matches: 158 | if hit == match: 159 | scores.append('!') 160 | elif hit == '-': 161 | scores.append('-') 162 | elif hit in genes[match][1]: 163 | scores.append(' | '.join(genes[match][1][hit][1])) 164 | else: 165 | scores.append('-') 166 | scores = ' ** '.join(scores) 167 | return scores 168 | 169 | def find_previous(matches, fasta): 170 | matches = [match for match in matches if match != '-'] 171 | candidates = [] 172 | for match in matches: 173 | previous = '-' 174 | while previous == '-': 175 | other_previous = genes[match][0][3] 176 | if other_previous == 0: 177 | previous = [100, match] 178 | else: 179 | for hit in genes[other_previous][1]: 180 | if other_previous in genes[hit][1]: 181 | if genes[hit][0][1] == fasta and genes[hit][1][other_previous][0] == 1: 182 | score = genes[hit][1][other_previous][1][2] 183 | previous = [float(score), hit] 184 | match = other_previous 185 | candidates.append(previous) 186 | previous = sorted(candidates)[0][1] 187 | return previous 188 | 189 | def find_sort_id(matches): 190 | id = [] 191 | index = 0 192 | for gene in matches: 193 | if gene == '-': 194 | fasta = fastas[index] 195 | id.append(g2index[find_previous(matches, fasta)]) 196 | else: 197 | id.append(g2index[gene]) 198 | index += 1 199 | return id 200 | # return ''.join(id) 201 | 202 | def format_results(results): 203 | formatted = [] 204 | for matches in results: 205 | matches = matches.split('*') 206 | format = [] 207 | sort_id = find_sort_id(matches) 208 | for match in matches: 209 | if match != '-': 210 | description = genes[match][0][2] 211 | scores = get_scores(match, matches) 212 | format.append('\t'.join([match, scores, description])) 213 | else: 214 | format.append('\t'.join(['-', '-', '-'])) 215 | formatted.append(sort_id + ['\t'.join(format)]) 216 | return sorted(formatted) 217 | 218 | def print_results(results, fastas): 219 | header = ['%s\tscores\tdescription' % fasta for fasta in fastas] 220 | results = format_results(results) 221 | print('### end time: %s \n' % (datetime.now())) 222 | print('### output: \n') 223 | print('# %s' % ('\t'.join(header))) 224 | for result in results: 225 | print(result[-1]) 226 | 227 | def orthologer(mode, fastas): 228 | find_genes(fastas) # find all genes in all fasta files 229 | if mode == 'global': 230 | results = global_orthologs(fastas) 231 | if mode == 'reference': 232 | search_output = search_fastas(fastas, mode)[0] 233 | # search fasta files against one another and get a list of output files 234 | rec_hits(search_output) 235 | # find rec. best blast hits under e-value threshold 236 | results = reference_orthologs(fastas) 237 | print_results(results, fastas) 238 | 239 | if __name__ == "__main__": 240 | if len(sys.argv) == 1: 241 | print('please specify \'global\' or \'reference\' mode and the fasta files to compare') 242 | exit() 243 | mode = sys.argv[1] 244 | fastas = sys.argv[2:] 245 | if mode != 'global' and mode != 'reference': 246 | print('please specify \'global\' or \'reference\' mode and the fasta files to compare') 247 | exit() 248 | log(mode, fastas) 249 | orthologer(mode, fastas) 250 | -------------------------------------------------------------------------------- /ctbBio/ncbi_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for downloading genomes from NCBI 5 | """ 6 | 7 | # python modules 8 | import os 9 | import sys 10 | import argparse 11 | import pandas as pd 12 | from tqdm import tqdm 13 | from subprocess import Popen 14 | from glob import glob as glob 15 | from multiprocessing import Pool 16 | from subprocess import Popen, PIPE 17 | 18 | def calcMD5(path, md5): 19 | """ 20 | calc MD5 based on path 21 | """ 22 | # check that file exists 23 | if os.path.exists(path) is False: 24 | yield False 25 | else: 26 | command = [md5, path] 27 | p = Popen(command, stdout = PIPE) 28 | for line in p.communicate()[0].splitlines(): 29 | line = line.decode('ascii').strip().split() 30 | # check if `md5` output 31 | if line[0] == 'MD5': 32 | yield line[-1] 33 | # else assume md5sum output 34 | else: 35 | yield line[0] 36 | p.wait() 37 | yield False 38 | 39 | def md5check(f, ftp, md5, md5p, exclude): 40 | """ 41 | * comfirm that downloaded files match md5 checksum on server 42 | * if md5 is False, only check path for the download 43 | """ 44 | files = glob(f) 45 | # if no md5 file is specified: download files if path does not exist 46 | if md5 is False: 47 | if len(files) == 0: 48 | return False 49 | print('## already downloaded:', f) 50 | return True 51 | # get md5s from server 52 | ## path to md5 file on ftp server 53 | md5 = '%s/%s' % (ftp.rsplit('/', 1)[0], md5) 54 | ## read md5 table from server 55 | try: 56 | md5 = pd.read_csv(md5, delim_whitespace = True, names = ['ftp md5', 'file']) 57 | except: 58 | return False 59 | ## filter for md5 files that match file type 60 | t = f.split('*')[1] 61 | md5 = md5[md5['file'].str.contains(t)] 62 | ## remove preceding characters from file paths 63 | md5['file'] = [i.replace('./', '') for i in md5['file']] 64 | ## exclude md5s for sub directories 65 | md5 = md5[~md5['file'].str.contains('/')] 66 | ## exclude files 67 | md5 = md5[~md5['file'].str.contains(exclude.replace('*', ''))] 68 | # get local md5s 69 | md5['local md5'] = [[j for j in calcMD5(i, md5p)][0] for i in md5['file']] 70 | # return false if md5s do not match 71 | for i, File in md5.iterrows(): 72 | if File['ftp md5'] != File['local md5']: 73 | try: 74 | os.remove(File['file']) 75 | return False 76 | except: 77 | return False 78 | print('## already downloaded:', f) 79 | return True 80 | 81 | 82 | def wget(ftp, f = False, exclude = False, name = False, 83 | md5 = False, md5p = 'md5sum', tries = 10): 84 | """ 85 | download files with wget 86 | """ 87 | # file name 88 | if f is False: 89 | f = ftp.rsplit('/', 1)[-1] 90 | # downloaded file if it does not already exist 91 | # check md5s on server (optional) 92 | t = 0 93 | while md5check(f, ftp, md5, md5p, exclude) is not True: 94 | t += 1 95 | if name is not False: 96 | print('# downloading:', name, f) 97 | if exclude is False: 98 | command = 'wget -q --random-wait %s' % (ftp) 99 | else: 100 | command = 'wget -q --random-wait -R %s %s' % (exclude, ftp) 101 | p = Popen(command, shell = True) 102 | p.communicate() 103 | if t >= tries: 104 | print('not downloaded:', name, f) 105 | return [f, False] 106 | return [f, True] 107 | 108 | def check(line, queries): 109 | """ 110 | check that at least one of 111 | queries is in list, l 112 | """ 113 | line = line.strip() 114 | spLine = line.replace('.', ' ').split() 115 | matches = set(spLine).intersection(queries) 116 | if len(matches) > 0: 117 | return matches, line.split('\t') 118 | return matches, False 119 | 120 | def entrez(db, acc): 121 | """ 122 | search entrez using specified database 123 | and accession 124 | """ 125 | c1 = ['esearch', '-db', db, '-query', acc] 126 | c2 = ['efetch', '-db', 'BioSample', '-format', 'docsum'] 127 | p1 = Popen(c1, stdout = PIPE, stderr = PIPE) 128 | p2 = Popen(c2, stdin = p1.stdout, stdout = PIPE, stderr = PIPE) 129 | return p2.communicate() 130 | 131 | def searchAccession(acc): 132 | """ 133 | attempt to use NCBI Entrez to get 134 | BioSample ID 135 | """ 136 | # try genbank file 137 | # genome database 138 | out, error = entrez('genome', acc) 139 | for line in out.splitlines(): 140 | line = line.decode('ascii').strip() 141 | if 'Assembly_Accession' in line or 'BioSample' in line: 142 | newAcc = line.split('>')[1].split('<')[0].split('.')[0].split(',')[0] 143 | if len(newAcc) > 0: 144 | return (True, acc, newAcc) 145 | # nucleotide database 146 | out, error = entrez('nucleotide', acc) 147 | for line in out.splitlines(): 148 | line = line.decode('ascii').strip() 149 | if 'Assembly_Accession' in line or 'BioSample' in line: 150 | newAcc = line.split('>')[1].split('<')[0].split('.')[0].split(',')[0] 151 | if len(newAcc) > 0: 152 | return (True, acc, newAcc) 153 | # assembly database 154 | out, error = entrez('assembly', acc) 155 | for line in out.splitlines(): 156 | line = line.decode('ascii').strip() 157 | if 'Assembly_Accession' in line or 'BioSample' in line: 158 | newAcc = line.split('>')[1].split('<')[0].split('.')[0].split(',')[0] 159 | if len(newAcc) > 0: 160 | return (True, acc, newAcc) 161 | for error in error.splitlines(): 162 | error = error.decode('ascii').strip() 163 | if '500 Can' in error: 164 | return (False, acc, 'no network') 165 | return (False, acc, 'efetch failed') 166 | 167 | def getFTPs(accessions, ftp, search, exclude, convert = False, threads = 1, attempt = 1, 168 | max_attempts = 2): 169 | """ 170 | download genome info from NCBI 171 | """ 172 | info = wget(ftp)[0] 173 | allMatches = [] 174 | for genome in open(info, encoding = 'utf8'): 175 | genome = str(genome) 176 | matches, genomeInfo = check(genome, accessions) 177 | if genomeInfo is not False: 178 | f = genomeInfo[0] + search 179 | Gftp = genomeInfo[19] 180 | Gftp = Gftp + '/' + search 181 | allMatches.extend(matches) 182 | yield [Gftp, f, exclude, matches] 183 | # print accessions that could not be matched 184 | # and whether or not they could be converted (optional) 185 | newAccs = [] 186 | missing = accessions.difference(set(allMatches)) 187 | if convert is True: 188 | pool = Pool(threads) 189 | pool = pool.imap_unordered(searchAccession, missing) 190 | for newAcc in tqdm(pool, total = len(missing)): 191 | status, accession, newAcc = newAcc 192 | if status is True: 193 | newAccs.append(newAcc) 194 | print('not found:', accession, '->', newAcc) 195 | else: 196 | for accession in missing: 197 | print('not found:', accession) 198 | # re-try after converting accessions (optional) 199 | if len(newAccs) > 0 and attempt <= max_attempts: 200 | print('convert accession attempt', attempt) 201 | attempt += 1 202 | for hit in getFTPs(set(newAccs), ftp, search, exclude, convert, 203 | threads = 1, attempt = attempt): 204 | yield hit 205 | 206 | def wgetGenome(pars, md5 = 'md5checksums.txt'): 207 | """ 208 | """ 209 | ftp, f, exclude, matches, md5p = pars 210 | name = ';'.join(list(matches)) 211 | return wget(ftp, f, exclude, name, md5 = md5, md5p = md5p) 212 | 213 | def download(args): 214 | """ 215 | download genomes from NCBI 216 | """ 217 | accessions, infoFTP = set(args['g']), args['i'] 218 | search, exclude, md5p = args['s'], args['e'], args['m'] 219 | FTPs = getFTPs(accessions, infoFTP, search, exclude, threads = args['t'], 220 | convert = args['convert']) 221 | FTPs = [ftp + [md5p] for ftp in FTPs] 222 | if args['test'] is True: 223 | for genome in FTPs: 224 | print('found:', ';'.join(genome[-1]), genome[0]) 225 | return FTPs 226 | pool = Pool(args['t']) 227 | pool = pool.imap_unordered(wgetGenome, FTPs) 228 | files = [] 229 | for f in tqdm(pool, total = len(accessions)): 230 | files.append(f) 231 | return files 232 | 233 | if __name__ == '__main__': 234 | ftp = 'ftp://ftp.ncbi.nih.gov/genomes/genbank/assembly_summary_genbank.txt' 235 | parser = argparse.ArgumentParser(description='# download genomes from NCBI') 236 | parser.add_argument(\ 237 | '-g', nargs = '*', action = 'store', 238 | required = True, help = 'list of genome accession numbers (- for stdin)') 239 | parser.add_argument(\ 240 | '-s', default = '*.fna.gz', 241 | required = False, help = 'search term for download (default = "*.fna.gz")') 242 | parser.add_argument(\ 243 | '-e', default = '*from_genomic*', 244 | required = False, 245 | help = 'search exclusion term, or False (default = "*from_genomic*")') 246 | parser.add_argument(\ 247 | '-i', default = ftp, 248 | required = False, help = 'genome info FTP (default: %s)' % (ftp)) 249 | parser.add_argument(\ 250 | '-m', default = 'md5sum', type = str, 251 | required = False, help = 'md5 program (default = md5sum, md5 on Mac)') 252 | parser.add_argument(\ 253 | '-t', default = 3, type = int, 254 | required = False, help = 'threads (default = 3)') 255 | parser.add_argument(\ 256 | '--convert', action = 'store_true', required = False, 257 | help = 'convert missing accessions using Entrez Direct (slow; requires `esearch` and `efetch`)') 258 | parser.add_argument(\ 259 | '--test', action = 'store_true', required = False, 260 | help = 'look for genomes, but do not download them') 261 | args = vars(parser.parse_args()) 262 | if args['e'] == 'False' or args['e'] == 'FALSE': 263 | args['e'] = False 264 | if args['g'][0] == '-': 265 | args['g'] = [i.strip() for i in sys.stdin] 266 | print('# downloading genome info:', args['i']) 267 | download(args) 268 | 269 | -------------------------------------------------------------------------------- /ctbBio/mapped.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for sorting and filtering a sam file 5 | """ 6 | 7 | import sys 8 | import os 9 | from itertools import cycle 10 | from subprocess import Popen, PIPE 11 | import argparse 12 | 13 | def sam2fastq(line): 14 | """ 15 | print fastq from sam 16 | """ 17 | fastq = [] 18 | fastq.append('@%s' % line[0]) 19 | fastq.append(line[9]) 20 | fastq.append('+%s' % line[0]) 21 | fastq.append(line[10]) 22 | return fastq 23 | 24 | def count_mismatches(read): 25 | """ 26 | look for NM:i: flag to determine number of mismatches 27 | """ 28 | if read is False: 29 | return False 30 | mm = [int(i.split(':')[2]) for i in read[11:] if i.startswith('NM:i:')] 31 | if len(mm) > 0: 32 | return sum(mm) 33 | else: 34 | return False 35 | 36 | def check_mismatches(read, pair, mismatches, mm_option, req_map): 37 | """ 38 | - check to see if the read maps with <= threshold number of mismatches 39 | - mm_option = 'one' or 'both' depending on whether or not one or both reads 40 | in a pair need to pass the mismatch threshold 41 | - pair can be False if read does not have a pair 42 | - make sure alignment score is not 0, which would indicate that the read was not aligned to the reference 43 | """ 44 | # if read is not paired, make sure it is mapped and that mm <= thresh 45 | if pair is False: 46 | mm = count_mismatches(read) 47 | if mm is False: 48 | return False 49 | # if no threshold is supplied, return True 50 | if mismatches is False: 51 | return True 52 | # passes threshold? 53 | if mm <= mismatches: 54 | return True 55 | # paired reads 56 | r_mm = count_mismatches(read) 57 | p_mm = count_mismatches(pair) 58 | # if neither read is mapped, return False 59 | if r_mm is False and p_mm is False: 60 | return False 61 | # if no threshold, return True 62 | if mismatches is False: 63 | return True 64 | # if req_map is True, both reads have to map 65 | if req_map is True: 66 | if r_mm is False or p_mm is False: 67 | return False 68 | ## if option is 'one,' only one read has to pass threshold 69 | if mm_option == 'one': 70 | if (r_mm is not False and r_mm <= mismatches) or (p_mm is not False and p_mm <= mismatches): 71 | return True 72 | ## if option is 'both,' both reads have to pass threshold 73 | if mm_option == 'both': 74 | ## if one read in pair does not map to the scaffold, 75 | ## make sure the other read passes threshold 76 | if r_mm is False: 77 | if p_mm <= mismatches: 78 | return True 79 | elif p_mm is False: 80 | if r_mm <= mismatches: 81 | return True 82 | elif (r_mm is not False and r_mm <= mismatches) and (p_mm is not False and p_mm <= mismatches): 83 | return True 84 | return False 85 | 86 | def get_overlap(a, b): 87 | """ 88 | report overlap of coordinates 89 | """ 90 | return max(0, min(a[1], b[1]) - max(a[0], b[0])) 91 | 92 | def check_region(read, pair, region): 93 | """ 94 | determine whether or not reads map to specific region of scaffold 95 | """ 96 | if region is False: 97 | return True 98 | for mapping in read, pair: 99 | if mapping is False: 100 | continue 101 | start, length = int(mapping[3]), len(mapping[9]) 102 | r = [start, start + length - 1] 103 | if get_overlap(r, region) > 0: 104 | return True 105 | return False 106 | 107 | def reads_from_mapping(mapping, contigs, mismatches, mm_option, req_map, region): 108 | c = cycle([1, 2]) 109 | for line in mapping: 110 | line = line.strip().split('\t') 111 | if line[0].startswith('@'): # get the sam header 112 | if line[0].startswith('@SQ'): 113 | contig = line[1].split('SN:', 1)[1] 114 | if contigs is not False: 115 | if contig in contigs: 116 | yield [0, line] 117 | else: 118 | yield [0, line] 119 | else: 120 | yield [0, line] 121 | continue 122 | if int(line[1]) <= 20: # is this from a single read? 123 | if contigs is False or line[2] in contigs: 124 | if check_mismatches(line, False, mismatches, mm_option, req_map) is True: 125 | if check_region(line, False, region) is False: 126 | continue 127 | yield [1, sam2fastq(line)] 128 | yield [10, line] 129 | else: 130 | n = next(c) 131 | if n == 2: 132 | if contigs is False: 133 | if prev[2] != '*' or line[2] != '*': 134 | if check_mismatches(line, prev, mismatches, mm_option, req_map) is True: 135 | if check_region(line, prev, region) is False: 136 | continue 137 | yield [2, sam2fastq(prev)] 138 | yield [20, prev] 139 | yield [2, sam2fastq(line)] 140 | yield [20, line] 141 | else: 142 | if prev[2] in contigs or line[2] in contigs: 143 | if check_mismatches(line, prev, mismatches, mm_option, req_map) is True: 144 | if check_region(line, prev, region) is False: 145 | continue 146 | yield [2, sam2fastq(prev)] 147 | yield [20, prev] 148 | yield [2, sam2fastq(line)] 149 | yield [20, line] 150 | prev = line 151 | 152 | def get_reads(sam, \ 153 | contigs = False, mismatches = False, mm_option = False, \ 154 | sort_sam = True, req_map = False, region = False, sbuffer = False): 155 | """ 156 | get mapped reads (and their pairs) from an unsorted sam file 157 | """ 158 | tempdir = '%s/' % (os.path.abspath(sam).rsplit('/', 1)[0]) 159 | if sort_sam is True: 160 | mapping = '%s.sorted.sam' % (sam.rsplit('.', 1)[0]) 161 | if sam != '-': 162 | if os.path.exists(mapping) is False: 163 | os.system("\ 164 | sort -k1 --buffer-size=%sG -T %s -o %s %s\ 165 | " % (sbuffer, tempdir, mapping, sam)) 166 | else: 167 | mapping = 'stdin-sam.sorted.sam' 168 | p = Popen("sort -k1 --buffer-size=%sG -T %s -o %s" \ 169 | % (sbuffer, tempdir, mapping), stdin = sys.stdin, shell = True) 170 | p.communicate() 171 | mapping = open(mapping) 172 | else: 173 | if sam == '-': 174 | mapping = sys.stdin 175 | else: 176 | mapping = open(sam) 177 | for read in reads_from_mapping(mapping, contigs, mismatches, mm_option, req_map, region): 178 | yield read 179 | 180 | if __name__ == '__main__': 181 | parser = argparse.ArgumentParser(description = '# filter sam file based on mismatches') 182 | parser.add_argument(\ 183 | '-s', required = True, help = 'path to sorted sam file (- for stdin)') 184 | parser.add_argument(\ 185 | '-m', required = True, help = 'maximum number of mismatches (or False to include all mapped)') 186 | parser.add_argument(\ 187 | '-p', required = True, help = 'require that "one" or "both" reads in pair have <= m mismatches') 188 | parser.add_argument(\ 189 | '--require-mapping', action = 'store_true', help = 'require both reads are mapped') 190 | parser.add_argument(\ 191 | '-o', default = False, help = 'name for new sam file') 192 | parser.add_argument(\ 193 | '-f', default = False, help = 'filter based on scaffold name (single name or - if list from stdin)') 194 | parser.add_argument(\ 195 | '-c', default = False, help = 'report reads mapped to region of scaffold (e.g. 1-500)') 196 | parser.add_argument(\ 197 | '-r', action = 'store_true', help = 'print mapped paired reads to stdout and single reads to stderr') 198 | parser.add_argument(\ 199 | '--sort', action = 'store_true', help = 'sort the sam file') 200 | parser.add_argument(\ 201 | '-b', default = "100", help = 'buffer size (GB) to use when sorting sam file (default = 100)') 202 | args = vars(parser.parse_args()) 203 | # is -o or -r specified? If not, script won't output anything 204 | if args['o'] is False and args['r'] is False: 205 | print('# specify -o and/or -r') 206 | exit() 207 | sam, contigs, mismatches, mm_option, new_sam = args['s'], args['f'], args['m'], args['p'], args['o'] 208 | print_reads, sort_sam, req_map = args['r'], args['sort'], args['require_mapping'] 209 | region = args['c'] 210 | sbuffer = args['b'] 211 | # convert region to list 212 | if region is not False: 213 | if '-' not in region: 214 | print('# specify range with -c (e.g. 1-500)') 215 | exit() 216 | region = [int(i) for i in region.split('-')] 217 | if mismatches == 'False' or mismatches == 'FALSE' or mismatches == 'false': 218 | mismatches = False 219 | if mismatches is False: # just to make sure mm_option is not specified if mismatches is not specified 220 | mm_option = False 221 | else: 222 | mismatches = int(mismatches) 223 | if mm_option != 'one' and mm_option != 'both': 224 | print('# specify one or both for mismatch option', file=sys.stderr) 225 | print('# i.e. should the mismatches threshold apply to one or both reads in a pair?', file=sys.stderr) 226 | exit() 227 | if contigs == '-': 228 | contigs = [i.strip() for i in sys.stdin] 229 | if new_sam is not False: 230 | new_sam = open(new_sam, 'w') 231 | for type, read in get_reads(sam, contigs, mismatches, mm_option, sort_sam, req_map, region, sbuffer): 232 | if type == 1: 233 | if print_reads is True: 234 | print('\n'.join(read), file=sys.stderr) 235 | elif type == 2: 236 | if print_reads is True: 237 | print('\n'.join(read)) 238 | elif new_sam is not False and (type == 0 or type == 10 or type == 20): 239 | print('\t'.join(read), file=new_sam) 240 | if new_sam is not False: 241 | new_sam.close() 242 | -------------------------------------------------------------------------------- /ctbBio/compare_aligned.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import itertools 7 | import numpy as np 8 | from tqdm import tqdm as tqdm 9 | from multiprocessing import Pool as multithread 10 | #from Levenshtein import ratio as lr 11 | 12 | from ctbBio.nr_fasta import de_rep as nr_fasta 13 | 14 | def calc_pident(a, b): 15 | """ 16 | calculate percent identity 17 | """ 18 | m = 0 # matches 19 | mm = 0 # mismatches 20 | for A, B in zip(list(a), list(b)): 21 | if A == '.' or B == '.': 22 | continue 23 | if A == '-' and B == '-': 24 | continue 25 | if A == B: 26 | m += 1 27 | else: 28 | mm += 1 29 | try: 30 | return float(float(m)/float((m + mm))) * 100 31 | except: 32 | return 0 33 | 34 | def calc_pident_ignore_gaps(a, b): 35 | """ 36 | calculate percent identity 37 | """ 38 | m = 0 # matches 39 | mm = 0 # mismatches 40 | for A, B in zip(list(a), list(b)): 41 | if A == '-' or A == '.' or B == '-' or B == '.': 42 | continue 43 | if A == B: 44 | m += 1 45 | else: 46 | mm += 1 47 | try: 48 | return float(float(m)/float((m + mm))) * 100 49 | except: 50 | return 0 51 | 52 | def remove_gaps(A, B): 53 | """ 54 | skip column if either is a gap 55 | """ 56 | a_seq, b_seq = [], [] 57 | for a, b in zip(list(A), list(B)): 58 | if a == '-' or a == '.' or b == '-' or b == '.': 59 | continue 60 | a_seq.append(a) 61 | b_seq.append(b) 62 | return ''.join(a_seq), ''.join(b_seq) 63 | 64 | def compare_seqs(seqs): 65 | """ 66 | compare pairs of sequences 67 | """ 68 | A, B, ignore_gaps = seqs 69 | a, b = A[1], B[1] # actual sequences 70 | if len(a) != len(b): 71 | print('# reads are not the same length', file=sys.stderr) 72 | exit() 73 | if ignore_gaps is True: 74 | pident = calc_pident_ignore_gaps(a, b) 75 | else: 76 | pident = calc_pident(a, b) 77 | return A[0], B[0], pident 78 | 79 | def compare_seqs_leven(seqs): 80 | """ 81 | calculate Levenshtein ratio of sequences 82 | """ 83 | A, B, ignore_gaps = seqs 84 | a, b = remove_gaps(A[1], B[1]) # actual sequences 85 | if len(a) != len(b): 86 | print('# reads are not the same length', file=sys.stderr) 87 | exit() 88 | pident = lr(a, b) * 100 89 | return A[0], B[0], pident 90 | 91 | def pairwise_compare(afa, leven, threads, print_list, ignore_gaps): 92 | """ 93 | make pairwise sequence comparisons between aligned sequences 94 | """ 95 | # load sequences into dictionary 96 | seqs = {seq[0]: seq for seq in nr_fasta([afa], append_index = True)} 97 | num_seqs = len(seqs) 98 | # define all pairs 99 | pairs = ((i[0], i[1], ignore_gaps) for i in itertools.combinations(list(seqs.values()), 2)) 100 | pool = multithread(threads) 101 | # calc percent identity between all pairs - parallelize 102 | if leven is True: 103 | pident = pool.map(compare_seqs_leven, pairs) 104 | else: 105 | compare = pool.imap_unordered(compare_seqs, pairs) 106 | pident = [i for i in tqdm(compare, total = (num_seqs*num_seqs)/2)] 107 | pool.close() 108 | pool.terminate() 109 | pool.join() 110 | return to_dictionary(pident, print_list) 111 | 112 | def to_dictionary(pw, print_list): 113 | """ 114 | - convert list of comparisons to dictionary 115 | - print list of pidents (if requested) to stderr 116 | """ 117 | pairs = {} 118 | for p in pw: 119 | a, b, pident = p 120 | if a not in pairs: 121 | pairs[a] = {a: '-'} 122 | if b not in pairs: 123 | pairs[b] = {b: '-'} 124 | pairs[a][b] = pident 125 | pairs[b][a] = pident 126 | if print_list is True: 127 | A, B = a.split('>')[1], b.split('>')[1] 128 | print('\t'.join([str(i) for i in [A, B, pident]]), file=sys.stderr) 129 | print('\t'.join([str(i) for i in [B, A, pident]]), file=sys.stderr) 130 | return pairs 131 | 132 | def print_pairwise(pw, median = False): 133 | """ 134 | print matrix of pidents to stdout 135 | """ 136 | names = sorted(set([i for i in pw])) 137 | if len(names) != 0: 138 | if '>' in names[0]: 139 | yield ['#'] + [i.split('>')[1] for i in names if '>' in i] 140 | else: 141 | yield ['#'] + names 142 | for a in names: 143 | if '>' in a: 144 | yield [a.split('>')[1]] + [pw[a][b] for b in names] 145 | else: 146 | out = [] 147 | for b in names: 148 | if b in pw[a]: 149 | if median is False: 150 | out.append(max(pw[a][b])) 151 | else: 152 | out.append(np.median(pw[a][b])) 153 | else: 154 | out.append('-') 155 | yield [a] + out 156 | 157 | def print_comps(comps): 158 | """ 159 | print stats for comparisons 160 | """ 161 | if comps == []: 162 | print('n/a') 163 | else: 164 | print('# min: %s, max: %s, mean: %s' % \ 165 | (min(comps), max(comps), np.mean(comps))) 166 | 167 | def compare_clades(pw): 168 | """ 169 | print min. pident within each clade and then matrix of between-clade max. 170 | """ 171 | names = sorted(set([i for i in pw])) 172 | for i in range(0, 4): 173 | wi, bt = {}, {} 174 | for a in names: 175 | for b in pw[a]: 176 | if ';' not in a or ';' not in b: 177 | continue 178 | pident = pw[a][b] 179 | cA, cB = a.split(';')[i], b.split(';')[i] 180 | if i == 0 and '_' in cA and '_' in cB: 181 | cA = cA.rsplit('_', 1)[1] 182 | cB = cB.rsplit('_', 1)[1] 183 | elif '>' in cA or '>' in cB: 184 | cA = cA.split('>')[1] 185 | cB = cB.split('>')[1] 186 | if cA == cB: 187 | if cA not in wi: 188 | wi[cA] = [] 189 | wi[cA].append(pident) 190 | else: 191 | if cA not in bt: 192 | bt[cA] = {} 193 | if cB not in bt[cA]: 194 | bt[cA][cB] = [] 195 | bt[cA][cB].append(pident) 196 | print('\n# min. within') 197 | for clade, pidents in list(wi.items()): 198 | print('\t'.join(['wi:%s' % str(i), clade, str(min(pidents))])) 199 | # print matrix of maximum between groups 200 | comps = [] 201 | print('\n# max. between') 202 | for comp in print_pairwise(bt): 203 | if comp is not None: 204 | print('\t'.join(['bt:%s' % str(i)] + [str(j) for j in comp])) 205 | if comp[0] != '#': 206 | comps.extend([j for j in comp[1:] if j != '-']) 207 | print_comps(comps) 208 | # print matrix of median between groups 209 | comps = [] 210 | print('\n# median between') 211 | for comp in print_pairwise(bt, median = True): 212 | if comp is not None: 213 | print('\t'.join(['bt:%s' % str(i)] + [str(j) for j in comp])) 214 | if comp[0] != '#': 215 | comps.extend([j for j in comp[1:] if j != '-']) 216 | print_comps(comps) 217 | 218 | def matrix2dictionary(matrix): 219 | """ 220 | convert matrix to dictionary of comparisons 221 | """ 222 | pw = {} 223 | for line in matrix: 224 | line = line.strip().split('\t') 225 | if line[0].startswith('#'): 226 | names = line[1:] 227 | continue 228 | a = line[0] 229 | for i, pident in enumerate(line[1:]): 230 | b = names[i] 231 | if a not in pw: 232 | pw[a] = {} 233 | if b not in pw: 234 | pw[b] = {} 235 | if pident != '-': 236 | pident = float(pident) 237 | pw[a][b] = pident 238 | pw[b][a] = pident 239 | return pw 240 | 241 | if __name__ == '__main__': 242 | parser = argparse.ArgumentParser(description = '# calculate percent identity of aligned reads') 243 | parser.add_argument(\ 244 | '-a', default = False, \ 245 | help = 'aligned fasta file') 246 | parser.add_argument(\ 247 | '-m', default = False, \ 248 | help = 'matrix of comparisons (for clade calcs.)') 249 | parser.add_argument(\ 250 | '--list', action = 'store_true', \ 251 | help = 'print list of pair-wise identities to stderr') 252 | parser.add_argument(\ 253 | '--no-matrix', action = 'store_false', \ 254 | help = 'do not print matrix') 255 | parser.add_argument(\ 256 | '--clades', action = 'store_true', \ 257 | help = 'compare clades based on header, e.g. >[0]Bacteria;[1]OD1;[2]unknown or >Bacteria;OD1;unknown') 258 | parser.add_argument(\ 259 | '--ignore-gaps', action = 'store_true', \ 260 | help = 'ignore gaps in alignment') 261 | parser.add_argument(\ 262 | '--leven', action = 'store_true', \ 263 | help = 'calculate Levenshtein ratio') 264 | parser.add_argument(\ 265 | '-t', default = 6, type = int,\ 266 | help = 'number of threads (default: 6)') 267 | args = vars(parser.parse_args()) 268 | afa, matrix, print_list, print_matrix, clades, ignore_gaps, leven, threads = \ 269 | args['a'], args['m'], args['list'], args['no_matrix'], \ 270 | args['clades'], args['ignore_gaps'], args['leven'], args['t'] 271 | if (afa is False and matrix is False) or (afa is not False and matrix is not False): 272 | print('# use -a or -m; -h for help', file=sys.stderr) 273 | exit() 274 | if afa is not False: 275 | if afa == '-': 276 | afa = sys.stdin 277 | else: 278 | afa = open(afa) 279 | pairwise = pairwise_compare(afa, leven, threads, print_list, 280 | ignore_gaps) 281 | if print_matrix is True: 282 | for i in print_pairwise(pairwise): 283 | print('\t'.join([str(j) for j in i])) 284 | if clades is True: 285 | compare_clades(pairwise) 286 | if matrix is not False: 287 | pairwise = matrix2dictionary(open(matrix)) 288 | compare_clades(pairwise) 289 | -------------------------------------------------------------------------------- /ctbBio/rax.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for quickly running FastTree and raxml 5 | """ 6 | 7 | import sys 8 | import os 9 | import random 10 | import argparse 11 | from Bio import AlignIO 12 | import string as string_gen 13 | from subprocess import Popen 14 | 15 | # ctb 16 | from ctbBio.fasta import iterate_fasta as parse_fasta 17 | 18 | def check_type(fasta): 19 | nucl = ['A', 'T', 'G', 'C'] 20 | junk = ['N', 'U', '.', '-', ' '] 21 | type = 'nucl' 22 | for seq in parse_fasta(fasta): 23 | seq = seq[1].upper() 24 | for residue in seq: 25 | if residue in junk: 26 | continue 27 | if residue not in nucl: 28 | type = 'prot' 29 | break 30 | break 31 | return type 32 | 33 | def check(file): 34 | """ 35 | if a file exists, return 'True,' else, 'False' 36 | """ 37 | try: 38 | open(file) 39 | return True 40 | except (OSError, IOError) as e: 41 | return False 42 | 43 | def remove_bad(string): 44 | """ 45 | remove problem characters from string 46 | """ 47 | remove = [':', ',', '(', ')', ' ', '|', ';', '\''] 48 | for c in remove: 49 | string = string.replace(c, '_') 50 | return string 51 | 52 | def id_generator(size = 10, chars = string_gen.ascii_uppercase): 53 | return ''.join(random.choice(chars) for _ in range(size)) 54 | 55 | def get_ids(a): 56 | """ 57 | make copy of sequences with short identifier 58 | """ 59 | a_id = '%s.id.fa' % (a.rsplit('.', 1)[0]) 60 | a_id_lookup = '%s.id.lookup' % (a.rsplit('.', 1)[0]) 61 | if check(a_id) is True: 62 | return a_id, a_id_lookup 63 | a_id_f = open(a_id, 'w') 64 | a_id_lookup_f = open(a_id_lookup, 'w') 65 | ids = [] 66 | for seq in parse_fasta(open(a)): 67 | id = id_generator() 68 | while id in ids: 69 | id = id_generator() 70 | ids.append(id) 71 | header = seq[0].split('>')[1] 72 | name = remove_bad(header) 73 | seq[0] = '>%s %s' % (id, header) 74 | print('\n'.join(seq), file=a_id_f) 75 | print('%s\t%s\t%s' % (id, name, header), file=a_id_lookup_f) 76 | return a_id, a_id_lookup 77 | 78 | def convert2phylip(convert): 79 | """ 80 | convert fasta to phylip because RAxML is ridiculous 81 | """ 82 | out = '%s.phy' % (convert.rsplit('.', 1)[0]) 83 | if check(out) is False: 84 | convert = open(convert, 'rU') 85 | out_f = open(out, 'w') 86 | alignments = AlignIO.parse(convert, "fasta") 87 | AlignIO.write(alignments, out, "phylip") 88 | return out 89 | 90 | def run_fast(aligned, threads, cluster, node): 91 | """ 92 | run FastTree 93 | """ 94 | tree = '%s.fasttree.nwk' % (aligned.rsplit('.', 1)[0]) 95 | if check(tree) is False: 96 | if 'FastTreeV' in os.environ: 97 | ft = os.environ['FastTreeV'] 98 | os.environ['OMP_NUM_THREADS'] = str(threads) 99 | else: 100 | ft = 'FastTreeMP' 101 | os.environ['OMP_NUM_THREADS'] = str(threads) 102 | if check_type(aligned) == 'nucl': 103 | type = '-nt -gamma -spr 4 -mlacc 2 -slownni' 104 | else: 105 | type = '-spr 4 -mlacc 2 -slownni' 106 | dir = os.getcwd() 107 | command = 'cat %s/%s | cut -d \' \' -f 1 | %s -log %s/%s.log %s > %s/%s 2>>%s/%s.log' % \ 108 | (dir, aligned, ft, dir, tree, type, dir, tree, dir, tree) 109 | if cluster is False: 110 | p = Popen(command, shell = True) 111 | else: 112 | if int(threads) > 24: 113 | ppn = 24 114 | else: 115 | ppn = threads 116 | re_call = 'cd %s; %s --no-rax' % (dir.rsplit('/', 1)[0], ' '.join(sys.argv)) 117 | if node is False: 118 | node = '1' 119 | qsub = 'qsub -l nodes=%s:ppn=%s -m e -N FastTree' % (node, ppn) 120 | p = Popen('echo "%s;%s" | %s' % (command, re_call, qsub), shell = True) 121 | p.communicate() 122 | return tree 123 | 124 | def run_raxml(rax_out, boot, a_id_phylip, threads, aligned, model, cluster, node): 125 | """ 126 | run raxml 127 | """ 128 | # set ppn based on threads 129 | if threads > 24: 130 | ppn = 24 131 | else: 132 | ppn = threads 133 | if 'raxml' in os.environ: 134 | raxml = os.environ['raxml'] 135 | threads = '-T %s' % (threads) 136 | else: 137 | raxml = 'raxml' 138 | threads = '' 139 | rax_tree = 'RAxML_bipartitions.%s' % (rax_out) 140 | if check(rax_tree) is False: 141 | seed = random.randint(123456789, 12345678910000000) 142 | print(seed, file=open('seed.txt', 'w')) 143 | if check_type(aligned) == 'nucl' and model is False: 144 | model = 'GTRCAT' 145 | elif model is False: 146 | model = 'PROTCATJTT' 147 | dir = os.getcwd() 148 | command = '%s -f a -m %s -n %s -N %s -s %s -x %s -p %s %s > %s.log 2>>%s.log' % \ 149 | (raxml, model, rax_out, boot, a_id_phylip, seed, seed, threads, rax_out, rax_out) 150 | if cluster is False: 151 | p = Popen(command, shell = True) 152 | else: 153 | if node is False: 154 | node = '1' 155 | qsub = 'qsub -l nodes=%s:ppn=%s -m e -N raxml' % (node, ppn) 156 | command = 'cd /tmp; mkdir raxml_%s; cd raxml_%s; cp %s/%s .; %s; mv * %s/; rm -r ../raxml_%s' \ 157 | % (seed, seed, dir, a_id_phylip, command, dir, seed) 158 | re_call = 'cd %s; %s --no-fast' % (dir.rsplit('/', 1)[0], ' '.join(sys.argv)) 159 | p = Popen('echo "%s;%s" | %s' % (command, re_call, qsub), shell = True) 160 | p.communicate() 161 | return rax_tree 162 | 163 | def run_iqtree(phy, model, threads, cluster, node): 164 | """ 165 | run IQ-Tree 166 | """ 167 | # set ppn based on threads 168 | if threads > 24: 169 | ppn = 24 170 | else: 171 | ppn = threads 172 | tree = '%s.treefile' % (phy) 173 | if check(tree) is False: 174 | if model is False: 175 | model = 'TEST' 176 | dir = os.getcwd() 177 | command = 'iqtree-omp -s %s -m %s -nt %s -quiet' % \ 178 | (phy, model, threads) 179 | if cluster is False: 180 | p = Popen(command, shell = True) 181 | else: 182 | if node is False: 183 | node = '1' 184 | qsub = 'qsub -l nodes=%s:ppn=%s -m e -N iqtree' % (node, ppn) 185 | command = 'cd /tmp; mkdir iqtree; cd iqtree; cp %s/%s .; %s; mv * %s/; rm -r ../iqtree' \ 186 | % (dir, phy, command, dir) 187 | re_call = 'cd %s; %s --no-fast --iq' % (dir.rsplit('/', 1)[0], ' '.join(sys.argv)) 188 | p = Popen('echo "%s;%s" | %s' % (command, re_call, qsub), shell = True) 189 | p.communicate() 190 | return tree 191 | 192 | def fix_tree(tree, a_id_lookup, out): 193 | """ 194 | get the names for sequences in the raxml tree 195 | """ 196 | if check(out) is False and check(tree) is True: 197 | tree = open(tree).read() 198 | for line in open(a_id_lookup): 199 | id, name, header = line.strip().split('\t') 200 | tree = tree.replace(id+':', name+':') 201 | out_f = open(out, 'w') 202 | print(tree.strip(), file=out_f) 203 | return out 204 | 205 | def rax(a, boot, threads, \ 206 | fast = False, run_rax = False, run_iq = False, model = False, cluster = False, node = False): 207 | """ 208 | run raxml on 'a' (alignment) with 'boot' (bootstraps) and 'threads' (threads) 209 | store all files in raxml_a_b 210 | 1. give every sequence a short identifier 211 | 2. convert fasta to phylip 212 | 3. run raxml 213 | 4. convert ids in raxml tree to original names 214 | """ 215 | a = os.path.abspath(a) 216 | a_base = a.rsplit('/', 1)[1] 217 | out_dir = '%s/%s_rax_boots_%s' % \ 218 | (a.rsplit('/', 1)[0], a_base.rsplit('.', 1)[0], boot) 219 | os.system('mkdir -p %s' % (out_dir)) 220 | os.system('ln -sf %s %s/%s' % (os.path.abspath(a), out_dir, a.rsplit('/', 1)[1])) 221 | os.chdir(out_dir) 222 | a_id, a_id_lookup = get_ids(a_base) 223 | a_id_phylip = convert2phylip(a_id) 224 | rax_out = '%s.raxml.txt' % (a_id_phylip) 225 | if fast is True: 226 | final_fast = '%s.fasttree.tree' % (a_id_lookup.rsplit('.', 2)[0]) 227 | fast_tree = run_fast(a_id, threads, cluster, node) 228 | good_fast = fix_tree(fast_tree, a_id_lookup, final_fast) 229 | yield '%s/%s' % (out_dir, final_fast) 230 | # run IQ-Tree or RAxML 231 | if run_iq is True: 232 | final_iq = '%s.iq.tree' % (a_id_lookup.rsplit('.', 2)[0]) 233 | iq_out = '%s.iq.out' % (a_id_phylip) 234 | iq_tree = run_iqtree(a_id_phylip, model, threads, cluster, node) 235 | good_tree = fix_tree(iq_tree, a_id_lookup, final_iq) 236 | yield '%s/%s' % (out_dir, final_iq) 237 | elif run_rax is True: 238 | final_rax = '%s.raxml.tree' % (a_id_lookup.rsplit('.', 2)[0]) 239 | rax_tree = run_raxml(rax_out, boot, a_id_phylip, threads, a_id, model, cluster, node) 240 | good_tree = fix_tree(rax_tree, a_id_lookup, final_rax) 241 | yield '%s/%s' % (out_dir, final_rax) 242 | 243 | if __name__ == '__main__': 244 | parser = argparse.ArgumentParser(description = \ 245 | '# run RAxML and FastTree on aligned fasta file') 246 | parser.add_argument(\ 247 | '-a', required = True, \ 248 | help = 'aligned fasta file') 249 | parser.add_argument(\ 250 | '-b', default = 100, required = False, \ 251 | help = 'bootstraps (default: 100)') 252 | parser.add_argument(\ 253 | '-t', default = 6, required = False, type = int, \ 254 | help = 'threads (default: 6)') 255 | parser.add_argument(\ 256 | '-m', default = False, required = False, \ 257 | help = 'model (only for RAxML, default: GTRCAT/PROTCATJTT)') 258 | parser.add_argument(\ 259 | '--no-fast', action = 'store_false', required = False, \ 260 | help = 'do not run FastTree') 261 | parser.add_argument(\ 262 | '--no-rax', action = 'store_false', required = False, \ 263 | help = 'do not run RAxML') 264 | parser.add_argument(\ 265 | '--iq', action = 'store_true', required = False, \ 266 | help = 'run IQ-Tree instead of RAxML (auto model unless specified)') 267 | parser.add_argument(\ 268 | '--cluster', action = 'store_true', required = False, \ 269 | help = 'run on cluster') 270 | parser.add_argument(\ 271 | '-node', default = False, required = False, \ 272 | help = 'name of cluster node (optional: for use with --cluster and -t )') 273 | args = vars(parser.parse_args()) 274 | alignment, bootstraps, threads, \ 275 | fasttree, run_rax, run_iq, model, cluster, node = \ 276 | args['a'], args['b'], args['t'], args['no_fast'], \ 277 | args['no_rax'], args['iq'], args['m'], args['cluster'], args['node'] 278 | if cluster is False and node is not False: 279 | print('# use --cluster with -node', file=sys.stderr) 280 | exit() 281 | if cluster is True: 282 | if node is False: 283 | threads = 24 284 | else: 285 | threads = args['t'] 286 | [i for i in rax(alignment, bootstraps, threads, fasttree, run_rax, run_iq, model, cluster, node)] 287 | -------------------------------------------------------------------------------- /ctbBio/besthits.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | filter specified number of hits from a blast or HMM search 5 | that pass evalue and bit score thresholds 6 | 7 | note: file must be sorted by query ID (and domain # for domtblout), 8 | but does not have to be sorted by significance 9 | """ 10 | 11 | import os 12 | import sys 13 | import argparse 14 | import pandas as pd 15 | from operator import itemgetter 16 | 17 | def top_hits(hits, num, column, reverse): 18 | """ 19 | get top hits after sorting by column number 20 | """ 21 | hits.sort(key = itemgetter(column), reverse = reverse) 22 | for hit in hits[0:num]: 23 | yield hit 24 | 25 | def numBlast_sort(blast, numHits, evalueT, bitT): 26 | """ 27 | parse b6 output with sorting 28 | """ 29 | header = ['#query', 'target', 'pident', 'alen', 'mismatch', 'gapopen', 30 | 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bitscore'] 31 | yield header 32 | hmm = {h:[] for h in header} 33 | for line in blast: 34 | if line.startswith('#'): 35 | continue 36 | line = line.strip().split('\t') 37 | # Evalue and Bitscore thresholds 38 | line[10], line[11] = float(line[10]), float(line[11]) 39 | evalue, bit = line[10], line[11] 40 | if evalueT is not False and evalue > evalueT: 41 | continue 42 | if bitT is not False and bit < bitT: 43 | continue 44 | for i, h in zip(line, header): 45 | hmm[h].append(i) 46 | hmm = pd.DataFrame(hmm) 47 | for query, df in hmm.groupby(by = ['#query']): 48 | df = df.sort_values(by = ['bitscore'], ascending = False) 49 | for hit in df[header].values[0:numHits]: 50 | yield hit 51 | 52 | def numBlast(blast, numHits, evalueT = False, bitT = False, sort = False): 53 | """ 54 | parse b6 output 55 | """ 56 | if sort is True: 57 | for hit in numBlast_sort(blast, numHits, evalueT, bitT): 58 | yield hit 59 | return 60 | header = ['#query', 'target', 'pident', 'alen', 'mismatch', 'gapopen', 61 | 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bitscore'] 62 | yield header 63 | prev, hits = None, [] 64 | for line in blast: 65 | line = line.strip().split('\t') 66 | ID = line[0] 67 | line[10], line[11] = float(line[10]), float(line[11]) 68 | evalue, bit = line[10], line[11] 69 | if ID != prev: 70 | if len(hits) > 0: 71 | # column is 1 + line index 72 | for hit in top_hits(hits, numHits, 11, True): 73 | yield hit 74 | hits = [] 75 | if evalueT == False and bitT == False: 76 | hits.append(line) 77 | elif evalue <= evalueT and bitT == False: 78 | hits.append(line) 79 | elif evalue <= evalueT and bit >= bitT: 80 | hits.append(line) 81 | elif evalueT == False and bit >= bitT: 82 | hits.append(line) 83 | prev = ID 84 | for hit in top_hits(hits, numHits, 11, True): 85 | yield hit 86 | 87 | def numDomtblout_sort(domtblout, numHits, evalueT, bitT): 88 | """ 89 | parse hmm domain table output with sorting 90 | """ 91 | header = ['#target name', 'target accession', 'tlen', 92 | 'query name', 'query accession', 'qlen', 93 | 'full E-value', 'full score', 'full bias', 94 | 'domain #', '# domains', 95 | 'domain c-Evalue', 'domain i-Evalue', 'domain score', 'domain bias', 96 | 'hmm from', 'hmm to', 'seq from', 'seq to', 'env from', 'env to', 97 | 'acc', 'target description'] 98 | yield header 99 | hmm = {h:[] for h in header} 100 | for line in domtblout: 101 | if line.startswith('#'): 102 | continue 103 | line = line.strip().split() 104 | # domain c-Evalue and domain score thresholds 105 | line[11], line[13] = float(line[11]), float(line[13]) 106 | evalue, bit = line[11], line[13] 107 | if evalueT is not False and evalue > evalueT: 108 | continue 109 | if bitT is not False and bit < bitT: 110 | continue 111 | line, desc = line[0:22], ' '.join(line[22:]) 112 | line.append(desc) 113 | for i, h in zip(line, header): 114 | hmm[h].append(i) 115 | hmm = pd.DataFrame(hmm) 116 | for query, df in hmm.groupby(by = ['#target name', 'domain #']): 117 | df = df.sort_values(by = ['domain score'], ascending = False) 118 | for hit in df[header].values[0:numHits]: 119 | yield hit 120 | 121 | def numDomtblout(domtblout, numHits, evalueT, bitT, sort): 122 | """ 123 | parse hmm domain table output 124 | this version is faster but does not work unless the table is sorted 125 | """ 126 | if sort is True: 127 | for hit in numDomtblout_sort(domtblout, numHits, evalueT, bitT): 128 | yield hit 129 | return 130 | header = ['#target name', 'target accession', 'tlen', 131 | 'query name', 'query accession', 'qlen', 132 | 'full E-value', 'full score', 'full bias', 133 | 'domain #', '# domains', 134 | 'domain c-Evalue', 'domain i-Evalue', 'domain score', 'domain bias', 135 | 'hmm from', 'hmm to', 'seq from', 'seq to', 'env from', 'env to', 136 | 'acc', 'target description'] 137 | yield header 138 | prev, hits = None, [] 139 | for line in domtblout: 140 | if line.startswith('#'): 141 | continue 142 | # parse line and get description 143 | line = line.strip().split() 144 | desc = ' '.join(line[18:]) 145 | line = line[0:18] 146 | line.append(desc) 147 | # create ID based on query name and domain number 148 | ID = line[0] + line[9] 149 | # domain c-Evalue and domain score thresholds 150 | line[11], line[13] = float(line[11]), float(line[13]) 151 | evalue, bitscore = line[11], line[13] 152 | line[11], line[13] = evalue, bitscore 153 | if ID != prev: 154 | if len(hits) > 0: 155 | for hit in top_hits(hits, numHits, 13, True): 156 | yield hit 157 | hits = [] 158 | if evalueT == False and bitT == False: 159 | hits.append(line) 160 | elif evalue <= evalueT and bitT == False: 161 | hits.append(line) 162 | elif evalue <= evalueT and bit >= bitT: 163 | hits.append(line) 164 | elif evalueT == False and bit >= bitT: 165 | hits.append(line) 166 | prev = ID 167 | for hit in top_hits(hits, numHits, 13, True): 168 | yield hit 169 | 170 | def numTblout_sort(tblout, numHits, evalueT, bitT): 171 | """ 172 | parse hmm hmm table output with sorting 173 | """ 174 | header = ['#target name', 'target accession', 175 | 'query name', 'query accession', 176 | 'full E-value', 'full score', 'full bias', 177 | 'best E-value', 'best score', 'best bias', 178 | 'exp', 'reg', 'clu', 'ov', 'env', 'dom', 'rep', 'inc', 179 | 'description of target'] 180 | yield header 181 | hmm = {h:[] for h in header} 182 | for line in tblout: 183 | if line.startswith('#'): 184 | continue 185 | line = line.strip().split() 186 | # domain full Evalue and full score thresholds 187 | line[4], line[5] = float(line[4]), float(line[5]) 188 | evalue, bit = line[4], line[5] 189 | if evalueT is not False and evalue > evalueT: 190 | continue 191 | if bitT is not False and bit < bitT: 192 | continue 193 | line, desc = line[0:18], ' '.join(line[18:]) 194 | line.append(desc) 195 | for i, h in zip(line, header): 196 | hmm[h].append(i) 197 | hmm = pd.DataFrame(hmm) 198 | for query, df in hmm.groupby(by = ['#target name']): 199 | df = df.sort_values(by = ['full score'], ascending = False) 200 | for hit in df[header].values[0:numHits]: 201 | yield hit 202 | 203 | def numTblout(tblout, numHits, evalueT, bitT, sort): 204 | """ 205 | parse hmm table output 206 | this version is faster but does not work unless the table is sorted 207 | """ 208 | if sort is True: 209 | for hit in numTblout_sort(tblout, numHits, evalueT, bitT): 210 | yield hit 211 | return 212 | header = ['#target name', 'target accession', 213 | 'query name', 'query accession', 214 | 'full E-value', 'full score', 'full bias', 215 | 'best E-value', 'best score', 'best bias', 216 | 'exp', 'reg', 'clu', 'ov', 'env', 'dom', 'rep', 'inc', 217 | 'description of target'] 218 | yield header 219 | prev, hits = None, [] 220 | for line in tblout: 221 | if line.startswith('#'): 222 | continue 223 | # parse line and get description 224 | line = line.strip().split() 225 | desc = ' '.join(line[18:]) 226 | line = line[0:18] 227 | line.append(desc) 228 | # ID and scores 229 | ID = line[0] 230 | line[4], line[5] = float(line[4]), float(line[5]) 231 | evalue, bitscore = line[4], line[5] 232 | line[4], line[5] = evalue, bitscore 233 | if ID != prev: 234 | if len(hits) > 0: 235 | for hit in top_hits(hits, numHits, 5, True): 236 | yield hit 237 | hits = [] 238 | if evalueT == False and bitT == False: 239 | hits.append(line) 240 | elif evalue <= evalueT and bitT == False: 241 | hits.append(line) 242 | elif evalue <= evalueT and bit >= bitT: 243 | hits.append(line) 244 | elif evalueT == False and bit >= bitT: 245 | hits.append(line) 246 | prev = ID 247 | for hit in top_hits(hits, numHits, 5, True): 248 | yield hit 249 | 250 | if __name__ == '__main__': 251 | parser = argparse.ArgumentParser(description = '# filter blast or HMM tab output') 252 | parser.add_argument(\ 253 | '-i', default = '-', \ 254 | help = 'path to search results (sorted by query; default = stdin)') 255 | parser.add_argument(\ 256 | '-n', default = 1, type = int, \ 257 | help = 'number of hits (default = 1)') 258 | parser.add_argument(\ 259 | '-e', default = False, type = float, \ 260 | help = 'e-value threshold (default = None)') 261 | parser.add_argument(\ 262 | '-b', default = False, type = float, \ 263 | help = 'bit score threshold (default = None)') 264 | parser.add_argument(\ 265 | '-f', default = 'b6', type = str,\ 266 | help = 'format (default = b6, options: b6, domtblout, tblout)') 267 | parser.add_argument(\ 268 | '--sort', action = 'store_true', \ 269 | help = 'sort hits by query name (evalues are sorted by default)') 270 | args = vars(parser.parse_args()) 271 | # check if file is from stdin 272 | if args['i'] == '-': 273 | args['i'] = sys.stdin 274 | else: 275 | args['i'] = open(args['i']) 276 | if args['f'] == 'b6': 277 | for hit in numBlast(args['i'], args['n'], args['e'], args['b'], 278 | args['sort']): 279 | print('\t'.join([str(i) for i in hit])) 280 | elif args['f'] == 'domtblout': 281 | for hit in numDomtblout(args['i'], args['n'], args['e'], args['b'], 282 | args['sort']): 283 | print('\t'.join([str(i) for i in hit])) 284 | elif args['f'] == 'tblout': 285 | for hit in numTblout(args['i'], args['n'], args['e'], args['b'], 286 | args['sort']): 287 | print('\t'.join([str(i) for i in hit])) 288 | else: 289 | print('unsupported format:', args['f']) 290 | -------------------------------------------------------------------------------- /ctbBio/cluster_ani.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for clustering genomes based on average nucleotide 5 | identity 6 | """ 7 | import os 8 | import sys 9 | import argparse 10 | import subprocess 11 | from networkx import Graph as Graph 12 | from networkx import connected_components as connected_components 13 | 14 | # ctb 15 | from ctbBio.fasta import iterate_fasta as parse_fasta 16 | 17 | def make_mashes(fastas, mash_file, threads, kmer = 21, force = False): 18 | """ 19 | Create mash files for multiple fasta files 20 | Input: 21 | fastas -- paths to fasta files 22 | mash_file -- path to output mash file 23 | threads -- # threads for parallelization 24 | kmer -- kmer size for mash sketching 25 | force -- force overwrite of all mash files 26 | """ 27 | mash_processes = set() 28 | sketches = [fasta + '.msh' for fasta in fastas] 29 | devnull = open(os.devnull, 'w') 30 | # Perform the sketching 31 | for fasta, sketch in zip(fastas, sketches): 32 | if os.path.isfile(sketch): 33 | continue 34 | mash_cmd = ['/opt/bin/bio/mash', 'sketch', '-o', fasta, '-k', str(kmer), fasta] 35 | mash_processes.add(subprocess.Popen(mash_cmd, stderr=devnull)) 36 | if len(mash_processes) >= threads: 37 | os.wait() 38 | mash_processes.difference_update([mp for mp in mash_processes if mp.poll() is not None]) 39 | # Collect stragglers 40 | for mp in mash_processes: 41 | if mp.poll() is None: 42 | mp.wait() 43 | # Paste sketches into single mash 44 | paste_mashes(sketches, mash_file, force = force) 45 | return 46 | 47 | def paste_mashes(sketches, pasted_mash, force = False): 48 | """ 49 | Combine mash files into single sketch 50 | Input: 51 | sketches -- paths to sketch files 52 | pasted_mash -- path to output mash file 53 | force -- force overwrite of all mash file 54 | """ 55 | if os.path.isfile(pasted_mash): 56 | if force: 57 | subprocess.Popen(['rm', pasted_mash]).wait() 58 | else: 59 | return 60 | pasted_mash = pasted_mash.rsplit('.msh')[0] 61 | mash_cmd = ['/opt/bin/bio/mash', 'paste', pasted_mash] 62 | mash_cmd.extend(sketches) 63 | process = subprocess.Popen(mash_cmd) 64 | process.wait() 65 | return 66 | 67 | def ani(fastas, mash_file, sim_threshold, threads): 68 | """ 69 | Use mash to estimate ANI of genomes 70 | Input: 71 | fastas -- paths to fasta files 72 | sim_threshold -- fractional cutoff % identity for cluster joining 73 | mash_file -- pasted sketch file of all fastas being compared 74 | threads -- number threads for distance estimation 75 | """ 76 | ANI = Graph() 77 | # use Mash to estimate ANI 78 | for fasta in fastas: 79 | indiv_mash = fasta + '.msh' 80 | if os.path.isfile(indiv_mash): 81 | cmp_file = indiv_mash 82 | else: 83 | cmp_file = fasta 84 | mash_cmd = ['/opt/bin/bio/mash', 'dist', cmp_file, mash_file] 85 | process = subprocess.Popen(mash_cmd, stdout = subprocess.PIPE) 86 | for pair in process.communicate()[0].splitlines(): 87 | a, b, dist, p, shared = pair.decode().strip().split() 88 | a = a.rsplit('.', 1)[0].rsplit('/', 1)[-1].rsplit('.contigs')[0] 89 | b = b.rsplit('.', 1)[0].rsplit('/', 1)[-1].rsplit('.contigs')[0] 90 | p = float(p) 91 | similarity = (1 - float(dist)) * 100 92 | if similarity >= sim_threshold: 93 | ANI.add_edge(a, b, si = similarity, pval = p, sharedK = shared) 94 | process.wait() 95 | return ANI 96 | 97 | def genome_info(genome, info): 98 | """ 99 | return genome info for choosing representative 100 | 101 | if ggKbase table provided - choose rep based on SCGs and genome length 102 | - priority for most SCGs - extra SCGs, then largest genome 103 | 104 | otherwise, based on largest genome 105 | """ 106 | try: 107 | scg = info['#SCGs'] 108 | dups = info['#SCG duplicates'] 109 | length = info['genome size (bp)'] 110 | return [scg - dups, length, genome] 111 | except: 112 | return [False, False, info['genome size (bp)'], genome] 113 | 114 | def print_clusters(fastas, info, ANI): 115 | """ 116 | choose represenative genome and 117 | print cluster information 118 | 119 | *if ggKbase table is provided, use SCG info to choose best genome 120 | """ 121 | header = ['#cluster', 'num. genomes', 'rep.', 'genome', '#SCGs', '#SCG duplicates', \ 122 | 'genome size (bp)', 'fragments', 'list'] 123 | yield header 124 | in_cluster = [] 125 | for cluster_num, cluster in enumerate(connected_components(ANI)): 126 | cluster = sorted([genome_info(genome, info[genome]) \ 127 | for genome in cluster], \ 128 | key = lambda x: x[0:], reverse = True) 129 | rep = cluster[0][-1] 130 | cluster = [i[-1] for i in cluster] 131 | size = len(cluster) 132 | for genome in cluster: 133 | in_cluster.append(genome) 134 | try: 135 | stats = [size, rep, genome, \ 136 | info[genome]['#SCGs'], info[genome]['#SCG duplicates'], \ 137 | info[genome]['genome size (bp)'], info[genome]['# contigs'], cluster] 138 | except: 139 | stats = [size, rep, genome, \ 140 | 'n/a', 'n/a', \ 141 | info[genome]['genome size (bp)'], info[genome]['# contigs'], cluster] 142 | if rep == genome: 143 | stats = ['*%s' % (cluster_num)] + stats 144 | else: 145 | stats = [cluster_num] + stats 146 | yield stats 147 | # print singletons 148 | try: 149 | start = cluster_num + 1 150 | except: 151 | start = 0 152 | fastas = set([i.rsplit('.', 1)[0].rsplit('/', 1)[-1].rsplit('.contigs')[0] for i in fastas]) 153 | for cluster_num, genome in \ 154 | enumerate(fastas.difference(set(in_cluster)), start): 155 | try: 156 | stats = ['*%s' % (cluster_num), 1, genome, genome, \ 157 | info[genome]['#SCGs'], info[genome]['#SCG duplicates'], \ 158 | info[genome]['genome size (bp)'], info[genome]['# contigs'], [genome]] 159 | except: 160 | stats = ['*%s' % (cluster_num), 1, genome, genome, \ 161 | 'n/a', 'n/a', \ 162 | info[genome]['genome size (bp)'], info[genome]['# contigs'], [genome]] 163 | yield stats 164 | 165 | def to_int(i): 166 | """ 167 | convert to integer, if possible 168 | """ 169 | try: 170 | return int(i) 171 | except: 172 | return i 173 | 174 | def parse_ggKbase_tables(tables, id_type): 175 | """ 176 | convert ggKbase genome info tables to dictionary 177 | """ 178 | g2info = {} 179 | for table in tables: 180 | for line in open(table): 181 | line = line.strip().split('\t') 182 | if line[0].startswith('name'): 183 | header = line 184 | header[4] = 'genome size (bp)' 185 | header[12] = '#SCGs' 186 | header[13] = '#SCG duplicates' 187 | continue 188 | name, code, info = line[0], line[1], line 189 | info = [to_int(i) for i in info] 190 | if id_type is False: # try to use name and code ID 191 | if 'UNK' in code or 'unknown' in code: 192 | code = name 193 | if (name != code) and (name and code in g2info): 194 | print('# duplicate name or code in table(s)', file=sys.stderr) 195 | print('# %s and/or %s' % (name, code), file=sys.stderr) 196 | exit() 197 | if name not in g2info: 198 | g2info[name] = {item:stat for item, stat in zip(header, info)} 199 | if code not in g2info: 200 | g2info[code] = {item:stat for item, stat in zip(header, info)} 201 | else: 202 | if id_type == 'name': 203 | ID = name 204 | elif id_type == 'code': 205 | ID = code 206 | else: 207 | print('# specify name or code column using -id', file=sys.stderr) 208 | exit() 209 | ID = ID.replace(' ', '') 210 | g2info[ID] = {item:stat for item, stat in zip(header, info)} 211 | if g2info[ID]['genome size (bp)'] == '': 212 | g2info[ID]['genome size (bp)'] = 0 213 | return g2info 214 | 215 | def parse_checkM_tables(tables): 216 | """ 217 | convert checkM genome info tables to dictionary 218 | """ 219 | g2info = {} 220 | for table in tables: 221 | for line in open(table): 222 | line = line.strip().split('\t') 223 | if line[0].startswith('Bin Id'): 224 | header = line 225 | header[8] = 'genome size (bp)' 226 | header[5] = '#SCGs' 227 | header[6] = '#SCG duplicates' 228 | continue 229 | ID, info = line[0], line 230 | info = [to_int(i) for i in info] 231 | ID = ID.replace(' ', '') 232 | g2info[ID] = {item:stat for item, stat in zip(header, info)} 233 | if g2info[ID]['genome size (bp)'] == '': 234 | g2info[ID]['genome size (bp)'] = 0 235 | return g2info 236 | 237 | def genome_lengths(fastas, info): 238 | """ 239 | get genome lengths 240 | """ 241 | if info is False: 242 | info = {} 243 | for genome in fastas: 244 | name = genome.rsplit('.', 1)[0].rsplit('/', 1)[-1].rsplit('.contigs')[0] 245 | if name in info: 246 | continue 247 | length = 0 248 | fragments = 0 249 | for seq in parse_fasta(genome): 250 | length += len(seq[1]) 251 | fragments += 1 252 | info[name] = {'genome size (bp)':length, '# contigs':fragments} 253 | return info 254 | 255 | if __name__ == '__main__': 256 | parser = argparse.ArgumentParser(description = \ 257 | '# cluster genomes based on average nucleotide identity (ani)') 258 | parser.add_argument(\ 259 | '-f', nargs = '*', action = 'store', required = True, \ 260 | help = 'fastas') 261 | parser.add_argument(\ 262 | '-m', action = 'store', required = True, type = str, \ 263 | help = 'mash file (will be created if it does not exist)') 264 | parser.add_argument(\ 265 | '-s', default = 98, type = float, required = False, \ 266 | help = 'percent similarity (default = 98)') 267 | parser.add_argument(\ 268 | '-g', nargs = '*', action = 'store', required = False, \ 269 | default = False, \ 270 | help = 'ggKbase genome table for selecting representative (optional)') 271 | parser.add_argument(\ 272 | '-c', nargs = '*', action = 'store', required = False, \ 273 | default = False, \ 274 | help = 'checkM genome table for selecting representative (optional)') 275 | parser.add_argument(\ 276 | '-id', default = False, \ 277 | help = 'use name or code column in ggKbase table (default: try both)') 278 | parser.add_argument(\ 279 | '-t', required = False, default = 6, type = int, \ 280 | help = 'threads (default = 6)') 281 | args = vars(parser.parse_args()) 282 | fastas, similarity, id_type, threads, mash_file = \ 283 | args['f'], args['s'], args['id'], args['t'], args['m'] 284 | gg, cm = args['g'], args['c'] 285 | if '.msh' not in mash_file: 286 | mash_file = '%s.msh' % (mash_file) 287 | info = False # assume no marker gene file is given (either ggKbase or checkM) 288 | if gg is not False: 289 | info = parse_ggKbase_tables(gg, id_type) 290 | elif cm is not False: 291 | info = parse_checkM_tables(cm) 292 | info = genome_lengths(fastas, info) 293 | make_mashes(fastas, mash_file, threads) 294 | ANI = ani(fastas, mash_file, similarity, threads) 295 | for genome in print_clusters(fastas, info, ANI): 296 | print('\t'.join([str(i) for i in genome])) 297 | -------------------------------------------------------------------------------- /ctbBio/23SfromHMM.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | script for 1) finding 23S rRNA gene sequences from contigs based on ssu-cmsearch or cmsearch tabular output and 2) 5 | identifying 23S rRNA gene insertions 6 | 7 | ctb@berkeley.edu 8 | """ 9 | 10 | # python modules 11 | import sys 12 | import os 13 | from operator import itemgetter 14 | import argparse 15 | from subprocess import Popen 16 | 17 | # ctb modules 18 | from ctbBio.fasta import iterate_fasta as parse_fasta 19 | from ctbBio.rc import reverse_complement as rc 20 | 21 | def best_model(seq2hmm): 22 | """ 23 | determine the best model: archaea, bacteria, eukarya (best score) 24 | """ 25 | for seq in seq2hmm: 26 | best = [] 27 | for model in seq2hmm[seq]: 28 | best.append([model, sorted([i[-1] for i in seq2hmm[seq][model]], reverse = True)[0]]) 29 | best_model = sorted(best, key = itemgetter(1), reverse = True)[0][0] 30 | seq2hmm[seq] = [best_model] + [seq2hmm[seq][best_model]] 31 | return seq2hmm 32 | 33 | def check_gaps(matches, gap_threshold = 0): 34 | """ 35 | check for large gaps between alignment windows 36 | """ 37 | gaps = [] 38 | prev = None 39 | for match in sorted(matches, key = itemgetter(0)): 40 | if prev is None: 41 | prev = match 42 | continue 43 | if match[0] - prev[1] >= gap_threshold: 44 | gaps.append([prev, match]) 45 | prev = match 46 | return [[i[0][1], i[1][0]] for i in gaps] 47 | 48 | def get_overlap(a, b): 49 | return max(0, min(a[1], b[1]) - max(a[0], b[0])) 50 | 51 | def check_overlap(current, hit, overlap = 200): 52 | """ 53 | determine if sequence has already hit the same part of the model, 54 | indicating that this hit is for another 23S rRNA gene 55 | """ 56 | for prev in current: 57 | p_coords = prev[2:4] 58 | coords = hit[2:4] 59 | if get_overlap(coords, p_coords) >= overlap: 60 | return True 61 | return False 62 | 63 | def check_order(current, hit, overlap = 200): 64 | """ 65 | determine if hits are sequential on model and on the 66 | same strand 67 | * if not, they should be split into different groups 68 | """ 69 | prev_model = current[-1][2:4] 70 | prev_strand = current[-1][-2] 71 | hit_model = hit[2:4] 72 | hit_strand = hit[-2] 73 | # make sure they are on the same strand 74 | if prev_strand != hit_strand: 75 | return False 76 | # check for sequential hits on + strand 77 | if prev_strand == '+' and (prev_model[1] - hit_model[0] >= overlap): 78 | return False 79 | # check for sequential hits on - strand 80 | if prev_strand == '-' and (hit_model[1] - prev_model[0] >= overlap): 81 | return False 82 | else: 83 | return True 84 | 85 | def hit_groups(hits): 86 | """ 87 | * each sequence may have more than one 16S rRNA gene 88 | * group hits for each gene 89 | """ 90 | groups = [] 91 | current = False 92 | for hit in sorted(hits, key = itemgetter(0)): 93 | if current is False: 94 | current = [hit] 95 | elif check_overlap(current, hit) is True or check_order(current, hit) is False: 96 | groups.append(current) 97 | current = [hit] 98 | else: 99 | current.append(hit) 100 | groups.append(current) 101 | return groups 102 | 103 | def find_coordinates(hmms, bit_thresh): 104 | """ 105 | find 23S rRNA gene sequence coordinates 106 | """ 107 | # get coordinates from cmsearch output 108 | seq2hmm = parse_hmm(hmms, bit_thresh) 109 | seq2hmm = best_model(seq2hmm) 110 | group2hmm = {} # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps] 111 | for seq, info in list(seq2hmm.items()): 112 | group2hmm[seq] = {} 113 | # info = [model, [hit1], [hit2], ...] 114 | for group_num, group in enumerate(hit_groups(info[1])): 115 | # group is a group of hits to a single 23S gene 116 | # determine matching strand based on best hit 117 | best = sorted(group, reverse = True, key = itemgetter(-1))[0] 118 | strand = best[5] 119 | coordinates = [i[0] for i in group] + [i[1] for i in group] 120 | coordinates = [min(coordinates), max(coordinates), strand] 121 | # make sure all hits are to the same strand 122 | matches = [i for i in group if i[5] == strand] 123 | # gaps = [[gstart, gend], [gstart2, gend2]] 124 | gaps = check_gaps(matches) 125 | group2hmm[seq][group_num] = [info[0], strand, coordinates, matches, gaps] 126 | return group2hmm 127 | 128 | def get_info(line, bit_thresh): 129 | """ 130 | get info from either ssu-cmsearch or cmsearch output 131 | """ 132 | if len(line) >= 18: # output is from cmsearch 133 | id, model, bit, inc = line[0].split()[0], line[2], float(line[14]), line[16] 134 | sstart, send, strand = int(line[7]), int(line[8]), line[9] 135 | mstart, mend = int(line[5]), int(line[6]) 136 | elif len(line) == 9: # output is from ssu-cmsearch 137 | if bit_thresh == 0: 138 | print('# ssu-cmsearch does not include a model-specific inclusion threshold, ', file=sys.stderr) 139 | print('# please specify a bit score threshold', file=sys.stderr) 140 | exit() 141 | id, model, bit = line[1].split()[0], line[0], float(line[6]) 142 | inc = '!' # this is not a feature of ssu-cmsearch 143 | sstart, send = int(line[2]), int(line[3]) 144 | mstart, mend = int(4), int(5) 145 | if send >= sstart: 146 | strand = '+' 147 | else: 148 | strand = '-' 149 | else: 150 | print('# unsupported hmm format:', file=sys.stderr) 151 | print('# provide tabular output from ssu-cmsearch and cmsearch supported', file=sys.stderr) 152 | exit() 153 | coords = [sstart, send] 154 | sstart, send = min(coords), max(coords) 155 | mcoords = [mstart, mend] 156 | mstart, mend = min(mcoords), max(mcoords) 157 | return id, model, bit, sstart, send, mstart, mend, strand, inc 158 | 159 | def parse_hmm(hmms, bit_thresh): 160 | seq2hmm = {} 161 | for hmm in hmms: 162 | for line in hmm: 163 | if line.startswith('#'): 164 | continue 165 | line = line.strip().split() 166 | id, model, bit, sstart, send, mstart, mend, strand, inc = get_info(line, bit_thresh) 167 | if bit >= bit_thresh and inc == '!': 168 | if id not in seq2hmm: 169 | seq2hmm[id] = {} 170 | if model not in seq2hmm[id]: 171 | seq2hmm[id][model] = [] 172 | length = abs(sstart - send) + 1 173 | seq2hmm[id][model].append([sstart, send, mstart, mend, length, strand, bit]) 174 | return seq2hmm 175 | 176 | def mask_sequence(seq, gaps): 177 | """ 178 | mask (make lower case) regions of sequence found in gaps between model alignments 179 | """ 180 | seq = [i.upper() for i in seq] 181 | for gap in gaps: 182 | for i in range(gap[0] - 1, gap[1]): 183 | seq[i] = seq[i].lower() 184 | return ''.join(seq) 185 | 186 | def check_buffer(coords, length, buffer): 187 | """ 188 | check to see how much of the buffer is being used 189 | """ 190 | s = min(coords[0], buffer) 191 | e = min(length - coords[1], buffer) 192 | return [s, e] 193 | 194 | def find_23S(fastas, hmms, bit_thresh = float(20), length_thresh = 500, masking = True, buffer = 0): 195 | """ 196 | 1) parse hmm output into dictionary (sequence must pass bit_thresh and inc == '!') 197 | seq2hmm[seq] = {model: [sstart, ssend, length, strand, score]} 198 | 2) determine which model (archaea, bacteria, eukarya) the sequence most closely matches 199 | seq2hmm[seq] = [model, sstart, send, length, strand, score], [model2, sstart2, send2, length2, strand2, score2], ...] 200 | 3) identify regions that match to 23S (for best model) 201 | 4) mask internal regions that do not align to model 202 | 5) length threshold applies to aligned regions of 23S sequence 203 | 5) export 23S sequnece based on complete gene (including masked insertions) 204 | """ 205 | # identify start/stop positions 206 | # group2hmm[seq][group] = [model, strand, coordinates, matches, gaps] 207 | group2hmm = find_coordinates(hmms, bit_thresh) 208 | # get sequences from fasta file 209 | for fasta in fastas: 210 | for seq in parse_fasta(fasta): 211 | id = seq[0].split('>')[1].split()[0] 212 | if id not in group2hmm: 213 | continue 214 | seq[1] = seq[1].upper() 215 | count = 0 # how many 23S genes are there on the contig? 216 | for group, info in list(group2hmm[id].items()): 217 | model, strand, coords, matches, gaps = info 218 | # count insertion bases (ib) from gaps 219 | ib = sum([i[1] - i[0] + 1 for i in gaps]) 220 | # calcualte length of non-insertion regions (don't include buffer) 221 | tl = coords[1] - coords[0] + 1 222 | length = tl - ib 223 | if length < length_thresh: 224 | continue 225 | # count sequence 226 | count += 1 227 | # set retrieval coords based on buffer 228 | ret_coords = [max([coords[0] - buffer, 1]), \ 229 | min([coords[1] + buffer, len(seq[1])]), coords[2]] 230 | buffer_ends = check_buffer(coords, len(seq[1]), buffer) 231 | # mask insertion sequences 232 | if masking is True: 233 | seq[1] = mask_sequence(seq[1], gaps) 234 | S = seq[1][(ret_coords[0] - 1):(ret_coords[1])] 235 | inserts = [gap[1] - gap[0] + 1 for gap in gaps] 236 | inserts.append('end') 237 | model_pos = ';'.join(['%s-%s(%s)' % (match[2], match[3], insert) for match, insert in zip(matches, inserts)]) 238 | header = '%s 23SfromHMM::model=%s seq=%s pos=%s-%s strand=%s total-len=%s 23S-len=%s model-pos(ins-len)=%s buffer-len=%s/%s ins-bases=%s' % \ 239 | (seq[0], model, count, ret_coords[0], ret_coords[1], strand, tl, length, model_pos, buffer_ends[0], buffer_ends[1], ib) 240 | # reverse complement if strand is reverse 241 | if strand == '-': 242 | S = rc(['', S])[1] 243 | yield [header, S] 244 | 245 | def run_cmsearch(fastas, threads, cm): 246 | """ 247 | run cmsearch: comapre 23S sequences to ssu-align's CM 248 | """ 249 | out = [] 250 | for fasta in fastas: 251 | cmsearch = '%s.23S.cmsearch' % (fasta.name.rsplit('.', 1)[0]) 252 | if os.path.exists(cmsearch) is False: 253 | p = Popen('\ 254 | cmsearch --cpu %s --hmmonly --acc --noali -T -1 --tblout %s %s %s >> cmsearch.log' \ 255 | % (threads, cmsearch, cm, fasta.name), shell = True) 256 | p.communicate() 257 | else: 258 | print('# cmsearch output found: %s' % (cmsearch), file=sys.stderr) 259 | out.append(open(cmsearch)) 260 | return out 261 | 262 | if __name__ == '__main__': 263 | parser = argparse.ArgumentParser(description='# find 23S rRNA genes using cmsearch') 264 | parser.add_argument(\ 265 | '-f', nargs = '*', action = 'store', \ 266 | required = True, help = 'path to fasta file(s) for searching') 267 | parser.add_argument(\ 268 | '-c', nargs = '*', action = 'store', default = False, \ 269 | help = 'path to cmsearch table(s)') 270 | parser.add_argument(\ 271 | '-t', default = False, help = 'number of cpus to use') 272 | parser.add_argument(\ 273 | '-b', default = 0, help = 'size of flanking region to collect, default = 0') 274 | parser.add_argument(\ 275 | '-m', action = 'store_true', \ 276 | help = 'mask insertion sequences, default = False') 277 | parser.add_argument(\ 278 | '-d', default = False, help = 'path to cm') 279 | parser.add_argument(\ 280 | '-bit', default = 0, type = float, help = 'minimum bit score to consider, default = 0') 281 | parser.add_argument(\ 282 | '-l', default = 500, type = int, \ 283 | help = 'minimum length 23S rRNA gene sequence to report, default = 500') 284 | args = vars(parser.parse_args()) 285 | # make sure either a cmsearch table or the number of cpus to use for running cmsearch is specified 286 | if args['c'] is False and args['t'] is False: 287 | print('specify: -c or -t ', file=sys.stderr) 288 | exit() 289 | # make sure that fasta is not from stdin if cmsearch table is not provided 290 | if args['c'] is False and args['f'][0] == '-': 291 | print('specify: -c or -f ', file=sys.stderr) 292 | exit() 293 | # check if either fasta or cmsearch is from stdin 294 | if args['f'][0] == '-' and args['c'] == False: 295 | print('specify: -f or -c ', file=sys.stderr) 296 | exit() 297 | for i in 'f' 'c': 298 | if args[i] is False: 299 | continue 300 | if args[i][0] == '-': 301 | args[i] = [sys.stdin] 302 | else: 303 | args[i] = [open(j) for j in args[i]] 304 | # if cmsearch output not specified, run cmsearch 305 | if args['c'] is False: 306 | if args['d'] is False and 'lsucmdb' not in os.environ: 307 | print('specify: -d ', file=sys.stderr) 308 | exit() 309 | if args['d'] is False: 310 | cm = os.environ['lsucmdb'] 311 | print('# 23S rRNA CM: %s' % (cm), file=sys.stderr) 312 | else: 313 | cm = args['d'] 314 | args['c'] = run_cmsearch(args['f'], args['t'], cm) 315 | fastas, hmms, bit_thresh, length_thresh, masking, buffer = \ 316 | args['f'], args['c'], float(args['bit']), int(args['l']), args['m'], int(args['b']) 317 | for seq in find_23S(fastas, hmms, bit_thresh, length_thresh, masking, buffer): 318 | print('\n'.join(seq)) 319 | --------------------------------------------------------------------------------