├── align_io ├── __init__.py ├── __pycache__ │ ├── msa.cpython-36.pyc │ ├── msa.cpython-37.pyc │ ├── seq_ali.cpython-36.pyc │ ├── seq_ali.cpython-37.pyc │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── xmfa_mummer4_io.cpython-36.pyc │ └── xmfa_mummer4_io.cpython-37.pyc ├── maf_io.py ├── msa.py ├── xmfa_parsnp_io.py ├── xmfa_io.py ├── xmfa_mummer4_io.py └── seq_ali.py ├── snps_io ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── gen_msa.cpython-310.pyc │ ├── gen_msa.cpython-36.pyc │ ├── gen_msa.cpython-37.pyc │ ├── vcf_io.cpython-310.pyc │ ├── vcf_io.cpython-36.pyc │ ├── vcf_io.cpython-37.pyc │ ├── __init__.cpython-310.pyc │ ├── id_centroid.cpython-310.pyc │ ├── id_centroid.cpython-36.pyc │ ├── id_centroid.cpython-37.pyc │ ├── align_assembly.cpython-310.pyc │ ├── align_assembly.cpython-36.pyc │ ├── align_assembly.cpython-37.pyc │ ├── concat_alleles.cpython-310.pyc │ ├── concat_alleles.cpython-36.pyc │ ├── id_genome_clusters.cpython-36.pyc │ ├── id_genome_clusters.cpython-37.pyc │ └── id_genome_clusters.cpython-310.pyc ├── id_centroid.py ├── vcf_var_io.py ├── gen_msa_single.py ├── gen_msa.py ├── vcf_io.py ├── id_genome_clusters.py ├── concat_alleles.py └── align_assembly.py ├── bin ├── iso_gt_mtar ├── callm_db_build ├── callm_db_val └── maast.py ├── db_io ├── __pycache__ │ ├── build_db.cpython-310.pyc │ ├── build_db.cpython-36.pyc │ └── build_db.cpython-37.pyc ├── example.sh └── build_db.py ├── conda_recipe ├── build.sh └── meta.yaml ├── Makefile ├── maast ├── LICENSE ├── src ├── callm_db_build.cpp └── callm_db_val.cpp └── README.md /align_io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /snps_io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bin/iso_gt_mtar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/bin/iso_gt_mtar -------------------------------------------------------------------------------- /bin/callm_db_build: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/bin/callm_db_build -------------------------------------------------------------------------------- /bin/callm_db_val: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/bin/callm_db_val -------------------------------------------------------------------------------- /align_io/__pycache__/msa.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/msa.cpython-36.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/msa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/msa.cpython-37.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/seq_ali.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/seq_ali.cpython-36.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/seq_ali.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/seq_ali.cpython-37.pyc -------------------------------------------------------------------------------- /db_io/__pycache__/build_db.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/db_io/__pycache__/build_db.cpython-310.pyc -------------------------------------------------------------------------------- /db_io/__pycache__/build_db.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/db_io/__pycache__/build_db.cpython-36.pyc -------------------------------------------------------------------------------- /db_io/__pycache__/build_db.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/db_io/__pycache__/build_db.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/gen_msa.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/gen_msa.cpython-310.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/gen_msa.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/gen_msa.cpython-36.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/gen_msa.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/gen_msa.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/vcf_io.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/vcf_io.cpython-310.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/vcf_io.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/vcf_io.cpython-36.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/vcf_io.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/vcf_io.cpython-37.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/id_centroid.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_centroid.cpython-310.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/id_centroid.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_centroid.cpython-36.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/id_centroid.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_centroid.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/align_assembly.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/align_assembly.cpython-310.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/align_assembly.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/align_assembly.cpython-36.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/align_assembly.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/align_assembly.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/concat_alleles.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/concat_alleles.cpython-310.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/concat_alleles.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/concat_alleles.cpython-36.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/xmfa_mummer4_io.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/xmfa_mummer4_io.cpython-36.pyc -------------------------------------------------------------------------------- /align_io/__pycache__/xmfa_mummer4_io.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/xmfa_mummer4_io.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/id_genome_clusters.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_genome_clusters.cpython-36.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/id_genome_clusters.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_genome_clusters.cpython-37.pyc -------------------------------------------------------------------------------- /snps_io/__pycache__/id_genome_clusters.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_genome_clusters.cpython-310.pyc -------------------------------------------------------------------------------- /conda_recipe/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## change to source dir 4 | cd ${SRC_DIR} 5 | 6 | ## compile 7 | make 8 | 9 | ## install 10 | mkdir -p $PREFIX/bin 11 | cp -r ${SRC_DIR}/* ${PREFIX}/bin/ 12 | -------------------------------------------------------------------------------- /db_io/example.sh: -------------------------------------------------------------------------------- 1 | find -maxdepth 2 -name '*nr*' | cut -d'/' -f2 | awk '{printf "igvam_dbval -d %s/nr-snp-kmer.tsv -n %s -t 2 -L <(cut -f1 ../snp_calling_hqsubsets/%s.path.list) 1> %s/kmer_profiles.tsv 2> %s/kmer_profiles.log\n", $1, $1, $1, $1, $1}' | head -n 1 2 | 3 | cat missing_species.list | xargs -I[] -n1 -P7 bash -c "python /home/ubuntu/proj/snpMLST/snp_mlst/build_db_new.py tt extract --ref-genome ./[]/reference.fna --vcf ./[]/core_snps.vcf --msa ./[]/temp/mummer4/[]/msa.fa --out ./[]/nr --kmer-type all --coords ./[]/coords.tsv --no-reduction &> ./[]/kmer_xtract.log" & 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: callm_db_build callm_db_val iso_gt_mtar 2 | @echo "Maast build completed." 3 | 4 | callm_db_build: ./src/callm_db_build.cpp Makefile 5 | g++ -std=c++11 ./src/callm_db_build.cpp -o ./bin/callm_db_build -O3 -lpthread 6 | 7 | callm_db_val: ./src/callm_db_val.cpp Makefile 8 | g++ -std=c++11 ./src/callm_db_val.cpp -o ./bin/callm_db_val -O3 -lpthread 9 | 10 | iso_gt_mtar: ./src/callm_db_val.cpp Makefile 11 | g++ -std=c++11 ./src/iso_gt_mtar.cpp -o ./bin/iso_gt_mtar -O3 -lpthread 12 | 13 | clean: 14 | rm ./bin/callm_db_build ./bin/callm_db_val ./bin/iso_gt_mtar 15 | -------------------------------------------------------------------------------- /maast: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | Version() 4 | { 5 | # Display version 6 | echo "Maast version 1.0.8" 7 | } 8 | 9 | # automatic path exporting 10 | REALME=`realpath $0` 11 | MAAST=`dirname ${REALME}` 12 | EBSROOT=`dirname ${MAAST}` 13 | export PATH=$PATH:${MAAST}/bin/ 14 | export PYTHONPATH=$PYTHONPATH:${MAAST} 15 | 16 | # controller calling functional modules 17 | if [ "$1" = "-v" ] || [ "$1" = "--version" ] || [ "$1" = "version" ] || [ "$1" = "-version" ]; then 18 | Version 19 | elif [ "$1" = "-h" ] || [ "$1" = "-help" ] || [ "$1" = "help" ] || [ "$1" = "--help" ]; then 20 | maast.py -h 21 | else 22 | maast.py $* 23 | fi 24 | -------------------------------------------------------------------------------- /conda_recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: maast 3 | version: 1.0.8 4 | 5 | source: 6 | git_url: https://github.com/zjshi/Maast.git 7 | 8 | requirements: 9 | build: 10 | - python ==3.9.6 11 | - {{ compiler('cxx') }} 12 | - setuptools 13 | - pip 14 | host: 15 | - python ==3.9.6 16 | - setuptools 17 | - pip 18 | - numpy 19 | - scipy 20 | - biopython 21 | - networkx 22 | run: 23 | - python ==3.9.6 24 | - pip 25 | - numpy 26 | - scipy 27 | - networkx 28 | - biopython 29 | - fasttree 30 | - mash 31 | - mummer4 32 | - pigz 33 | - lz4 34 | - lbzip2 35 | 36 | test: 37 | import: 38 | - numpy 39 | - scipy 40 | - biopython 41 | - networkx 42 | 43 | about: 44 | home: https://github.com/zjshi/Maast 45 | license: MIT License 46 | license_file: LICENSE 47 | summary: Maast for efficient genotyping of microbial SNPs 48 | doc_url: https://github.com/zjshi/Maast 49 | dev_url: https://github.com/zjshi/Maast 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Zhou (Jason) Shi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /align_io/maf_io.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Alignment: 4 | def __init__(self, line): 5 | self.desc = line 6 | self.seqs = [] 7 | 8 | class Sequence: 9 | def __init__(self, line): 10 | values = line.rstrip().split() 11 | self.chrom = values[1] 12 | self.start = values[2] 13 | self.end = int(values[5]) 14 | self.length = int(values[3]) 15 | self.strand = values[4] 16 | 17 | self.seq = values[6].upper() 18 | 19 | def parse(fpath): 20 | with open(fpath) as file: 21 | for line in file: 22 | if line[0] == '#': continue 23 | else: break 24 | alignment = Alignment(line) 25 | for line in file: 26 | if line[0] == 'a': 27 | yield alignment 28 | alignment = Alignment(line) 29 | elif line[0] == 's': 30 | sequence = Sequence(line) 31 | alignment.seqs.append(sequence) 32 | yield alignment 33 | 34 | def iter_parse(fpath): 35 | with open(fpath) as file: 36 | for line in file: 37 | if line[0] == '#': continue 38 | else: break 39 | alignment = Alignment(line) 40 | for line in file: 41 | if line[0] == 'a': 42 | yield alignment 43 | alignment = Alignment(line) 44 | elif line[0] == 's': 45 | sequence = Sequence(line) 46 | alignment.seqs.append(sequence) 47 | yield alignment 48 | -------------------------------------------------------------------------------- /align_io/msa.py: -------------------------------------------------------------------------------- 1 | 2 | def parse_control(msa_path, msa_type, max_sample=float('inf')): 3 | if msa_type == 'xmfa-parsnp': 4 | from align_io.xmfa_parsnp_io import parse 5 | elif msa_type == 'xmfa-mummer4': 6 | from align_io.xmfa_mummer4_io import parse 7 | elif msa_type == 'xmfa-mauve': 8 | from align_io.xmfa_mauve_io import parse 9 | elif msa_type == 'maf-mugsy': 10 | from align_io.maf_io import parse 11 | else: 12 | import sys 13 | sys.exit("\nError: invalid value for --msa-format: %s\n" % msa_type) 14 | return parse(msa_path, max_sample) 15 | 16 | def monolithic_parse(msa_path, msa_type, max_sample=float('inf')): 17 | return parse_control(msa_path, msa_type, max_sample) 18 | 19 | def iter_parse_control(msa_path, msa_type, max_sample=float('inf')): 20 | if msa_type == 'xmfa-parsnp': 21 | from align_io.xmfa_parsnp_io import iter_parse 22 | elif msa_type == 'xmfa-mummer4': 23 | from align_io.xmfa_mummer4_io import iter_parse 24 | elif msa_type == 'xmfa-mauve': 25 | from align_io.xmfa_mauve_io import iter_parse 26 | elif msa_type == 'maf-mugsy': 27 | from align_io.maf_io import iter_parse 28 | else: 29 | import sys 30 | sys.exit("\nError: invalid value for --msa-format: %s\n" % msa_type) 31 | return iter_parse(msa_path, max_sample) 32 | 33 | def iter_parse(msa_path, msa_type, max_sample=float('inf')): 34 | for align in iter_parse_control(msa_path, msa_type, max_sample): 35 | yield align 36 | 37 | def iterate_cols(msa_path, msa_type): 38 | for align in parse(msa_path, msa_type): 39 | for column in align.fetch_columns(): 40 | yield column 41 | -------------------------------------------------------------------------------- /snps_io/id_centroid.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import sys, os 4 | import argparse, operator 5 | 6 | import numpy as np 7 | 8 | from time import time 9 | 10 | def read_tags(tag_paths): 11 | tag_map = dict() 12 | 13 | for tag_genome in tag_paths: 14 | tag_map[tag_genome] = 0 15 | 16 | return tag_map 17 | 18 | def calc_tag_weights(tag_map, dist_path, dist_type): 19 | sys.stderr.write("[clustering] start\n") 20 | 21 | with open(dist_path, 'r') as fh: 22 | for line in fh: 23 | items = line.rstrip().split('\t') 24 | genome1, genome2, d = items[0], items[1], float(items[2]) 25 | 26 | if genome1 >= genome2: 27 | #sys.stderr.write("{} {}\n".format(genome1, genome2)) 28 | continue 29 | # sys.stderr.write("{} {}\n".format(genome1, genome2)) 30 | 31 | if genome1 in tag_map and genome2 in tag_map: 32 | if dist_type == "L1": 33 | tag_map[genome1] += d 34 | tag_map[genome2] += d 35 | elif dist_type == "L2": 36 | tag_map[genome1] = (tag_map[genome1] ** 2 + d ** 2) ** 0.5 37 | tag_map[genome2] = (tag_map[genome2] ** 2 + d ** 2) ** 0.5 38 | elif dist_type == "Linf": 39 | if tag_map[genome1] < d: 40 | tag_map[genome1] = d 41 | if tag_map[genome2] < d: 42 | tag_map[genome2] = d 43 | else: 44 | sys.exit("Error: distance type {} is not supported for centroid genome picking".format(dist_type)) 45 | 46 | sys.stderr.write("[clustering] done\n") 47 | 48 | return tag_map 49 | 50 | def centroid_from_map(tag_map): 51 | centroid = None 52 | 53 | for tag in tag_map.keys(): 54 | if centroid is None: 55 | centroid = tag 56 | else: 57 | if tag_map[tag] < tag_map[centroid]: 58 | centroid = tag 59 | 60 | return centroid 61 | 62 | def identify(tag_paths, dist_path, dist_type="L1"): 63 | tag_map = read_tags(tag_paths) 64 | tag_map = calc_tag_weights(tag_map, dist_path, dist_type) 65 | 66 | centroid = centroid_from_map(tag_map) 67 | 68 | return centroid 69 | -------------------------------------------------------------------------------- /snps_io/vcf_var_io.py: -------------------------------------------------------------------------------- 1 | class SNP: 2 | def __init__(self, chrom, variant_id, pos, ref="", alt="", info=None, fmt=None, sample_ids=None): 3 | self.chrom = chrom 4 | self.var_id = variant_id 5 | self.pos = pos 6 | 7 | self.ref_allele = "" 8 | self.alt_allele = "" 9 | 10 | if info is None: 11 | self.info = {} 12 | self.info['NS'] = -1 13 | self.info['DP'] = -1 14 | self.info['AF'] = -1 15 | 16 | if fmt is None: 17 | self.format = {} 18 | self.format['AF'] = "" 19 | 20 | if sample_ids is None: 21 | self.sample_ids = [] 22 | 23 | 24 | def format_header(sample_ids): 25 | import time 26 | header = "" 27 | header += """##fileformat=VCFv4.1\n""" 28 | header += """##fileDate=%s\n""" % time.strftime("%Y-%m-%d %H:%M") 29 | header += """##source=https://github.com/snayfach/snpMLST\n""" 30 | header += """##INFO=\n""" 31 | header += """##INFO=\n""" 32 | header += """##INFO=\n""" 33 | header += """##FORMAT=\n""" 34 | header += """##FORMAT=\n""" 35 | header += """#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT %s\n""" % " ".join(sample_ids) 36 | return header 37 | 38 | def format_snp(snp): 39 | record = "" 40 | record += str(snp.chrom) + "\t" # CHROM 41 | record += str(snp.pos) + "\t" # POS 42 | record += str(snp.var_id) + "\t" # ID 43 | record += snp.ref_allele + "\t" # REF 44 | record += snp.alt_allele + "\t" # ALT 45 | record += ".\t" # QUAL 46 | record += "PASS\t" # FILTER 47 | record += "%s\t" % format_info(snp) # INFO 48 | record += "%s\t" % ":".join(snp.format.keys()) # FORMAT 49 | record += "%s\n" % format_samples(snp) # GENOTYPES 50 | return record 51 | 52 | def format_info(snp): 53 | return ";".join([key + "=" + str(value) for key, value in snp.info.items()]) 54 | 55 | def format_samples(snp): 56 | formats = snp.format.keys() 57 | indexes = range(len(snp.sample_ids)) 58 | return "\t".join([":".join([str(snp.format[f][i]) for f in formats]) for i in indexes]) 59 | 60 | def write_vcf(snps, outdir): 61 | path = outdir+'/core_snps.vcf' 62 | if len(snps) > 0: 63 | with open(path, 'w') as file: 64 | file.write(format_header(snps[0].sample_ids)) 65 | for snp in snps: 66 | file.write(format_snp(snp)) 67 | else: 68 | print "Empty set of SNPs was found for the dataset, the file writing was skipped" 69 | -------------------------------------------------------------------------------- /snps_io/gen_msa_single.py: -------------------------------------------------------------------------------- 1 | import sys, argparse 2 | 3 | from Bio import SeqIO 4 | 5 | def parse_args(): 6 | """ Return dictionary of command line arguments 7 | """ 8 | parser = argparse.ArgumentParser( 9 | formatter_class=argparse.RawTextHelpFormatter, 10 | usage=argparse.SUPPRESS) 11 | 12 | parser.add_argument('--delta', type=str, dest='delta_path', required=True, 13 | help="""Path to delta file indicating the whole genome alignment information. This delta file is also the output of mummer4""") 14 | parser.add_argument('--ref-seq', type=str, dest='ref_path', required=True, 15 | help="""Path to reference genome file as input in multiple fasta format""") 16 | parser.add_argument('--qry-seq', type=str, dest='qry_path', required=True, 17 | help="""Path to query genome file as input in multiple fasta format""") 18 | parser.add_argument('--ref-name', type=str, dest='ref_name', required=True, 19 | help="""Specify reference genome name""") 20 | parser.add_argument('--qry-name', type=str, dest='qry_name', required=True, 21 | help="""Specify query genome name""") 22 | parser.add_argument('--out', type=str, dest='out', default="/dev/out", 23 | help="""Path to output (double genome alignment) will be written in MSA formart""") 24 | 25 | return vars(parser.parse_args()) 26 | 27 | 28 | def rc(seq): 29 | base_map = { 30 | 'A': 'T', 'a': 'T', 31 | 'C': 'G', 'c': 'G', 32 | 'G': 'C', 'g': 'C', 33 | 'T': 'A', 't': 'A', 34 | 'N': 'N', 'n': 'N' 35 | } 36 | 37 | return ''.join([base_map[c] for c in seq[::-1]]) 38 | 39 | def read_genome(genome_path): 40 | ordered_chroms = [] 41 | genome_seqs = dict() 42 | 43 | for seq in SeqIO.parse(genome_path, "fasta"): 44 | ordered_chroms.append(seq.id) 45 | genome_seqs[seq.id] = seq.seq 46 | 47 | return genome_seqs, ordered_chroms 48 | 49 | def parse_delta(delta_path): 50 | align_blocs = [] 51 | 52 | with open(delta_path, 'r') as fh: 53 | fh.readline() 54 | fh.readline() 55 | 56 | r_tag = "" 57 | q_tag = "" 58 | 59 | r_len = "" 60 | q_len = "" 61 | 62 | bloc = [] 63 | 64 | for line in fh: 65 | if line[0] == '>': 66 | items = line[1:].rstrip().split(' ') 67 | r_tag = items[0] 68 | q_tag = items[1] 69 | 70 | r_len = int(items[2]) 71 | q_len = int(items[3]) 72 | else: 73 | if ' ' in line: 74 | items = line.rstrip().split(' ') 75 | bloc = [ int(item) for item in items[:4] ] 76 | else: 77 | diff = line.rstrip() 78 | 79 | if diff == '0': 80 | align_blocs.append([r_tag, q_tag, r_len, q_len] + bloc) 81 | bloc = [] 82 | else: 83 | bloc.append(int(diff)) 84 | 85 | return align_blocs 86 | 87 | def main(): 88 | args = parse_args() 89 | 90 | ref_genome, ref_chroms = read_genome(args['ref_path']) 91 | qry_genome, qry_chroms = read_genome(args['qry_path']) 92 | 93 | align_blocs = parse_delta(args['delta_path']) 94 | 95 | aligned_qry = dict() 96 | 97 | for chrom_id in ref_genome.keys(): 98 | aligned_qry[chrom_id] = '-' * len(ref_genome[chrom_id]) 99 | 100 | for bloc in align_blocs: 101 | r_tag = bloc[0] 102 | q_tag = bloc[1] 103 | 104 | r_len = bloc[2] 105 | q_len = bloc[3] 106 | 107 | r_start = bloc[4] 108 | r_end = bloc[5] 109 | 110 | assert r_end > r_start 111 | 112 | q_start = bloc[6] 113 | q_end = bloc[7] 114 | 115 | q_seq = "" 116 | if q_end < q_start: 117 | q_start = q_len - q_start + 1 118 | q_end = q_len - q_end + 1 119 | 120 | q_seq = rc(qry_genome[q_tag])[q_start-1:q_end] 121 | else: 122 | q_seq = qry_genome[q_tag][q_start-1:q_end] 123 | 124 | 125 | pos = 0 126 | for diff in bloc[8:]: 127 | if diff < 0: 128 | pos = pos+abs(diff) 129 | q_seq = q_seq[:pos-1] + q_seq[pos:] 130 | pos = pos - 1 131 | else: 132 | pos = pos + diff 133 | q_seq = q_seq[:pos-1] + '-' + q_seq[pos-1:] 134 | 135 | # print [r_start, r_end, r_end - r_start, q_start, q_end, q_end - q_start, len(q_seq)] 136 | 137 | if __name__ == "__main__": 138 | main() 139 | -------------------------------------------------------------------------------- /align_io/xmfa_parsnp_io.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | from snp_mlst.align_io import seq_ali 4 | 5 | class Sequence: 6 | def __init__(self, line): 7 | values = line.rstrip().lstrip('>').lstrip().split() 8 | self.index = values[0].split(':')[0] 9 | self.start = int(values[0].split(':')[1].split('-')[0]) 10 | self.end = int(values[0].split(':')[1].split('-')[1]) 11 | self.length = self.end - self.start + 1 12 | self.strand = values[1] 13 | self.chrom = values[2] 14 | self.seq = '' 15 | 16 | def extract_genome_info(fpath): 17 | # check version 18 | with open(fpath) as file: 19 | version = next(file).rstrip().split(' ', 1)[-1] 20 | if version != 'Parsnp v1.1': 21 | sys.exit("\nError: expected XMFA version 'Parsnp v1.1' but got '%s'\n" % version) 22 | 23 | """ 24 | this section reads the header of the parsnp xmfa file, 25 | header are the lines beginning with ## and #, 26 | map genome index to info, 27 | keys (field in code) = ['SequenceLength', 'SequenceFile', 'SequenceHeader'] 28 | """ 29 | genome_info = {} 30 | with open(fpath) as file: 31 | header = '' 32 | for line in file: 33 | if line.startswith('##'): header += line.lstrip('#') 34 | elif line.startswith('#'): continue 35 | else: break 36 | 37 | for h in header.split('SequenceIndex ')[1:]: 38 | genome_index, info_string = h.rstrip('\n').split('\n', 1) 39 | genome_info[genome_index] = {} 40 | for info_record in info_string.split('\n'): 41 | field, value = info_record.split(' ', 1) 42 | genome_info[genome_index][field] = value 43 | 44 | return genome_info 45 | 46 | def parse(fpath, max_sample=float('inf')): 47 | 48 | genome_info = extract_genome_info(fpath) 49 | 50 | """ 51 | this section reads the body of the parsnp xmfa file, 52 | the '=' marked the end of a multiple sequence alignment, 53 | a multiple sequence alignment contains sequences in fasta format for each sample 54 | each sequence fasta header was splitted to different atttributes, and store in Sequence class for description. 55 | each sequence was added to the Sequence class 56 | """ 57 | alns = [] 58 | with open(fpath) as file: 59 | last_aln = None 60 | cur_aln = seq_ali.Alignment() 61 | for line in file: 62 | if line.startswith('#'): 63 | continue 64 | elif line.startswith('='): 65 | cur_aln.nseqs = len(cur_aln.seqs) 66 | cur_aln.ncols = len(cur_aln.seqs[0].seq) 67 | cur_aln.chrom = cur_aln.seqs[0].chrom 68 | cur_aln.update() 69 | last_aln = cur_aln 70 | cur_aln = seq_ali.Alignment() 71 | alns.append(last_aln) 72 | elif len(cur_aln.seqs) <= max_sample: 73 | if line.startswith('>'): 74 | seq = Sequence(line) 75 | seq.id = genome_info[seq.index]['SequenceFile'] 76 | cur_aln.seqs.append(seq) 77 | else: 78 | cur_aln.seqs[-1].seq += line.rstrip().upper() 79 | else: 80 | if len(cur_aln.seqs) > max_sample: 81 | cur_aln.seqs.pop() 82 | pass 83 | return alns 84 | 85 | def iter_parse(fpath, max_sample=float('inf')): 86 | genome_info = extract_genome_info(fpath) 87 | 88 | """ 89 | this section reads the body of the parsnp xmfa file, 90 | the '=' marked the end of a multiple sequence alignment, 91 | a multiple sequence alignment contains sequences in fasta format for each sample 92 | each sequence fasta header was splitted to different atttributes, and store in Sequence class for description. 93 | each sequence was added to the Sequence class 94 | """ 95 | 96 | with open(fpath) as file: 97 | last_aln = None 98 | cur_aln = seq_ali.Alignment() 99 | for line in file: 100 | if line.startswith('#'): 101 | continue 102 | elif line.startswith('='): 103 | cur_aln.nseqs = len(cur_aln.seqs) 104 | cur_aln.ncols = len(cur_aln.seqs[0].seq) 105 | cur_aln.chrom = cur_aln.seqs[0].chrom 106 | cur_aln.update() 107 | last_aln = cur_aln 108 | cur_aln = seq_ali.Alignment() 109 | yield last_aln 110 | else: 111 | if len(cur_aln.seqs) <= max_sample: 112 | if line.startswith('>'): 113 | seq = Sequence(line) 114 | seq.id = genome_info[seq.index]['SequenceFile'] 115 | cur_aln.seqs.append(seq) 116 | else: 117 | cur_aln.seqs[-1].seq += line.rstrip().upper() 118 | else: 119 | if len(cur_aln.seqs) > max_sample: 120 | cur_aln.seqs.pop() 121 | pass 122 | -------------------------------------------------------------------------------- /align_io/xmfa_io.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from operator import itemgetter 3 | 4 | class Alignment: 5 | def __init__(self): 6 | self.desc = '' 7 | self.seqs = [] 8 | 9 | def fetch_columns(self): 10 | # ncols is the length of first seq in Alignment 11 | # Can be inferred from this block of code that all seq in Alignment have the same length 12 | # seq.id = genome_info[seq.index]['SequenceFile'] 13 | 14 | for col_index in range(self.ncols): 15 | pos = col_index + 1 16 | chars = [seq.seq[col_index] for seq in self.seqs] 17 | sample_ids = [seq.id for seq in self.seqs] 18 | yield AlignColumn(pos, chars, sample_ids) 19 | 20 | class AlignColumn: 21 | def __init__(self, pos, chars, sample_ids): 22 | self.chrom = None 23 | self.pos = pos 24 | self.chars = chars 25 | self.sample_ids = sample_ids 26 | self.total_samples = len(chars) 27 | self.pooled_counts = {'A':0, 'T':0, 'G':0, 'C':0} 28 | self.pool_counts() 29 | self.pooled_depth = sum(self.pooled_counts.values()) 30 | self.present_samples = sum(self.pooled_counts.values()) 31 | self.call_snp() 32 | self.prev = self.present_samples / float(self.total_samples) 33 | self.allele_freqs = self.genotype() 34 | 35 | def pool_counts(self): 36 | """ Pool read-counts to 4 alleles """ 37 | for char in self.chars: 38 | if char in self.pooled_counts: 39 | self.pooled_counts[char] += 1 40 | 41 | def call_snp(self): 42 | """ Identify major and minor alleles """ 43 | counts = sorted(list(self.pooled_counts.items()), key=itemgetter(1), reverse=True) 44 | if self.present_samples == 0: 45 | self.cons_allele = 'N' 46 | self.cons_count = 0 47 | self.cons_freq = 0.0 48 | self.alt_freq = 0.0 49 | else: 50 | self.cons_allele, self.cons_count = counts.pop(0) 51 | self.cons_freq = self.cons_count/float(self.present_samples) 52 | if len(counts) > 0: 53 | self.alt_allele, self.alt_count = counts[0] 54 | self.alt_freq = self.alt_count/float(self.present_samples) 55 | else: 56 | self.alt_freq = 0.0 57 | 58 | def consensus(self): 59 | from collections import Counter 60 | return Counter(self.chars).most_common(1)[0][0] 61 | 62 | def percent_aligned(self, nseqs): 63 | """ Compute the % of genomes with observed data """ 64 | gaps = self.chars.count('-') 65 | missing = self.chars.count('N') 66 | return (nseqs - gaps - missing) / float(nseqs) 67 | 68 | def genotype(self): 69 | genotypes = [] 70 | for char in self.chars: 71 | if char == self.cons_allele: 72 | genotypes.append(0) 73 | elif char == self.alt_allele: 74 | genotypes.append(1) 75 | else: 76 | genotypes.append(None) 77 | return genotypes 78 | 79 | 80 | class Sequence: 81 | def __init__(self, line): 82 | values = line.rstrip().lstrip('>').lstrip().split() 83 | self.index = values[0].split(':')[0] 84 | self.start = int(values[0].split(':')[1].split('-')[0]) 85 | self.end = int(values[0].split(':')[1].split('-')[1]) 86 | self.length = self.end - self.start + 1 87 | self.strand = values[1] 88 | self.seq = '' 89 | 90 | 91 | def parse(fpath): 92 | 93 | # check version 94 | with open(fpath) as file: 95 | version = next(file).rstrip().split(' ', 1)[-1] 96 | if version != 'Parsnp v1.1': 97 | sys.exit("\nError: expected XMFA version 'Parsnp v1.1' but got '%s'\n" % version) 98 | 99 | # map genome index to info 100 | # keys = ['SequenceLength', 'SequenceFile', 'SequenceHeader'] 101 | with open(fpath) as file: 102 | genome_info = {} 103 | header = '' 104 | for line in file: 105 | if line.startswith('##'): header += line.lstrip('#') 106 | elif line.startswith('#'): continue 107 | else: break 108 | for h in header.split('SequenceIndex ')[1:]: 109 | genome_index, info_string = h.rstrip('\n').split('\n', 1) 110 | genome_info[genome_index] = {} 111 | for info_record in info_string.split('\n'): 112 | field, value = info_record.split(' ', 1) 113 | genome_info[genome_index][field] = value 114 | 115 | # yield alignment blocks 116 | with open(fpath) as file: 117 | last = None 118 | current = Alignment() 119 | for line in file: 120 | if line.startswith('#'): 121 | continue 122 | elif line.startswith('='): 123 | current.nseqs = len(current.seqs) 124 | current.ncols = len(current.seqs[0].seq) 125 | last = current 126 | current = Alignment() 127 | yield last 128 | elif line.startswith('>'): 129 | seq = Sequence(line) 130 | seq.id = genome_info[seq.index]['SequenceFile'] 131 | current.seqs.append(seq) 132 | else: 133 | current.seqs[-1].seq += line.rstrip().upper() 134 | -------------------------------------------------------------------------------- /snps_io/gen_msa.py: -------------------------------------------------------------------------------- 1 | import os, time, sys 2 | import numpy as np 3 | 4 | def parse_seqs(path): 5 | with open(path) as file: 6 | try: id = next(file).split()[0].lstrip('>') 7 | except: return 8 | seq = '' 9 | for line in file: 10 | if line[0]=='>': 11 | yield id, seq 12 | try: id = line.split()[0].lstrip('>') 13 | except: return 14 | seq = '' 15 | else: 16 | seq += line.rstrip() 17 | yield id, seq 18 | 19 | def parse_coords(fpath): 20 | fields = [('s1',int),('e1',int), 21 | ('s2',int),('e2',int), 22 | ('len1',int),('len2',int), 23 | ('pid',float), 24 | ('c1',str),('c2',str)] 25 | with open(fpath) as f: 26 | for i in range(5): 27 | next(f) 28 | for l in f: 29 | values = l.replace(' | ', ' ').split() 30 | yield dict([(f[0],f[1](v)) for f,v in zip(fields, values)]) 31 | 32 | def parse_snps(fpath): 33 | fields = [('p1',int),('b1',str),('b2',str),('p2',int), 34 | ('buf',int),('dist',int), 35 | ('r',int),('q',int), 36 | ('s1',int),('s2',int), 37 | ('c1',str),('c2',str)] 38 | with open(fpath) as f: 39 | for i in range(5): 40 | next(f) 41 | for l in f: 42 | values = l.replace(' | ', ' ').split() 43 | yield dict([(f[0],f[1](v)) for f,v in zip(fields, values)]) 44 | 45 | def build_msa(indir, overwrite=True, max_genomes=None, max_sites=None, msa_id=None, subset=None): 46 | start = time.time() 47 | 48 | aln_dir = os.path.join(indir, 'aln') 49 | 50 | if not os.path.exists(indir): 51 | sys.exit("Error: dir does not exist: %s" % indir) 52 | 53 | print("Reading reference genome") 54 | ref = {} 55 | chroms = [] 56 | local_pos = np.array([]) 57 | for id, seq in parse_seqs(os.path.join(indir, 'reference.fna')): 58 | chroms.append(id) 59 | ref[id] = np.array(list(seq.upper())) 60 | if len(local_pos) == 0: 61 | local_pos = np.arange(len(seq)) 62 | else: 63 | local_pos = np.concatenate([local_pos, np.arange(len(seq))]) 64 | print(" count contigs: %s" % len(ref)) 65 | print(" count sites: %s" % sum([len(_) for _ in ref.values()])) 66 | 67 | print("Initializing alignments") 68 | genome_ids = os.listdir(aln_dir) 69 | 70 | if max_genomes is not None: 71 | genome_ids = genome_ids[:max_genomes] 72 | 73 | if len(subset) == 0: 74 | pass 75 | else: 76 | genome_ids = [] 77 | for genome_id in os.listdir(aln_dir): 78 | if "{}.fna".format(genome_id) in subset or \ 79 | "{}.fasta".format(genome_id) in subset or \ 80 | "{}.fsa".format(genome_id) in subset or \ 81 | "{}.fa".format(genome_id) in subset: 82 | genome_ids.append(genome_id) 83 | 84 | print(" count genomes: %s" % len(genome_ids)) 85 | genomes = {} 86 | for genome_id in genome_ids: 87 | genomes[genome_id] = {} 88 | for id, seq in ref.items(): 89 | genomes[genome_id][id] = np.array(['-']*len(seq)) 90 | 91 | print("Reading alignment blocks") 92 | for genome_id in genome_ids: 93 | fpath = '%s/%s/coords' % (aln_dir, genome_id) 94 | aln_length = 0 95 | for r in parse_coords(fpath): 96 | aln_length += (r['e1'] - r['s1']) 97 | genomes[genome_id][r['c1']][r['s1']-1:r['e1']] = ref[r['c1']][r['s1']-1:r['e1']] 98 | 99 | print("Reading SNPs") 100 | for genome_id in genome_ids: 101 | fpath = '%s/%s/snps' % (aln_dir, genome_id) 102 | for r in parse_snps(fpath): 103 | if r['b1'] == '.': 104 | continue 105 | elif r['b2'] == '.': 106 | genomes[genome_id][r['c1']][r['p1']-1] = '-' 107 | else: 108 | genomes[genome_id][r['c1']][r['p1']-1] = r['b2'] 109 | 110 | chrom_aligns = {} 111 | for chrom in chroms: 112 | chrom_aligns[chrom] = '' 113 | for genome_id in genomes: 114 | chrom_aligns[chrom] = chrom_aligns[chrom] + '>{} {}\n{}\n'.format(genome_id, chrom, ''.join(genomes[genome_id][chrom])) 115 | 116 | chrom_aligns[chrom] = chrom_aligns[chrom] + '=\n' 117 | 118 | print("Writing fasta") 119 | 120 | fname = "msa.fa" 121 | 122 | if msa_id is not None: 123 | fname = "{}.fa".format(msa_id) 124 | 125 | msa_path = os.path.join(indir, fname) 126 | 127 | if overwrite is True: 128 | pass 129 | else: 130 | indx = 1 131 | while (os.path.isfile(msa_path)): 132 | fname = "msa.{}.fa".format(indx) 133 | msa_path = os.path.join(indir, fname) 134 | 135 | print(" path: %s" % msa_path) 136 | with open(msa_path, 'w') as f: 137 | for chrom in chroms: 138 | f.write(chrom_aligns[chrom]) 139 | 140 | print("\nDone!") 141 | print("Time (s):", round(time.time()-start,2)) 142 | 143 | return msa_path 144 | """ 145 | print("Writing fasta") 146 | msa_path = os.path.join(indir, 'msa.fa') 147 | print(" path: %s" % msa_path) 148 | with open(msa_path, 'w') as f: 149 | for chrom in chroms: 150 | for genome_id in genomes: 151 | f.write('>%s %s\n' % (genome_id, chrom)) 152 | f.write(''.join(genomes[genome_id][chrom])+'\n') 153 | f.write("=\n") 154 | 155 | print("\nDone!") 156 | print("Time (s):", round(time.time()-start,2)) 157 | """ 158 | -------------------------------------------------------------------------------- /align_io/xmfa_mummer4_io.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import copy 4 | from align_io import seq_ali 5 | 6 | class SimpleSequence: 7 | def __init__(self, indx, id="", seq=""): 8 | self.index = indx 9 | self.id = id 10 | self.seq = seq 11 | self.chrom = "" 12 | 13 | def parse(fpath, max_sample=float('inf')): 14 | """ 15 | this section reads the body of the parsnp xmfa file, 16 | the '=' marked the end of a multiple sequence alignment, 17 | a multiple sequence alignment contains sequences in fasta format for each sample 18 | each sequence fasta header was splitted to different atttributes, and store in Sequence class for description. 19 | each sequence was added to the Sequence class 20 | """ 21 | alns = [] 22 | total_len = 0 23 | with open(fpath) as file: 24 | last_aln = None 25 | cur_aln = seq_ali.Alignment() 26 | indx = 0 27 | for line in file: 28 | if line.startswith('='): 29 | cur_aln.nseqs = len(cur_aln.seqs) 30 | cur_aln.ncols = len(cur_aln.seqs[0].seq) 31 | cur_aln.chrom = cur_aln.seqs[0].chrom 32 | # print(cur_aln.nseqs) 33 | cur_aln.update() 34 | last_aln = cur_aln 35 | cur_aln = seq_ali.Alignment() 36 | alns.append(last_aln) 37 | total_len = total_len + last_aln.ncols 38 | else: 39 | if len(cur_aln.seqs) <= max_sample: 40 | if line.startswith('>'): 41 | seq = SimpleSequence(indx) 42 | temp_items = line.rstrip().split(' ') 43 | seq.id = temp_items[0][1:] 44 | seq.chrom = temp_items[1] 45 | cur_aln.seqs.append(seq) 46 | indx = indx + 1 47 | else: 48 | cur_aln.seqs[-1].seq += line.rstrip().upper() 49 | else: 50 | if len(cur_aln.seqs) > max_sample: 51 | cur_aln.seqs.pop() 52 | pass 53 | 54 | print("total length of alignments: {}".format(total_len)) 55 | return alns 56 | 57 | def iter_parse(fpath, max_sample=float('inf')): 58 | """ 59 | this section reads the body of the parsnp xmfa file, 60 | the '=' marked the end of a multiple sequence alignment, 61 | a multiple sequence alignment contains sequences in fasta format for each sample 62 | each sequence fasta header was splitted to different atttributes, and store in Sequence class for description. 63 | each sequence was added to the Sequence class 64 | """ 65 | lines = [] 66 | n_align = 0 67 | with open(fpath) as file: 68 | for line in file: 69 | lines.append(line) 70 | if line.startswith('='): 71 | n_align = n_align + 1 72 | 73 | if n_align > 1: 74 | last_aln = None 75 | cur_aln = seq_ali.Alignment() 76 | indx = 0 77 | for line in lines: 78 | if line.startswith('='): 79 | cur_aln.nseqs = len(cur_aln.seqs) 80 | cur_aln.ncols = len(cur_aln.seqs[0].seq) 81 | cur_aln.chrom = cur_aln.seqs[0].chrom 82 | 83 | cur_aln.update() 84 | last_aln = cur_aln 85 | cur_aln = seq_ali.Alignment() 86 | yield last_aln 87 | else: 88 | if len(cur_aln.seqs) <= max_sample: 89 | if line.startswith('>'): 90 | seq = SimpleSequence(indx) 91 | temp_items = line.rstrip().split(' ') 92 | seq.id = temp_items[0][1:] 93 | seq.chrom = temp_items[1] 94 | cur_aln.seqs.append(seq) 95 | indx = indx + 1 96 | else: 97 | cur_aln.seqs[-1].seq += line.rstrip().upper() 98 | else: 99 | if len(cur_aln.seqs) > max_sample: 100 | cur_aln.seqs.pop() 101 | pass 102 | else: 103 | max_iter_stride = 200*1000 104 | 105 | last_aln = None 106 | cur_aln = seq_ali.Alignment() 107 | indx = 0 108 | 109 | for line in lines: 110 | if line.startswith('='): 111 | cur_aln.nseqs = len(cur_aln.seqs) 112 | cur_aln.ncols = len(cur_aln.seqs[0].seq) 113 | cur_aln.chrom = cur_aln.seqs[0].chrom 114 | 115 | if cur_aln.ncols <= max_iter_stride: 116 | # print cur_aln.ncols 117 | cur_aln.update() 118 | yield cur_aln 119 | else: 120 | for sp in range(0, cur_aln.ncols, max_iter_stride): 121 | next_aln = seq_ali.Alignment() 122 | for seq in cur_aln.seqs: 123 | temp_seq = copy.deepcopy(seq) 124 | if sp+max_iter_stride <= cur_aln.ncols+1000: 125 | temp_seq.seq = temp_seq.seq[sp:(sp+max_iter_stride)] 126 | else: 127 | temp_seq.seq = temp_seq.seq[sp:] 128 | next_aln.seqs.append(temp_seq) 129 | #print next_aln.seqs[-1].chrom 130 | 131 | next_aln.nseqs = len(next_aln.seqs) 132 | next_aln.ncols = len(next_aln.seqs[0].seq) 133 | next_aln.chrom = next_aln.seqs[0].chrom 134 | next_aln.update() 135 | yield next_aln 136 | else: 137 | if len(cur_aln.seqs) <= max_sample: 138 | if line.startswith('>'): 139 | seq = SimpleSequence(indx) 140 | temp_items = line.rstrip().split(' ') 141 | seq.id = temp_items[0][1:] 142 | seq.chrom = temp_items[1] 143 | cur_aln.seqs.append(seq) 144 | indx = indx + 1 145 | else: 146 | cur_aln.seqs[-1].seq += line.rstrip().upper() 147 | else: 148 | if len(cur_aln.seqs) > max_sample: 149 | cur_aln.seqs.pop() 150 | pass 151 | -------------------------------------------------------------------------------- /snps_io/vcf_io.py: -------------------------------------------------------------------------------- 1 | class SNP: 2 | def __init__(self, chrom, variant_id, pos, ref, alt, third=None, forth=None, avail_alleles=None, info=None, fmt=None, sample_ids=None): 3 | self.chrom = chrom 4 | self.var_id = variant_id 5 | self.pos = pos 6 | 7 | self.ref_allele = ref 8 | self.alt_allele = alt 9 | self.third_allele = third 10 | self.forth_allele = forth 11 | 12 | self.avail_alleles = avail_alleles 13 | 14 | if info is None: 15 | self.info = {} 16 | self.info['NS'] = -1 17 | self.info['DP'] = -1 18 | self.info['AF'] = -1 19 | else: 20 | self.info = info 21 | 22 | if fmt is None: 23 | self.format = {} 24 | self.format['GP1'] = "" 25 | self.format['GP2'] = "" 26 | self.format['GP3'] = "" 27 | self.format['GP4'] = "" 28 | else: 29 | self.format = fmt 30 | 31 | if sample_ids is None: 32 | self.sample_ids = [] 33 | else: 34 | self.sample_ids = sample_ids 35 | 36 | def format_header(sample_ids, cmdl): 37 | import time 38 | header = "" 39 | header += """##fileformat=VCFv4.1\n""" 40 | header += """##fileDate=%s\n""" % time.strftime("%Y-%m-%d %H:%M") 41 | header += """##source=https://github.com/zjshi/Maast\n""" 42 | header += """##command='%s'\n""" % cmdl 43 | header += """##INFO=\n""" 44 | header += """##INFO=\n""" 45 | header += """##INFO=\n""" 46 | header += """##FORMAT=\n""" 47 | header += """##FORMAT=\n""" 48 | header += """##FORMAT=\n""" 49 | header += """##FORMAT=\n""" 50 | 51 | col_names = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + sample_ids 52 | header += """#%s\n""" % "\t".join(col_names) 53 | 54 | return header 55 | 56 | def format_snp(snp): 57 | record = "" 58 | record += str(snp.chrom) + "\t" # CHROM 59 | record += str(snp.pos) + "\t" # POS 60 | record += str(snp.var_id) + "\t" # ID 61 | record += (snp.ref_allele + b"\t").decode() # REF 62 | record += (snp.avail_alleles + b"\t").decode() # ALT 63 | record += ".\t" # QUAL 64 | record += "PASS\t" # FILTER 65 | record += "%s\t" % format_info(snp) # INFO 66 | record += "%s\t" % ":".join(sorted(snp.format.keys())) # FORMAT 67 | record += "%s\n" % format_samples(snp) # GENOTYPES 68 | return record 69 | 70 | def format_info(snp): 71 | return ";".join([key + "=" + str(value) for key, value in snp.info.items()]) 72 | 73 | def format_samples(snp): 74 | formats = sorted(snp.format.keys()) 75 | indexes = range(len(snp.sample_ids)) 76 | return "\t".join([":".join([str(snp.format[f][i]) for f in formats]) for i in indexes]) 77 | 78 | def write_vcf_header(snps, outdir, cmdl='unspecified'): 79 | import sys 80 | 81 | path = outdir+'/core_snps.vcf' 82 | if len(snps) > 0: 83 | with open(path, 'w') as file: 84 | file.write(format_header(snps[0].sample_ids, cmdl)) 85 | 86 | def write_vcf(snps, outdir, single_chrom_rep=False): 87 | import sys 88 | 89 | path = outdir+'/core_snps.vcf' 90 | if len(snps) > 0: 91 | with open(path, 'a') as file: 92 | for snp in snps: 93 | if single_chrom_rep is True: 94 | t_snp = snp 95 | t_snp.pos = t_snp.var_id 96 | file.write(format_snp(t_snp)) 97 | else: 98 | file.write(format_snp(snp)) 99 | else: 100 | print("Empty set of SNPs was found for the dataset, the file writing was skipped") 101 | 102 | def write_coords_header(coords, out_dir): 103 | path = out_dir+'/coords.tsv' 104 | with open(path, 'w') as file: 105 | file.write('\t'.join(['chrom', 'start', 'end'])+'\n') 106 | 107 | def write_coords(coords, out_dir): 108 | path = out_dir+'/coords.tsv' 109 | with open(path, 'a') as file: 110 | for d in coords: 111 | file.write('\t'.join([d['chrom'], str(d['start']), str(d['end'])])+'\n') 112 | 113 | def merge_coords(coords, min_gap=1): 114 | if len(coords) > 1: 115 | merged_coords = [] 116 | 117 | last_coord = coords[0] 118 | for i, coord in enumerate(coords[1:]): 119 | if coord['start'] - last_coord['end'] <= min_gap: 120 | if coord['chrom'] == last_coord['chrom']: 121 | new_coord = {'chrom':coord['chrom'], 'start':last_coord['start'], 'end':coord['end']} 122 | last_coord = new_coord 123 | continue 124 | 125 | merged_coords.append(last_coord) 126 | last_coord = coord 127 | 128 | merged_coords.append(last_coord) 129 | 130 | return merged_coords 131 | else: 132 | return coords 133 | 134 | def write_genome(genome, out_dir): 135 | path = out_dir+'/consensus.fna' 136 | with open(path, 'w') as file: 137 | file.write('>consensus\n'+genome+'\n') 138 | 139 | ##INFO= 140 | ##INFO= 141 | ##INFO= 142 | ##INFO= 143 | ##INFO= 144 | ##INFO= 145 | ##FILTER= 146 | ##FILTER= 147 | ##FORMAT= 148 | ##FORMAT= 149 | ##FORMAT= 150 | ##FORMAT= 151 | -------------------------------------------------------------------------------- /align_io/seq_ali.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | 5 | """ 6 | Class for encapsulating a single Multiple Sequence Alignment(MSA) 7 | """ 8 | class Alignment: 9 | def __init__(self): 10 | self.desc = '' #not used for now 11 | # chrom has identifier like 'cluster123', which mark a division on the core-genome, which is a conserved region found cross all reference genomes 12 | self.chrom = '' 13 | self.nseqs = 0 14 | self.ncols = 0 # number of sites 15 | self.seqs = [] # actual sequence for each sample 16 | self.sample_ids = [] 17 | 18 | """ 19 | attributes below were described in function update() 20 | """ 21 | self.char_mat = [] 22 | self.local_pos = [] 23 | self.count_mat = [] 24 | self.freq_mat = [] 25 | 26 | self.ref_alleles = [] 27 | self.alt_alleles = [] 28 | self.third_alleles = [] 29 | self.forth_alleles = [] 30 | 31 | self.ref_prob_mat = [] 32 | self.alt_prob_mat = [] 33 | self.third_prob_mat = [] 34 | self.forth_prob_mat = [] 35 | 36 | self.sample_presence = [] 37 | self.ref_freqs = [] 38 | self.alt_freqs = [] 39 | self.third_freqs = [] 40 | self.forth_freqs = [] 41 | 42 | self.prevalence = [] 43 | self.aligned_pctg = [] 44 | 45 | def update(self): 46 | assert len(self.seqs) > 1 47 | self.nseqs = len(self.seqs) 48 | self.ncols = len(self.seqs[0].seq) 49 | self.sample_ids = [seq.id for seq in self.seqs] 50 | 51 | """ 52 | generate char matrix from the aligned sequences 53 | 54 | example: 55 | sample1: ATCG 56 | sample2: ATGG 57 | sample3: ATGC 58 | 59 | the char matrix is the transpose of [[A, T, G, C],[A, T, G, G],[A, T, G, C]] 60 | """ 61 | self.char_mat = np.array([np.fromstring(seq.seq, dtype='c') for seq in self.seqs]) 62 | 63 | #print self.char_mat.shape 64 | #print self.char_mat.nbytes 65 | 66 | """ 67 | count A, T, G, C, N and - for each site on the sequences 68 | local_pos stores all local positions of each site on this core-genome division (or alignment) 69 | """ 70 | As = np.sum(self.char_mat == b'A', axis=0) 71 | Ts = np.sum(self.char_mat == b'T', axis=0) 72 | Gs = np.sum(self.char_mat == b'G', axis=0) 73 | Cs = np.sum(self.char_mat == b'C', axis=0) 74 | Ns = np.sum(self.char_mat == b'N', axis=0) 75 | Gaps = np.sum(self.char_mat == b'-', axis=0) 76 | 77 | self.local_pos = np.arange(len(self.seqs[0].seq)) 78 | 79 | self.count_mat = np.array([As, Ts, Gs, Cs, Ns, Gaps]) 80 | 81 | #print(self.char_mat) 82 | #print(self.char_mat == 'A') 83 | #print(As) 84 | #print(self.count_mat) 85 | #print(self.count_mat.shape) 86 | 87 | """ 88 | char_template: complete set of chars for each site on the sequences, from which the ref allele and alt allele will be selected 89 | """ 90 | char_template = np.array([ 91 | np.repeat(b'A', self.count_mat.shape[1]), 92 | np.repeat(b'T', self.count_mat.shape[1]), 93 | np.repeat(b'G', self.count_mat.shape[1]), 94 | np.repeat(b'C', self.count_mat.shape[1]) 95 | # np.repeat('N', self.count_mat.shape[1]) 96 | # np.repeat('-', self.count_mat.shape[1]) 97 | ]) 98 | 99 | """ 100 | sorting the counts of different chars at each site, 101 | then select the indices of chars whose counts are in top 2, 102 | then using the indices of chars to select the chars, 103 | then the top 1 char (the char with highest count) at each site is considered as ref allele for now, 104 | then the char with the second highest count at each site is considered as alt allele for now, 105 | for those sites that have only one char, the counts of other chars are zeroes, so theorectically any of them can be selected by program, but in reality '-' will be selected, no exception was found. 106 | """ 107 | count_inds_mat = self.count_mat[0:4,:].argsort(axis=0) 108 | top2_inds = count_inds_mat[-4:,] 109 | top2_char_mat = np.choose(top2_inds, char_template) 110 | self.ref_alleles = top2_char_mat[3,:] 111 | self.alt_alleles = top2_char_mat[2,:] 112 | 113 | self.third_alleles = top2_char_mat[1,:] 114 | self.forth_alleles = top2_char_mat[0,:] 115 | 116 | """ 117 | frequency matrix has the same shape as the char matrix 118 | it is initialize to have None only, 119 | in the end, it is used to store the presence/absence of ref/alt allele for each site cross all samples with the following rules: 120 | - presence of ref allele: 1 121 | - absence of ref allele: 0 122 | - presence of N or -: None 123 | """ 124 | self.freq_mat = np.repeat(np.int8(-1), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape) 125 | 126 | 127 | """ 128 | these two masks have the same shape as the frequence matrix 129 | """ 130 | # ref_mask = ((self.char_mat == self.ref_alleles) & (self.char_mat != '-') & (self.char_mat != 'N')) 131 | # alt_mask = ((self.char_mat == self.alt_alleles) & (self.char_mat != '-') & (self.char_mat != 'N')) 132 | 133 | ref_mask = (self.char_mat == self.ref_alleles) 134 | alt_mask = (self.char_mat == self.alt_alleles) 135 | third_mask = (self.char_mat == self.third_alleles) 136 | forth_mask = (self.char_mat == self.forth_alleles) 137 | """ 138 | such operation is possible because, I guess, numpy store matrix in a gigantic 1D array. 139 | """ 140 | self.freq_mat[ref_mask] = 0 141 | self.freq_mat[alt_mask] = 1 142 | self.freq_mat[third_mask] = 2 143 | self.freq_mat[forth_mask] = 3 144 | 145 | self.ref_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape) 146 | self.alt_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape) 147 | self.third_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape) 148 | self.forth_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape) 149 | 150 | 151 | 152 | self.ref_prob_mat[ref_mask] = 1 153 | self.alt_prob_mat[alt_mask] = 1 154 | self.third_prob_mat[third_mask] = 1 155 | self.forth_prob_mat[forth_mask] = 1 156 | 157 | ref_counts = np.sum(ref_mask, axis=0) 158 | alt_counts = np.sum(alt_mask, axis=0) 159 | third_counts = np.sum(third_mask, axis=0) 160 | forth_counts = np.sum(forth_mask, axis=0) 161 | """ 162 | the sample presence here only sum up the counts of A, T, G, C for each site, leave out the N and -, 163 | it facilitate the calculation of prevalence of the site on certain sequence alignment position 164 | ref and alt allele frequencies were calculated with denominator of sample_presence rather than the number of sample. 165 | naturally, there is another route to calculate them. 166 | """ 167 | self.sample_presence = np.sum(self.count_mat[0:4,:], axis=0) 168 | self.prevalence = self.sample_presence/self.nseqs 169 | 170 | zero_mask = (self.sample_presence == 0) 171 | self.sample_presence[zero_mask] = 1 172 | self.ref_freqs = ref_counts/self.sample_presence 173 | self.alt_freqs = alt_counts/self.sample_presence 174 | self.third_freqs = third_counts/self.sample_presence 175 | self.forth_freqs = forth_counts/self.sample_presence 176 | 177 | self.ref_freqs[zero_mask] = 0 178 | self.alt_freqs[zero_mask] = 0 179 | self.third_freqs[zero_mask] = 0 180 | self.forth_freqs[zero_mask] = 0 181 | 182 | self.sample_presence[zero_mask] = 0 183 | 184 | unaligned_masks = (self.count_mat[4:6,:] != 0) 185 | self.aligned_pctg = 1 - (np.sum(unaligned_masks, axis=0) / self.nseqs) 186 | -------------------------------------------------------------------------------- /snps_io/id_genome_clusters.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import sys, os 4 | import argparse, operator 5 | 6 | import numpy as np 7 | import networkx as nx 8 | 9 | from time import time 10 | 11 | class GenomeCluster: 12 | def __init__(self, max_d): 13 | self.max_d = max_d 14 | 15 | self.genomes = dict() 16 | self.links = dict() 17 | 18 | self.tag_genome = None 19 | 20 | def size(self): 21 | return len(genomes) 22 | 23 | def add(self, genome1, genome2, d, edge_weighted): 24 | weight = 1 25 | if edge_weighted is True: 26 | weight = 1 - d 27 | 28 | if genome1 not in self.genomes: 29 | self.genomes[genome1] = weight 30 | else: 31 | self.genomes[genome1] = self.genomes[genome1] + weight 32 | 33 | if genome2 not in self.genomes: 34 | self.genomes[genome2] = weight 35 | else: 36 | self.genomes[genome2] = self.genomes[genome2] + weight 37 | 38 | link = "{}|{}".format(genome1, genome2) 39 | if genome1 > genome2: 40 | link = "{}|{}".format(genome2, genome1) 41 | 42 | #sys.stderr.write("{} {}\n".format(genome1, genome2)) 43 | 44 | if link not in self.links: 45 | self.links[link] = d 46 | else: 47 | #assert True 48 | assert self.links[link] == d 49 | #sys.stderr.write("{} {}\n".format(genome1, genome2)) 50 | 51 | 52 | def merge(self, cluster2, genome1, genome2, d, edge_weighted): 53 | if self.contains(genome1) and cluster2.contains(genome2): 54 | self.add(genome1, genome2, d, edge_weighted) 55 | for link in cluster2.links.keys(): 56 | genomes = link.split("|") 57 | self.add(genomes[0], genomes[1], cluster2.links[link], edge_weighted) 58 | else: 59 | sys.exit("{} is not in cluster1 or {} is not cluster2, nullify the basis for merging".format(genome1, genome2)) 60 | 61 | def is_empty(self): 62 | return len(self.genomes.keys()) == 0 63 | 64 | def contains(self, genome): 65 | return genome in self.genomes 66 | 67 | def id_tag_genome(self, cent_meth): 68 | # no snps 69 | if len(self.genomes) == 0: 70 | sys.exit("\nError: no genomes on cluster: cannot id tag genome\n") 71 | # one snp 72 | elif len(self.genomes) == 1: 73 | self.tag_genome = self.genomes.keys()[0] 74 | else: 75 | tmp_min = 0 76 | tmp_genome = None 77 | if cent_meth == "degree": 78 | for genome in self.genomes.keys(): 79 | if self.genomes[genome] > tmp_min: 80 | tmp_min = self.genomes[genome] 81 | tmp_genome = genome 82 | else: 83 | G = nx.Graph() 84 | centrality = dict() 85 | for link in self.links.keys(): 86 | genomes = link.split("|") 87 | G.add_edge(genomes[0], genomes[1]) 88 | if cent_meth == "eigenvector": 89 | centrality = nx.eigenvector_centrality(G) 90 | elif cent_meth == "katz": 91 | centrality = nx.katz_centrality(G) 92 | elif cent_meth == "closeness": 93 | centrality = nx.closeness_centrality(G) 94 | elif cent_meth == "information": 95 | centrality = nx.information_centrality(G) 96 | elif cent_meth == "betweenness": 97 | centrality = nx.betweenness_centrality(G) 98 | elif cent_meth == "load": 99 | centrality = nx.load_centrality(G) 100 | else: 101 | sys.exit("Error: centrality method {} is not support for tag genome identification".format(cent_meth)) 102 | tmp_genome = [k for k, v in sorted(centrality.items(), key=lambda x: x[1])][-1] 103 | self.tag_genome = tmp_genome 104 | 105 | return self.tag_genome 106 | 107 | def fmtout(self): 108 | sorted_tuples = sorted(self.genomes.items(), key=operator.itemgetter(1), reverse=True) 109 | sorted_genomes = [genome_tuple[0] for genome_tuple in sorted_tuples] 110 | 111 | return "* {} {}".format(self.tag_genome, " ".join(sorted_genomes)) 112 | 113 | def fmtout_all(self): 114 | fmt_str = "{}\n".format(self.fmtout()) 115 | 116 | for link in self.links.keys(): 117 | fmt_str += "- {} {}\n".format(link, self.links[link]) 118 | 119 | return fmt_str 120 | 121 | def search_genome_clusters(dist_path, max_d, cent_meth, edge_weighted): 122 | sys.stderr.write("[clustering] start\n") 123 | 124 | genome_clusters = [] 125 | genome_lookup = dict() 126 | 127 | with open(dist_path, 'r') as fh: 128 | for line in fh: 129 | items = line.rstrip().split('\t') 130 | genome1, genome2, d = items[0], items[1], float(items[2]) 131 | 132 | if genome1 >= genome2 or d > max_d: 133 | #sys.stderr.write("{} {}\n".format(genome1, genome2)) 134 | continue 135 | # sys.stderr.write("{} {}\n".format(genome1, genome2)) 136 | 137 | if genome1 not in genome_lookup and genome2 not in genome_lookup: 138 | new_cluster = GenomeCluster(max_d) 139 | new_cluster.add(genome1, genome2, d, edge_weighted) 140 | genome_clusters.append(new_cluster) 141 | genome_lookup[genome1] = len(genome_clusters) - 1 142 | genome_lookup[genome2] = len(genome_clusters) - 1 143 | elif genome1 in genome_lookup and genome2 not in genome_lookup: 144 | cluster_indx = genome_lookup[genome1] 145 | genome_lookup[genome2] = cluster_indx 146 | genome_clusters[cluster_indx].add(genome1, genome2, d, edge_weighted) 147 | elif genome1 not in genome_lookup and genome2 in genome_lookup: 148 | cluster_indx = genome_lookup[genome2] 149 | genome_lookup[genome1] = cluster_indx 150 | genome_clusters[cluster_indx].add(genome1, genome2, d, edge_weighted) 151 | else: 152 | if genome_lookup[genome1] == genome_lookup[genome2]: 153 | pass 154 | else: 155 | cluster_indx1 = genome_lookup[genome1] 156 | cluster_indx2 = genome_lookup[genome2] 157 | genome_clusters[cluster_indx1].merge(genome_clusters[cluster_indx2], genome1, genome2, d, edge_weighted) 158 | 159 | for genome in genome_clusters[cluster_indx2].genomes: 160 | genome_lookup[genome] = cluster_indx1 161 | 162 | genome_clusters[cluster_indx2] = None 163 | 164 | sys.stderr.write("[clustering] done\n") 165 | sys.stderr.write("[clustering] {} genomes have been included in clusters\n".format(len(genome_lookup.keys()))) 166 | 167 | good_clusters = verify_clusters(genome_clusters, genome_lookup, cent_meth) 168 | 169 | return good_clusters, len(genome_lookup.keys()) 170 | 171 | def verify_clusters(genome_clusters, genome_lookup, cent_meth): 172 | for genome in genome_lookup.keys(): 173 | assert genome in genome_clusters[genome_lookup[genome]].genomes 174 | 175 | good_clusters = [] 176 | for i, cluster in enumerate(genome_clusters): 177 | if cluster is not None: 178 | cluster.id_tag_genome(cent_meth) 179 | good_clusters.append(cluster) 180 | 181 | for genome in cluster.genomes: 182 | assert genome_lookup[genome] == i 183 | 184 | return good_clusters 185 | 186 | def output_clusters(good_clusters, output_path="/dev/stdout"): 187 | if output_path is not None: 188 | with open(output_path, 'w') as fh: 189 | for gcluster in good_clusters: 190 | fh.write(gcluster.fmtout_all()) 191 | 192 | def build_genome_blocks(dist_path, total_n, critical_n=100, max_d=0.01, end_d=0.000001, range_factor=1.2, cent_meth="degree", edge_weigthed=False, output_path=None): 193 | optimal_d = 0 194 | optimal_n = 0 195 | optimal_clusters = [] 196 | 197 | upper_cap = critical_n * range_factor 198 | 199 | genome_clusters, clust_n = search_genome_clusters(dist_path, max_d, cent_meth, edge_weigthed) 200 | 201 | tag_n = total_n - clust_n + len(genome_clusters) 202 | 203 | firstcut_exit = False 204 | if tag_n > upper_cap: 205 | print("Program will continue with a non-optimal number ({}) of genomes. Perhaps try a higher cutoff (current {})".format(str(tag_n), str(max_d))) 206 | optimal_d = max_d 207 | optimal_n = tag_n 208 | optimal_clusters = genome_clusters 209 | firstcut_exit = True 210 | elif tag_n >= critical_n and tag_n <= upper_cap: 211 | # perfect scenario on exit 212 | optimal_d = max_d 213 | optimal_n = tag_n 214 | optimal_clusters = genome_clusters 215 | else: 216 | # determine lower bound 217 | min_d = max_d 218 | 219 | print("[Searching lower cap]") 220 | while min_d >= end_d and tag_n < critical_n: 221 | min_d = min_d / 10 222 | 223 | genome_clusters, clust_n = search_genome_clusters(dist_path, min_d, cent_meth, edge_weigthed) 224 | tag_n = total_n - clust_n + len(genome_clusters) 225 | 226 | print("\t{}: {} tag genomes".format(min_d, tag_n)) 227 | 228 | print("[End earching]") 229 | 230 | # binary search into critical range 231 | print("[Searching optimal d-cut]") 232 | if min_d < end_d and tag_n < critical_n: 233 | print("Program cannot reach the number ({}) of genomes required for core-genome SNP calling.") 234 | print("Proceeding with orginal set of genomes") 235 | 236 | optimal_d = None 237 | optimal_n = None 238 | optimal_clusters = None 239 | else: 240 | left_d = max_d 241 | right_d = min_d 242 | mid_point = int((upper_cap + critical_n) / 2) 243 | 244 | delta_d = 1 # arbitary value; does not matter 245 | 246 | while delta_d > 0.0000001 and (tag_n > upper_cap or tag_n < critical_n): 247 | cur_d = (left_d + right_d) / 2 248 | 249 | genome_clusters, clust_n = search_genome_clusters(dist_path, cur_d, cent_meth, edge_weigthed) 250 | tag_n = total_n - clust_n + len(genome_clusters) 251 | 252 | if tag_n > mid_point: 253 | right_d = cur_d 254 | else: 255 | left_d = cur_d 256 | 257 | delta_d = abs(left_d - right_d) 258 | 259 | print("\tsearching space [ {} , {} ]".format(left_d, right_d)) 260 | print("\tcurrent d-cut: {}".format(cur_d)) 261 | print("\tcurrent no of tags: {}".format(tag_n)) 262 | 263 | if tag_n >= critical_n: 264 | delta_1 = abs(tag_n - mid_point) 265 | delta_2 = abs(optimal_n - mid_point) 266 | 267 | if delta_1 < delta_2: 268 | optimal_d = cur_d 269 | optimal_n = tag_n 270 | optimal_clusters = genome_clusters 271 | else: 272 | pass 273 | else: 274 | pass 275 | 276 | 277 | print("[Searching optimal d-cut]") 278 | 279 | if tag_n < critical_n and optimal_n < critical_n: 280 | print("Program cannot reach the number ({}) of genomes required for core-genome SNP calling.") 281 | print("Proceeding with orginal set of genomes. Or try higher MAF") 282 | 283 | optimal_d = None 284 | optimal_n = None 285 | optimal_clusters = None 286 | 287 | return optimal_clusters, optimal_d, optimal_n, firstcut_exit 288 | -------------------------------------------------------------------------------- /db_io/build_db.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import sys, os, argparse, copy, signal 4 | import numpy as np 5 | import multiprocessing as mp 6 | 7 | from time import time, sleep 8 | 9 | from Bio import SeqIO 10 | from Bio.SeqRecord import SeqRecord 11 | 12 | """ 13 | This function fetch all possible kmers of size specified by kmer_size argument, 14 | all fetched kmers will serve as the database used for verify the uniqueness of kmers that actually snp 15 | build a dictionary for all possible kmers is quite time consuming, one possible way to speed up is to pre-compute it and store it in form of binary file that is directly loadable, thus avoid hash computation again. 16 | 17 | Upadate: 07/01/18 18 | Using the following format for the sake of simplicity 19 | kmer_seq: ATGC 20 | """ 21 | def fetch_all_kmers(genome_seq, kmer_size, coords=None): 22 | kmers = [] 23 | 24 | if len(genome_seq) < kmer_size: 25 | return kmers 26 | else: 27 | for i in range(len(genome_seq)-kmer_size+1): 28 | kmer = genome_seq[i:(i+kmer_size)] 29 | kmers.append(kemr) 30 | rc_kmer = revcomp(kmer) 31 | return kmers 32 | 33 | def build_kmer_db(genome_seq, kmer_size): 34 | kmers = dict() 35 | 36 | if len(genome_seq) < kmer_size: 37 | return kmers 38 | else: 39 | for i in range(len(genome_seq)-kmer_size+1): 40 | kmer = genome_seq[i:(i+kmer_size)] 41 | if kmer not in kmers: 42 | kmers[kmer] = 1 43 | else: 44 | kmers[kmer] = kmers[kmer] + 1 45 | 46 | #for kmer, count in kmers.iteritems(): 47 | # sys.stderr.write("{}\t{}\n".format(kmer, count)) 48 | 49 | return kmers 50 | 51 | """ 52 | The module takes kmer size argument and allows to build kmer database of different size. 53 | It also takes a kmer_type argument for allowing two approaches to search kmers that cover target snps: 54 | 1) fetch all eligible kmers; fetch_all_snp_kmers() 55 | 2) fetch kmers whose target snp was at the center; fetch_center_snp_kmers(). 56 | 57 | Upadate: 07/01/18 58 | Using the following format for the sake of simplicity 59 | id/glob_pos: 11111 60 | allele_pos_on_kmer: 3 61 | kmer_seq(REF/+): ATGC 62 | kmer_seq(ALT/+): ATTC 63 | kmer_seq(REF/-): GCAT 64 | kmer_seq(ALT/-): GAAT 65 | """ 66 | def fetch_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, kmer_type, coords=None): 67 | if kmer_type == 'all': 68 | return fetch_all_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords) 69 | elif kmer_type == 'center': 70 | return fetch_center_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords) 71 | else: 72 | sys.exit("the specified kmer_type value was not recognized by the program: {}".format(kmer_type)) 73 | 74 | def fetch_all_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords=None): 75 | print("[searching] start to search {}-mers".format(kmer_size)) 76 | 77 | inds_map = None 78 | if coords is not None: 79 | inds_map = [None for i in range(len(genome_seq))] 80 | 81 | for i, coord in enumerate(coords): 82 | for j in range(int(coord[1]), int(coord[2])+1): 83 | inds_map[j] = i 84 | 85 | kmers = [] 86 | for ri, pos in enumerate(snp_pos): 87 | kmer_start = int(pos)-kmer_size+1 88 | kmer_end = int(pos)+kmer_size-1 89 | 90 | if inds_map is not None: 91 | if inds_map[int(pos)] is None: 92 | continue 93 | 94 | cur_coord = coords[inds_map[int(pos)]] 95 | coord_start, coord_end = int(cur_coord[1]), int(cur_coord[2]) 96 | kmer_start, kmer_end = max(coord_start, kmer_start), min(coord_end, kmer_end) 97 | 98 | if kmer_end - kmer_start + 1 >= kmer_size: 99 | subseq = genome_seq[kmer_start:(kmer_end+1)] 100 | 101 | for i in range(len(subseq)-kmer_size+1): 102 | kmer = subseq[i:(i+kmer_size)] 103 | 104 | var_pos = kmer_size-i-1 105 | 106 | kmer = kmer[:var_pos]+snp_alleles[ri][0]+kmer[var_pos+1:] 107 | akmer = kmer[:var_pos]+snp_alleles[ri][1]+kmer[var_pos+1:] 108 | 109 | rc_kmer = revcomp(kmer) 110 | rc_akmer = revcomp(akmer) 111 | 112 | kmers.append([pos, var_pos, kmer, akmer, rc_kmer, rc_akmer]) 113 | print(" a total of {} kmers was found\n".format(len(kmers))) 114 | return kmers 115 | 116 | def load_msa(msa_path): 117 | genome_msa = dict() 118 | 119 | with open(msa_path, 'r') as fh: 120 | for line in fh: 121 | if line[0] == '>': 122 | working_id = line.split(' ')[0][1:] 123 | elif line[0] == '=': 124 | pass 125 | else: 126 | if working_id not in genome_msa: 127 | genome_msa[working_id] = "" 128 | 129 | genome_msa[working_id] = genome_msa[working_id] + line.rstrip() 130 | 131 | genome_seqs = [genome_msa[key] for key in genome_msa.keys()] 132 | 133 | return genome_seqs 134 | 135 | 136 | def fetch_all_from_msa(genome_seqs, ref_seq, snp_pos, snp_alleles, kmer_size, coords=None): 137 | print("[searching] start to search {}-mers".format(kmer_size)) 138 | 139 | inds_map = None 140 | if coords is not None: 141 | inds_map = [None for i in range(len(ref_seq))] 142 | 143 | for i, coord in enumerate(coords): 144 | for j in range(int(coord[1]), int(coord[2])+1): 145 | inds_map[j] = i 146 | 147 | kmer_records = [] 148 | for ri, pos in enumerate(snp_pos): 149 | kmer_start = int(pos)-kmer_size+1 150 | kmer_end = int(pos)+kmer_size-1 151 | 152 | if inds_map is not None: 153 | if inds_map[int(pos)] is None: 154 | continue 155 | 156 | cur_coord = coords[inds_map[int(pos)]] 157 | coord_start, coord_end = int(cur_coord[1]), int(cur_coord[2]) 158 | kmer_start, kmer_end = max(coord_start, kmer_start), min(coord_end, kmer_end) 159 | 160 | if kmer_end - kmer_start + 1 >= kmer_size: 161 | subseqs = [genome_seq[kmer_start:(kmer_end+1)] for genome_seq in genome_seqs] 162 | 163 | for i in range(len(subseqs[0])-kmer_size+1): 164 | raw_kmers = [subseq[i:(i+kmer_size)] for subseq in subseqs] 165 | 166 | kmers = [] 167 | for rk in raw_kmers: 168 | if '-' not in rk and 'N' not in rk: 169 | kmers.append(rk) 170 | 171 | ukmers, counts = np.unique(kmers, return_counts=True) 172 | uk_inds = np.argsort(counts)[::-1] 173 | 174 | var_pos = kmer_size-i-1 175 | 176 | kmer = "" 177 | akmer = "" 178 | kflag = False 179 | akflag = False 180 | for ukmer in ukmers[uk_inds]: 181 | if kflag is False: 182 | if ukmer[var_pos] == snp_alleles[ri][0]: 183 | kmer = ukmer 184 | kflag = True 185 | 186 | if akflag is False: 187 | if ukmer[var_pos] == snp_alleles[ri][1]: 188 | akmer = ukmer 189 | akflag = True 190 | 191 | if kflag is True and akflag is True: 192 | break 193 | 194 | if len(kmer) != 31 or len(akmer) != 31: 195 | continue 196 | 197 | rc_kmer = revcomp(kmer) 198 | rc_akmer = revcomp(akmer) 199 | 200 | kmer_records.append([pos, var_pos, kmer, akmer, rc_kmer, rc_akmer]) 201 | 202 | print(" a total of {} kmer records was found\n".format(len(kmer_records))) 203 | return kmer_records 204 | 205 | def fetch_center_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords=None): 206 | print("[searching] start to search {}-mers\n".format(kmer_size)) 207 | 208 | inds_map = None 209 | if coords is not None: 210 | inds_map = [None for i in range(len(genome_seq))] 211 | 212 | for i, coord in enumerate(coords): 213 | for j in range(int(coord[1]), int(coord[2])+1): 214 | inds_map[j] = i 215 | 216 | is_even = (kmer_size % 2 == 0) 217 | 218 | kmers = [] 219 | for ri, pos in enumerate(snp_pos): 220 | kmer_start, kmer_end, var_pos = 0, 0, 0 221 | 222 | if is_even: 223 | var_pos = int(kmer_size/2) 224 | kmer_start = int(pos)-int(kmer_size/2)+1 225 | kmer_end = int(pos)+int(kmer_size/2) 226 | else: 227 | var_pos = int(kmer_size/2)+1 228 | kmer_start = int(pos)-int(kmer_size/2) 229 | kmer_end = int(pos)+int(kmer_size/2) 230 | 231 | if inds_map is not None: 232 | if inds_map[int(pos)] is None: 233 | continue 234 | 235 | cur_coord = coords[inds_map[int(pos)]] 236 | 237 | coord_start = int(cur_coord[1]) 238 | coord_end = int(cur_coord[2]) 239 | 240 | if kmer_start < coord_start or kmer_end > coord_end: 241 | continue 242 | 243 | kmer = genome_seq[kmer_start:(kmer_end+1)] 244 | akmer = kmer[:var_pos]+snp_alleles[ri][0]+kmer[var_pos+1:] 245 | 246 | akmer = kmer 247 | akmer = kmer[:var_pos]+snp_alleles[ri][1]+kmer[var_pos+1:] 248 | 249 | rc_kmer = revcomp(kmer) 250 | rc_akmer = revcomp(akmer) 251 | 252 | kmers.append([pos, var_pos, kmer, akmer, rc_kmer, rc_akmer]) 253 | print(" a total of {} kmers was found\n".format(len(kmers))) 254 | return kmers 255 | 256 | def revcomp(seq): 257 | """ Reverse complement sequence 258 | 259 | Args: 260 | seq: string from alphabet {A,T,C,G,N} 261 | 262 | Returns: 263 | reverse complement of seq 264 | """ 265 | complement = { 266 | 'A':'T', 267 | 'T':'A', 268 | 'G':'C', 269 | 'C':'G', 270 | 'N':'N', 271 | 'R':'N', 272 | 'Y':'N', 273 | 'K':'N', 274 | 'M':'N', 275 | 'S':'N', 276 | 'W':'N', 277 | 'B':'N', 278 | 'D':'N', 279 | 'H':'N', 280 | 'V':'N' 281 | } 282 | return ''.join([complement[_] for _ in seq[::-1]]) 283 | 284 | def calc_snp_coverage(kmers): 285 | return len(set([kmer[0] for kmer in kmers])) 286 | 287 | def dump_tsv(kmers, output): 288 | with open(output, 'w') as fh: 289 | for kmer in kmers: 290 | if len(kmer) == 6: 291 | fh.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(*kmer)) 292 | elif len(kmer) == 9: 293 | fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(*kmer)) 294 | else: 295 | assert False 296 | 297 | # a mini function to load all coordinates in to memory 298 | def read_coords(fpath): 299 | print("[load] loading key coordinates on core-genome from {}".format(fpath)) 300 | 301 | coords = [] 302 | with open(fpath, "r") as fh: 303 | fh.readline() 304 | for line in fh: 305 | coords.append(line.rstrip('\n').split('\t')) 306 | 307 | print(" a total of {} divisions was found\n".format(str(len(coords)))) 308 | return coords 309 | 310 | def open_vcf_file_local(fpath): 311 | """ 312 | * ``Record.CHROM``; string 313 | * ``Record.POS``; int 314 | * ``Record.ID``; None 315 | * ``Record.REF``; string 316 | * ``Record.ALT``; list 317 | * ``Record.QUAL``; None 318 | * ``Record.FILTER``; list 319 | * ``Record.INFO``; dictionary 320 | 321 | additional attributes: 322 | * ``Record.FORMAT``; string 323 | * ``Record.samples``; list 324 | * ``Record.genotype``; object 325 | """ 326 | 327 | print("[load] loading core snps from {}".format(fpath)) 328 | 329 | snp_gb_pos = [] 330 | snp_alleles = [] 331 | with open(fpath, 'r') as fh: 332 | for l in fh: 333 | if l[0] == "#": 334 | continue 335 | else: 336 | values = l.rstrip().split('\t')[:5] 337 | 338 | chrom = values[0] 339 | pos_r = int(values[1]) 340 | gid = values[2] 341 | allele_ma = values[3] 342 | allele_mi = values[4] 343 | 344 | if len(allele_mi) > 1: 345 | continue 346 | snp_gb_pos.append(int(gid)) 347 | snp_alleles.append([allele_ma, allele_mi]) 348 | print(" a total of {} core snps was found\n".format(str(len(snp_gb_pos)))) 349 | 350 | return snp_gb_pos, snp_alleles 351 | 352 | 353 | def open_genome_seq(genome_path): 354 | print("[load] loading core-genome consensus sequence from {}".format(genome_path)) 355 | 356 | records = list(SeqIO.parse(genome_path, "fasta")) 357 | main_genome = "" 358 | for record in records: 359 | main_genome = main_genome + str(record.seq).upper() 360 | 361 | print(" the loaded core-genome has a consensus sequence of {} bases\n".format(str(len(main_genome)))) 362 | 363 | return main_genome 364 | 365 | def read_kmerset(kmer_path): 366 | print("[load] loading kmerset from {}".format(kmer_path)) 367 | kmerset = [] 368 | 369 | with open(kmer_path, "r") as fh: 370 | for line in fh: 371 | items = line.rstrip().split('\t') 372 | items[6] = int(items[6]) 373 | items[7] = int(items[7]) 374 | items[8] = int(items[8]) 375 | kmerset.append(items) 376 | 377 | print(" the loaded kmerset has {} kmer records\n".format(str(len(kmerset)))) 378 | return kmerset 379 | 380 | -------------------------------------------------------------------------------- /snps_io/concat_alleles.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | import operator 3 | from time import time 4 | 5 | def read_input(in_dir, subset_list=None): 6 | fpaths = [] 7 | fnames = [] 8 | 9 | subset_map = dict() 10 | 11 | for f in os.listdir(in_dir): 12 | subset_map[f] = 1 13 | 14 | if subset_list is not None: 15 | subset_map = dict() 16 | with open(subset_list, 'r') as fh: 17 | for ln in fh: 18 | items = ln.rstrip().split('\t') 19 | assert len(items) == 2 20 | fname = items[0].split('/')[-1] 21 | subset_map[fname] = 1 22 | 23 | for f in os.listdir(in_dir): 24 | if f in subset_map: 25 | fpath = in_dir.rstrip('/')+'/'+f 26 | 27 | if os.path.isfile(fpath): 28 | fstats = os.stat(fpath) 29 | if fstats.st_size >= 0: 30 | fpaths.append(fpath) 31 | fnames.append(f) 32 | else: 33 | sys.stderr.write("skip {}: empty file\n".format(fpath)) 34 | else: 35 | sys.stderr.write("skip {}: not exist\n".format(fpath)) 36 | 37 | else: 38 | sys.stderr.write("skip {}\n".format(f)) 39 | 40 | return fpaths, fnames 41 | 42 | def read_msa(msa_in): 43 | valid_chars = dict() 44 | valid_chars['A'] = 'A' 45 | valid_chars['a'] = 'A' 46 | valid_chars['C'] = 'C' 47 | valid_chars['c'] = 'C' 48 | valid_chars['G'] = 'G' 49 | valid_chars['g'] = 'G' 50 | valid_chars['T'] = 'T' 51 | valid_chars['t'] = 'T' 52 | valid_chars['-'] = '-' 53 | 54 | alns = dict() 55 | cur_seq = "" 56 | cur_sample = "" 57 | cur_contig = "" 58 | with open(msa_in, 'r') as fh: 59 | for line in fh: 60 | if line[0] == '>': 61 | items = line.rstrip().split(' ') 62 | if items[0] not in alns: 63 | alns[items[0]] = dict() 64 | if items[1] not in alns[items[0]]: 65 | alns[items[0]][items[1]] = "" 66 | cur_sample = items[0] 67 | cur_contig = items[1] 68 | elif line[0] == '=': 69 | pass 70 | else: 71 | elems = [] 72 | for char in line.rstrip().split(): 73 | if char not in valid_chars: 74 | elems.append(char) 75 | else: 76 | elems.append(valid_chars[char]) 77 | alns[cur_sample][cur_contig] = "".join(elems) 78 | 79 | aln_recs = [] 80 | for sample in alns.keys(): 81 | for contig in alns.keys(): 82 | aln_recs.append([sample, contig, alns[cur_sample][cur_contig]]) 83 | 84 | sorted_alns = sorted(aln_recs, key = lambda x: (x[1], x[0])) 85 | concat_alns = dict() 86 | for aln_rec in sorted_alns: 87 | if aln_rec[0] not in concat_alns: 88 | concat_alns[aln_rec[0]] = "" 89 | else: 90 | concat_alns[aln_rec[0]] += aln_rec[2] 91 | 92 | return concat_alns 93 | 94 | def read_aln(aln_in): 95 | valid_chars = dict() 96 | valid_chars['A'] = 'A' 97 | valid_chars['a'] = 'A' 98 | valid_chars['C'] = 'C' 99 | valid_chars['c'] = 'C' 100 | valid_chars['G'] = 'G' 101 | valid_chars['g'] = 'G' 102 | valid_chars['T'] = 'T' 103 | valid_chars['t'] = 'T' 104 | valid_chars['-'] = '-' 105 | 106 | alns = dict() 107 | cur_seq = "" 108 | cur_sample = "" 109 | with open(aln_in, 'r') as fh: 110 | for line in fh: 111 | if line[0] == '>': 112 | cur_sample = line.rstrip() 113 | if cur_sample not in alns: 114 | alns[cur_sample] = "" 115 | else: 116 | elems = [] 117 | for char in line.rstrip().split(): 118 | if char not in valid_chars: 119 | elems.append(char) 120 | else: 121 | elems.append(valid_chars[char]) 122 | alns[cur_sample] += "".join(elems) 123 | 124 | return alns 125 | 126 | def write_aln(alns, out_path, max_gap=0.2): 127 | with open(out_path, 'w') as fh: 128 | for aln_key in alns.keys(): 129 | n_gaps = 0 130 | total_len = len(alns[aln_key]) 131 | for base in alns[aln_key]: 132 | if base == '-': 133 | n_gaps += 1 134 | 135 | if n_gaps/total_len > max_gap: 136 | print("{}: skip {}".format(n_gaps/total_len, aln_key)) 137 | else: 138 | fh.write("{}\n{}\n".format(aln_key, alns[aln_key])) 139 | 140 | def read_gtp(input_path, min_depth): 141 | input_recs = dict() 142 | 143 | with open(input_path, 'r') as fh: 144 | for line in fh: 145 | items = line.rstrip().split("\t") 146 | 147 | contig_id = items[0] 148 | contig_pos = items[1] 149 | snp_key = contig_id + "__" + contig_pos 150 | 151 | cnt_allele_1 = int(items[5]) 152 | cnt_allele_2 = int(items[6]) 153 | if cnt_allele_1 + cnt_allele_2 < min_depth: 154 | continue 155 | 156 | allele = "" 157 | if cnt_allele_1 > cnt_allele_2: 158 | allele = items[3] 159 | else: 160 | allele = items[4] 161 | input_recs[snp_key] = allele 162 | 163 | return input_recs 164 | 165 | def union_inputs(inputs, names): 166 | first_union_in = dict() 167 | 168 | for input_recs in inputs: 169 | for snp_key in input_recs: 170 | if snp_key not in first_union_in: 171 | first_union_in[snp_key] = 1 172 | print("first_union_in: {}".format(len(first_union_in.keys()))) 173 | 174 | union_in = dict() 175 | n_samples = len(names) 176 | for snp_key in first_union_in.keys(): 177 | n_prev = 0 178 | allele_col = dict() 179 | for input_recs in inputs: 180 | if snp_key in input_recs: 181 | n_prev += 1 182 | allele_col[input_recs[snp_key]] = 1 183 | #if n_prev / n_samples >= 10 and len(allele_col.keys()) > 1: 184 | if len(allele_col.keys()) > 1: 185 | union_in[snp_key] = 1 186 | 187 | print("union_in: {}".format(len(union_in.keys()))) 188 | all_keys = [[key.split('__')[0], int(key.split('__')[1])] for key in union_in.keys()] 189 | sorted_keys = sorted(all_keys, key = lambda x: (x[0], x[1])) 190 | #sorted_keys = sorted(all_keys, key = operator.itemgetter(0, 1)) 191 | 192 | allele_aln = dict() 193 | for i, input_recs in enumerate(inputs): 194 | alleles = [] 195 | for elem in sorted_keys: 196 | snp_key = elem[0] + "__" + str(elem[1]) 197 | if snp_key in input_recs: 198 | alleles.append(input_recs[snp_key]) 199 | else: 200 | alleles.append('-') 201 | allele_aln[names[i]] = alleles 202 | 203 | return allele_aln 204 | 205 | def concat_snps(allele_aln, allele_aln_fasta, max_gap, min_prev, min_maf, min_mac): 206 | with open(allele_aln_fasta, 'w') as fh: 207 | good_names = [] 208 | for name in allele_aln.keys(): 209 | n_gaps = 0 210 | total_len = len(allele_aln[name]) 211 | for base in allele_aln[name]: 212 | if base == '-': 213 | n_gaps += 1 214 | 215 | if n_gaps/total_len > max_gap: 216 | print("{}: skip {}".format(n_gaps/total_len, name)) 217 | else: 218 | good_names.append(name) 219 | 220 | print(good_names) 221 | comm_aln = dict() 222 | comm_inds = [] 223 | n_samples = len(good_names) 224 | 225 | for i, allele in enumerate(allele_aln[good_names[0]]): 226 | n_gaps = 0 227 | alleles = [] 228 | allele_track = dict() 229 | for name in good_names: 230 | alleles.append(allele_aln[name][i]) 231 | if allele_aln[name][i] == '-': 232 | n_gaps += 1 233 | else: 234 | if allele_aln[name][i] not in allele_track: 235 | allele_track[allele_aln[name][i]] = 1 236 | else: 237 | allele_track[allele_aln[name][i]] += 1 238 | 239 | if (1 - n_gaps/n_samples) < min_prev: 240 | print("low prevalence: {}: skip {}".format(1 - n_gaps/n_samples, i)) 241 | continue 242 | elif len(allele_track.keys()) <= 1: 243 | print("not a SNP site: skip {}".format(i)) 244 | continue 245 | else: 246 | sorted_allele_track = sorted(allele_track.items(), key=lambda item: item[1], reverse=True) 247 | major_count = sorted_allele_track[0][1] 248 | minor_count = sorted_allele_track[1][1] 249 | if minor_count/n_samples < min_maf: 250 | print("low min MAF: {}: skip {}".format(minor_count/n_samples, i)) 251 | continue 252 | elif minor_count < min_mac: 253 | print("low min MAC: {}: skip {}".format(minor_count, i)) 254 | continue 255 | else: 256 | comm_inds.append(i) 257 | 258 | print("number of good sites: {}".format(len(comm_inds))) 259 | print("number of good samples: {}".format(len(good_names))) 260 | 261 | for name in good_names: 262 | if name not in comm_aln: 263 | comm_aln[name] = [] 264 | for i in comm_inds: 265 | comm_aln[name].append(allele_aln[name][i]) 266 | fh.write(">{}\n{}\n".format(name, "".join(comm_aln[name]))) 267 | 268 | return allele_aln_fasta 269 | 270 | 271 | def run_command(cmd, env=None): 272 | import subprocess as sp 273 | if env: 274 | p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE, env=env) 275 | else: 276 | p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) 277 | out, err = p.communicate() 278 | if p.returncode != 0: 279 | err_msg = "\nError: the following returned non-zero status: '%s':\n" % cmd 280 | err_msg += "\n%s" % err 281 | sys.exit(err_msg) 282 | else: 283 | return out, err 284 | 285 | def run_fasttree(snp_str_fasta, out_dir): 286 | sys.stderr.write("[start] inferring max. likelihood tree\n") 287 | sys.stderr.write("\tsnp string fasta path: {}\n".format(snp_str_fasta)) 288 | 289 | o_mat_path = out_dir + "/concat_allele.aln.mat" 290 | 291 | command = "FastTreeMP -makematrix -nt -gtr < " 292 | command += snp_str_fasta 293 | command += " > " 294 | command += o_mat_path 295 | 296 | environ = os.environ.copy() 297 | run_command(command, environ) 298 | sys.stderr.write("\tfinishing up, distance matrix is writtedn to {}\n".format(o_mat_path)) 299 | 300 | o_tre_path = out_dir + "/concat_allele.aln.tre" 301 | 302 | command = "FastTreeMP -nt -gtr < " 303 | command += snp_str_fasta 304 | command += " > " 305 | command += o_tre_path 306 | 307 | environ = os.environ.copy() 308 | run_command(command, environ) 309 | 310 | sys.stderr.write("\tfinishing up, tree is writtedn to {}\n".format(o_tre_path)) 311 | sys.stderr.write("[done] inferring max. likelihood tree\n") 312 | 313 | def concat_allele_tree(args): 314 | in_dir = args['input_dir'] 315 | in_path = args['input_list'] 316 | 317 | out_dir = args['out_dir'].rstrip('/') 318 | if not os.path.isdir(out_dir): 319 | os.makedirs(out_dir) 320 | 321 | min_sites_per_sample = args['min_sites_per_sample'] 322 | max_gap_ratio = args['max_gap_ratio'] 323 | min_site_prev = args['min_site_prev'] 324 | min_maf = args['min_maf'] 325 | min_mac = args['min_mac'] 326 | 327 | paths, names = read_input(in_dir, in_path) 328 | if len(names) != len(set(names)): 329 | sys.stderr.write("\n[error] names of input files are not unqiue.\n") 330 | sys.exit() 331 | input_recs = [] 332 | aln_fasta = out_dir + '/concat_allele.aln.fasta' 333 | nonempty_names = [] 334 | for i, path in enumerate(paths): 335 | if not os.path.exists(path): 336 | print("Skip input: {} does not exists.".format(path)) 337 | continue 338 | if args["min_depth"] is not None: 339 | min_depth = args["min_depth"] 340 | input_rec = read_gtp(path, min_depth) 341 | if len(input_rec.keys()) < min_sites_per_sample: 342 | print("{}: skipped {}".format(len(input_rec.keys()), path)) 343 | else: 344 | input_recs.append(input_rec) 345 | nonempty_names.append(names[i]) 346 | allele_aln = union_inputs(input_recs, nonempty_names) 347 | concat_snps(allele_aln, aln_fasta, max_gap_ratio, min_site_prev, min_maf, min_mac) 348 | run_fasttree(aln_fasta, out_dir) 349 | -------------------------------------------------------------------------------- /snps_io/align_assembly.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | from snps_io import vcf_io 5 | 6 | class AlignAssembly: 7 | def __init__(self, algns=[], max_sites=float('inf'), gpos_offset=0): 8 | self.alignments = algns 9 | self.sample_ids = [] 10 | 11 | self.chroms = np.array([]) 12 | # self.char_mat = np.array([]) 13 | self.global_pos = np.array([]) 14 | self.local_pos = np.array([]) 15 | self.ref_alleles = np.array([]) 16 | self.alt_alleles = np.array([]) 17 | self.third_alleles = np.array([]) 18 | self.forth_alleles = np.array([]) 19 | 20 | self.ref_prob_mat = np.array([]) 21 | self.alt_prob_mat = np.array([]) 22 | self.third_prob_mat = np.array([]) 23 | self.forth_prob_mat = np.array([]) 24 | 25 | self.freq_mat = np.array([]) 26 | self.sample_presence = np.array([]) 27 | self.prevalence = np.array([]) 28 | self.ref_freqs = np.array([]) 29 | self.alt_freqs = np.array([]) 30 | self.third_freqs = np.array([]) 31 | self.forth_freqs = np.array([]) 32 | 33 | self.gpos_offset = gpos_offset 34 | self.is_spliced = False 35 | 36 | if self._check_(): 37 | self.splice(max_sites) 38 | 39 | self.snps = [] 40 | self.coords = [] 41 | self.consensus_genome = "" 42 | 43 | def _check_(self): 44 | if len(self.alignments) > 2: 45 | if all([len(align.sample_ids) == len(self.alignments[0].sample_ids) for align in self.alignments]): 46 | return True 47 | else: 48 | return False 49 | else: 50 | return False 51 | 52 | def splice(self, max_sites=float('inf')): 53 | if (len(self.alignments) == 1): 54 | print("warning: cannot splice less than 2 alignments") 55 | 56 | first_align = self.alignments[0] 57 | self.sample_ids = first_align.sample_ids 58 | 59 | # self.char_mat = np.concatenate([algn.char_mat for algn in self.alignments], axis=1) 60 | self.chroms = np.array(np.repeat(first_align.chrom, first_align.ncols)) 61 | self.local_pos = np.array(first_align.local_pos) 62 | 63 | self.ref_alleles = np.array(first_align.ref_alleles) 64 | self.alt_alleles = np.array(first_align.alt_alleles) 65 | self.third_alleles = np.array(first_align.third_alleles) 66 | self.forth_alleles = np.array(first_align.forth_alleles) 67 | 68 | self.ref_prob_mat = np.array(first_align.ref_prob_mat) 69 | self.alt_prob_mat = np.array(first_align.alt_prob_mat) 70 | self.third_prob_mat = np.array(first_align.third_prob_mat) 71 | self.forth_prob_mat = np.array(first_align.forth_prob_mat) 72 | 73 | self.freq_mat = np.array(first_align.freq_mat) # attention here 74 | self.sample_presence = np.array(first_align.sample_presence) 75 | self.prevalence = np.array(first_align.prevalence) 76 | self.ref_freqs = np.array(first_align.ref_freqs) 77 | self.alt_freqs = np.array(first_align.alt_freqs) 78 | self.third_freqs = np.array(first_align.third_freqs) 79 | self.forth_freqs = np.array(first_align.forth_freqs) 80 | 81 | self.global_pos = np.arange(len(self.chroms)) + self.gpos_offset 82 | 83 | self.is_spliced = True 84 | if max_sites < len(self.chroms): 85 | self.cut_short(max_sites) 86 | elif len(self.alignments) > 1 and all([len(align.sample_ids) == len(self.alignments[0].sample_ids) for align in self.alignments]): 87 | first_align = self.alignments[0] 88 | self.sample_ids = first_align.sample_ids 89 | 90 | # self.char_mat = np.concatenate([algn.char_mat for algn in self.alignments], axis=1) 91 | self.chroms = np.concatenate([np.repeat(algn.chrom, algn.ncols) for algn in self.alignments], axis=0) 92 | self.local_pos = np.concatenate([algn.local_pos for algn in self.alignments], axis=0) 93 | 94 | self.ref_alleles = np.concatenate([algn.ref_alleles for algn in self.alignments], axis=0) 95 | self.alt_alleles = np.concatenate([algn.alt_alleles for algn in self.alignments], axis=0) 96 | self.third_alleles = np.concatenate([algn.third_alleles for algn in self.alignments], axis=0) 97 | self.forth_alleles = np.concatenate([algn.forth_alleles for algn in self.alignments], axis=0) 98 | 99 | 100 | self.ref_prob_mat = np.concatenate([algn.ref_prob_mat for algn in self.alignments], axis=1) 101 | self.alt_prob_mat = np.concatenate([algn.alt_prob_mat for algn in self.alignments], axis=1) 102 | self.third_prob_mat = np.concatenate([algn.third_prob_mat for algn in self.alignments], axis=1) 103 | self.forth_prob_mat = np.concatenate([algn.forth_prob_mat for algn in self.alignments], axis=1) 104 | 105 | self.freq_mat = np.concatenate([algn.freq_mat for algn in self.alignments], axis=1) 106 | 107 | self.sample_presence = np.concatenate([algn.sample_presence for algn in self.alignments], axis=0) 108 | self.prevalence = np.concatenate([algn.prevalence for algn in self.alignments], axis=0) 109 | self.ref_freqs = np.concatenate([algn.ref_freqs for algn in self.alignments], axis=0) 110 | self.alt_freqs = np.concatenate([algn.alt_freqs for algn in self.alignments], axis=0) 111 | self.third_freqs = np.concatenate([algn.third_freqs for algn in self.alignments], axis=0) 112 | self.forth_freqs = np.concatenate([algn.forth_freqs for algn in self.alignments], axis=0) 113 | 114 | self.global_pos = np.arange(len(self.chroms)) + self.gpos_offset 115 | 116 | self.is_spliced = True 117 | 118 | if max_sites < len(self.chroms): 119 | self.cut_short(max_sites) 120 | else: 121 | print("errors: zero or uneven alignments") 122 | 123 | print("total number of sites: {}".format(len(self.chroms))) 124 | 125 | return self.is_spliced 126 | 127 | def cut_short(self, _max_sites): 128 | if self.is_spliced: 129 | if _max_sites < len(self.chroms): 130 | max_sites = int(_max_sites) 131 | 132 | self.chroms = self.chroms[:max_sites] 133 | self.global_pos = self.global_pos[:max_sites] 134 | self.local_pos = self.local_pos[:max_sites] 135 | self.ref_alleles = self.ref_alleles[:max_sites] 136 | self.alt_alleles = self.alt_alleles[:max_sites] 137 | self.freq_mat = self.freq_mat[:,:max_sites] 138 | 139 | self.ref_prob_mat = self.ref_prob_mat[:,:max_sites] 140 | self.alt_prob_mat = self.alt_prob_mat[:,:max_sites] 141 | self.third_prob_mat = self.third_prob_mat[:,:max_sites] 142 | self.forth_prob_mat = self.forth_prob_mat[:,:max_sites] 143 | 144 | self.sample_presence = self.sample_presence[:max_sites] 145 | self.prevalence = self.prevalence[:max_sites] 146 | self.ref_freqs = self.ref_freqs[:max_sites] 147 | self.alt_freqs = self.alt_freqs[:max_sites] 148 | else: 149 | print("warnings: impossible to cut short unspliced alignments, no changes has been done!") 150 | 151 | def id_core_genome(self, min_prev, min_alt_freq): 152 | print("min. prevalence: {}".format(min_prev)) 153 | print("min. alt. frequency: {}".format(min_alt_freq)) 154 | 155 | if self.is_spliced: 156 | prev_mask = (self.prevalence >= min_prev) 157 | snp_mask = (self.alt_freqs >= min_alt_freq) & (self.ref_alleles != b'N') & (self.ref_alleles != b'-') 158 | wildcard_mask = (self.ref_alleles != b'N') & (self.ref_alleles != b'-') 159 | 160 | # alt_freq_mask = ((1 - self.ref_freqs - self.alt_freqs) <= (min_alt_freq+0.000000001)) 161 | 162 | 163 | # fake_mask = np.logical_not(alt_freq_mask) 164 | # print alt_freq_mask[fake_mask] 165 | # print (1 - self.ref_freqs - self.alt_freqs)[fake_mask] 166 | # print self.ref_freqs[fake_mask] 167 | # print self.alt_freqs[fake_mask] 168 | # print self.third_freqs[fake_mask] 169 | # print self.forth_freqs[fake_mask] 170 | 171 | self.consensus_genome = self.id_consensus_genome() 172 | 173 | shift_chroms = np.append(self.chroms[1:], self.chroms[-1]) 174 | boundary_mask = np.logical_not((shift_chroms == self.chroms)) 175 | #goodness_mask = (prev_mask & alt_freq_mask & wildcard_mask) 176 | goodness_mask = (prev_mask & wildcard_mask) 177 | 178 | self.coords = self.id_coordinates(boundary_mask, goodness_mask) 179 | 180 | print("masked by prev_mask: {}".format(np.sum(prev_mask))) 181 | print("masked by snp_mask: {}".format(np.sum(snp_mask))) 182 | # print "masked by alt_freq_mask: {}".format(np.sum(alt_freq_mask)) 183 | print("masked by wildcard_mask: {}".format(np.sum(wildcard_mask))) 184 | 185 | calling_mask = goodness_mask & snp_mask 186 | self.snps = self.id_snps(calling_mask) 187 | else: 188 | sys.exit("premature call of id_core_genome, the multiple alignments were sliced yet.") 189 | 190 | def id_consensus_genome(self): 191 | if self.is_spliced: 192 | if len(self.ref_alleles) > 0: 193 | return b''.join([ref_allele for ref_allele in self.ref_alleles]) 194 | else: 195 | return b'' 196 | else: 197 | return b'' 198 | 199 | def id_snps(self, calling_mask): 200 | if self.is_spliced: 201 | snps = [] 202 | 203 | snp_chroms = self.chroms[calling_mask] 204 | snp_gb_pos = self.global_pos[calling_mask] 205 | snp_lc_pos = self.local_pos[calling_mask] 206 | snp_refs = self.ref_alleles[calling_mask] 207 | snp_alts = self.alt_alleles[calling_mask] 208 | snp_third = self.third_alleles[calling_mask] 209 | snp_forth = self.forth_alleles[calling_mask] 210 | 211 | snp_ref_prob_mat = self.ref_prob_mat[:,calling_mask] 212 | snp_alt_prob_mat = self.alt_prob_mat[:,calling_mask] 213 | snp_third_prob_mat = self.third_prob_mat[:,calling_mask] 214 | snp_forth_prob_mat = self.forth_prob_mat[:,calling_mask] 215 | 216 | snp_freqs = self.freq_mat[:,calling_mask] 217 | snp_presence = self.sample_presence[calling_mask] 218 | snp_prevs = self.prevalence[calling_mask] 219 | snp_ref_freqs = self.ref_freqs[calling_mask] 220 | snp_alt_freqs = self.alt_freqs[calling_mask] 221 | snp_third_freqs = self.third_freqs[calling_mask] 222 | snp_forth_freqs = self.forth_freqs[calling_mask] 223 | 224 | for i, chrom in enumerate(snp_chroms): 225 | var_id = str(snp_gb_pos[i]) 226 | 227 | freq_row = snp_freqs[:,i] 228 | freq_row[snp_freqs[:,i] == None] = -1 229 | 230 | snp_ref_prob_row = snp_ref_prob_mat[:,i] 231 | snp_ref_prob_row[snp_ref_prob_mat[:,i] == None] = -1 232 | 233 | snp_alt_prob_row = snp_alt_prob_mat[:,i] 234 | snp_alt_prob_row[snp_alt_prob_mat[:,i] == None] = -1 235 | 236 | snp_third_prob_row = snp_third_prob_mat[:,i] 237 | snp_third_prob_row[snp_third_prob_mat[:,i] == None] = -1 238 | 239 | snp_forth_prob_row = snp_forth_prob_mat[:,i] 240 | snp_forth_prob_row[snp_forth_prob_mat[:,i] == None] = -1 241 | 242 | allele_mask = (np.array([snp_ref_prob_row.sum(), snp_alt_prob_row.sum(), snp_third_prob_row.sum(), snp_forth_prob_row.sum()]) > 0) 243 | 244 | alleles = np.array([snp_alts[i], snp_third[i], snp_forth[i]]) 245 | alleles = alleles[allele_mask[1:]] 246 | 247 | if len(alleles) == 0: 248 | avail_alleles = b'.' 249 | else: 250 | avail_alleles = b','.join(alleles) 251 | 252 | snp = self._make_snp_( 253 | chrom, var_id, snp_lc_pos[i], 254 | snp_refs[i], snp_alts[i], snp_third[i], snp_forth[i], avail_alleles, 255 | len(self.sample_ids), snp_presence[i], round(snp_alt_freqs[i], 3), 256 | self.sample_ids, snp_ref_prob_row, snp_alt_prob_row, snp_third_prob_row, snp_forth_prob_row 257 | ) 258 | snps.append(snp) 259 | 260 | self.snps = snps 261 | return snps 262 | else: 263 | return [] 264 | 265 | def id_coordinates(self, boundary_mask, goodness_mask): 266 | if self.is_spliced: 267 | end_pos = np.array([]) 268 | start_pos = np.array([]) 269 | 270 | if len(self.alignments) > 1: 271 | end_pos = self.global_pos[boundary_mask] 272 | end_pos = np.append(end_pos, self.global_pos[-1]) 273 | 274 | start_pos = np.array([self.global_pos[0]]) 275 | shift_ends = end_pos[:-1] + 1 276 | start_pos = np.concatenate((start_pos, shift_ends)) 277 | else: 278 | end_pos = np.array([self.global_pos[-1]]) 279 | start_pos = np.array([self.global_pos[0]]) 280 | 281 | bad_pos = self.global_pos[np.logical_not(goodness_mask)] 282 | 283 | rshift_bad_pos = bad_pos + 1 284 | lshift_bad_pos = bad_pos - 1 285 | 286 | start_pos = np.concatenate((start_pos, rshift_bad_pos)) 287 | end_pos = np.concatenate((end_pos, lshift_bad_pos)) 288 | 289 | start_pos = np.sort(start_pos) 290 | end_pos = np.sort(end_pos) 291 | 292 | good_region_mask = (start_pos <= end_pos) 293 | start_pos = start_pos[good_region_mask] 294 | end_pos = end_pos[good_region_mask] 295 | 296 | end_pos = np.sort(end_pos) 297 | 298 | coords = [] 299 | for i, sp in enumerate(start_pos): 300 | coords.append({'chrom':self.chroms[sp-self.gpos_offset], 'start':sp, 'end':end_pos[i]}) 301 | 302 | self.coords = coords 303 | return coords 304 | else: 305 | return [] 306 | 307 | def _make_snp_(self, chrom, var_id, pos, ref, alt, third, forth, avail_alleles, NS, DP, AF, samp_ids, gp1, gp2, gp3, gp4): 308 | """ Format SNP for VCF """ 309 | info = {} 310 | info['NS'] = NS 311 | info['DP'] = DP 312 | info['AF'] = AF 313 | 314 | dat_fmt = {} 315 | dat_fmt['GP1'] = gp1 316 | dat_fmt['GP2'] = gp2 317 | dat_fmt['GP3'] = gp3 318 | dat_fmt['GP4'] = gp4 319 | 320 | snp = vcf_io.SNP(chrom, var_id, pos, ref, alt, third, forth, avail_alleles, info, dat_fmt, samp_ids) 321 | 322 | return snp 323 | 324 | def call_snps(aligns, max_sites, min_prev, snp_freq): 325 | """ 326 | Loop over each genomic site in each contig. 327 | For each site, fetch per-sample info from pileup files. 328 | Initialize GenomicSite object 329 | Determine site prevalence and allele frequency. 330 | Keep track of core-genome coordinates and SNPs in those regions. 331 | 332 | Args: 333 | max_sites: int; max number of sites to process 334 | min_prev: float; minimum prevalence for calling core sites 335 | snp_freq: float; minimum minor allele frequency for snp calling 336 | """ 337 | 338 | aa = AlignAssembly(aligns, max_sites) 339 | 340 | if not aa.is_spliced: 341 | aa.splice() 342 | 343 | if max_sites < len(aa.chroms): 344 | aa.cut_short(max_sites) 345 | 346 | aa.id_core_genome(min_prev, snp_freq) 347 | 348 | return aa 349 | 350 | def call_snps_iter(align_iterator, max_sites, min_prev, snp_freq): 351 | """ 352 | Loop over each genomic site in each contig. 353 | For each site, fetch per-sample info from pileup files. 354 | Initialize GenomicSite object 355 | Determine site prevalence and allele frequency. 356 | Keep track of core-genome coordinates and SNPs in those regions. 357 | 358 | Args: 359 | max_sites: int; max number of sites to process 360 | min_prev: float; minimum prevalence for calling core sites 361 | snp_freq: float; minimum minor allele frequency for snp calling 362 | """ 363 | 364 | block_size = 100*1000 365 | counter = 0 366 | gb_pos = 0 367 | 368 | aligns = [] 369 | for align in align_iterator: 370 | aligns.append(align) 371 | counter = counter + align.ncols 372 | 373 | if counter > block_size: 374 | aa = AlignAssembly(aligns, max_sites, gb_pos) 375 | 376 | if not aa.is_spliced: 377 | aa.splice() 378 | 379 | if max_sites < len(aa.chroms): 380 | aa.cut_short(max_sites) 381 | 382 | aa.id_core_genome(min_prev, snp_freq) 383 | 384 | for ali in aligns: 385 | gb_pos = gb_pos + ali.ncols 386 | 387 | aligns = [] 388 | counter = 0 389 | 390 | yield aa 391 | 392 | 393 | if len(aligns) > 0: 394 | aa = AlignAssembly(aligns, max_sites, gb_pos) 395 | 396 | if not aa.is_spliced: 397 | aa.splice() 398 | 399 | if max_sites < len(aa.chroms): 400 | aa.cut_short(max_sites) 401 | 402 | aa.id_core_genome(min_prev, snp_freq) 403 | 404 | yield aa 405 | -------------------------------------------------------------------------------- /src/callm_db_build.cpp: -------------------------------------------------------------------------------- 1 | #if __linux__ 2 | #include 3 | #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) 4 | #define _MAP_POPULATE_AVAILABLE 5 | #endif 6 | #endif 7 | 8 | #ifdef _MAP_POPULATE_AVAILABLE 9 | #define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) 10 | #else 11 | #define MMAP_FLAGS MAP_PRIVATE 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | using namespace std; 31 | 32 | 33 | // this program scans its input (fastq text stream) for forward k mers, 34 | 35 | // usage: 36 | // g++ -O3 --std=c++11 -o vfkmrz_bunion vfkmrz_bunion.cpp 37 | // ./vfkmrz_bunion -k1 -k2 38 | // 39 | // standard fastq format only for input, otherwise failure is almost guaranteed. 40 | 41 | // global variable declaration starts here 42 | constexpr auto k = 31; 43 | 44 | // set operation mode 45 | // valid values: 0, 1, 2 46 | // 0 is set union operation; 1 is set intersection operation; 2 is set difference([set1-set2]); 47 | constexpr auto s_mod = 0; 48 | 49 | // parameters for file read; from the source of GNU coreutils wc 50 | constexpr auto step_size = 256 * 1024 * 1024; 51 | constexpr auto buffer_size = 256 * 1024 * 1024; 52 | 53 | // output file path 54 | constexpr auto out_path = "/dev/stdout"; 55 | 56 | // get time elapsed since when it all began in milliseconds. 57 | long chrono_time() { 58 | using namespace chrono; 59 | return duration_cast(system_clock::now().time_since_epoch()).count(); 60 | } 61 | 62 | // number of bits per single nucleotide base 63 | constexpr int bpb = 2; 64 | 65 | size_t get_fsize(const char* filename) { 66 | struct stat st; 67 | stat(filename, &st); 68 | return st.st_size; 69 | } 70 | 71 | 72 | char* get_ftype(const char* filename) { 73 | int fn_len = strlen(filename); 74 | char *ftype = (char *)malloc(5); 75 | 76 | for(int i = 0; i < 4; ++i) { 77 | ftype[i] = filename[fn_len - 4 + i]; 78 | } 79 | 80 | ftype[4] = '\0'; 81 | 82 | return ftype; 83 | } 84 | 85 | 86 | template 87 | int_type bit_encode(const char c) { 88 | switch (c) { 89 | case 'A': return 0; 90 | case 'C': return 1; 91 | case 'G': return 2; 92 | case 'T': return 3; 93 | } 94 | 95 | assert(false); 96 | } 97 | 98 | 99 | template 100 | char bit_decode(const int_type bit_code) { 101 | switch (bit_code) { 102 | case 0: return 'A'; 103 | case 1: return 'C'; 104 | case 2: return 'G'; 105 | case 3: return 'T'; 106 | } 107 | assert(false); 108 | } 109 | 110 | template 111 | void make_code_dict(int_type* code_dict) { 112 | code_dict['A'] = bit_encode('A'); 113 | code_dict['C'] = bit_encode('C'); 114 | code_dict['G'] = bit_encode('G'); 115 | code_dict['T'] = bit_encode('T'); 116 | } 117 | 118 | template 119 | int_type seq_encode(const char* buf, int len, const int_type* code_dict, const int_type b_mask) { 120 | int_type seq_code = 0; 121 | for (int i=0; i < len; ++i) { 122 | const int_type b_code = code_dict[buf[i]]; 123 | seq_code |= ((b_code & b_mask) << (bpb * (len - i - 1))); 124 | } 125 | return seq_code; 126 | } 127 | 128 | template 129 | void seq_decode(char* buf, const int len, const int_type seq_code, int_type* code_dict, const int_type b_mask) { 130 | for (int i=0; i < len-1; ++i) { 131 | const int_type b_code = (seq_code >> (bpb * (len - i - 2))) & b_mask; 132 | buf[i] = bit_decode(b_code); 133 | } 134 | 135 | buf[len-1] = '\0'; 136 | } 137 | 138 | 139 | template 140 | void bit_load(const char* k_path, vector& buffer, vector>& k_vec, const int_type* code_dict, const int_type b_mask) { 141 | auto t_start = chrono_time(); 142 | 143 | char* window = buffer.data(); 144 | 145 | uintmax_t n_lines = 0; 146 | 147 | int fd; 148 | fd = open(k_path, O_RDONLY); 149 | 150 | int cur_pos = 0; 151 | int snp_pos = 0; 152 | 153 | char seq_buf[k]; 154 | char snp_id[16]; 155 | 156 | //auto fh = fstream(out_path, ios::out | ios::binary); 157 | 158 | bool id_switch = false; 159 | bool has_wildcard = false; 160 | 161 | while (true) { 162 | 163 | const ssize_t bytes_read = read(fd, window, step_size); 164 | 165 | if (bytes_read == 0) 166 | break; 167 | 168 | if (bytes_read == (ssize_t) -1) { 169 | cerr << "unknown fetal error when reading " << k_path << endl; 170 | exit(EXIT_FAILURE); 171 | } 172 | 173 | for (int i = 0; i < bytes_read; ++i) { 174 | char c = toupper(window[i]); 175 | if (c == '\n') { 176 | ++n_lines; 177 | 178 | if (has_wildcard) { 179 | has_wildcard = false; 180 | continue; 181 | } 182 | 183 | auto code = seq_encode(seq_buf, k, code_dict, b_mask); 184 | 185 | snp_id[snp_pos] = '\0'; 186 | int_type id_int = stoull(snp_id); 187 | //int_type id_int = 1; 188 | 189 | k_vec.push_back(tuple(code, id_int)); 190 | 191 | cur_pos = 0; 192 | snp_pos = 0; 193 | 194 | id_switch = false; 195 | } else if (c == '\t'){ 196 | id_switch = true; 197 | } else { 198 | if (c == 'N') { 199 | has_wildcard = true; 200 | } 201 | 202 | if (id_switch) { 203 | snp_id[snp_pos++] = c; 204 | } else { 205 | seq_buf[cur_pos++] = c; 206 | } 207 | } 208 | } 209 | 210 | //fh.write(&kmers[0], kmers.size()); 211 | 212 | // cerr << n_lines << " lines were scanned after " << (chrono_time() - t_start) / 1000 << " seconds" << endl; 213 | } 214 | 215 | auto timeit = chrono_time(); 216 | } 217 | 218 | 219 | template 220 | void bit_load(vector& buffer, vector>& k_vec, const int_type* code_dict, const int_type b_mask) { 221 | auto t_start = chrono_time(); 222 | 223 | char* window = buffer.data(); 224 | 225 | uintmax_t n_lines = 0; 226 | 227 | int cur_pos = 0; 228 | int snp_pos = 0; 229 | 230 | char seq_buf[k]; 231 | char snp_id[16]; 232 | 233 | //auto fh = fstream(out_path, ios::out | ios::binary); 234 | 235 | bool id_switch = false; 236 | bool has_wildcard = false; 237 | 238 | while (true) { 239 | 240 | const ssize_t bytes_read = read(STDIN_FILENO, window, step_size); 241 | 242 | if (bytes_read == 0) 243 | break; 244 | 245 | if (bytes_read == (ssize_t) -1) { 246 | cerr << "unknown fetal error when reading from stdin" << endl; 247 | exit(EXIT_FAILURE); 248 | } 249 | 250 | for (int i = 0; i < bytes_read; ++i) { 251 | char c = toupper(window[i]); 252 | if (c == '\n') { 253 | ++n_lines; 254 | 255 | if (has_wildcard) { 256 | has_wildcard = false; 257 | continue; 258 | } 259 | 260 | auto code = seq_encode(seq_buf, k, code_dict, b_mask); 261 | 262 | snp_id[snp_pos] = '\0'; 263 | int_type id_int = stoull(snp_id); 264 | //int_type id_int = 1; 265 | 266 | k_vec.push_back(tuple(code, id_int)); 267 | 268 | cur_pos = 0; 269 | snp_pos = 0; 270 | id_switch = false; 271 | } else if (c == '\t'){ 272 | id_switch = true; 273 | } else { 274 | if (c == 'N') { 275 | has_wildcard = true; 276 | } 277 | 278 | if (id_switch) { 279 | snp_id[snp_pos++] = c; 280 | } else { 281 | seq_buf[cur_pos++] = c; 282 | } 283 | } 284 | } 285 | 286 | //fh.write(&kmers[0], kmers.size()); 287 | 288 | // cerr << n_lines << " lines were scanned after " << (chrono_time() - t_start) / 1000 << " seconds" << endl; 289 | } 290 | 291 | auto timeit = chrono_time(); 292 | } 293 | 294 | 295 | template 296 | void binary_load(const char* k_path, vector>& k_vec) { 297 | size_t filesize = get_fsize(k_path); 298 | //Open file 299 | int fd = open(k_path, O_RDONLY, 0); 300 | assert(fd != -1); 301 | //Execute mmap 302 | //uint64_t* mmappedData = (uint64_t *) mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); 303 | int_type* mmappedData = (int_type *) mmap(NULL, filesize, PROT_READ, MMAP_FLAGS, fd, 0); 304 | assert(mmappedData != MAP_FAILED); 305 | //Write the mmapped data to stdout (= FD #1) 306 | 307 | // write(1, mmappedData, filesize); 308 | 309 | // char seq_buf[k+1]; 310 | 311 | auto l_start = chrono_time(); 312 | 313 | for (uint64_t i = 0; i < filesize/8; i=i+2) { 314 | // seq_decode(seq_buf, k, mmappedData[i], b_mask); 315 | 316 | auto kmer_int = mmappedData[i]; 317 | auto snp = mmappedData[i+1]; 318 | 319 | string snp_str = to_string(snp); 320 | 321 | if (snp_str[6] == '2'){ 322 | snp_str[6] = '0'; 323 | } else if (snp_str[6] == '3') { 324 | snp_str[6] = '1'; 325 | } 326 | 327 | auto k_pair = make_tuple(kmer_int, stoull(snp_str)); 328 | 329 | k_vec.push_back(k_pair); 330 | } 331 | 332 | //Cleanup 333 | int rc = munmap(mmappedData, filesize); 334 | assert(rc == 0); 335 | close(fd); 336 | } 337 | 338 | 339 | template 340 | bool cmp_tuple(const tuple &a, const tuple &b){ 341 | return get<0>(a) < get<0>(b); 342 | } 343 | 344 | template 345 | void multi_btc64() { 346 | int_type lsb = 1; 347 | int_type b_mask = (lsb << bpb) - lsb; 348 | 349 | int_type code_dict[1 << (sizeof(char) * 8)]; 350 | make_code_dict(code_dict); 351 | 352 | vector> kdb; 353 | vector buffer(buffer_size); 354 | 355 | bit_load(buffer, kdb, code_dict, b_mask); 356 | 357 | auto timeit = chrono_time(); 358 | sort(kdb.begin(), kdb.end(), cmp_tuple); 359 | // typename vector::iterator ip = unique(kdb.begin(), kdb.end()); 360 | // kdb.resize(std::distance(kdb.begin(), ip)); 361 | cerr << "Done!\n" << "It takes " << (chrono_time() - timeit) / 1000 << " secs" << endl; 362 | cerr << "the kmer list has " << kdb.size() << " kmers" << endl; 363 | 364 | // char seq_buf[k+1]; 365 | 366 | vector o_buff; 367 | 368 | ofstream fh(out_path, ofstream::out | ofstream::binary); 369 | 370 | for (auto it = kdb.begin(); it != kdb.end(); ++it) { 371 | // seq_decode(seq_buf, k+1, *it, code_dict, b_mask); 372 | // fh << seq_buf << "\n"; 373 | // fh << *it << "\n"; 374 | 375 | // cerr << get<0>(*it) << '\t' << get<1>(*it) << '\n'; 376 | o_buff.push_back(get<0>(*it)); 377 | o_buff.push_back(get<1>(*it)); 378 | } 379 | 380 | fh.write((char*)&o_buff[0], o_buff.size() * sizeof(int_type)); 381 | 382 | fh.close(); 383 | } 384 | 385 | template 386 | void multi_btc64(int n_path, char** kpaths) { 387 | int_type lsb = 1; 388 | int_type b_mask = (lsb << bpb) - lsb; 389 | 390 | int_type code_dict[1 << (sizeof(char) * 8)]; 391 | make_code_dict(code_dict); 392 | 393 | vector> kdb; 394 | vector buffer(buffer_size); 395 | 396 | for (int i = 1; i < n_path; ++i) { 397 | cerr << kpaths[i] << endl; 398 | 399 | char* kp_type = get_ftype(kpaths[i]); 400 | 401 | if (strcmp(kp_type, ".tsv") == 0) { 402 | bit_load(kpaths[i], buffer, kdb, code_dict, b_mask); 403 | } else if (strcmp(kp_type, ".bin") == 0) { 404 | binary_load(kpaths[i], kdb); 405 | } else { 406 | assert(false); 407 | } 408 | } 409 | 410 | auto timeit = chrono_time(); 411 | 412 | sort(kdb.begin(), kdb.end(), cmp_tuple); 413 | // typename vector::iterator ip = unique(kdb.begin(), kdb.end()); 414 | // kdb.resize(std::distance(kdb.begin(), ip)); 415 | cerr << "Sorting done! " << "It takes " << (chrono_time() - timeit) / 1000 << " secs" << endl; 416 | cerr << "the kmer list has " << kdb.size() << " kmers" << endl; 417 | 418 | char seq_buf[k+1]; 419 | // ofstream fh(out_path, ofstream::out | ofstream::binary); 420 | vector o_buff; 421 | 422 | // move onto checkout when checkout_flag is true 423 | bool checkout_flag = true; 424 | vector> auto_queue; 425 | 426 | cerr << "start to check conflicts" << endl; 427 | for (auto it = kdb.begin(); it+1 != kdb.end(); ++it) { 428 | // seq_decode(seq_buf, k+1, get<0>(*it), code_dict, b_mask); 429 | // cerr << seq_buf << '\t' << get<1>(*it) << '\n'; 430 | 431 | if (get<0>(*it) == get<0>(*(it+1))) { 432 | auto spe1 = stoi(to_string(get<1>(*it)).substr(0, 6)); 433 | auto spe2 = stoi(to_string(get<1>(*(it+1))).substr(0, 6)); 434 | 435 | if (spe1 != spe2) { 436 | checkout_flag = false; 437 | } 438 | 439 | auto_queue.push_back(*it); 440 | 441 | continue; 442 | } 443 | 444 | // check out when code(i) != code(i+1) 445 | if (!checkout_flag) { 446 | for(auto iq = auto_queue.begin(); iq != auto_queue.end(); ++iq){ 447 | //cerr << get<0>(*iq) << " - " << get<1>(*iq) << '\n'; 448 | } 449 | auto_queue.clear(); 450 | checkout_flag = true; 451 | } else { 452 | if (auto_queue.size() > 0){ 453 | for(auto iq = auto_queue.begin(); iq != auto_queue.end(); ++iq){ 454 | o_buff.push_back(get<0>(*iq)); 455 | o_buff.push_back(get<1>(*iq)); 456 | } 457 | 458 | auto_queue.clear(); 459 | } 460 | o_buff.push_back(get<0>(*it)); 461 | o_buff.push_back(get<1>(*it)); 462 | } 463 | } 464 | 465 | auto end_ele = kdb.back(); 466 | if (checkout_flag) { 467 | if (auto_queue.size() > 0){ 468 | for(auto iq = auto_queue.begin(); iq != auto_queue.end(); ++iq){ 469 | o_buff.push_back(get<0>(*iq)); 470 | o_buff.push_back(get<1>(*iq)); 471 | } 472 | 473 | auto_queue.clear(); 474 | } 475 | o_buff.push_back(get<0>(end_ele)); 476 | o_buff.push_back(get<1>(end_ele)); 477 | } 478 | 479 | cerr << "the kmer list has " << o_buff.size()/2<< " kmers after purging conflicts" << endl; 480 | 481 | vector>().swap(kdb); 482 | 483 | unordered_map> snp_indx; 484 | 485 | for (auto it = o_buff.begin(); it != o_buff.end(); it=it+2) { 486 | auto snp = stoull(to_string(*(it+1)).substr(0, 6) + to_string(*(it+1)).substr(7)); 487 | auto snp_type = stoi(to_string(*(it+1)).substr(6, 1)); 488 | 489 | if (snp_indx.find(snp) == snp_indx.end()) { 490 | auto type_pair = make_tuple(0, 0); 491 | snp_indx.insert({snp, type_pair}); 492 | } 493 | 494 | assert(snp_type == 0 || snp_type == 1); 495 | 496 | if (snp_type == 0) { 497 | get<0>(snp_indx[snp]) = 1; 498 | } else { 499 | get<1>(snp_indx[snp]) = 1; 500 | } 501 | } 502 | 503 | vector v_buff; 504 | 505 | for (auto it = o_buff.begin(); it != o_buff.end(); it=it+2) { 506 | auto snp = stoull(to_string(*(it+1)).substr(0, 6) + to_string(*(it+1)).substr(7)); 507 | 508 | assert(snp_indx.find(snp) != snp_indx.end()); 509 | 510 | if (get<0>(snp_indx[snp]) + get<1>(snp_indx[snp]) == 2) { 511 | v_buff.push_back(*it); 512 | v_buff.push_back(*(it+1)); 513 | } 514 | } 515 | 516 | cerr << "the kmer list has " << v_buff.size()/2<< " kmers after purging conflicts" << endl; 517 | 518 | ofstream fh(out_path, ofstream::binary); 519 | 520 | fh.write((char*)&v_buff[0], v_buff.size() * sizeof(int_type)); 521 | fh.close(); 522 | } 523 | 524 | void display_usage(char *fname){ 525 | cout << "usage: " << fname << " fpath [fpath ...]\n"; 526 | } 527 | 528 | int main(int argc, char** argv){ 529 | if (argc == 2 && string(argv[1]) == "-h") { 530 | display_usage(argv[0]); 531 | } else if (argc >= 2) { 532 | multi_btc64(argc, argv); 533 | } else if (argc == 1) { 534 | multi_btc64(); 535 | } else { 536 | cerr << argv[0] << " reads from stdin or takes at least one arguments!" << endl; 537 | display_usage(argv[0]); 538 | exit(EXIT_FAILURE); 539 | } 540 | 541 | return 0; 542 | } 543 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Catalogue 2 | 3 | * [Maast](https://github.com/zjshi/Maast#maast) 4 | * [What Maast does](https://github.com/zjshi/Maast#what-maast-does) 5 | * [How to cite](https://github.com/zjshi/Maast#how-to-cite) 6 | * [Installation](https://github.com/zjshi/Maast#installation) 7 | * [Conda Installation](https://github.com/zjshi/Maast#conda-installation) 8 | * [How to use](https://github.com/zjshi/Maast#how-to-use) 9 | * * [Type SNPs from a set of whole genome assemblies and sequencing reads from beginning to end in one single command line](https://github.com/zjshi/Maast#type-snps-from-a-set-of-whole-genome-assemblies-and-sequencing-reads-from-beginning-to-end-in-one-single-command-line) 10 | * * [Genotype SNPs step by step](https://github.com/zjshi/Maast#genotype-snps-step-by-step) 11 | * * * [Step 1a: Call SNP with a collection of whole genome assemblies](https://github.com/zjshi/Maast#step-1a-call-snp-with-a-collection-of-whole-genome-assemblies) 12 | * * * [Step 1b: Call SNPs from a set of whole genomes without redundancy reduction](https://github.com/zjshi/Maast#step-1b-call-snps-from-a-set-of-whole-genomes-without-redundancy-reduction) 13 | * * * [Step 1c: Call SNPs with customized minimum prevalence and minor allele frequency (MAF) thresholds](https://github.com/zjshi/Maast#step-1c-call-snps-with-customized-minimum-prevalence-and-minor-allele-frequency-maf-thresholds) 14 | * * * [Step 2: Build SNP covering k-mer database](https://github.com/zjshi/Maast#step-2-build-snp-covering-k-mer-database) 15 | * * * [Step 3: Genotype whole genome assemblies, sequencing reads or both](https://github.com/zjshi/Maast#step-3-genotype-whole-genome-assemblies-sequencing-reads-or-both) 16 | * * * [Construct a SNP tree with Maast genotypes (optional)](https://github.com/zjshi/Maast#construct-a-snp-tree-with-maast-genotypes-optional) 17 | * * * [More helper text and arguments](https://github.com/zjshi/Maast#more-helper-text-and-arguments) 18 | * [Example tutorial](https://github.com/zjshi/Maast#example) 19 | * * [Download and decompress test dataset](https://github.com/zjshi/Maast#download-and-decompress-test-dataset) 20 | * * [Genotype SNPs from begin to end in one single command line with the test dataset](https://github.com/zjshi/Maast#genotype-snps-from-begin-to-end-in-one-single-command-line-with-the-test-dataset) 21 | * * [Genotype SNPs step by step with the test dataset](https://github.com/zjshi/Maast#genotype-snps-step-by-step-with-the-test-dataset) 22 | * * * [Step 1: Call SNPs with whole genome assemblies](https://github.com/zjshi/Maast#step-1-call-snps-with-whole-genome-assemblies) 23 | * * * [Step 2: Build SNP covering k-mer database](https://github.com/zjshi/Maast#step-2-build-snp-covering-k-mer-database-1) 24 | * * * [Step 3: Genotype whole genome assemblies, sequencing reads or both](https://github.com/zjshi/Maast#step-3-genotype-whole-genome-assemblies-sequencing-reads-or-both-1) 25 | * * * [Construct a SNP tree with Maast genotypes (optional)](https://github.com/zjshi/Maast#construct-a-snp-tree-with-maast-genotypes-optional-1) 26 | 27 | # Maast 28 | 29 | Maast for microbial agile accurate SNP typing 30 | 31 | ## What Maast does 32 | 33 | Recent spikes in available whole-genome sequences have greatly expanded intra-species diversity especially for prevalent species. As the number of genomes per species grows, it becomes computationally challenging to perform whole-genome alignment and call single nucleotide polymorphisms (SNPs). Furthermore, the genomes from some species are highly similar and hence redundant for SNP discovery. These trends are irreversible and worse over time. To address the challenge, we present Maast, a tool for discovering core-genome SNPs and genotyping these SNPs in conspecific genomes, contigs, or unassembled reads. Maast runs orders of magnitude faster than existing tools and uses less RAM because it is free of read alignment and assembly. Maast is also comparably accurate and recovers more core-genome SNPs compared to other the-state-of-art tools. 34 | 35 | ## How to cite 36 | 37 | The publication of Maast is in preparation. Please cite this GitHub repo as alternative for now. 38 | 39 | ## Installation 40 | 41 | Python requirement 42 | * Python3 (>=3.9.6) 43 | 44 | Required Python libraries 45 | * [NumPy] (https://numpy.org/install/) (>=1.19.5) 46 | * [SciPy] (https://scipy.org/install/) (>=1.5.4) 47 | * [Biopython] (https://biopython.org/wiki/Download) (>=1.79) 48 | * [NetworkX] (https://pypi.org/project/networkx/) (>=2.5.1) 49 | 50 | Note: the following installation command line might be helpful 51 | `pip install numpy biopython` 52 | 53 | Required external programs 54 | * [Mash](https://github.com/marbl/Mash) (>=v2.2) 55 | * [MUMmer4](https://github.com/mummer4/mummer) (>=v4.0.0) 56 | 57 | Optional installation 58 | * [FastTreeMP](http://www.microbesonline.org/fasttree/FastTreeMP) (>= v2.1.11) (Optional; only required when tree subcommand is run) 59 | * [pigz](https://zlib.net/pigz/) (Optional; A parallel implementation of gzip for modern multi-processor, multi-core machines) 60 | * [lbzip2](http://lbzip2.org/) (Optional; A free, multi-threaded compression utility with support for bzip2 compressed file format) 61 | * [lz4](http://www.lz4.org) (Optional; Extremely Fast Compression algorithm) 62 | 63 | Note: the optional dependencies are not required for essential features of Maast, but they are recommended to be installed for better performance or additional features. 64 | 65 | First, retrieve a copy of Maast to your local computing environment 66 | 67 | `git clone https://github.com/zjshi/Maast.git` 68 | 69 | Change your current working directory into where you put Maast 70 | `cd /path/to/Maast/` 71 | 72 | Type in the command line to compile the source code of Maast 73 | `make` 74 | 75 | Type in the command line to make GT-Pro ready to execute 76 | `chmod 755 maast` 77 | 78 | The main program (`maast`) should be found in the same directory as `/path/to/Maast/`. This location can be added to the system path so that the main program can be accessed from anywhere. Reference through full path is also allowed. 79 | 80 | Type in the command line to display help text 81 | 82 | `./maast -h` 83 | 84 | Notes for C++ compiler 85 | 86 | Maast requires a C++ compiler that is compatible with C++ 11 standards to work properly. All the tests have been done and passed with clang-900.0.38, but it should be compatible for GNU C Compiler (newer than 5.4.0). We have not tested Maast with older compilers, but we expect it to run similiarly as long as it compiles successfully. 87 | 88 | 89 | ## Conda Installation 90 | 91 | Create a new conda environment 92 | `conda create -n maast` 93 | 94 | Activate the environment just created 95 | `conda activate maast` 96 | 97 | Conda automatic installation with all dependencies 98 | `conda install -c conda-forge -c bioconda maast` 99 | 100 | Quick installation verification 101 | `maast -h` 102 | 103 | ## How to use 104 | 105 | ### Type common SNPs from a set of whole genome assemblies and sequencing reads from beginning to end in one single command line 106 | 107 | `maast end_to_end --in-dir /path/to/directory/containing/genomes/reads/or/both --out-dir /path/Maast/output/ --min-prev 0.9 --snp-freq 0.01` 108 | 109 | Note: 110 | 111 | Input directory must have a number of whole genome assemblies in FASTA format. 112 | 113 | Maast can automatically identify file types with supported file suffix: whole genome assemblies (.fa, .fsa, .fna and .fasta) and sequencing reads (.fq and .fastq). Files compressed with popular algorithms, including .gz, .lz4 and .bz2, are also supported. 114 | 115 | The running of end_to_end subcomand is equavalent to the running of genomes, db and genotype subcommand with default settings in a row. 116 | 117 | ### Genotype SNPs step by step 118 | 119 | #### Step 1a: Call SNP with a collection of whole genome assemblies 120 | `maast genomes --fna-dir /path/to/genomes/ --out-dir /path/Maast/output/` 121 | 122 | Note: 123 | By default, Maast first collapsed redundancy in the input genomes and then call common SNPs from a subset of tag genomes. It also automatically identifies a centroid-genome and use it for the representative genome. 124 | 125 | Upon a successful run, this step will produce several important files that are required for downstream steps. 126 | * reference.fna (Reference genome that provides genomic coordinate for SNPs) 127 | * core_snps.vcf (SNP catalog) 128 | * tag_paths.list (Selected tag genomes) 129 | 130 | #### Step 1b: Call SNPs from a set of whole genomes with a speficied reference genome without redundancy reduction 131 | 132 | `maast genomes --fna-dir /path/to/genomes/ --rep-fna /path/to/rep_genome.fna --out-dir /path/Maast/output/ --skip-centroid --keep-redundancy` 133 | 134 | #### Step 1c: Call SNPs with customized minimum prevalence and minor allele frequency (MAF) thresholds 135 | 136 | `maast genomes --fna-dir /path/to/genomes/ --rep-fna /path/to/rep_genome.fna --out-dir /path/Maast/output/ --min-prev 0.95 --snp-freq 0.001` 137 | 138 | #### Step 2: Build SNP covering k-mer database 139 | 140 | `maast db --ref-genome /path/to/reference.fna --vcf /path/to/core_snps.vcf --msa /path/to/tag_msa.fna --tag-fna-list /path/to/tag_paths.list --fna-dir /path/to/genomes/ --out-dir /path/Maast/output/` 141 | 142 | Note: 143 | 144 | Upon a successful run, this step will produce a SNP covering k-mer database that is required for genotyping sequencing reads. 145 | * kmer_db.bin (SNP covering k-mer database) 146 | 147 | #### Step 3: Genotype whole genome assemblies, sequencing reads or both 148 | 149 | `maast genotype --in-dir /path/to/directory/containing/genomes/reads/or/both --ref-genome /path/to/reference.fna --db /path/to/kmer_db.bin --vcf /path/to/core_snps.vcf --out-dir /path/Maast/output/` 150 | 151 | #### Construct a SNP tree with Maast genotypes (optional) 152 | 153 | `maast tree --input-list /path/to/Maast/genotypes.input.tsv --out-dir /path/Maast/output/` 154 | 155 | #### More helper text and arguments 156 | 157 | `maast end_to_end|genomes|db|genotype|tree -h` 158 | 159 | ## Example 160 | 161 | ### Download and decompress test dataset 162 | 163 | `wget --content-disposition https://fileshare.czbiohub.org/s/TwGJAsAZ6dQsM49/download` 164 | 165 | `tar xzvf 101346.tar.gz` 166 | 167 | Note: after running the two command line above, one directory named 101346 can be found in the current directory. In the directory 101346, there are 300 whole genome assemblie in FASTA format (.fna) and 8 gzipped files of WGS sequencing reads in FASTQ format (.fastq.gz). 168 | 169 | ### Genotype SNPs from begin to end in one single command line with the test dataset 170 | 171 | `maast end_to_end --in-dir ./101346 --out-dir ./101346_out` 172 | 173 | Note: after running the above command line, one directory name 101346_out can be found in the currently directory, which contains all resulting files and directories. 174 | 175 | The files include 176 | * reference.fna (selected reference genome) 177 | * tag_paths.list (list of selected tag genomes) 178 | * tag_msa.fna (multiple sequence alignment of tag genomes) 179 | * coords.tsv (coordinates of consensus genome) 180 | * core_snps.vcf (called SNPs in VCF format) 181 | * nr_kmer_set.tsv (raw SNP-covering k-mers) 182 | * check_fna_paths.list (a list of genomes used for validating SNP-covering k-mers) 183 | * kmer_prof.tsv (hit profile of SNP-covering k-mers) 184 | * selected_kmers.tsv (validated SNP-covering k-mers) 185 | * kmer_db.bin (optimized database of SNP-covering k-mers) 186 | 187 | The directories include 188 | * gt_results (SNP genotyping results) 189 | * temp (tempory directory for hosting ) 190 | 191 | ### Genotype SNPs step by step with the test dataset 192 | 193 | #### Step 1: Call SNPs with whole genome assemblies 194 | 195 | `maast genomes --fna-dir ./101346 --out-dir ./101346_out` 196 | 197 | Note: upon a successful run of the first step, the output files include 198 | * reference.fna (selected reference genome) 199 | * tag_paths.list (list of selected tag genomes) 200 | * tag_msa.fna (multiple sequence alignment of tag genomes) 201 | * coords.tsv (coordinates of consensus genome) 202 | * core_snps.vcf (called SNPs in VCF format) 203 | 204 | #### Step 2: Build SNP covering k-mer database 205 | 206 | `maast db --ref-genome ./101346_out/reference.fna --vcf ./101346_out/core_snps.vcf --msa ./101346_out/tag_msa.fna --tag-fna-list ./101346_out/tag_paths.list --fna-dir ./101346/ --out-dir ./101346_out/` 207 | 208 | Note: all the required input files can be found from the output files of the first step. 209 | 210 | Upon a successful run of the second step, the output files include 211 | * nr_kmer_set.tsv (raw SNP-covering k-mers) 212 | * check_fna_paths.list (a list of genomes used for validating SNP-covering k-mers) 213 | * kmer_prof.tsv (hit profile of SNP-covering k-mers) 214 | * selected_kmers.tsv (validated SNP-covering k-mers) 215 | * kmer_db.bin (optimized database of SNP-covering k-mers) 216 | 217 | Among them, kmer_db.bin is the database file that will be used in the next step along with a few other required files from the first step. 218 | 219 | #### Step 3: Genotype whole genome assemblies, sequencing reads or both 220 | 221 | `maast genotype --in-dir ./101346/ --ref-genome ./101346_out/reference.fna --db ./101346_out/kmer_db.bin --vcf ./101346_out/core_snps.vcf --out-dir ./101346_out/` 222 | 223 | Note: Files to genotype should be supplied in a directory with --in-dir. Supported file types including FASTA and FASTQ formats. Input files can be all FASTAs, FASTQs or a mixture of both. 224 | 225 | all other required input files could be found from the output files of two previous steps. 226 | 227 | The main output files are the SNP genotypes that can be found in the a directory named "gt_results" in the designated output directory, ./101346_out/ in this case. 228 | 229 | It has seven fields as the following: 230 | 231 | 1. Contig: string type with arbitary length which specifies the contig of a representative genome where a SNP is from 232 | 2. Local Pos: up to seven digits which specifies the local position of a SNP on a contig 233 | 3. Global Pos: up to seven digits which specifies the global position of a SNP in a species, served as sort of ID 234 | 4. Allele 1: single character, A, C, G or T, which specifies allele 1 of a SNP 235 | 5. Allele 2: similiar as Ref allele but specifies allele 2 of a SNP 236 | 6. Allele 1 Cnt: an integer specifying the count of detected allele 1 in a metagenome 237 | 7. Allele 2 Cnt: an integer specifying the count of detected allele 2 in a metagenome 238 | 239 | An example of such looks like the following: 240 | 241 | | Contig | Local Pos | Global Pos | Allele 1 | Allele 2 | Allele 1 Cnt | Allele 2 Cnt | 242 | | :--- | :----: | :----: | :----: | :----: | :----: | :----: | 243 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15829 | 349759 | C | T | 65 | 0 | 244 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15863 | 20713 | C | T | 62 | 1 | 245 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15889 | 131457 | C | A | 62 | 0 | 246 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15907 | 4457 | G | A | 59 | 0 | 247 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15910 | 4553 | C | A | 59 | 0 | 248 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15937 | 151893 | C | T | 56 | 0 | 249 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15940 | 101338 | C | T | 55 | 0 | 250 | | ... | ... | ... | ... | ... | ... | ... | 251 | 252 | #### Construct a SNP tree with Maast genotypes (optional) 253 | 254 | This is an optional step that helps take advantage of genotyped SNPs for a quick application - SNP tree building 255 | 256 | `paste <(find ./101346_out/gt_results/ -name '*tsv' | sort) <(find ./101346_out/gt_results/ -name '*tsv' | sort | cut -d'/' -f4 | cut -d'.' -f1) > 101346_genotypes.input.tsv` 257 | 258 | Note: the step above generates a list of input pairs. Each pair per row contains a path to a genotype result file generated from Maast genotype command and a unique name of the file. The path and name are separated by a tab, like the following 259 | /file/path/1 name1 260 | /file/path/2 name2 261 | /file/path/3 name3 262 | ... 263 | 264 | 265 | The first three rows of 101346_genotypes.input.tsv in this example look like 266 | ./101346_out/gt_results/GUT_GENOME000400.fna.tsv GUT_GENOME000400 267 | ./101346_out/gt_results/GUT_GENOME000466.fna.tsv GUT_GENOME000466 268 | ./101346_out/gt_results/GUT_GENOME000688.fna.tsv GUT_GENOME000688 269 | 270 | 271 | `maast tree --input-list ./101346_genotypes.input.tsv --out-dir ./101346_out/` 272 | 273 | Note: upon the successful completion of this command, the following three output can be found: 274 | * concat_allele.aln.fasta (concatenated allele sequences with genotyped SNPs) 275 | * concat_allele.aln.mat (Pairwise genomic distances between concatenated allele sequences) 276 | * concat_allele.aln.tre (Phylogenetic tree built with concatenated allele sequences) 277 | -------------------------------------------------------------------------------- /src/callm_db_val.cpp: -------------------------------------------------------------------------------- 1 | #if __linux__ 2 | #include 3 | #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) 4 | #define _MAP_POPULATE_AVAILABLE 5 | #endif 6 | #endif 7 | 8 | #ifdef _MAP_POPULATE_AVAILABLE 9 | #define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE) 10 | #else 11 | #define MMAP_FLAGS MAP_PRIVATE 12 | #endif 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | 31 | using namespace std; 32 | 33 | 34 | // this program scans its input (fastq text stream) for forward k mers, 35 | 36 | // usage: 37 | // g++ -O3 --std=c++11 -o vfkmrz_bunion vfkmrz_bunion.cpp 38 | // ./vfkmrz_bunion -k1 -k2 39 | // 40 | // standard fastq format only for input, otherwise failure is almost guaranteed. 41 | 42 | // global variable declaration starts here 43 | constexpr auto k = 31; 44 | 45 | // set operation mode 46 | // valid values: 0, 1, 2 47 | // 0 is set union operation; 1 is set intersection operation; 2 is set difference([set1-set2]); 48 | constexpr auto s_mod = 0; 49 | 50 | // parameters for file read; from the source of GNU coreutils wc 51 | constexpr auto step_size = 256 * 1024 * 1024; 52 | constexpr auto buffer_size = 256 * 1024 * 1024; 53 | 54 | // output file path 55 | constexpr auto out_path = "/dev/stdout"; 56 | 57 | // get time elapsed since when it all began in milliseconds. 58 | long chrono_time() { 59 | using namespace chrono; 60 | return duration_cast(system_clock::now().time_since_epoch()).count(); 61 | } 62 | 63 | // number of bits per single nucleotide base 64 | constexpr int bpb = 2; 65 | 66 | size_t get_fsize(const char* filename) { 67 | struct stat st; 68 | stat(filename, &st); 69 | return st.st_size; 70 | } 71 | 72 | 73 | char* get_ftype(const char* filename) { 74 | int fn_len = strlen(filename); 75 | char *ftype = (char *)malloc(5); 76 | 77 | for(int i = 0; i < 4; ++i) { 78 | ftype[i] = filename[fn_len - 4 + i]; 79 | } 80 | 81 | ftype[4] = '\0'; 82 | 83 | return ftype; 84 | } 85 | 86 | void make_comp_map(char* comp_map) { 87 | comp_map['A'] = 'T'; 88 | comp_map['C'] = 'G'; 89 | comp_map['G'] = 'C'; 90 | comp_map['T'] = 'A'; 91 | } 92 | 93 | template 94 | int_type bit_encode(const char c) { 95 | switch (c) { 96 | case 'A': return 0; 97 | case 'C': return 1; 98 | case 'G': return 2; 99 | case 'T': return 3; 100 | } 101 | 102 | assert(false); 103 | } 104 | 105 | template 106 | char bit_decode(const int_type bit_code) { 107 | switch (bit_code) { 108 | case 0: return 'A'; 109 | case 1: return 'C'; 110 | case 2: return 'G'; 111 | case 3: return 'T'; 112 | } 113 | assert(false); 114 | } 115 | 116 | template 117 | void make_code_dict(int_type* code_dict) { 118 | code_dict['A'] = bit_encode('A'); 119 | code_dict['C'] = bit_encode('C'); 120 | code_dict['G'] = bit_encode('G'); 121 | code_dict['T'] = bit_encode('T'); 122 | } 123 | 124 | template 125 | int_type seq_encode(const char* buf, int len, const int_type* code_dict, const int_type b_mask) { 126 | int_type seq_code = 0; 127 | for (int i=0; i < len; ++i) { 128 | const int_type b_code = code_dict[buf[i]]; 129 | seq_code |= ((b_code & b_mask) << (bpb * (len - i - 1))); 130 | } 131 | return seq_code; 132 | } 133 | 134 | template 135 | void seq_decode(char* buf, const int len, const int_type seq_code, int_type* code_dict, const int_type b_mask) { 136 | for (int i=0; i < len-1; ++i) { 137 | const int_type b_code = (seq_code >> (bpb * (len - i - 2))) & b_mask; 138 | buf[i] = bit_decode(b_code); 139 | } 140 | 141 | buf[len-1] = '\0'; 142 | } 143 | 144 | 145 | template 146 | void load_profile(const char* k_path, vector& buffer, vector& kv1, vector& kv2, vector& kv3, vector& kv4, vector>& k_info, int_type* code_dict, const int_type b_mask) { 147 | auto t_start = chrono_time(); 148 | 149 | char* window = buffer.data(); 150 | 151 | uintmax_t n_lines = 0; 152 | 153 | int fd; 154 | fd = open(k_path, O_RDONLY); 155 | 156 | int k_cur = 0; 157 | int snp_cur = 0; 158 | int pos_cur = 0; 159 | 160 | char kbuf1[k]; 161 | char kbuf2[k]; 162 | char kbuf3[k]; 163 | char kbuf4[k]; 164 | 165 | char snp_pos[16]; 166 | char kmer_pos[4]; 167 | 168 | //auto fh = fstream(out_path, ios::out | ios::binary); 169 | 170 | int cur_field = 0; 171 | bool has_wildcard = false; 172 | 173 | while (true) { 174 | 175 | const ssize_t bytes_read = read(fd, window, step_size); 176 | 177 | if (bytes_read == 0) 178 | break; 179 | 180 | if (bytes_read == (ssize_t) -1) { 181 | cerr << "unknown fetal error when reading " << k_path << endl; 182 | exit(EXIT_FAILURE); 183 | } 184 | 185 | for (int i = 0; i < bytes_read; ++i) { 186 | char c = window[i]; 187 | 188 | if (c == '\n') { 189 | ++n_lines; 190 | 191 | if (has_wildcard) { 192 | has_wildcard = false; 193 | continue; 194 | } 195 | 196 | auto code1 = seq_encode(kbuf1, k, code_dict, b_mask); 197 | auto code2 = seq_encode(kbuf2, k, code_dict, b_mask); 198 | auto code3 = seq_encode(kbuf3, k, code_dict, b_mask); 199 | auto code4 = seq_encode(kbuf4, k, code_dict, b_mask); 200 | 201 | kv1.push_back(code1); 202 | kv2.push_back(code2); 203 | kv3.push_back(code3); 204 | kv4.push_back(code4); 205 | 206 | snp_pos[snp_cur] = '\0'; 207 | int_type id_int = stoull(snp_pos); 208 | 209 | kmer_pos[pos_cur] = '\0'; 210 | int_type k_pos = stoull(kmer_pos); 211 | 212 | k_info.push_back(tuple(id_int, k_pos)); 213 | 214 | k_cur = 0; 215 | pos_cur = 0; 216 | snp_cur = 0; 217 | 218 | cur_field = 0; 219 | } else if (c == '\t'){ 220 | ++cur_field; 221 | k_cur = 0; 222 | } else { 223 | if (c == 'N') { 224 | has_wildcard = true; 225 | } 226 | 227 | if (cur_field == 0) { 228 | snp_pos[snp_cur++] = c; 229 | } else if (cur_field == 1) { 230 | kmer_pos[pos_cur++] = c; 231 | } else if (cur_field == 2) { 232 | kbuf1[k_cur++] = c; 233 | } else if (cur_field == 3) { 234 | kbuf2[k_cur++] = c; 235 | } else if (cur_field == 4) { 236 | kbuf3[k_cur++] = c; 237 | } else if (cur_field == 5) { 238 | kbuf4[k_cur++] = c; 239 | } else { 240 | //do nothing; 241 | } 242 | } 243 | } 244 | } 245 | 246 | 247 | assert(kv1.size() == k_info.size() && kv1.size() == kv2.size()); 248 | assert(kv1.size() == kv3.size() && kv1.size() == kv4.size()); 249 | 250 | auto timeit = chrono_time(); 251 | close(fd); 252 | } 253 | 254 | template 255 | void fna_load_pool(const char* fna_path, vector& buffer, unordered_map& k_map, const int_type* code_dict, const int_type b_mask) { 256 | auto t_start = chrono_time(); 257 | 258 | char comp_map[1 << (sizeof(char) * 8)]; 259 | make_comp_map(comp_map); 260 | 261 | char* window = buffer.data(); 262 | 263 | uintmax_t n_lines = 0; 264 | 265 | int fd; 266 | fd = open(fna_path, O_RDONLY); 267 | 268 | int cur_pos = 0; 269 | 270 | vector seq_buf(10*1000*1000); 271 | char* bases = seq_buf.data(); 272 | 273 | char kmer_buff[k]; 274 | char rckmer_buff[k]; 275 | 276 | bool is_base = false; 277 | bool has_wildcard = false; 278 | 279 | while (true) { 280 | 281 | const ssize_t bytes_read = read(fd, window, step_size); 282 | 283 | if (bytes_read == 0) 284 | break; 285 | 286 | if (bytes_read == (ssize_t) -1) { 287 | cerr << "unknown fetal error when reading " << fna_path << endl; 288 | exit(EXIT_FAILURE); 289 | } 290 | 291 | for (int i = 0; i < bytes_read; ++i) { 292 | char c = window[i]; 293 | if (c == '\n') { 294 | if (!is_base) { 295 | is_base = true; 296 | } 297 | continue; 298 | } else if (c == '>') { 299 | for (int j = 0; j < cur_pos-k+1; ++j) { 300 | for (int l = 0; l < k; ++l) { 301 | kmer_buff[l] = bases[j+l]; 302 | 303 | if (kmer_buff[l] == 'N') { 304 | has_wildcard = true; 305 | break; 306 | } 307 | } 308 | 309 | 310 | if (has_wildcard) { 311 | has_wildcard = false; 312 | continue; 313 | } 314 | 315 | auto kmer_int = seq_encode(kmer_buff, k, code_dict, b_mask); 316 | 317 | if (k_map.find(kmer_int) == k_map.end()) { 318 | k_map.insert({kmer_int, 1}); 319 | } else { 320 | ++k_map[kmer_int]; 321 | } 322 | 323 | for (int l = k-1; l >= 0; --l) { 324 | rckmer_buff[k-1-l] = comp_map[kmer_buff[l]]; 325 | } 326 | 327 | /* not really necessary when rc kmers present 328 | auto rckmer_int = seq_encode(rckmer_buff, k, code_dict, b_mask); 329 | 330 | if (k_map.find(rckmer_int) == k_map.end()) { 331 | k_map.insert({rckmer_int, 1}); 332 | } else { 333 | ++k_map[rckmer_int]; 334 | } 335 | */ 336 | } 337 | 338 | cur_pos = 0; 339 | is_base = false; 340 | 341 | ++n_lines; 342 | } else { 343 | if (is_base) { 344 | bases[cur_pos++] = toupper(c); 345 | } 346 | } 347 | } 348 | 349 | } 350 | 351 | if (cur_pos >= k) { 352 | for (int j = 0; j < cur_pos-k+1; ++j) { 353 | for (int l = 0; l < k; ++l) { 354 | kmer_buff[l] = bases[j+l]; 355 | 356 | if (kmer_buff[l] == 'N') { 357 | has_wildcard = true; 358 | break; 359 | } 360 | } 361 | 362 | if (has_wildcard) { 363 | has_wildcard = false; 364 | continue; 365 | } 366 | 367 | auto kmer_int = seq_encode(kmer_buff, k, code_dict, b_mask); 368 | 369 | if (k_map.find(kmer_int) == k_map.end()) { 370 | k_map.insert({kmer_int, 1}); 371 | } else { 372 | ++k_map[kmer_int]; 373 | } 374 | 375 | /* not really necessary when rc kmers present 376 | for (int l = k-1; l >= 0; --l) { 377 | rckmer_buff[k-1-l] = comp_map[kmer_buff[l]]; 378 | } 379 | 380 | auto rckmer_int = seq_encode(rckmer_buff, k, code_dict, b_mask); 381 | if (k_map.find(rckmer_int) == k_map.end()) { 382 | k_map.insert({rckmer_int, 1}); 383 | } else { 384 | ++k_map[rckmer_int]; 385 | } 386 | */ 387 | } 388 | 389 | cur_pos = 0; 390 | ++n_lines; 391 | } 392 | 393 | buffer.clear(); 394 | cerr << fna_path << endl; 395 | cerr << "number of sequences " << n_lines/2 << endl; 396 | cerr << "number of unique kmers: "<< k_map.size() << endl << endl; 397 | auto timeit = chrono_time(); 398 | close(fd); 399 | } 400 | 401 | 402 | template 403 | void bit_load_pool(const char* k_path, vector& buffer, unordered_map& k_map, const int_type* code_dict, const int_type b_mask) { 404 | auto t_start = chrono_time(); 405 | 406 | char* window = buffer.data(); 407 | 408 | uintmax_t n_lines = 0; 409 | 410 | int fd; 411 | fd = open(k_path, O_RDONLY); 412 | 413 | int cur_pos = 0; 414 | int snp_pos = 0; 415 | 416 | char seq_buf[k]; 417 | char snp_id[16]; 418 | 419 | //auto fh = fstream(out_path, ios::out | ios::binary); 420 | 421 | bool id_switch = false; 422 | bool has_wildcard = false; 423 | 424 | while (true) { 425 | 426 | const ssize_t bytes_read = read(fd, window, step_size); 427 | 428 | if (bytes_read == 0) 429 | break; 430 | 431 | if (bytes_read == (ssize_t) -1) { 432 | cerr << "unknown fetal error when reading " << k_path << endl; 433 | exit(EXIT_FAILURE); 434 | } 435 | 436 | for (int i = 0; i < bytes_read; ++i) { 437 | char c = toupper(window[i]); 438 | if (c == '\n') { 439 | ++n_lines; 440 | 441 | if (has_wildcard) { 442 | has_wildcard = false; 443 | continue; 444 | } 445 | 446 | auto kmer_int = seq_encode(seq_buf, k, code_dict, b_mask); 447 | 448 | snp_id[snp_pos] = '\0'; 449 | int_type kcount = stoull(snp_id); 450 | 451 | assert(k_map.find(kmer_int) == k_map.end()); 452 | k_map.insert({kmer_int, kcount}); 453 | 454 | cur_pos = 0; 455 | snp_pos = 0; 456 | 457 | id_switch = false; 458 | } else if (c == '\t'){ 459 | id_switch = true; 460 | } else { 461 | if (c == 'N') { 462 | has_wildcard = true; 463 | } 464 | 465 | if (id_switch) { 466 | snp_id[snp_pos++] = c; 467 | } else { 468 | seq_buf[cur_pos++] = c; 469 | } 470 | } 471 | } 472 | 473 | //fh.write(&kmers[0], kmers.size()); 474 | 475 | // cerr << n_lines << " lines were scanned after " << (chrono_time() - t_start) / 1000 << " seconds" << endl; 476 | } 477 | 478 | auto timeit = chrono_time(); 479 | close(fd); 480 | } 481 | 482 | 483 | template 484 | void bin_load_pool(const char* p_path, unordered_map& k_map) { 485 | size_t filesize = get_fsize(p_path); 486 | //Open file 487 | int fd = open(p_path, O_RDONLY, 0); 488 | assert(fd != -1); 489 | //Execute mmap 490 | //uint64_t* mmappedData = (uint64_t *) mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0); 491 | int_type* mmappedData = (int_type *) mmap(NULL, filesize, PROT_READ, MMAP_FLAGS, fd, 0); 492 | assert(mmappedData != MAP_FAILED); 493 | //Write the mmapped data to stdout (= FD #1) 494 | 495 | // write(1, mmappedData, filesize); 496 | 497 | // char seq_buf[k+1]; 498 | 499 | auto l_start = chrono_time(); 500 | 501 | for (uint64_t i = 0; i < filesize/8; i=i+2) { 502 | // seq_decode(seq_buf, k, mmappedData[i], b_mask); 503 | 504 | auto kmer_int = mmappedData[i]; 505 | auto kcount = mmappedData[i+1]; 506 | 507 | assert(k_map.find(kmer_int) == k_map.end()); 508 | k_map.insert({kmer_int, kcount}); 509 | } 510 | 511 | //Cleanup 512 | int rc = munmap(mmappedData, filesize); 513 | assert(rc == 0); 514 | close(fd); 515 | } 516 | 517 | 518 | template 519 | bool cmp_tuple(const tuple &a, const tuple &b){ 520 | return get<0>(a) < get<0>(b); 521 | } 522 | 523 | template 524 | void set_kvecs(char* db_path, const int kv_n, vector* kvecs, vector>& kinfo) { 525 | assert(kv_n == 4); 526 | 527 | int_type lsb = 1; 528 | int_type b_mask = (lsb << bpb) - lsb; 529 | 530 | int_type code_dict[1 << (sizeof(char) * 8)]; 531 | make_code_dict(code_dict); 532 | 533 | vector buffer(buffer_size); 534 | 535 | load_profile(db_path, buffer, kvecs[0], kvecs[1], kvecs[2], kvecs[3], kinfo, code_dict, b_mask); 536 | 537 | cerr << "DB loading OK!" << endl; 538 | } 539 | 540 | template 541 | void multi_dbval(int kv_n, vector* kvecs, int n_path, vector& kpaths, vector>& prof_vec) { 542 | assert(kv_n == 4); 543 | 544 | int_type lsb = 1; 545 | int_type b_mask = (lsb << bpb) - lsb; 546 | 547 | int_type code_dict[1 << (sizeof(char) * 8)]; 548 | make_code_dict(code_dict); 549 | 550 | vector buffer(buffer_size); 551 | 552 | int64_t prof_size = kvecs[0].size(); 553 | 554 | vector lc_vecs[4]; 555 | 556 | for (int i = 0; i < 4; ++i) { 557 | lc_vecs[i].reserve(prof_size); 558 | } 559 | 560 | prof_vec.resize(prof_size, make_tuple(0,0,0,0,0)); 561 | 562 | unordered_map kpool; 563 | 564 | for (int i = 0; i < n_path; ++i) { 565 | char* kp_type = get_ftype(kpaths[i].c_str()); 566 | 567 | 568 | if (strcmp(kp_type, ".bin") == 0) { 569 | bin_load_pool(kpaths[i].c_str(), kpool); 570 | } else { 571 | fna_load_pool(kpaths[i].c_str(), buffer, kpool, code_dict, b_mask); 572 | } 573 | 574 | //else { 575 | // bit_load_pool(kpaths[i].c_str(), buffer, kpool, code_dict, b_mask); 576 | //} 577 | 578 | // splitted loops 579 | for (int j = 0; j < 4; ++j) { 580 | for(auto it = kvecs[j].begin(); it != kvecs[j].end(); ++it) { 581 | if(kpool.find(*it) == kpool.end()) { 582 | lc_vecs[j].push_back(0); 583 | } else { 584 | lc_vecs[j].push_back(kpool[*it]); 585 | } 586 | } 587 | } 588 | 589 | assert(lc_vecs[0].size() == lc_vecs[1].size()); 590 | assert(lc_vecs[0].size() == lc_vecs[2].size()); 591 | assert(lc_vecs[0].size() == lc_vecs[3].size()); 592 | 593 | const int64_t lc_size = lc_vecs[0].size(); 594 | 595 | for (int64_t j = 0; j < lc_size; ++j) { 596 | auto lc_sum = lc_vecs[0][j] + lc_vecs[1][j] + lc_vecs[2][j] + lc_vecs[3][j]; 597 | 598 | auto ref_sum = lc_vecs[0][j] + lc_vecs[2][j]; 599 | auto alt_sum = lc_vecs[1][j] + lc_vecs[3][j]; 600 | 601 | /* 602 | if (strcmp(kp_type, ".fna") == 0) { 603 | lc_sum = lc_sum / 2; 604 | } 605 | */ 606 | 607 | if (lc_sum == 0) { 608 | ++get<0>(prof_vec[j]); 609 | } else if (lc_sum == 1) { 610 | ++get<1>(prof_vec[j]); 611 | } else { 612 | ++get<2>(prof_vec[j]); 613 | } 614 | 615 | if (ref_sum > 0) { 616 | ++get<3>(prof_vec[j]); 617 | } 618 | 619 | if (alt_sum > 0) { 620 | ++get<4>(prof_vec[j]); 621 | } 622 | } 623 | 624 | for (int j = 0; j < 4; ++j) { 625 | lc_vecs[j].clear(); 626 | } 627 | 628 | kpool.clear(); 629 | } 630 | 631 | auto timeit = chrono_time(); 632 | } 633 | 634 | void display_usage(char *fname){ 635 | cout << "usage: " << fname << " -d profile_path -n identifier [-t n_threads] [-o output_path] [-L path to list of input] inpath1 [ inpath2 ...]\n"; 636 | } 637 | 638 | int main(int argc, char** argv){ 639 | extern char *optarg; 640 | extern int optind; 641 | 642 | bool dbflag = false; 643 | bool inflag = false; 644 | bool idflag = false; 645 | bool list_flag = false; 646 | 647 | 648 | char* fname = argv[0]; 649 | char* db_path = (char *)""; 650 | char* list_path = (char *)""; 651 | char* oname = (char *)"/dev/stdout"; 652 | char* spe_id = (char *)""; 653 | 654 | int n_threads = 1; 655 | 656 | int opt; 657 | while ((opt = getopt(argc, argv, "d:n:t:L:o:h")) != -1) { 658 | switch (opt) { 659 | case 'd': 660 | dbflag = true; 661 | db_path = optarg; 662 | break; 663 | case 'n': 664 | idflag = true; 665 | spe_id = optarg; 666 | break; 667 | case 't': 668 | n_threads = stoi(optarg); 669 | break; 670 | case 'L': 671 | list_flag = true; 672 | list_path = optarg; 673 | break; 674 | case 'o': 675 | oname = optarg; 676 | break; 677 | case 'h': case '?': 678 | display_usage(fname); 679 | exit(1); 680 | } 681 | } 682 | 683 | cerr << fname << '\t' << db_path << '\t' << n_threads << endl; 684 | 685 | if (!dbflag) { 686 | cerr << "missing argument: -d \n"; 687 | display_usage(fname); 688 | exit(1); 689 | } 690 | 691 | if (!idflag) { 692 | cerr << "missing argument: -n \n"; 693 | display_usage(fname); 694 | exit(1); 695 | } 696 | 697 | 698 | int in_pos = optind; 699 | 700 | if (list_flag) { 701 | cerr << "program reads a list of kmer pools for checking kmer uniqueness: " << list_path << endl; 702 | } else { 703 | if (optind == argc) { 704 | cerr << "missing argument: input (>1)\n"; 705 | display_usage(fname); 706 | exit(1); 707 | } 708 | } 709 | 710 | 711 | vector kvecs[4]; 712 | const int max_size = 100 * 1000 * 1000; 713 | 714 | for (int i = 0; i < 4; ++i) { 715 | kvecs[i].reserve(max_size); 716 | } 717 | 718 | vector> kinfo; 719 | 720 | set_kvecs(db_path, 4, kvecs, kinfo); 721 | 722 | vector input_array[n_threads]; 723 | 724 | auto label = 0; 725 | if (list_flag) { 726 | ifstream file(list_path); 727 | string line; 728 | int l_count = 0; 729 | while (getline(file, line)) { 730 | label = l_count % n_threads; 731 | string tmp_line = line; 732 | input_array[label].push_back(tmp_line); 733 | ++l_count; 734 | } 735 | } 736 | 737 | if (optind < argc) { 738 | for(; optind < argc; optind++) { 739 | auto slabel = (optind - in_pos + label) % n_threads; 740 | input_array[slabel].push_back(string(argv[optind])); 741 | } 742 | } 743 | 744 | vector> prof_vecs[n_threads]; 745 | vector th_array; 746 | 747 | for (int i = 0; i < n_threads; ++i) { 748 | th_array.push_back(thread(multi_dbval, 4, kvecs, input_array[i].size(), ref(input_array[i]), ref(prof_vecs[i]))); 749 | } 750 | 751 | 752 | for (thread & ith : th_array) { 753 | ith.join(); 754 | } 755 | th_array.clear(); 756 | 757 | vector> reduced_prof; 758 | reduced_prof.resize(kvecs[0].size(), make_tuple(0,0,0,0,0)); 759 | 760 | uint64_t lsb = 1; 761 | uint64_t b_mask = (lsb << bpb) - lsb; 762 | 763 | uint64_t code_dict[1 << (sizeof(char) * 8)]; 764 | make_code_dict(code_dict); 765 | 766 | char sbuf1[k+1]; 767 | char sbuf2[k+1]; 768 | char sbuf3[k+1]; 769 | char sbuf4[k+1]; 770 | 771 | ofstream myfile; 772 | myfile.open(oname); 773 | 774 | for (int j = 0; j < kinfo.size(); ++j) { 775 | for (int i = 0; i < n_threads; ++i) { 776 | get<0>(reduced_prof[j]) += get<0>(prof_vecs[i][j]); 777 | get<1>(reduced_prof[j]) += get<1>(prof_vecs[i][j]); 778 | get<2>(reduced_prof[j]) += get<2>(prof_vecs[i][j]); 779 | get<3>(reduced_prof[j]) += get<3>(prof_vecs[i][j]); 780 | get<4>(reduced_prof[j]) += get<4>(prof_vecs[i][j]); 781 | } 782 | 783 | auto info_pair = kinfo[j]; 784 | 785 | seq_decode(sbuf1, k+1, kvecs[0][j], code_dict, b_mask); 786 | seq_decode(sbuf2, k+1, kvecs[1][j], code_dict, b_mask); 787 | seq_decode(sbuf3, k+1, kvecs[2][j], code_dict, b_mask); 788 | seq_decode(sbuf4, k+1, kvecs[3][j], code_dict, b_mask); 789 | 790 | myfile << get<0>(info_pair) << '\t' << get<1>(info_pair) << '\t' << sbuf1 << '\t' << sbuf2 << '\t' << sbuf3 << '\t' << sbuf4 << '\t' << get<0>(reduced_prof[j]) << '\t' << get<1>(reduced_prof[j]) << '\t'<< get<2>(reduced_prof[j]) << '\t' << spe_id << '\t' << get<3>(reduced_prof[j]) << '\t'<< get<4>(reduced_prof[j]) << '\n'; 791 | } 792 | 793 | return 0; 794 | } 795 | -------------------------------------------------------------------------------- /bin/maast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from __future__ import division 4 | 5 | import sys, os, time, argparse 6 | import shutil, hashlib, math, multiprocessing 7 | import numpy as np 8 | from operator import itemgetter 9 | 10 | from Bio import SeqIO 11 | 12 | from snps_io import id_genome_clusters, id_centroid 13 | from snps_io import vcf_io, concat_alleles, gen_msa, align_assembly 14 | 15 | from db_io import build_db 16 | 17 | def get_data_type(): 18 | """ Get program specified by user (species, genes, or snps) """ 19 | import sys 20 | if len(sys.argv) == 1 or sys.argv[1] in ['-h', '--help']: 21 | cmd = 'maast ' 22 | print('usage: %s [options]' % cmd) 23 | print('') 24 | print("version: 1.0.8") 25 | print('') 26 | print('description: identify and genotype core-genome snps from ') 27 | print('') 28 | print('modules:') 29 | print(' end_to_end Run full Maast pipeline from begining to end') 30 | print(' genomes Perform multiple alignment of genomes to call core-genome SNPs') 31 | print(' db Build kmer database targeting snps') 32 | print(' genotype Call core-genome SNPs for single genomes and isolate sequencing data') 33 | print(' tree Build SNP tree using identified genotypes') 34 | print('') 35 | print("use '%s -h' for usage on a specific command" % cmd) 36 | print('') 37 | quit() 38 | elif sys.argv[1] not in ['end_to_end', 'genomes', 'db', 'genotype', 'tree']: 39 | sys.exit("\nError: invalid subcommand\n\nSupported subcommand: genomes, db, genotype, end_to_end, tree\n") 40 | else: 41 | return sys.argv[1] 42 | 43 | def parse_args(): 44 | 45 | data_type = get_data_type() 46 | 47 | parser = argparse.ArgumentParser( 48 | formatter_class=argparse.RawTextHelpFormatter, 49 | add_help=False, 50 | usage=argparse.SUPPRESS 51 | ) 52 | 53 | parser.add_argument('data_type', help=argparse.SUPPRESS) 54 | 55 | if data_type == 'end_to_end': 56 | end2end_input = parser.add_argument_group('end2end_input') 57 | end2end_input.add_argument('--in-dir', type=str, metavar='PATH',required=True, 58 | help = """Path to directory of raw-read-files in FASTQ format (.fastq or .fq; gzipped or not)\nor whole-genome sequences in FASTA format (.fna, .fa, .fsa or .fasta). (Required)""") 59 | 60 | io = parser.add_argument_group('input/output') 61 | io.add_argument('--out-dir', type=str, metavar='PATH', required=True, 62 | help="""Directory to store output (required)""") 63 | 64 | if data_type in ['genomes']: 65 | io.add_argument('--fna-dir', type=str, metavar='PATH', required=True, 66 | help = """Path to directory of genomes in FASTA format (required)""") 67 | 68 | if data_type in ['genomes', 'end_to_end']: 69 | io.add_argument('--rep-fna', type=str, metavar='PATH', default=None, 70 | help = """Path to the reference genome serving as the template for whole genome alignment. \nIf provided, Maast will not identify and use centroid genome for reference (default None)""") 71 | io.add_argument('--skip-align', action='store_true', default=False, 72 | help = """skip whole genome sequence or short read alignment, only applicable when alignment \nhas already been done (default False)""") 73 | io.add_argument('--has-completeness', action='store_true', default=False, 74 | help = """Toggle for specifying completeness for supplied genomes sequenes. If toggled on, \nit requries to supply either --completeness or --completeness-list (default False)""") 75 | io.add_argument('--completeness', type=float, metavar='FLOAT', default=None, 76 | help = """Single completeness value for all genomes sequenes \n(i.e. all genomes have the same completeness) (default False)""") 77 | io.add_argument('--completeness-list', type=str, metavar='PATH', default=None, 78 | help = """Path to list of pairs of genome file name and completeness value, separated by tab character. \n(note: genome file names should have no duplicates, and should cover all files specified in --fna-dir) (default None)""") 79 | io.add_argument('--missing-ratio', type=float, metavar='FLOAT', default=0.05, 80 | help = """Parameter defining the missing ratio of core sites even when completeness is 1 (default 0.05)""") 81 | io.add_argument('--min-pid', type=float, metavar='FLOAT', default=0, 82 | help = """Parameter defining the minimal identity for including each aligned block, [0, 100] (default 0)""") 83 | io.add_argument('--min-aln-len', type=int, metavar='INT', default=10, 84 | help = """Parameter defining the minimal length for including each aligned block (default 10)""") 85 | io.add_argument('--max-pid-delta', type=float, metavar='FLOAT', default=0.1, 86 | help = """Parameter defining the maximum identity gap between identity of each aligned block and \nwhole-genome ANI, all alignments with identity less than ANI * (1 - delta) will be purged, [0, 1] (default 10)""") 87 | io.add_argument('--mem', action='store_true', default=False, 88 | help = """calling SNPs by genomic segment, option for memory saving (default False)""") 89 | 90 | if data_type in ['genomes', 'end_to_end']: 91 | prep = parser.add_argument_group('preprocessing') 92 | prep.add_argument('--keep-redundancy', action='store_true', default=False, 93 | help="""If toggled on, Maast will skip redundancy removal and move on with all input genomes (default=False)""") 94 | prep.add_argument('--skip-centroid', action='store_true', default=False, 95 | help="""If toggled on, Maast will not attempt to identify and use centroid genome for reference (default=False)""") 96 | prep.add_argument('--sketch-k', type=int, metavar='INT', default=21, 97 | help="""k-mer size for building Mash sketch (default=21)""") 98 | prep.add_argument('--sketch-size', type=int, metavar='INT', default=5000, 99 | help="""The number of k-mers per Mash sketch (default=5000)""") 100 | prep.add_argument('--precut', type=float, metavar='FLOAT', default=0.05, 101 | help="""Limit searches among pair of genomes with distance smaller than the provided value (default=0.05)""") 102 | prep.add_argument('--start-cutoff', type=float, metavar='FLOAT', default=0.02, 103 | help="""The cutoff from which Maast will start to search a distance cutoff, which generate \nthe good number of genome clusters and tag genomes based on a given MAF (default=0.02)""") 104 | prep.add_argument('--end-cutoff', type=float, metavar='FLOAT', default=0.0001, 105 | help="""Similiar to --start-cutoff, the cutoff at which Maast will end the search for a distance cutoff. \nThis value should be smaller than --start-cutoff (default=0.0001)""") 106 | prep.add_argument('--range-factor', type=float, metavar='FLOAT', default=1.2, 107 | help="""This factor times the minimum number of genomes needed for a given MAF will create \nthe upper bound of a range satisfying the search. It should be larger than 1 (default=1.2)""") 108 | prep.add_argument('--tag-centrality', dest='centrality_method', default='degree', 109 | choices=['degree', 'eigenvector', 'closeness', 'information', 'betweenness', 'load'], 110 | help=""" 111 | Choose method to rank genomes by centrality in a genome cluster 112 | degree: degree centrality, which rank genomes by the number of links (default) 113 | eigenvector: eigenvector centrality 114 | closeness: closeness centrality 115 | information: information centrality 116 | betweenness: betweenness centrality 117 | load: load centrality 118 | *for method details, see https://networkx.org/documentation/stable/reference/algorithms/centrality.html""") 119 | prep.add_argument('--edge-weighted', action='store_true', default=False, 120 | help="""By default, Maast with binarize the edges (convert non-zero distance values to 1) for tag genome picking.\n If toggled on, Maast will use distance values as weights for tag genome picking. (default=False)""") 121 | prep.add_argument('--centroid-distance', dest='cent_dist_type', default='L1', 122 | choices=['L1', 'L2', 'Linf'], 123 | help=""" 124 | Choose type of distance that will be used to pick centroid genome as the one with least distance to other genomes in the cluster. 125 | L1: sum of distances to all other genomes 126 | L2: square root of the sum of the squared distances to all other genomes 127 | Linf: the maximum distance to all other genomes""") 128 | 129 | if data_type in ['genomes', 'end_to_end']: 130 | snps = parser.add_argument_group('snp-calling') 131 | snps.add_argument('--max-sites', type=int, metavar='INT', default=float('inf'), 132 | help="""Maximum genomic sites to parse (use all); useful for testing (default=inf)""") 133 | snps.add_argument('--min-prev', type=float, metavar='FLOAT', default=1.0, 134 | help="""Minimum prevalence (default=1.0)""") 135 | snps.add_argument('--snp-freq', type=float, metavar='FLOAT', default=0.01, 136 | help="""Minimum minor allele frequency for SNP calling (default=0.01)""") 137 | snps.add_argument('--max-samples', type=int, metavar='INT', default=float('inf'), 138 | help="""Only use a subset of genomes or metagenomes for snp calling (default=inf)""") 139 | 140 | if data_type in ['db', 'end_to_end']: 141 | db = parser.add_argument_group('db-building') 142 | if data_type in ['db']: 143 | db.add_argument('--ref-genome', type=str, dest='ref_genome', required=True, 144 | help="""Path to reference genome sequence file (required)""") 145 | db.add_argument('--vcf', type=str, dest='vcf', required=True, 146 | help="""Path to a vcf file describing core snps/genetic variants called based on \nmultiple sequence alignments (required)""") 147 | db.add_argument('--msa', type=str, dest='msa', required=True, 148 | help="""Path to multiple sequence alignment file (required)""") 149 | db.add_argument('--tag-fna-list', type=str, dest='tag_list', required=True, 150 | help="""Path to a list of paths to the tag genomes (FASTA format) which are included \nin multiple sequence alignment file (required)""") 151 | db.add_argument('--fna-dir', type=str, dest='fna_dir', default=None, 152 | help="""Path to a list of paths to the tag genomes (FASTA format) which are included \nin multiple sequence alignment file (default=None)""") 153 | db.add_argument('--coords', type=str, dest='coords', default=None, 154 | help="""Path to core genome block coordinate file (default=None)""") 155 | 156 | if data_type in ['db', 'end_to_end']: 157 | db.add_argument('--genome-name', dest='genome_name', type=str, default='100000', 158 | help="""Name of the core-genome corresponding to INPUT. Should be six digits \nwith the first digit in [1, 9] (default=100000)""") 159 | db.add_argument('--kmer-type', dest='kmer_type', default='all', 160 | choices=['all', 'center'], 161 | help=""" 162 | Choose type of kmers to be fetched 163 | all: all elligible kmers 164 | 1) covered snp at any position 165 | and 2) do not cover any bad sites (e.g. N or -) 166 | and 3) were well contained on its coordinate division (default) 167 | center: all kmers whose target snps was at their centers.""") 168 | db.add_argument('--snp-cover', dest='snp_type', default='all', 169 | choices=['all', 'l1-tags', 'l2-tags'], 170 | help=""" 171 | Choose object to kmerize 172 | all: all snps from the cluster will be attempted for kmer search; most kmers (default) 173 | l1-tags: only representative snps from all snp blocks will be attempted 174 | l2-tags: only representative snps from representative snp blocks will be attempted; fewest kmers 175 | * note: all kmers must uniquely match an allele and intersect >= 1 SNP""") 176 | 177 | if data_type in ['genotype', 'end_to_end']: 178 | genotype_input = parser.add_argument_group('genotype_input') 179 | 180 | if data_type in ['genotype']: 181 | genotype_input.add_argument('--in-dir', type=str, metavar='PATH',required=True, 182 | help = """Path to directory of raw-read-files in FASTQ format (.fastq or .fq; gzipped or not) \nor whole-genome sequences in FASTA format (.fna, .fa, .fsa or .fasta) (required)""") 183 | genotype_input.add_argument('--ref-genome', type=str, dest='ref_genome', required=True, 184 | help="""Path to reference genome sequence file (required)""") 185 | genotype_input.add_argument('--db', type=str, metavar='PATH', dest='kmer_db_path', required=True, 186 | help = """Path to directory of raw-read-files in FASTQ format (.fastq or .fq; gzipped or not) \nor whole-genome sequences in FASTA format (.fna, .fa, .fsa or .fasta) (required)""") 187 | genotype_input.add_argument('--vcf', type=str, dest='vcf', required=True, 188 | help="""Path to a vcf file describing core snps/genetic variants called based on \nmultiple sequence alignments (required)""") 189 | single_genome = parser.add_argument_group('genome-genotyping') 190 | single_genome.add_argument('--min-pid', type=float, metavar='FLOAT', default=0, 191 | help = """Parameter defining the minimal identity for including each aligned block, [0, 100] (default=0)""") 192 | single_genome.add_argument('--min-aln-len', type=int, metavar='INT', default=10, 193 | help = """Parameter defining the minimal length for including each aligned block (default=10)""") 194 | single_genome.add_argument('--max-pid-delta', type=float, metavar='FLOAT', default=0.1, 195 | help = """Parameter defining the maximum identity gap between identity of each aligned block and \nwhole-genome ANI, all alignments with identity less than ANI * (1 - delta) will be purged, [0, 1] (default=0.1)""") 196 | 197 | if data_type in ['genotype', 'end_to_end']: 198 | genotype_input.add_argument('--merge-pairs', action='store_true', default=False, 199 | help = """Flag to merge paired raw reads files in ; indicated by ext '_1*' and '_2*'""") 200 | 201 | align = parser.add_argument_group('reads-genotyping') 202 | align.add_argument('--mode', default='very-sensitive', 203 | choices=['very-fast', 'fast', 'sensitive', 'very-sensitive'], 204 | help = """Alignment speed/sensitivity (default=very-sensitive)""") 205 | align.add_argument('--max-reads', type=int, metavar='INT', 206 | help = """Maximum # reads to use from each FASTQ file (default=None; use all)""") 207 | 208 | if data_type in ['genomes', 'genotype', 'end_to_end']: 209 | io.add_argument('--subset-list', type=str, metavar='PATH', default=None, 210 | help = """Path to file contains the names of the fullset or subset of the files in the input directory. \nFiles not in the list will not be included for snp calling (default=None; use all)""") 211 | 212 | if data_type in ['tree']: 213 | tree_io = parser.add_argument_group('tree_io') 214 | tree_io.add_argument('--input-dir', type=str, dest='input_dir', required=True, 215 | help="""Input directory that should contains genotype result files generated from Maast genotype command""") 216 | tree_io.add_argument('--input-list', type=str, dest='input_list', default=None, 217 | help="""A list of input pairs. Each pair per row contains a path to a genotype result file generated \nfrom Maast genotype command and a unique name of the file. (required) 218 | The path and name must be separated by a tab. 219 | Example 220 | /file/path/1 name1 221 | /file/path/2 name2 222 | /file/path/3 name3 223 | ...""") 224 | tree_io.add_argument('--min-sites', type=int, dest='min_sites_per_sample', default=1000, 225 | help="""Minimum SNP sites. Any allele sequence with a number of non-empty sites lower than \nthis value will not be included (default=1000)""") 226 | tree_io.add_argument('--max-gap-ratio', type=float, dest='max_gap_ratio', default=0.5, 227 | help="""Maximum ratio of gaps. Any allele sequence with a ratio of gap higher than this value \nwill not be included (default=0.5)""") 228 | tree_io.add_argument('--min-site-prev', type=float, dest='min_site_prev', default=0.9, 229 | help="""Minimum site prevalence. Any site with an actual allele presents in a fraction of sequences \nlower than this value will not be included (default=0.9)""") 230 | tree_io.add_argument('--min-MAF', type=float, dest='min_maf', default=0.01, 231 | help="""Minimum allele frequency. Any site with MAF lower than this value will not be included (default=0.01)""") 232 | tree_io.add_argument('--min-MAC', type=float, dest='min_mac', default=1, 233 | help="""Minimum allele count. Any site with MAC lower than this value will not be included (default=1)""") 234 | tree_io.add_argument('--min-depth', type=float, dest='min_depth', default=1, 235 | help="""Minimum read depth. Any site supported by a number of reads lower than this value will not be included. \nThis option is only for genotypes identified from sequencing reads. \nDefault value is 1 and any value >1 will effectively exclude all whole genome assemblies from analysis. \nCaution is advised (default=1)""") 236 | 237 | misc = parser.add_argument_group('misc') 238 | misc.add_argument("-h", "--help", action="help", 239 | help="""Show this help message and exit""") 240 | # use all if default=multiprocessing.cpu_count() 241 | misc.add_argument('--threads', type=int, metavar='INT', default=1, 242 | help="""Number of CPUs to use (default=1)""") 243 | misc.add_argument('--overwrite', dest='overwrite', action='store_true', help="""Overwrite existing output files""") 244 | 245 | args = vars(parser.parse_args()) 246 | 247 | args['data_type'] = data_type 248 | 249 | return args 250 | 251 | def run_command(cmd, env=None): 252 | import subprocess as sp 253 | if env: 254 | p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE, env=env) 255 | else: 256 | p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE) 257 | out, err = p.communicate() 258 | if p.returncode != 0: 259 | err_msg = "\nError: the following returned non-zero status: '%s':\n" % cmd 260 | err_msg += "\n%s" % err 261 | sys.exit(err_msg) 262 | else: 263 | return out.decode('utf-8'), err.decode('utf-8') 264 | 265 | def parallel(function, argument_list, threads): 266 | """ Based on: https://gist.github.com/admackin/003dd646e5fadee8b8d6 """ 267 | import multiprocessing as mp 268 | import signal 269 | import time 270 | 271 | def init_worker(): 272 | signal.signal(signal.SIGINT, signal.SIG_IGN) 273 | 274 | pool = mp.Pool(int(threads), init_worker) 275 | 276 | try: 277 | results = [] 278 | for arguments in argument_list: 279 | p = pool.apply_async(function, args=arguments) 280 | results.append(p) 281 | pool.close() 282 | 283 | while True: 284 | if all(r.ready() for r in results): 285 | return [r.get() for r in results] 286 | time.sleep(1) 287 | 288 | except KeyboardInterrupt: 289 | pool.terminate() 290 | pool.join() 291 | sys.exit("\nKeyboardInterrupt") 292 | 293 | def reformat_sequence_headers(args): 294 | """ 295 | Reformat sequence headers in input genomes to prevent parsnp from crashing 296 | """ 297 | import Bio.SeqIO 298 | if 'fna_dir' in args: 299 | try: os.makedirs(args['out_dir']+'/temp/genomes') 300 | except: pass 301 | for file in os.listdir(args['fna_dir']): 302 | infile = open(args['fna_dir']+'/'+file) 303 | outfile = open(args['out_dir']+'/temp/genomes/'+file, 'w') 304 | for seq in Bio.SeqIO.parse(infile, 'fasta'): 305 | seq.id = seq.id.replace('-', '_') 306 | seq.seq = str(seq.seq).upper() 307 | outfile.write('>'+seq.id+'\n'+seq.seq+'\n') 308 | infile.close() 309 | outfile.close() 310 | args['fna_dir'] = args['out_dir']+'/temp/genomes' 311 | 312 | if 'rep_fna' in args and args['rep_fna'] is not None: 313 | infile = open(args['rep_fna']) 314 | outfile = open(args['out_dir']+'/temp/'+os.path.basename(args['rep_fna']), 'w') 315 | for seq in Bio.SeqIO.parse(infile, 'fasta'): 316 | seq.id = seq.id.replace('-', '_') 317 | seq.seq = str(seq.seq).upper() 318 | outfile.write('>'+seq.id+'\n'+seq.seq+'\n') 319 | infile.close() 320 | outfile.close() 321 | args['rep_fna'] = args['out_dir']+'/temp/'+os.path.basename(args['rep_fna']) 322 | 323 | def locate_fpaths(args, in_dir, rep_fna=None, subset_list=None): 324 | subset_map = dict() 325 | 326 | for f in os.listdir(in_dir): 327 | subset_map[f] = 1 328 | 329 | if subset_list is not None: 330 | subset_map = dict() 331 | with open(subset_list, 'r') as fh: 332 | for ln in fh: 333 | subset_map[ln.rstrip()] = 1 334 | 335 | args["subset_map"] = subset_map 336 | 337 | ref_path = "" 338 | fpaths = [] 339 | 340 | # Using the largest genome file in direcory for reference intead of randomly selecting anyone 341 | lg_fpath = "" 342 | cur_size = 0 343 | for f in os.listdir(in_dir): 344 | if f in subset_map: 345 | fpath = in_dir.rstrip('/')+'/'+f 346 | ftype = id_input_type(fpath) 347 | 348 | if os.path.isfile(fpath) and ftype == "fasta": 349 | fstats = os.stat(fpath) 350 | fpaths.append(fpath) 351 | if fstats.st_size >= cur_size: 352 | cur_size = fstats.st_size 353 | lg_fpath = fpath 354 | else: 355 | sys.stderr.write("skip {}: not fasta format\n".format(fpath)) 356 | 357 | else: 358 | sys.stderr.write("skip {}\n".format(f)) 359 | 360 | if rep_fna is not None: # Using speficied reference genome 361 | ref_path = rep_fna 362 | else: 363 | ref_path = lg_fpath 364 | 365 | args['rep_fna'] = ref_path 366 | args['fna_paths'] = fpaths 367 | 368 | def detect_single_chrom(ref_path): 369 | single_chrom = True 370 | chrom_cnt = 0 371 | with open(ref_path, 'r') as fh: 372 | for line in fh: 373 | if line[0] == '>': 374 | chrom_cnt = chrom_cnt + 1 375 | 376 | if chrom_cnt == 1: 377 | pass 378 | else: 379 | single_chrom = False 380 | break 381 | 382 | return single_chrom 383 | 384 | def register_run_id(args, in_dir, single=False): 385 | args['run_id'] = in_dir.rstrip('/').split('/')[-1] 386 | 387 | if single is True: 388 | args['run_id'] = args['run_id'] + "_single" 389 | 390 | return args['run_id'] 391 | 392 | def register_msa_id(args, ref_path, fpaths): 393 | order_names = [] 394 | 395 | for fpath in fpaths: 396 | order_names.append(fpath.rstrip('/').split('/')[-1]) 397 | 398 | order_names.append(ref_path.rstrip('/').split('/')[-1]) 399 | 400 | in_string = "".join(order_names) 401 | args['msa_id'] = hashlib.md5(in_string.encode()).hexdigest() 402 | 403 | return args['msa_id'] 404 | 405 | def auto_min_pid_by_delta(coords_path, idt_delta): 406 | min_pid_by_delta = 0 407 | 408 | # fields = [('s1',int),('e1',int), 409 | # ('s2',int),('e2',int), 410 | # ('len1',int),('len2',int), 411 | # ('pid',float), 412 | # ('c1',str),('c2',str)] 413 | 414 | pids = [] 415 | with open(coords_path) as f: 416 | for i in range(5): 417 | next(f) 418 | for l in f: 419 | values = l.replace(' | ', ' ').split() 420 | pid = float(values[6]) 421 | pids.append(pid) 422 | 423 | avg_pid = 0.7 424 | if len(pids) != 0: 425 | avg_pid = sum(pids)/len(pids) 426 | 427 | min_pid_by_delta = avg_pid * (1 - idt_delta) 428 | 429 | return min_pid_by_delta 430 | 431 | def run_mummer4_single(fpath, genome_id, ref_fpath, rep_id, out_dir, skip_align, min_pid, min_aln_len, max_pid_delta, internal_thread_num): 432 | print(" %s - %s" % (rep_id, genome_id)) 433 | 434 | try: os.makedirs(out_dir) 435 | except: pass 436 | 437 | log = open(out_dir+'/log','w') 438 | 439 | if skip_align is True and os.path.isfile("%s/%s.delta" % (out_dir, genome_id)): 440 | log.write('nucmer alignment was skipped\n') 441 | print(' nucmer alignment skipped\n') 442 | else: 443 | command = "nucmer " 444 | command += "-t %s " % internal_thread_num 445 | command += "%s " % ref_fpath 446 | command += "%s " % fpath 447 | command += "--prefix %s/%s " % (out_dir, genome_id) 448 | out, err = run_command(command) 449 | log.write(str(out)+'\n'+str(err)) 450 | 451 | command = "delta-filter -q -r " 452 | command += "-i %s " % str(min_pid) 453 | command += "-l %s " % str(min_aln_len) 454 | command += "%s/%s.delta " % (out_dir, genome_id) 455 | command += "> %s/%s.filter.delta.1" % (out_dir, genome_id) 456 | out, err = run_command(command) 457 | log.write(str(out)+'\n'+str(err)) 458 | 459 | command = "show-coords " 460 | command += "%s/%s.filter.delta.1 " % (out_dir, genome_id) 461 | command += "> %s/%s" % (out_dir, 'coords.tmp') 462 | out, err = run_command(command) 463 | log.write(str(out)+'\n'+str(err)) 464 | 465 | coords_path = "{}/{}".format(out_dir, 'coords.tmp') 466 | min_pid_by_delta = auto_min_pid_by_delta(coords_path, max_pid_delta) 467 | 468 | command = "delta-filter -q -r " 469 | command += "-i %s " % str(min_pid_by_delta) 470 | command += "-l %s " % str(min_aln_len) 471 | command += "%s/%s.delta " % (out_dir, genome_id) 472 | command += "> %s/%s.filter.delta" % (out_dir, genome_id) 473 | out, err = run_command(command) 474 | 475 | for utility in ['coords', 'snps', 'diff']: 476 | command = "show-%s " % utility 477 | command += "%s/%s.filter.delta " % (out_dir, genome_id) 478 | command += "> %s/%s" % (out_dir, utility) 479 | out, err = run_command(command) 480 | log.write(str(out)+'\n'+str(err)) 481 | 482 | 483 | def run_mummer4(args): 484 | fpaths = args['fna_paths'] 485 | if 'tag_genome_paths' in args: 486 | fpaths = args['tag_genome_paths'] 487 | 488 | ref_fpath = args['rep_fna'] 489 | if 'tag_ref' in args: 490 | ref_fpath = args['tag_ref'] 491 | 492 | register_run_id(args, args['fna_dir']) 493 | register_msa_id(args, ref_fpath, fpaths) 494 | 495 | print("reference genome path: %s" % ref_fpath) 496 | 497 | args['mummer4_dir'] = args['out_dir']+'/temp/mummer4/'+args['run_id'] 498 | try: os.makedirs(args['mummer4_dir']) 499 | except: pass 500 | 501 | shutil.copy(ref_fpath, os.path.join(args['mummer4_dir'], 'reference.fna')) 502 | 503 | arg_list = [] 504 | rep_id = '.'.join(ref_fpath.split('/')[-1].split('.')[:-1]) 505 | 506 | print("[paired alignment]: start") 507 | for fpath in fpaths: 508 | genome_id = '.'.join(fpath.split('/')[-1].split('.')[:-1]) 509 | out_dir = '%s/aln/%s' % (args['mummer4_dir'], genome_id) 510 | arg_list.append([fpath, genome_id, ref_fpath, rep_id, out_dir, args['skip_align'], args['min_pid'], args['min_aln_len'], args['max_pid_delta'], 1]) 511 | 512 | print("[paired alignment]: done") 513 | 514 | parallel(run_mummer4_single, arg_list, args['threads']) 515 | 516 | msa_path = gen_msa.build_msa(indir=args['mummer4_dir'], overwrite=True, subset=args["subset_map"]) 517 | 518 | shutil.copy(os.path.join(args['mummer4_dir'], 'reference.fna'), args['out_dir']) 519 | 520 | args['msa_path'] = args['out_dir'] + '/tag_msa.fna' 521 | shutil.move(msa_path, args['msa_path']) 522 | 523 | args['msa_type'] = 'xmfa-mummer4' 524 | 525 | args['tag_list_path'] = args['out_dir'] + '/tag_paths.list' 526 | 527 | with open(args['tag_list_path'], 'w') as fh: 528 | for fpath in fpaths: 529 | fh.write("{}\n".format(fpath.rstrip())) 530 | 531 | 532 | def run_mash_scketch(args): 533 | ref_fpath = args['rep_fna'] 534 | fpaths = args['fna_paths'] 535 | 536 | register_run_id(args, args['fna_dir']) 537 | register_msa_id(args, ref_fpath, fpaths) 538 | 539 | print("reference genome path: %s" % ref_fpath) 540 | 541 | args['mash_dir'] = args['out_dir']+'/temp/mash/'+args['run_id'] 542 | 543 | try: os.makedirs(args['mash_dir']) 544 | except: pass 545 | 546 | args['fna_list_path'] = args['mash_dir'] + '/in_fna.list' 547 | 548 | with open(args['fna_list_path'], 'w') as fh: 549 | for fpath in fpaths: 550 | fh.write("{}\n".format(fpath)) 551 | 552 | print("[building mash sketch]: start") 553 | 554 | command = "mash sketch " 555 | command += "-k %s " % str(args['sketch_k']) 556 | command += "-s %s " % str(args['sketch_size']) 557 | command += "-p %s " % str(args['threads']) 558 | command += "-o %s " % (args['mash_dir']+'/mash_sketch') 559 | command += "-l %s " % args['fna_list_path'] 560 | 561 | out, err = run_command(command) 562 | with open(args['logfile'], 'a') as logger: 563 | logger.write(str(out)+'\n'+str(err)) 564 | 565 | args['mash_sketch_path'] = args['mash_dir']+'/mash_sketch.msh' 566 | 567 | def run_mash_dist(args): 568 | sketch_path = args['mash_sketch_path'] 569 | 570 | assert os.path.exists(sketch_path) 571 | 572 | args['mash_dist_path'] = args['mash_dir'] + '/mash_dist.tsv' 573 | 574 | print("[calculating mash distance]: start") 575 | 576 | command = "mash dist " 577 | command += "-p %s " % str(args['threads']) 578 | command += "%s %s " % (sketch_path, sketch_path) 579 | command += "> %s " % args['mash_dist_path'] 580 | 581 | out, err = run_command(command) 582 | with open(args['logfile'], 'a') as logger: 583 | logger.write(str(out)+'\n'+str(err)) 584 | 585 | def do_precut(args): 586 | dist_path = args['mash_dist_path'] 587 | 588 | assert os.path.exists(dist_path) 589 | 590 | args['cut_dist_path'] = args['mash_dir'] + '/mash_dist.cut.tsv' 591 | 592 | print("[cut mash distance: {}]: start".format(str(args['precut']))) 593 | 594 | command = "awk '$3 < %s' " % str(args['precut']) 595 | command += "%s " % dist_path 596 | command += "> %s " % args['cut_dist_path'] 597 | 598 | out, err = run_command(command) 599 | with open(args['logfile'], 'a') as logger: 600 | logger.write(str(out)+'\n'+str(err)) 601 | 602 | def id_clusters(args): 603 | run_mash_scketch(args) 604 | 605 | run_mash_dist(args) 606 | 607 | s_cut = args['start_cutoff'] 608 | e_cut = args['end_cutoff'] 609 | r_fac = args['range_factor'] 610 | 611 | total_n = len(args['fna_paths']) 612 | 613 | maf = args['snp_freq'] 614 | 615 | critical_n = math.ceil(1 / maf) 616 | 617 | do_precut(args) 618 | dist_path = args['cut_dist_path'] 619 | assert os.path.exists(dist_path) 620 | 621 | optimal_clusters, optimal_d, optimal_n = [], None, None 622 | while s_cut <= args['precut']: 623 | optimal_clusters, optimal_d, optimal_n, firstcut_exit = id_genome_clusters.build_genome_blocks(dist_path, total_n, critical_n, s_cut, e_cut, r_fac, args['centrality_method'], args['edge_weighted']) 624 | if firstcut_exit is True: 625 | s_cut = s_cut + 0.01 626 | else: 627 | break 628 | 629 | 630 | clust_genomes = dict() 631 | tag_genomes = [] 632 | for cluster in optimal_clusters: 633 | tag_genomes.append(cluster.tag_genome) 634 | for genome in cluster.genomes: 635 | clust_genomes[genome] = 1 636 | 637 | 638 | for fpath in args['fna_paths']: 639 | if fpath not in clust_genomes: 640 | tag_genomes.append(fpath) 641 | 642 | args['tag_genome_paths'] = tag_genomes 643 | 644 | def id_tag_ref(args): 645 | if 'mash_dist_path' not in args or not os.path.exists(args['mash_dist_path']): 646 | run_mash_scketch(args) 647 | run_mash_dist(args) 648 | 649 | dist_path = args['mash_dist_path'] 650 | 651 | tag_paths = args['fna_paths'] 652 | if 'tag_genome_paths' in args and len(args['tag_genome_paths']) > 1: 653 | tag_paths = args['tag_genome_paths'] 654 | 655 | centroid = id_centroid.identify(tag_paths, dist_path, args['cent_dist_type']) 656 | 657 | print(centroid) 658 | 659 | args['tag_ref'] = centroid 660 | args['rep_fna'] = centroid 661 | 662 | def run_kmerset_validate(args): 663 | assert os.path.exists(args['kmer_set']) 664 | assert os.path.exists(args['tag_list']) 665 | 666 | args['kmer_prof_path'] = args['out_dir']+'/kmer_prof.tsv' 667 | 668 | args['check_fna_paths'] = args['out_dir']+'/check_fna_paths.list' 669 | if 'fna_paths' in args: 670 | with open(args['check_fna_paths'], 'w') as fh: 671 | for fpath in args['fna_paths']: 672 | fh.write("{}\n".format(fpath)) 673 | 674 | print("[validating kmer set]: start") 675 | 676 | command = "callm_db_val " 677 | command += "-d %s " % args['kmer_set'] 678 | command += "-n %s " % args['genome_name'] 679 | command += "-t %s " % args['threads'] 680 | #command += "-L %s " % args['tag_list'] 681 | command += "-L %s " % args['check_fna_paths'] 682 | command += "-o %s " % args['kmer_prof_path'] 683 | 684 | out, err = run_command(command) 685 | with open(args['logfile'], 'a') as logger: 686 | logger.write(str(out)+'\n'+str(err)) 687 | 688 | def filter_kmers(args): 689 | assert os.path.exists(args['kmer_prof_path']) 690 | 691 | args['filtered_kmer_path'] = args['out_dir']+'/selected_kmers.tsv' 692 | 693 | with open(args['filtered_kmer_path'], 'w') as fw: 694 | with open(args['kmer_prof_path'], 'r') as fh: 695 | for line in fh: 696 | items = line.rstrip().split('\t') 697 | 698 | nonsingle_hit = int(items[8]) 699 | 700 | null_hit = int(items[6]) 701 | single_hit = int(items[7]) 702 | 703 | ref_hit = int(items[10]) 704 | alt_hit = int(items[11]) 705 | 706 | if nonsingle_hit > 0: 707 | continue 708 | 709 | if single_hit / (single_hit + null_hit) < 0.5: 710 | continue 711 | 712 | if ref_hit == 0 or alt_hit == 0: 713 | continue 714 | 715 | rec1 = "{}\t{}0{}".format(items[2], items[9], items[0]) 716 | rec2 = "{}\t{}1{}".format(items[3], items[9], items[0]) 717 | rec3 = "{}\t{}0{}".format(items[4], items[9], items[0]) 718 | rec4 = "{}\t{}1{}".format(items[5], items[9], items[0]) 719 | 720 | fw.write("{}\n{}\n{}\n{}\n".format(rec1, rec2, rec3, rec4)) 721 | 722 | def run_build_db(args): 723 | assert args['filtered_kmer_path'] 724 | 725 | args['kmer_db_path'] = args['out_dir']+'/kmer_db.bin' 726 | 727 | command = "callm_db_build " 728 | command += "%s " % args['filtered_kmer_path'] 729 | command += "> %s " % args['kmer_db_path'] 730 | 731 | out, err = run_command(command) 732 | with open(args['logfile'], 'a') as logger: 733 | logger.write(str(out)+'\n'+str(err)) 734 | 735 | def read_input_dir(args, in_dir, subset_list=None): 736 | subset_map = dict() 737 | 738 | for f in os.listdir(in_dir): 739 | subset_map[f] = 1 740 | 741 | if subset_list is not None: 742 | subset_map = dict() 743 | with open(subset_list, 'r') as fh: 744 | for ln in fh: 745 | subset_map[ln.rstrip()] = 1 746 | 747 | args["subset_map"] = subset_map 748 | 749 | fna_paths = [] 750 | fq_paths = [] 751 | 752 | for f in os.listdir(in_dir): 753 | if f in subset_map: 754 | fpath = in_dir.rstrip('/')+'/'+f 755 | print(fpath) 756 | 757 | if os.path.isdir(fpath): 758 | continue 759 | 760 | assert os.path.isfile(fpath) 761 | ftype = id_input_type(fpath) 762 | 763 | if ftype == "unknown": 764 | sys.stderr.write("skip {}: unknown input type\n".format(fpath)) 765 | elif ftype == "not_supported": 766 | sys.stderr.write("skip {}: compressed fasta is not supported yet\n".format(fpath)) 767 | elif ftype == "fasta": 768 | fna_paths.append(fpath) 769 | elif ftype in ["fastq", "fastq.gz", "fastq.lz4", "fastq.bz2"]: 770 | fq_paths.append(fpath) 771 | else: 772 | assert False 773 | else: 774 | sys.stderr.write("skip {}\n".format(f)) 775 | 776 | fq_pairs = [] 777 | if len(fq_paths) > 1: 778 | fq_pairs = pair_inputs(fq_paths) 779 | 780 | args['fna_paths'] = fna_paths 781 | args['fq_paths'] = fq_paths 782 | args['fq_pairs'] = fq_pairs 783 | 784 | 785 | def id_input_type(fpath): 786 | in_type = "fastq" #default 787 | 788 | fn_its = fpath.split("/")[-1].split(".") 789 | 790 | fn_end = "" 791 | if fn_its[-1] in ['gz', 'lz4', 'bz2']: 792 | fn_end = fn_its[-2] 793 | else: 794 | fn_end = fn_its[-1] 795 | 796 | if fn_end in ['fa', 'fsa', 'fna', 'fasta']: 797 | in_type = "fasta" 798 | elif fn_end in ['fq', 'fastq']: 799 | in_type = "fastq" 800 | else: 801 | in_type = "unknown" 802 | 803 | if fn_its[-1] in ['gz', 'lz4', 'bz2']: 804 | if fn_end in ['fa', 'fsa', 'fna', 'fasta']: 805 | in_type = "not_supported" 806 | else: 807 | in_type = in_type + '.' + fn_its[-1] 808 | 809 | return in_type 810 | 811 | def pair_inputs(fq_paths): 812 | pairs = dict() 813 | 814 | for fqpath in fq_paths: 815 | fn_its = fqpath.split("/")[-1].split(".") 816 | fq_name_parts = fn_its[0].split("_") 817 | 818 | if len(fq_name_parts) != 2: 819 | continue 820 | 821 | if fq_name_parts[1] not in ["1", "2"]: 822 | continue 823 | 824 | if fq_name_parts[0] not in pairs: 825 | pairs[fq_name_parts[0]] = dict() 826 | 827 | pairs[fq_name_parts[0]][fq_name_parts[1]] = fqpath 828 | 829 | real_pairs = [] 830 | for name in pairs.keys(): 831 | if "1" in pairs[name] and "2" in pairs[name]: 832 | real_pairs.append([pairs[name]["1"], pairs[name]["2"], name]) 833 | 834 | return real_pairs 835 | 836 | def genotype_single_genomes(args): 837 | ref_fpath = args['ref_genome'] 838 | fpaths = args['fna_paths'] 839 | 840 | print("reference genome path: %s" % ref_fpath) 841 | 842 | args['genotype_dir'] = args['out_dir']+'/temp/genotype' 843 | try: os.makedirs(args['genotype_dir']) 844 | except: pass 845 | 846 | args['gt_results_dir'] = args['out_dir']+'/gt_results' 847 | try: os.makedirs(args['gt_results_dir']) 848 | except: pass 849 | 850 | arg_list = [] 851 | arg_list_gt = [] 852 | rep_id = '.'.join(ref_fpath.split('/')[-1].split('.')[:-1]) 853 | 854 | global ref 855 | ref = read_ref(ref_fpath) 856 | 857 | global genos 858 | genos = extract_genotypes(args['vcf']) 859 | 860 | print("[paired alignment]: start") 861 | for fpath in fpaths: 862 | genome_id = fpath.split('/')[-1] 863 | out_dir = '%s/aln/%s' % (args['genotype_dir'], genome_id) 864 | arg_list.append([fpath, genome_id, ref_fpath, rep_id, out_dir, False, args['min_pid'], args['min_aln_len'], args['max_pid_delta'], 1]) 865 | 866 | coord_path = out_dir + '/coords' 867 | snp_path = out_dir + '/snps' 868 | output = args['gt_results_dir'] + '/' + genome_id + ".tsv" 869 | arg_list_gt.append([genos, ref, coord_path, snp_path, output]) 870 | 871 | print("[paired alignment]: done") 872 | 873 | parallel(run_mummer4_single, arg_list, args['threads']) 874 | parallel(run_single_fasta_gt, arg_list_gt, args['threads']) 875 | 876 | def read_ref(fpath): 877 | seq_recs = list(SeqIO.parse(fpath, "fasta")) 878 | 879 | rec_table = dict() 880 | for rec in seq_recs: 881 | rec_table[rec.id] = str(rec.seq).upper() 882 | 883 | return rec_table 884 | 885 | def extract_genotypes(vcf_path): 886 | genos = [] 887 | with open(vcf_path, 'r') as fh: 888 | for l in fh: 889 | if l[0] == "#": 890 | continue 891 | else: 892 | values = l.rstrip().split('\t')[:5] 893 | 894 | chrom = values[0] 895 | pos_r = int(values[1]) 896 | gid = values[2] 897 | allele_ma = values[3] 898 | allele_mi = values[4] 899 | 900 | if len(allele_mi) > 1: 901 | continue 902 | 903 | genos.append([chrom, str(pos_r), gid, allele_ma, allele_mi]) 904 | 905 | return genos 906 | 907 | def run_single_fasta_gt(genos, ref, coord_path, snp_path, output): 908 | coord_map = dict() 909 | 910 | with open(coord_path, 'r') as fh: 911 | for i in range(5): 912 | next(fh) 913 | for l in fh: 914 | values = l.replace(' | ', ' ').split() 915 | # position in coords file is 1 indexed compared to 0 indexed in vcf 916 | start = int(values[0]) - 1 917 | end = int(values[1]) - 1 918 | chrom = values[7] 919 | 920 | assert end > start 921 | 922 | if chrom not in coord_map: 923 | coord_map[chrom] = [] 924 | 925 | coord_map[chrom].append([start, end]) 926 | 927 | 928 | snp_map = dict() 929 | with open(snp_path) as fh: 930 | for i in range(5): 931 | next(fh) 932 | for l in fh: 933 | values = l.replace(' | ', ' ').split() 934 | # position in snps file is 1 indexed compared to 0 indexed in vcf 935 | pos_r = int(values[0]) - 1 936 | allele_r = values[1] 937 | allele_a = values[2] 938 | chrom = values[10] 939 | 940 | if allele_r == "." or allele_a == ".": 941 | continue 942 | 943 | if chrom not in snp_map: 944 | snp_map[chrom] = dict() 945 | 946 | snp_map[chrom][pos_r] = [allele_r, allele_a] 947 | 948 | gtypes = [] 949 | for geno in genos: 950 | chrom = geno[0] 951 | pos_r = int(geno[1]) 952 | gid = geno[2] 953 | allele_ma = geno[3] 954 | allele_mi = geno[4] 955 | 956 | if chrom not in coord_map: 957 | continue 958 | 959 | for g_range in coord_map[chrom]: 960 | if pos_r >= g_range[0] and pos_r <= g_range[1]: 961 | if chrom in snp_map and pos_r in snp_map[chrom]: 962 | if allele_mi == snp_map[chrom][pos_r][1]: 963 | gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '0', '1']) 964 | else: 965 | gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '1', '0']) 966 | else: 967 | assert chrom in ref 968 | allele_r = ref[chrom][pos_r] 969 | if allele_mi == allele_r: 970 | gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '0', '1']) 971 | else: 972 | gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '1', '0']) 973 | 974 | with open(output, 'w') as fw: 975 | for gtype in gtypes: 976 | fw.write("{}\n".format("\t".join(gtype))) 977 | 978 | def genotype_reads(args): 979 | fpaths = args['fq_paths'] 980 | 981 | args['genotype_dir'] = args['out_dir']+'/temp/genotype' 982 | try: os.makedirs(args['genotype_dir']) 983 | except: pass 984 | 985 | args['gt_results_dir'] = args['out_dir']+'/gt_results' 986 | try: os.makedirs(args['gt_results_dir']) 987 | except: pass 988 | 989 | gt_paths = [] 990 | outname = '%s/iso_gt' % args['genotype_dir'] 991 | try: os.makedirs(outname) 992 | except: pass 993 | 994 | mode = 2 995 | if args['mode'] == "very-fast": 996 | mode = 10 997 | elif args['mode'] == "fast": 998 | mode = 5 999 | elif args['mode'] == 'sensitive': 1000 | mode = 2 1001 | elif args['mode'] == 'very-sensitive': 1002 | mode = 1 1003 | else: 1004 | assert False 1005 | 1006 | command = "iso_gt_mtar " 1007 | command += "-d %s " % args['kmer_db_path'] 1008 | command += "-t %s " % args['threads'] 1009 | command += "-j %s " % mode 1010 | command += "-o %s/" % outname 1011 | command += "%{in} " 1012 | command += "-f " 1013 | 1014 | for fpath in fpaths: 1015 | command += "%s " % fpath 1016 | gt_paths.append(outname + '/' + extract_fastq_path_name(fpath) + ".tsv") 1017 | 1018 | out, err = run_command(command) 1019 | with open(args['logfile'], 'a') as logger: 1020 | logger.write(str(out)+'\n'+str(err)) 1021 | 1022 | merge_paths = [] 1023 | if args["merge_pairs"]: 1024 | assert "fq_pairs" in args 1025 | 1026 | for fq_pair in args["fq_pairs"]: 1027 | fq_1 = fq_pair[0] 1028 | fq_2 = fq_pair[1] 1029 | fq_name = fq_pair[2] 1030 | 1031 | fq_gt_1 = extract_fastq_path_name(fq_1) + ".tsv" 1032 | fq_gt_2 = extract_fastq_path_name(fq_2) + ".tsv" 1033 | 1034 | fq_merge = dict() 1035 | for fq_gt in [fq_gt_1, fq_gt_2]: 1036 | with open(fq_gt, 'r') as fh: 1037 | for line in fh: 1038 | items = line.rstrip().split('\t') 1039 | if items[0] not in fq_merge: 1040 | fq_merge[items[0]] = int(items[0]) 1041 | else: 1042 | fq_merge[items[0]] += int(items[0]) 1043 | 1044 | merge_output = outname + "/" + fq_name + ".merged.tsv" 1045 | with open(merge_output, 'w') as fw: 1046 | for snp in fq_merge.keys(): 1047 | fw.write("{}\t{}\n".format(snp, str(fq_merge[snp]))) 1048 | 1049 | merge_paths.append(merge_output) 1050 | 1051 | arg_list = [] 1052 | for gt_path in gt_paths + merge_paths: 1053 | fq_id = '.'.join(gt_path.split('/')[-1].split('.')[:-1]) 1054 | output = args['gt_results_dir'] + '/' + fq_id + '.reads.tsv' 1055 | arg_list.append([args['vcf'], gt_path, output]) 1056 | 1057 | parallel(run_parse_single, arg_list, args['threads']) 1058 | 1059 | def extract_fastq_path_name(fpath): 1060 | # chop off all leading '.' and '/' 1061 | pparts = [] 1062 | real_idx = 0 1063 | for i, ppart in enumerate(fpath.split('/')): 1064 | if ppart == '.' or ppart == "..": 1065 | continue 1066 | else: 1067 | real_idx = i 1068 | break 1069 | 1070 | vpath = '/'.join(fpath.split('/')[real_idx:]) 1071 | 1072 | path_parts = vpath.split('.') 1073 | real_parts = [] 1074 | if path_parts[-1] in ['gz', 'lz4', 'bz2']: 1075 | real_parts = path_parts[:-2] 1076 | elif path_parts[-1] in ['fq', 'fastq']: 1077 | real_parts = path_parts[:-1] 1078 | else: 1079 | assert False 1080 | 1081 | return ".".join(real_parts).replace('/', '_').replace('.','_') 1082 | 1083 | 1084 | def run_parse_single(vcf_path, gt_path, output): 1085 | snp_map = dict() 1086 | 1087 | with open(gt_path, 'r') as fh: 1088 | for line in fh: 1089 | values = line.rstrip().split('\t') 1090 | snp = values[0] 1091 | count = values[1] 1092 | 1093 | allele_type = int(snp[6]) 1094 | assert allele_type in [0, 1] 1095 | 1096 | gid = snp[7:] 1097 | 1098 | if gid not in snp_map: 1099 | snp_map[gid] = [0, 0] 1100 | 1101 | snp_map[gid][allele_type] = snp_map[gid][allele_type] + int(count) 1102 | 1103 | gtypes = [] 1104 | with open(vcf_path, 'r') as fh: 1105 | for l in fh: 1106 | if l[0] == "#": 1107 | continue 1108 | else: 1109 | values = l.rstrip().split('\t')[:5] 1110 | 1111 | chrom = values[0] 1112 | pos_r = int(values[1]) 1113 | gid = values[2] 1114 | allele_ma = values[3] 1115 | allele_mi = values[4] 1116 | 1117 | if len(allele_mi) > 1: 1118 | continue 1119 | 1120 | if gid in snp_map: 1121 | gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, str(snp_map[gid][0]), str(snp_map[gid][1])]) 1122 | 1123 | with open(output, 'w') as fw: 1124 | for gtype in gtypes: 1125 | fw.write("{}\n".format("\t".join(gtype))) 1126 | 1127 | def call_snps_main(args): 1128 | cmdl_str = ' '.join(sys.argv[1:]) 1129 | 1130 | if args['data_type'] in ['genomes', 'end_to_end']: 1131 | locate_fpaths(args, args['fna_dir'], args['rep_fna'], args['subset_list']) 1132 | 1133 | 1134 | if args['data_type'] in ['genomes', 'end_to_end']: 1135 | if args["has_completeness"]: 1136 | if args["completeness"]: 1137 | args["min_prev"] = (1 - float(args["missing_ratio"])) * float(args["completeness"]) 1138 | elif args["completeness_list"]: 1139 | completeness_map = {} 1140 | with open(args["completeness_list"], 'w') as fh: 1141 | for line in fh: 1142 | items = line.rstrip('').split('\t') 1143 | completeness_map[items[0]] = float(items[1]) 1144 | 1145 | ref_fpath = args['rep_fna'] 1146 | fpaths = args['fna_paths'] 1147 | 1148 | completenesses = [] 1149 | 1150 | for fpath in fpaths: 1151 | fname = fpath.rstrip('/').split('/')[-1] 1152 | if fname in completeness_map: 1153 | completenesses.append(completeness_map[fname]) 1154 | else: 1155 | sys.exit("missing completeness: {}".format(fpath)) 1156 | 1157 | avg_completeness = sum(completenesses)/len(completenesses) 1158 | args["min_prev"] = (1 - float(args["missing_ratio"])) * avg_completeness 1159 | else: 1160 | print("useless option --has-completeness") 1161 | 1162 | if len(args['fna_paths']) <= 5: 1163 | sys.exit("Input genomes {} are fewer than the min. requirement (5)".format(len(args['fna_paths']))) 1164 | 1165 | if len(args['fna_paths']) <= math.ceil(1 / args['snp_freq']): 1166 | print("[Warning] Total number of genomes ({}) < min. number of genomes required for effective SNP calling with MAF {} ({})".format(len(args['fna_paths']), args['snp_freq'], math.ceil(1 / args['snp_freq']))) 1167 | print("[Warning] Skip tag genome selection, all genomes will be used") 1168 | args['keep_redundancy'] = True 1169 | 1170 | if args['data_type'] in ['genomes', 'end_to_end']: 1171 | if not args['keep_redundancy']: 1172 | id_clusters(args) 1173 | 1174 | if args['skip_centroid']: 1175 | assert args['rep_fna'] is not None 1176 | assert os.path.exists(args['rep_fna']) 1177 | else: 1178 | id_tag_ref(args) 1179 | 1180 | 1181 | # >>> 1. Generate multiple-genome-alignment or pileups 1182 | 1183 | # data type is genomes: use parsnp to perform multiple genome alignment 1184 | start = time.time() 1185 | if args['data_type'] in ['genomes', 'end_to_end']: 1186 | print("Running mummer4; start") 1187 | run_mummer4(args) 1188 | #args['mummer4_dir'] = '/Users/jasonshi/Documents/zjshi_github/snpMLST/unit_test_raw/snps_from_genomes/Borrelia_burgdorferi_56121/temp/mummer4/54d64396-732c-42b0-8e88-3de63e8a665e/msa.fna' 1189 | # msa_path = gen_msa.build_msa(indir=args['mummer4_dir'], max_genomes=1280) 1190 | # args['msa_path'] = '/Users/jasonshi/Documents/zjshi_github/snpMLST/unit_test_raw/snps_from_genomes/Borrelia_burgdorferi_56121/temp/mummer4/54d64396-732c-42b0-8e88-3de63e8a665e/msa.fa' 1191 | # args['msa_type'] = 'xmfa-mummer4' 1192 | print("Running mummer4; done!") 1193 | print("Elapsed time: {}".format(time.time()-start)) 1194 | 1195 | 1196 | # >>> 2. Parse multiple-genome-alignment or pileup and call SNPs 1197 | 1198 | # fetch generator to parse msa columns or mpileup sites 1199 | start = time.time() 1200 | print("Fetching file-type-specific parser; start") 1201 | if args['data_type'] in ['genomes', 'end_to_end', 'msa']: 1202 | from align_io import msa 1203 | if args['mem']: 1204 | site_assembly = msa.iter_parse(args['msa_path'], args['msa_type'], args['max_samples']) 1205 | else: 1206 | site_assembly = msa.monolithic_parse(args['msa_path'], args['msa_type'], args['max_samples']) 1207 | 1208 | print("Fetching file-type-specific parser; done") 1209 | print("Elapsed time: {}".format(time.time()-start)) 1210 | 1211 | 1212 | # id core-genome coords and snps 1213 | start = time.time() 1214 | print("Identifying core-snps; start") 1215 | print("max sites: {}".format(args['max_sites'])) 1216 | print("min prevalence: {}".format(args['min_prev'])) 1217 | print("min MAF: {}".format(args['snp_freq'])) 1218 | 1219 | if args['mem']: 1220 | align_assembs = align_assembly.call_snps_iter(site_assembly, args['max_sites'], args['min_prev'], args['snp_freq']) 1221 | else: 1222 | align_assembs = align_assembly.call_snps(site_assembly, args['max_sites'], args['min_prev'], args['snp_freq']) 1223 | print("Identifying core-snps; done") 1224 | print("Elapsed time: {}".format(time.time()-start)) 1225 | 1226 | # sys.exit() 1227 | 1228 | single_chrom_rep = False 1229 | 1230 | if args['mem'] is True and args['rep_fna'] is not None: 1231 | single_chrom_rep = detect_single_chrom(args['rep_fna']) 1232 | 1233 | # write output files 1234 | start = time.time() 1235 | print("Writing snps to VCF; start") 1236 | if args['mem']: 1237 | header_ready = False 1238 | coords_buffer = [] 1239 | for align_assemb in align_assembs: 1240 | if len(align_assemb.snps) > 0: 1241 | if not header_ready: 1242 | vcf_io.write_coords_header(coords_buffer, args['out_dir']) 1243 | vcf_io.write_vcf_header(align_assemb.snps, args['out_dir'], cmdl_str) 1244 | header_ready = True 1245 | 1246 | # vcf_io.write_genome(core_genome.consensus_genome, args['out_dir']) 1247 | coords_buffer = coords_buffer + align_assemb.coords 1248 | vcf_io.write_vcf(align_assemb.snps, args['out_dir'], single_chrom_rep) 1249 | 1250 | vcf_io.write_coords(vcf_io.merge_coords(coords_buffer), args['out_dir']) 1251 | # vcf_io.write_coords(coords_buffer, args['out_dir']) 1252 | else: 1253 | vcf_io.write_coords_header(align_assembs.coords, args['out_dir']) 1254 | vcf_io.write_vcf_header(align_assembs.snps, args['out_dir'], cmdl_str) 1255 | vcf_io.write_coords(align_assembs.coords, args['out_dir']) 1256 | # vcf_io.write_genome(core_genome.consensus_genome, args['out_dir']) 1257 | vcf_io.write_vcf(align_assembs.snps, args['out_dir']) 1258 | print("Writing snps to VCF; done!") 1259 | print("Elapsed time: {}".format(time.time()-start)) 1260 | 1261 | 1262 | def build_db_main(args): 1263 | print("Database building; start") 1264 | args['kmer_size'] = 31 1265 | 1266 | genome_path, vcf_path, coords_path, tag_list_path = args['ref_genome'], args['vcf'], args['coords'], args['tag_list'] 1267 | k_size, k_type = args['kmer_size'], args['kmer_type'] 1268 | 1269 | if args['fna_dir'] is not None: 1270 | locate_fpaths(args, args['fna_dir']) 1271 | 1272 | genome_seq = build_db.open_genome_seq(genome_path) 1273 | #snps = build_db.open_vcf_file(vcf_path) 1274 | 1275 | coords = None 1276 | if coords_path is not None: 1277 | coords = build_db.read_coords(coords_path) 1278 | 1279 | snp_gb_pos, snp_alleles = build_db.open_vcf_file_local(vcf_path) 1280 | #snp_gb_pos = [int(snp.ID) for snp in snps] 1281 | #snp_alleles = [[str(snp.REF), str(snp.ALT[0])] for snp in snps] 1282 | #snp_kmers = fetch_snp_kmers(genome_seq, snp_gb_pos, snp_alleles, k_size, k_type, coords) 1283 | 1284 | genome_seqs = build_db.load_msa(args['msa']) 1285 | snp_kmers = build_db.fetch_all_from_msa(genome_seqs, genome_seq, snp_gb_pos, snp_alleles, k_size, coords) 1286 | 1287 | args['kmer_set'] = args['out_dir'] + '/nr_kmer_set.tsv' 1288 | 1289 | build_db.dump_tsv(snp_kmers, args['kmer_set']) 1290 | 1291 | run_kmerset_validate(args) 1292 | 1293 | filter_kmers(args) 1294 | 1295 | run_build_db(args) 1296 | 1297 | print("Database building; finished") 1298 | 1299 | 1300 | def genotype_main(args): 1301 | print("Genotyping; start") 1302 | read_input_dir(args, args['in_dir'], args['subset_list']) 1303 | 1304 | try: os.makedirs(args['out_dir']) 1305 | except: pass 1306 | 1307 | if len(args["fna_paths"]) > 0: 1308 | print("Genomes found; start") 1309 | genotype_single_genomes(args) 1310 | print("Genomes found; done") 1311 | 1312 | if len(args["fq_paths"]) > 0: 1313 | print("Reads found; start") 1314 | genotype_reads(args) 1315 | print("Reads found; start") 1316 | 1317 | print("Genotyping; finished") 1318 | 1319 | def tree_main(args): 1320 | print("SNP tree building; start") 1321 | concat_alleles.concat_allele_tree(args) 1322 | print("SNP tree building; finished") 1323 | 1324 | def end2end_main(args): 1325 | try: os.makedirs(args['out_dir']) 1326 | except: pass 1327 | 1328 | args['fna_dir'] = args['in_dir'] 1329 | locate_fpaths(args, args['in_dir'], args['rep_fna'], args['subset_list']) 1330 | call_snps_main(args) 1331 | 1332 | args['kmer_size'] = 31 1333 | args['ref_genome'] = args['rep_fna'] 1334 | args['vcf'] = args['out_dir'].rstrip('/') + '/core_snps.vcf' 1335 | args['coords'] = args['out_dir'].rstrip('/') + '/coords.tsv' 1336 | args['tag_list'] = args['out_dir'].rstrip('/') + '/tag_paths.list' 1337 | args['msa'] = args['out_dir'].rstrip('/') + '/tag_msa.fna' 1338 | 1339 | build_db_main(args) 1340 | 1341 | print("Genotyping; start") 1342 | read_input_dir(args, args['in_dir'], args['subset_list']) 1343 | if len(args["fna_paths"]) > 0: 1344 | print("Genomes found; start") 1345 | genotype_single_genomes(args) 1346 | print("Genomes found; done") 1347 | 1348 | if len(args["fq_paths"]) > 0: 1349 | print("Reads found; start") 1350 | genotype_reads(args) 1351 | print("Reads found; start") 1352 | print("Genotyping; finished") 1353 | print("All output files are in {}".format(args['out_dir'])) 1354 | print("The output files include the following") 1355 | print(" reference.fna (selected reference genome)") 1356 | print(" tag_paths.list (list of selected tag genomes)") 1357 | print(" tag_msa.fna (multiple sequence alignment of tag genomes)") 1358 | print(" coords.tsv (coordinates of consensus genome)") 1359 | print(" core_snps.vcf (called SNPs in VCF format)") 1360 | print(" nr_kmer_set.tsv (raw SNP-covering k-mers)") 1361 | print(" check_fna_paths.list (a list of genomes used for validating SNP-covering k-mers)") 1362 | print(" kmer_prof.tsv (hit profile of SNP-covering k-mers)") 1363 | print(" selected_kmers.tsv (validated SNP-covering k-mers)") 1364 | print(" kmer_db.bin (optimized database of SNP-covering k-mers)") 1365 | print("The directories include") 1366 | print(" gt_results (SNP genotyping results)") 1367 | print(" temp (tempory directory for hosting)") 1368 | 1369 | def main(): 1370 | args = parse_args() 1371 | 1372 | if args['overwrite'] is True: 1373 | try: os.rmdir(args['out_dir']) 1374 | except: pass 1375 | 1376 | try: os.makedirs(args['out_dir']) 1377 | except: pass 1378 | 1379 | args['logfile'] = "{}/logfile".format(args['out_dir'].rstrip('/')) 1380 | 1381 | if args['data_type'] == 'genomes': 1382 | call_snps_main(args) 1383 | elif args['data_type'] == 'db': 1384 | build_db_main(args) 1385 | elif args['data_type'] == 'genotype': 1386 | genotype_main(args) 1387 | elif args['data_type'] == 'tree': 1388 | tree_main(args) 1389 | elif args['data_type'] == 'end_to_end': 1390 | end2end_main(args) 1391 | else: 1392 | sys.exit("\nError: invalid subcommand\nSupported subcommand: genomes, db, genotype, tree, end_to_end\n") 1393 | 1394 | if __name__ == "__main__": 1395 | main() 1396 | --------------------------------------------------------------------------------