├── align_io
    ├── __init__.py
    ├── __pycache__
    │   ├── msa.cpython-36.pyc
    │   ├── msa.cpython-37.pyc
    │   ├── seq_ali.cpython-36.pyc
    │   ├── seq_ali.cpython-37.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── xmfa_mummer4_io.cpython-36.pyc
    │   └── xmfa_mummer4_io.cpython-37.pyc
    ├── maf_io.py
    ├── msa.py
    ├── xmfa_parsnp_io.py
    ├── xmfa_io.py
    ├── xmfa_mummer4_io.py
    └── seq_ali.py
├── snps_io
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── gen_msa.cpython-310.pyc
    │   ├── gen_msa.cpython-36.pyc
    │   ├── gen_msa.cpython-37.pyc
    │   ├── vcf_io.cpython-310.pyc
    │   ├── vcf_io.cpython-36.pyc
    │   ├── vcf_io.cpython-37.pyc
    │   ├── __init__.cpython-310.pyc
    │   ├── id_centroid.cpython-310.pyc
    │   ├── id_centroid.cpython-36.pyc
    │   ├── id_centroid.cpython-37.pyc
    │   ├── align_assembly.cpython-310.pyc
    │   ├── align_assembly.cpython-36.pyc
    │   ├── align_assembly.cpython-37.pyc
    │   ├── concat_alleles.cpython-310.pyc
    │   ├── concat_alleles.cpython-36.pyc
    │   ├── id_genome_clusters.cpython-36.pyc
    │   ├── id_genome_clusters.cpython-37.pyc
    │   └── id_genome_clusters.cpython-310.pyc
    ├── id_centroid.py
    ├── vcf_var_io.py
    ├── gen_msa_single.py
    ├── gen_msa.py
    ├── vcf_io.py
    ├── id_genome_clusters.py
    ├── concat_alleles.py
    └── align_assembly.py
├── bin
    ├── iso_gt_mtar
    ├── callm_db_build
    ├── callm_db_val
    └── maast.py
├── db_io
    ├── __pycache__
    │   ├── build_db.cpython-310.pyc
    │   ├── build_db.cpython-36.pyc
    │   └── build_db.cpython-37.pyc
    ├── example.sh
    └── build_db.py
├── conda_recipe
    ├── build.sh
    └── meta.yaml
├── Makefile
├── maast
├── LICENSE
├── src
    ├── callm_db_build.cpp
    └── callm_db_val.cpp
└── README.md


/align_io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/snps_io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/bin/iso_gt_mtar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/bin/iso_gt_mtar


--------------------------------------------------------------------------------
/bin/callm_db_build:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/bin/callm_db_build


--------------------------------------------------------------------------------
/bin/callm_db_val:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/bin/callm_db_val


--------------------------------------------------------------------------------
/align_io/__pycache__/msa.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/msa.cpython-36.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/msa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/msa.cpython-37.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/seq_ali.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/seq_ali.cpython-36.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/seq_ali.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/seq_ali.cpython-37.pyc


--------------------------------------------------------------------------------
/db_io/__pycache__/build_db.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/db_io/__pycache__/build_db.cpython-310.pyc


--------------------------------------------------------------------------------
/db_io/__pycache__/build_db.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/db_io/__pycache__/build_db.cpython-36.pyc


--------------------------------------------------------------------------------
/db_io/__pycache__/build_db.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/db_io/__pycache__/build_db.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/gen_msa.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/gen_msa.cpython-310.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/gen_msa.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/gen_msa.cpython-36.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/gen_msa.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/gen_msa.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/vcf_io.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/vcf_io.cpython-310.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/vcf_io.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/vcf_io.cpython-36.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/vcf_io.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/vcf_io.cpython-37.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/id_centroid.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_centroid.cpython-310.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/id_centroid.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_centroid.cpython-36.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/id_centroid.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_centroid.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/align_assembly.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/align_assembly.cpython-310.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/align_assembly.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/align_assembly.cpython-36.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/align_assembly.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/align_assembly.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/concat_alleles.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/concat_alleles.cpython-310.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/concat_alleles.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/concat_alleles.cpython-36.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/xmfa_mummer4_io.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/xmfa_mummer4_io.cpython-36.pyc


--------------------------------------------------------------------------------
/align_io/__pycache__/xmfa_mummer4_io.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/align_io/__pycache__/xmfa_mummer4_io.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/id_genome_clusters.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_genome_clusters.cpython-36.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/id_genome_clusters.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_genome_clusters.cpython-37.pyc


--------------------------------------------------------------------------------
/snps_io/__pycache__/id_genome_clusters.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjshi/Maast/HEAD/snps_io/__pycache__/id_genome_clusters.cpython-310.pyc


--------------------------------------------------------------------------------
/conda_recipe/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## change to source dir
 4 | cd ${SRC_DIR}
 5 | 
 6 | ## compile
 7 | make
 8 | 
 9 | ## install
10 | mkdir -p $PREFIX/bin
11 | cp -r ${SRC_DIR}/* ${PREFIX}/bin/
12 | 


--------------------------------------------------------------------------------
/db_io/example.sh:
--------------------------------------------------------------------------------
1 | find -maxdepth 2 -name '*nr*' | cut -d'/' -f2 | awk '{printf "igvam_dbval -d %s/nr-snp-kmer.tsv -n %s -t 2 -L <(cut -f1 ../snp_calling_hqsubsets/%s.path.list) 1> %s/kmer_profiles.tsv 2> %s/kmer_profiles.log\n", $1, $1, $1, $1, $1}' | head -n 1
2 | 
3 | cat missing_species.list | xargs -I[] -n1 -P7 bash -c "python /home/ubuntu/proj/snpMLST/snp_mlst/build_db_new.py tt extract --ref-genome ./[]/reference.fna --vcf ./[]/core_snps.vcf --msa ./[]/temp/mummer4/[]/msa.fa --out ./[]/nr --kmer-type all --coords ./[]/coords.tsv --no-reduction &> ./[]/kmer_xtract.log" &
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: callm_db_build callm_db_val iso_gt_mtar
 2 | 	@echo "Maast build completed."
 3 | 
 4 | callm_db_build:  ./src/callm_db_build.cpp Makefile
 5 | 	g++ -std=c++11 ./src/callm_db_build.cpp -o ./bin/callm_db_build -O3 -lpthread
 6 | 
 7 | callm_db_val: ./src/callm_db_val.cpp Makefile
 8 | 	g++ -std=c++11 ./src/callm_db_val.cpp -o ./bin/callm_db_val -O3 -lpthread
 9 | 
10 | iso_gt_mtar: ./src/callm_db_val.cpp Makefile
11 | 	g++ -std=c++11 ./src/iso_gt_mtar.cpp -o ./bin/iso_gt_mtar -O3 -lpthread
12 | 
13 | clean:
14 | 	rm ./bin/callm_db_build ./bin/callm_db_val ./bin/iso_gt_mtar
15 | 


--------------------------------------------------------------------------------
/maast:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | Version()
 4 | {
 5 |    # Display version 
 6 |    echo "Maast version 1.0.8"
 7 | }
 8 | 
 9 | # automatic path exporting
10 | REALME=`realpath $0`
11 | MAAST=`dirname ${REALME}`
12 | EBSROOT=`dirname ${MAAST}`
13 | export PATH=$PATH:${MAAST}/bin/
14 | export PYTHONPATH=$PYTHONPATH:${MAAST}
15 | 
16 | # controller calling functional modules
17 | if [ "$1" = "-v" ] || [ "$1" = "--version" ] || [ "$1" = "version" ] || [ "$1" = "-version" ]; then
18 | 	Version	
19 | elif [ "$1" = "-h" ] || [ "$1" = "-help" ] || [ "$1" = "help" ] || [ "$1" = "--help" ]; then
20 | 	maast.py -h	
21 | else
22 | 	maast.py $*
23 | fi
24 | 


--------------------------------------------------------------------------------
/conda_recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: maast
 3 |   version: 1.0.8
 4 | 
 5 | source:
 6 |   git_url: https://github.com/zjshi/Maast.git
 7 | 
 8 | requirements:
 9 |   build:
10 |     - python ==3.9.6
11 |     - {{ compiler('cxx') }}
12 |     - setuptools
13 |     - pip
14 |   host:
15 |     - python ==3.9.6
16 |     - setuptools
17 |     - pip
18 |     - numpy
19 |     - scipy 
20 |     - biopython
21 |     - networkx 
22 |   run:
23 |     - python ==3.9.6
24 |     - pip
25 |     - numpy
26 |     - scipy 
27 |     - networkx
28 |     - biopython
29 |     - fasttree
30 |     - mash
31 |     - mummer4
32 |     - pigz
33 |     - lz4
34 |     - lbzip2
35 | 
36 | test:
37 |    import:
38 |     - numpy
39 |     - scipy 
40 |     - biopython
41 |     - networkx 
42 | 
43 | about:
44 |   home: https://github.com/zjshi/Maast
45 |   license: MIT License
46 |   license_file: LICENSE
47 |   summary: Maast for efficient genotyping of microbial SNPs
48 |   doc_url: https://github.com/zjshi/Maast
49 |   dev_url: https://github.com/zjshi/Maast
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Zhou (Jason) Shi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/align_io/maf_io.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Alignment:
 4 | 	def __init__(self, line):
 5 | 		self.desc = line
 6 | 		self.seqs = []
 7 | 
 8 | class Sequence:
 9 | 	def __init__(self, line):
10 | 		values = line.rstrip().split()
11 | 		self.chrom = values[1]
12 | 		self.start = values[2]
13 | 		self.end = int(values[5])
14 | 		self.length = int(values[3])
15 | 		self.strand = values[4]
16 | 
17 | 		self.seq = values[6].upper()
18 | 
19 | def parse(fpath):
20 | 	with open(fpath) as file:
21 | 		for line in file:
22 | 			if line[0] == '#': continue
23 | 			else: break
24 | 		alignment = Alignment(line)
25 | 		for line in file:
26 | 			if line[0] == 'a':
27 | 				yield alignment
28 | 				alignment = Alignment(line)
29 | 			elif line[0] == 's':
30 | 				sequence = Sequence(line)
31 | 				alignment.seqs.append(sequence)
32 | 	yield alignment
33 | 
34 | def iter_parse(fpath):
35 | 	with open(fpath) as file:
36 | 		for line in file:
37 | 			if line[0] == '#': continue
38 | 			else: break
39 | 		alignment = Alignment(line)
40 | 		for line in file:
41 | 			if line[0] == 'a':
42 | 				yield alignment
43 | 				alignment = Alignment(line)
44 | 			elif line[0] == 's':
45 | 				sequence = Sequence(line)
46 | 				alignment.seqs.append(sequence)
47 | 	yield alignment
48 | 


--------------------------------------------------------------------------------
/align_io/msa.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def parse_control(msa_path, msa_type, max_sample=float('inf')):
 3 | 	if msa_type == 'xmfa-parsnp':
 4 | 		from align_io.xmfa_parsnp_io import parse
 5 | 	elif msa_type == 'xmfa-mummer4':
 6 | 		from align_io.xmfa_mummer4_io import parse
 7 | 	elif msa_type == 'xmfa-mauve':
 8 | 		from align_io.xmfa_mauve_io import parse
 9 | 	elif msa_type == 'maf-mugsy':
10 | 		from align_io.maf_io import parse
11 | 	else:
12 | 		import sys
13 | 		sys.exit("\nError: invalid value for --msa-format: %s\n" % msa_type)
14 | 	return parse(msa_path, max_sample)
15 | 
16 | def monolithic_parse(msa_path, msa_type, max_sample=float('inf')):
17 | 	return parse_control(msa_path, msa_type, max_sample)
18 | 
19 | def iter_parse_control(msa_path, msa_type, max_sample=float('inf')):
20 | 	if msa_type == 'xmfa-parsnp':
21 | 		from align_io.xmfa_parsnp_io import iter_parse
22 | 	elif msa_type == 'xmfa-mummer4':
23 | 		from align_io.xmfa_mummer4_io import iter_parse
24 | 	elif msa_type == 'xmfa-mauve':
25 | 		from align_io.xmfa_mauve_io import iter_parse
26 | 	elif msa_type == 'maf-mugsy':
27 | 		from align_io.maf_io import iter_parse
28 | 	else:
29 | 		import sys
30 | 		sys.exit("\nError: invalid value for --msa-format: %s\n" % msa_type)
31 | 	return iter_parse(msa_path, max_sample)
32 | 
33 | def iter_parse(msa_path, msa_type, max_sample=float('inf')):
34 | 	for align in iter_parse_control(msa_path, msa_type, max_sample):
35 | 		yield align
36 | 
37 | def iterate_cols(msa_path, msa_type):
38 | 	for align in parse(msa_path, msa_type):
39 | 		for column in align.fetch_columns():
40 | 			yield column
41 | 


--------------------------------------------------------------------------------
/snps_io/id_centroid.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | 
 3 | import sys, os
 4 | import argparse, operator
 5 | 
 6 | import numpy as np
 7 | 
 8 | from time import time
 9 | 
10 | def read_tags(tag_paths):
11 | 	tag_map = dict()
12 | 
13 | 	for tag_genome in tag_paths:
14 | 		tag_map[tag_genome] = 0
15 | 	
16 | 	return tag_map
17 | 
18 | def calc_tag_weights(tag_map, dist_path, dist_type):
19 | 	sys.stderr.write("[clustering] start\n")
20 | 
21 | 	with open(dist_path, 'r') as fh:
22 | 		for line in fh:
23 | 			items = line.rstrip().split('\t')
24 | 			genome1, genome2, d = items[0], items[1], float(items[2])
25 | 
26 | 			if genome1 >= genome2:
27 | 				#sys.stderr.write("{} {}\n".format(genome1, genome2))
28 | 				continue
29 | 			# sys.stderr.write("{} {}\n".format(genome1, genome2))
30 | 
31 | 			if genome1 in tag_map and genome2 in tag_map:
32 | 				if dist_type == "L1":
33 | 					tag_map[genome1] += d
34 | 					tag_map[genome2] += d
35 | 				elif dist_type == "L2":
36 | 					tag_map[genome1] = (tag_map[genome1] ** 2 + d ** 2) ** 0.5
37 | 					tag_map[genome2] = (tag_map[genome2] ** 2 + d ** 2) ** 0.5
38 | 				elif dist_type == "Linf":
39 | 					if tag_map[genome1] < d:
40 | 						tag_map[genome1] = d
41 | 					if tag_map[genome2] < d:
42 | 						tag_map[genome2] = d
43 | 				else:
44 | 					sys.exit("Error: distance type {} is not supported for centroid genome picking".format(dist_type))
45 | 
46 | 	sys.stderr.write("[clustering] done\n")
47 | 
48 | 	return tag_map
49 | 
50 | def centroid_from_map(tag_map):
51 | 	centroid = None
52 | 
53 | 	for tag in tag_map.keys():
54 | 		if centroid is None:
55 | 			centroid = tag
56 | 		else:
57 | 			if tag_map[tag] < tag_map[centroid]:
58 | 				centroid = tag
59 | 
60 | 	return centroid
61 | 
62 | def identify(tag_paths, dist_path, dist_type="L1"):
63 | 	tag_map = read_tags(tag_paths)
64 | 	tag_map = calc_tag_weights(tag_map, dist_path, dist_type)
65 | 
66 | 	centroid = centroid_from_map(tag_map)
67 | 
68 | 	return centroid
69 | 


--------------------------------------------------------------------------------
/snps_io/vcf_var_io.py:
--------------------------------------------------------------------------------
 1 | class SNP:
 2 | 	def __init__(self, chrom, variant_id, pos, ref="", alt="", info=None, fmt=None, sample_ids=None):
 3 | 		self.chrom = chrom
 4 | 		self.var_id = variant_id
 5 | 		self.pos = pos
 6 | 
 7 | 		self.ref_allele = ""
 8 | 		self.alt_allele = ""
 9 | 
10 | 		if info is None:
11 | 			self.info = {}
12 | 			self.info['NS'] = -1
13 | 			self.info['DP'] = -1
14 | 			self.info['AF'] = -1
15 | 
16 | 		if fmt is None:
17 | 			self.format = {}
18 | 			self.format['AF'] = ""
19 | 
20 | 		if sample_ids is None:
21 | 			self.sample_ids = []
22 | 
23 | 
24 | def format_header(sample_ids):
25 | 	import time
26 | 	header = ""
27 | 	header += """##fileformat=VCFv4.1\n"""
28 | 	header += """##fileDate=%s\n""" % time.strftime("%Y-%m-%d %H:%M")
29 | 	header += """##source=https://github.com/snayfach/snpMLST\n"""
30 | 	header += """##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">\n"""
31 | 	header += """##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n"""
32 | 	header += """##INFO=<ID=AF,Number=A,Type=Float,Description="Alternate Allele Frequency">\n"""
33 | 	header += """##FORMAT=<ID=AF,Number=1,Type=Float,Description="Alternate Allele Frequency">\n"""
34 | 	header += """##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n"""
35 | 	header += """#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT %s\n""" % " ".join(sample_ids)
36 | 	return header
37 | 
38 | def format_snp(snp):
39 | 	record = ""
40 | 	record += str(snp.chrom) + "\t" # CHROM
41 | 	record += str(snp.pos) + "\t" # POS
42 | 	record += str(snp.var_id) + "\t" # ID
43 | 	record += snp.ref_allele + "\t" # REF
44 | 	record += snp.alt_allele + "\t" # ALT
45 | 	record += ".\t" # QUAL
46 | 	record += "PASS\t" # FILTER
47 | 	record += "%s\t" % format_info(snp) # INFO
48 | 	record += "%s\t" % ":".join(snp.format.keys()) # FORMAT
49 | 	record += "%s\n" % format_samples(snp) # GENOTYPES
50 | 	return record
51 | 
52 | def format_info(snp):
53 | 	return ";".join([key + "=" + str(value) for key, value in snp.info.items()])
54 | 
55 | def format_samples(snp):
56 | 	formats = snp.format.keys()
57 | 	indexes = range(len(snp.sample_ids))
58 | 	return "\t".join([":".join([str(snp.format[f][i]) for f in formats]) for i in indexes])
59 | 
60 | def write_vcf(snps, outdir):
61 | 	path = outdir+'/core_snps.vcf'
62 | 	if len(snps) > 0:
63 | 		with open(path, 'w') as file:
64 | 			file.write(format_header(snps[0].sample_ids))
65 | 			for snp in snps:
66 | 				file.write(format_snp(snp))
67 | 	else:
68 | 		print "Empty set of SNPs was found for the dataset, the file writing was skipped"
69 | 


--------------------------------------------------------------------------------
/snps_io/gen_msa_single.py:
--------------------------------------------------------------------------------
  1 | import sys, argparse
  2 | 
  3 | from Bio import SeqIO
  4 | 
  5 | def parse_args():
  6 | 	""" Return dictionary of command line arguments
  7 | 	"""
  8 | 	parser = argparse.ArgumentParser(
  9 | 		formatter_class=argparse.RawTextHelpFormatter,
 10 | 		usage=argparse.SUPPRESS)
 11 | 
 12 | 	parser.add_argument('--delta', type=str, dest='delta_path', required=True,
 13 | 		help="""Path to delta file indicating the whole genome alignment information. This delta file is also the output of mummer4""")
 14 | 	parser.add_argument('--ref-seq', type=str, dest='ref_path', required=True,
 15 | 		help="""Path to reference genome file as input in multiple fasta format""")
 16 | 	parser.add_argument('--qry-seq', type=str, dest='qry_path', required=True,
 17 | 		help="""Path to query genome file as input in multiple fasta format""")
 18 | 	parser.add_argument('--ref-name', type=str, dest='ref_name', required=True,
 19 | 		help="""Specify reference genome name""")
 20 | 	parser.add_argument('--qry-name', type=str, dest='qry_name', required=True,
 21 | 		help="""Specify query genome name""")
 22 | 	parser.add_argument('--out', type=str, dest='out', default="/dev/out",
 23 | 		help="""Path to output (double genome alignment) will be written in MSA formart""")
 24 | 
 25 | 	return vars(parser.parse_args())
 26 | 
 27 | 
 28 | def rc(seq):
 29 | 	base_map = {
 30 | 		'A': 'T', 'a': 'T',
 31 | 		'C': 'G', 'c': 'G',
 32 | 		'G': 'C', 'g': 'C',
 33 | 		'T': 'A', 't': 'A',
 34 | 		'N': 'N', 'n': 'N'
 35 | 	} 
 36 | 
 37 | 	return ''.join([base_map[c] for c in seq[::-1]])
 38 | 
 39 | def read_genome(genome_path):
 40 | 	ordered_chroms = []
 41 | 	genome_seqs = dict()
 42 | 	
 43 | 	for seq in SeqIO.parse(genome_path, "fasta"):
 44 | 		ordered_chroms.append(seq.id)
 45 | 		genome_seqs[seq.id] = seq.seq
 46 | 
 47 | 	return genome_seqs, ordered_chroms
 48 | 
 49 | def parse_delta(delta_path):
 50 | 	align_blocs = []
 51 | 
 52 | 	with open(delta_path, 'r') as fh:
 53 | 		fh.readline()
 54 | 		fh.readline()
 55 | 
 56 | 		r_tag = ""
 57 | 		q_tag = ""
 58 | 		
 59 | 		r_len = ""
 60 | 		q_len = ""
 61 | 
 62 | 		bloc = []
 63 | 
 64 | 		for line in fh:
 65 | 			if line[0] == '>':
 66 | 				items = line[1:].rstrip().split(' ')
 67 | 				r_tag = items[0]
 68 | 				q_tag = items[1]
 69 | 
 70 | 				r_len = int(items[2])
 71 | 				q_len = int(items[3])
 72 | 			else:
 73 | 				if ' ' in line:
 74 | 					items = line.rstrip().split(' ')
 75 | 					bloc = [ int(item) for item in items[:4] ]
 76 | 				else:
 77 | 					diff = line.rstrip()
 78 | 					
 79 | 					if diff == '0':
 80 | 						align_blocs.append([r_tag, q_tag, r_len, q_len] + bloc)
 81 | 						bloc = []
 82 | 					else:
 83 | 						bloc.append(int(diff))
 84 | 	
 85 | 	return align_blocs
 86 | 
 87 | def main():
 88 | 	args = parse_args()
 89 | 
 90 | 	ref_genome, ref_chroms = read_genome(args['ref_path'])
 91 | 	qry_genome, qry_chroms  = read_genome(args['qry_path'])
 92 | 
 93 | 	align_blocs = parse_delta(args['delta_path'])
 94 | 
 95 | 	aligned_qry = dict()
 96 | 
 97 | 	for chrom_id in ref_genome.keys():
 98 | 		aligned_qry[chrom_id] = '-' * len(ref_genome[chrom_id])
 99 | 
100 | 	for bloc in align_blocs:
101 | 		r_tag = bloc[0]
102 | 		q_tag = bloc[1]
103 | 
104 | 		r_len = bloc[2]
105 | 		q_len = bloc[3]
106 | 
107 | 		r_start = bloc[4]
108 | 		r_end = bloc[5]
109 | 
110 | 		assert r_end > r_start
111 | 
112 | 		q_start = bloc[6]
113 | 		q_end = bloc[7]
114 | 
115 | 		q_seq = ""
116 | 		if q_end < q_start:
117 | 			q_start = q_len - q_start + 1
118 | 			q_end = q_len - q_end + 1
119 | 
120 | 			q_seq = rc(qry_genome[q_tag])[q_start-1:q_end]
121 | 		else:
122 | 			q_seq = qry_genome[q_tag][q_start-1:q_end]
123 | 
124 | 
125 | 		pos = 0
126 | 		for diff in bloc[8:]:
127 | 			if diff < 0:
128 | 				pos = pos+abs(diff)
129 | 				q_seq = q_seq[:pos-1] + q_seq[pos:]
130 | 				pos = pos - 1
131 | 			else:
132 | 				pos = pos + diff
133 | 				q_seq = q_seq[:pos-1] + '-' + q_seq[pos-1:]
134 | 
135 | 		# print [r_start, r_end, r_end - r_start, q_start, q_end, q_end - q_start,  len(q_seq)]
136 | 
137 | if __name__ == "__main__":
138 | 	main()
139 | 


--------------------------------------------------------------------------------
/align_io/xmfa_parsnp_io.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | from snp_mlst.align_io import seq_ali
  4 | 
  5 | class Sequence:
  6 | 	def __init__(self, line):
  7 | 		values = line.rstrip().lstrip('>').lstrip().split()
  8 | 		self.index = values[0].split(':')[0]
  9 | 		self.start = int(values[0].split(':')[1].split('-')[0])
 10 | 		self.end = int(values[0].split(':')[1].split('-')[1])
 11 | 		self.length = self.end - self.start + 1
 12 | 		self.strand = values[1]
 13 | 		self.chrom = values[2]
 14 | 		self.seq = ''
 15 | 
 16 | def extract_genome_info(fpath):
 17 | 	# check version
 18 | 	with open(fpath) as file:
 19 | 		version = next(file).rstrip().split(' ', 1)[-1]
 20 | 		if version != 'Parsnp v1.1':
 21 | 			sys.exit("\nError: expected XMFA version 'Parsnp v1.1' but got '%s'\n" % version)
 22 | 
 23 | 	"""
 24 | 	this section reads the header of the parsnp xmfa file,
 25 | 	header are the lines beginning with ## and #,
 26 | 	map genome index to info,
 27 | 	keys (field in code) = ['SequenceLength', 'SequenceFile', 'SequenceHeader']
 28 | 	"""
 29 | 	genome_info = {}
 30 | 	with open(fpath) as file:
 31 | 		header = ''
 32 | 		for line in file:
 33 | 			if line.startswith('##'): header += line.lstrip('#')
 34 | 			elif line.startswith('#'): continue
 35 | 			else: break
 36 | 
 37 | 		for h in header.split('SequenceIndex ')[1:]:
 38 | 			genome_index, info_string = h.rstrip('\n').split('\n', 1)
 39 | 			genome_info[genome_index] = {}
 40 | 			for info_record in info_string.split('\n'):
 41 | 				field, value = info_record.split(' ', 1)
 42 | 				genome_info[genome_index][field] = value
 43 | 
 44 | 	return genome_info
 45 | 
 46 | def parse(fpath, max_sample=float('inf')):
 47 | 
 48 | 	genome_info = extract_genome_info(fpath)
 49 | 
 50 | 	"""
 51 | 	this section reads the body of the parsnp xmfa file,
 52 | 	the '=' marked the end of a multiple sequence alignment,
 53 | 	a multiple sequence alignment contains sequences in fasta format for each sample
 54 | 	each sequence fasta header was splitted to different atttributes, and store in Sequence class for description.
 55 | 	each sequence was added to the Sequence class
 56 | 	"""
 57 | 	alns = []
 58 | 	with open(fpath) as file:
 59 | 		last_aln = None
 60 | 		cur_aln = seq_ali.Alignment()
 61 | 		for line in file:
 62 | 			if line.startswith('#'):
 63 | 				continue
 64 | 			elif line.startswith('='):
 65 | 				cur_aln.nseqs = len(cur_aln.seqs)
 66 | 				cur_aln.ncols = len(cur_aln.seqs[0].seq)
 67 | 				cur_aln.chrom = cur_aln.seqs[0].chrom
 68 | 				cur_aln.update()
 69 | 				last_aln = cur_aln
 70 | 				cur_aln = seq_ali.Alignment()
 71 | 				alns.append(last_aln)
 72 | 			elif len(cur_aln.seqs) <= max_sample:
 73 | 				if line.startswith('>'):
 74 | 					seq = Sequence(line)
 75 | 					seq.id = genome_info[seq.index]['SequenceFile']
 76 | 					cur_aln.seqs.append(seq)
 77 | 				else:
 78 | 					cur_aln.seqs[-1].seq += line.rstrip().upper()
 79 | 			else:
 80 | 				if len(cur_aln.seqs) > max_sample:
 81 | 					cur_aln.seqs.pop()
 82 | 				pass
 83 | 	return alns
 84 | 
 85 | def iter_parse(fpath, max_sample=float('inf')):
 86 | 	genome_info = extract_genome_info(fpath)
 87 | 
 88 | 	"""
 89 | 	this section reads the body of the parsnp xmfa file,
 90 | 	the '=' marked the end of a multiple sequence alignment,
 91 | 	a multiple sequence alignment contains sequences in fasta format for each sample
 92 | 	each sequence fasta header was splitted to different atttributes, and store in Sequence class for description.
 93 | 	each sequence was added to the Sequence class
 94 | 	"""
 95 | 
 96 | 	with open(fpath) as file:
 97 | 		last_aln = None
 98 | 		cur_aln = seq_ali.Alignment()
 99 | 		for line in file:
100 | 			if line.startswith('#'):
101 | 				continue
102 | 			elif line.startswith('='):
103 | 				cur_aln.nseqs = len(cur_aln.seqs)
104 | 				cur_aln.ncols = len(cur_aln.seqs[0].seq)
105 | 				cur_aln.chrom = cur_aln.seqs[0].chrom
106 | 				cur_aln.update()
107 | 				last_aln = cur_aln
108 | 				cur_aln = seq_ali.Alignment()
109 | 				yield last_aln
110 | 			else:
111 | 				if len(cur_aln.seqs) <= max_sample:
112 | 					if line.startswith('>'):
113 | 						seq = Sequence(line)
114 | 						seq.id = genome_info[seq.index]['SequenceFile']
115 | 						cur_aln.seqs.append(seq)
116 | 					else:
117 | 						cur_aln.seqs[-1].seq += line.rstrip().upper()
118 | 				else:
119 | 					if len(cur_aln.seqs) > max_sample:
120 | 						cur_aln.seqs.pop()
121 | 					pass
122 | 


--------------------------------------------------------------------------------
/align_io/xmfa_io.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from operator import itemgetter
  3 | 
  4 | class Alignment:
  5 | 	def __init__(self):
  6 | 		self.desc = ''
  7 | 		self.seqs = []
  8 | 
  9 | 	def fetch_columns(self):
 10 | 		# ncols is the length of first seq in Alignment
 11 | 		# Can be inferred from this block of code that all seq in Alignment have the same length
 12 | 		# seq.id = genome_info[seq.index]['SequenceFile']
 13 | 		
 14 | 		for col_index in range(self.ncols):
 15 | 			pos = col_index + 1
 16 | 			chars = [seq.seq[col_index] for seq in self.seqs]
 17 | 			sample_ids = [seq.id for seq in self.seqs]
 18 | 			yield AlignColumn(pos, chars, sample_ids)
 19 | 
 20 | class AlignColumn:
 21 | 	def __init__(self, pos, chars, sample_ids):
 22 | 		self.chrom = None
 23 | 		self.pos = pos
 24 | 		self.chars = chars
 25 | 		self.sample_ids = sample_ids
 26 | 		self.total_samples = len(chars)
 27 | 		self.pooled_counts = {'A':0, 'T':0, 'G':0, 'C':0}
 28 | 		self.pool_counts()
 29 | 		self.pooled_depth = sum(self.pooled_counts.values())
 30 | 		self.present_samples = sum(self.pooled_counts.values())
 31 | 		self.call_snp()
 32 | 		self.prev = self.present_samples / float(self.total_samples)
 33 | 		self.allele_freqs = self.genotype()
 34 | 
 35 | 	def pool_counts(self):
 36 | 		""" Pool read-counts to 4 alleles """
 37 | 		for char in self.chars:
 38 | 			if char in self.pooled_counts:
 39 | 				self.pooled_counts[char] += 1
 40 | 
 41 | 	def call_snp(self):
 42 | 		""" Identify major and minor alleles """
 43 | 		counts = sorted(list(self.pooled_counts.items()), key=itemgetter(1), reverse=True)
 44 | 		if self.present_samples == 0:
 45 | 			self.cons_allele = 'N'
 46 | 			self.cons_count = 0
 47 | 			self.cons_freq = 0.0
 48 | 			self.alt_freq = 0.0
 49 | 		else:
 50 | 			self.cons_allele, self.cons_count = counts.pop(0)
 51 | 			self.cons_freq = self.cons_count/float(self.present_samples)
 52 | 			if len(counts) > 0:
 53 | 				self.alt_allele, self.alt_count = counts[0]
 54 | 				self.alt_freq = self.alt_count/float(self.present_samples)
 55 | 			else:
 56 | 				self.alt_freq = 0.0
 57 | 
 58 | 	def consensus(self):
 59 | 		from collections import Counter
 60 | 		return Counter(self.chars).most_common(1)[0][0]
 61 | 
 62 | 	def percent_aligned(self, nseqs):
 63 | 		""" Compute the % of genomes with observed data """
 64 | 		gaps = self.chars.count('-')
 65 | 		missing = self.chars.count('N')
 66 | 		return (nseqs - gaps - missing) / float(nseqs)
 67 | 
 68 | 	def genotype(self):
 69 | 		genotypes = []
 70 | 		for char in self.chars:
 71 | 			if char == self.cons_allele:
 72 | 				genotypes.append(0)
 73 | 			elif char == self.alt_allele:
 74 | 				genotypes.append(1)
 75 | 			else:
 76 | 				genotypes.append(None)
 77 | 		return genotypes
 78 | 
 79 | 
 80 | class Sequence:
 81 | 	def __init__(self, line):
 82 | 		values = line.rstrip().lstrip('>').lstrip().split()
 83 | 		self.index = values[0].split(':')[0]
 84 | 		self.start = int(values[0].split(':')[1].split('-')[0])
 85 | 		self.end = int(values[0].split(':')[1].split('-')[1])
 86 | 		self.length = self.end - self.start + 1
 87 | 		self.strand = values[1]
 88 | 		self.seq = ''
 89 | 
 90 | 
 91 | def parse(fpath):
 92 | 
 93 | 	# check version
 94 | 	with open(fpath) as file:
 95 | 		version = next(file).rstrip().split(' ', 1)[-1]
 96 | 		if version != 'Parsnp v1.1':
 97 | 			sys.exit("\nError: expected XMFA version 'Parsnp v1.1' but got '%s'\n" % version)
 98 | 
 99 | 	# map genome index to info
100 | 	# keys = ['SequenceLength', 'SequenceFile', 'SequenceHeader']
101 | 	with open(fpath) as file:
102 | 		genome_info = {}
103 | 		header = ''
104 | 		for line in file:
105 | 			if line.startswith('##'): header += line.lstrip('#')
106 | 			elif line.startswith('#'): continue
107 | 			else: break
108 | 		for h in header.split('SequenceIndex ')[1:]:
109 | 			genome_index, info_string = h.rstrip('\n').split('\n', 1)
110 | 			genome_info[genome_index] = {}
111 | 			for info_record in info_string.split('\n'):
112 | 				field, value = info_record.split(' ', 1)
113 | 				genome_info[genome_index][field] = value
114 | 
115 | 	# yield alignment blocks
116 | 	with open(fpath) as file:
117 | 		last = None
118 | 		current = Alignment()
119 | 		for line in file:
120 | 			if line.startswith('#'):
121 | 				continue
122 | 			elif line.startswith('='):
123 | 				current.nseqs = len(current.seqs)
124 | 				current.ncols = len(current.seqs[0].seq)
125 | 				last = current
126 | 				current = Alignment()
127 | 				yield last
128 | 			elif line.startswith('>'):
129 | 				seq = Sequence(line)
130 | 				seq.id = genome_info[seq.index]['SequenceFile']
131 | 				current.seqs.append(seq)
132 | 			else:
133 | 				current.seqs[-1].seq += line.rstrip().upper()
134 | 


--------------------------------------------------------------------------------
/snps_io/gen_msa.py:
--------------------------------------------------------------------------------
  1 | import os, time, sys
  2 | import numpy as np
  3 | 
  4 | def parse_seqs(path):
  5 | 	with open(path) as file:
  6 | 		try: id = next(file).split()[0].lstrip('>')
  7 | 		except: return
  8 | 		seq = ''
  9 | 		for line in file:
 10 | 			if line[0]=='>':
 11 | 				yield id, seq
 12 | 				try: id = line.split()[0].lstrip('>')
 13 | 				except: return
 14 | 				seq = ''
 15 | 			else:
 16 | 				seq += line.rstrip()
 17 | 		yield id, seq
 18 | 
 19 | def parse_coords(fpath):
 20 | 	fields = [('s1',int),('e1',int),
 21 | 			  ('s2',int),('e2',int),
 22 | 			  ('len1',int),('len2',int),
 23 | 			  ('pid',float),
 24 | 			  ('c1',str),('c2',str)]
 25 | 	with open(fpath) as f:
 26 | 		for i in range(5):
 27 | 			next(f)
 28 | 		for l in f:
 29 | 			values = l.replace(' | ', ' ').split()
 30 | 			yield dict([(f[0],f[1](v)) for f,v in zip(fields, values)])
 31 | 
 32 | def parse_snps(fpath):
 33 | 	fields = [('p1',int),('b1',str),('b2',str),('p2',int),
 34 | 			  ('buf',int),('dist',int),
 35 | 			  ('r',int),('q',int),
 36 | 			  ('s1',int),('s2',int),
 37 | 			  ('c1',str),('c2',str)]
 38 | 	with open(fpath) as f:
 39 | 		for i in range(5):
 40 | 			next(f)
 41 | 		for l in f:
 42 | 			values = l.replace(' | ', ' ').split()
 43 | 			yield dict([(f[0],f[1](v)) for f,v in zip(fields, values)])
 44 | 
 45 | def build_msa(indir, overwrite=True, max_genomes=None, max_sites=None, msa_id=None, subset=None):
 46 | 	start = time.time()
 47 | 
 48 | 	aln_dir = os.path.join(indir, 'aln')
 49 | 
 50 | 	if not os.path.exists(indir):
 51 | 		sys.exit("Error: dir does not exist: %s" % indir)
 52 | 
 53 | 	print("Reading reference genome")
 54 | 	ref = {}
 55 | 	chroms = []
 56 | 	local_pos = np.array([])
 57 | 	for id, seq in parse_seqs(os.path.join(indir, 'reference.fna')):
 58 | 		chroms.append(id)
 59 | 		ref[id] = np.array(list(seq.upper()))
 60 | 		if len(local_pos) == 0:
 61 | 			local_pos = np.arange(len(seq))
 62 | 		else:
 63 | 			local_pos = np.concatenate([local_pos, np.arange(len(seq))])
 64 | 	print("   count contigs: %s" % len(ref))
 65 | 	print("   count sites: %s" % sum([len(_) for _ in ref.values()]))
 66 | 
 67 | 	print("Initializing alignments")
 68 | 	genome_ids = os.listdir(aln_dir)
 69 | 
 70 | 	if max_genomes is not None:
 71 | 		genome_ids = genome_ids[:max_genomes]
 72 | 
 73 | 	if len(subset) == 0:
 74 | 		pass
 75 | 	else:
 76 | 		genome_ids = []
 77 | 		for genome_id in os.listdir(aln_dir):
 78 | 			if "{}.fna".format(genome_id) in subset or \
 79 | 					"{}.fasta".format(genome_id) in subset or \
 80 |                     "{}.fsa".format(genome_id) in subset or \
 81 |                     "{}.fa".format(genome_id) in subset:
 82 | 				genome_ids.append(genome_id)
 83 | 
 84 | 	print("   count genomes: %s" % len(genome_ids))
 85 | 	genomes = {}
 86 | 	for genome_id in genome_ids:
 87 | 		genomes[genome_id] = {}
 88 | 		for id, seq in ref.items():
 89 | 			genomes[genome_id][id] = np.array(['-']*len(seq))
 90 | 
 91 | 	print("Reading alignment blocks")
 92 | 	for genome_id in genome_ids:
 93 | 		fpath = '%s/%s/coords' % (aln_dir, genome_id)
 94 | 		aln_length = 0
 95 | 		for r in parse_coords(fpath):
 96 | 			aln_length += (r['e1'] - r['s1'])
 97 | 			genomes[genome_id][r['c1']][r['s1']-1:r['e1']] = ref[r['c1']][r['s1']-1:r['e1']]
 98 | 
 99 | 	print("Reading SNPs")
100 | 	for genome_id in genome_ids:
101 | 		fpath = '%s/%s/snps' % (aln_dir, genome_id)
102 | 		for r in parse_snps(fpath):
103 | 			if r['b1'] == '.':
104 | 				continue
105 | 			elif r['b2'] == '.':
106 | 				genomes[genome_id][r['c1']][r['p1']-1] = '-'
107 | 			else:
108 | 				genomes[genome_id][r['c1']][r['p1']-1] = r['b2']
109 | 
110 | 	chrom_aligns = {}
111 | 	for chrom in chroms:
112 | 		chrom_aligns[chrom] = ''
113 | 		for genome_id in genomes:
114 | 			chrom_aligns[chrom] = chrom_aligns[chrom] + '>{} {}\n{}\n'.format(genome_id, chrom, ''.join(genomes[genome_id][chrom]))
115 | 
116 | 		chrom_aligns[chrom] = chrom_aligns[chrom] + '=\n'
117 | 
118 | 	print("Writing fasta")
119 | 
120 | 	fname = "msa.fa"
121 | 
122 | 	if msa_id is not None:
123 | 		fname = "{}.fa".format(msa_id)
124 | 
125 | 	msa_path = os.path.join(indir, fname)
126 | 
127 | 	if overwrite is True:
128 | 		pass
129 | 	else:
130 | 		indx = 1
131 | 		while (os.path.isfile(msa_path)):
132 | 			fname = "msa.{}.fa".format(indx)
133 | 			msa_path = os.path.join(indir, fname)
134 | 
135 | 	print("   path: %s" % msa_path)
136 | 	with open(msa_path, 'w') as f:
137 | 		for chrom in chroms:
138 | 			f.write(chrom_aligns[chrom])
139 | 
140 | 	print("\nDone!")
141 | 	print("Time (s):", round(time.time()-start,2))
142 | 
143 | 	return msa_path
144 | """
145 | 	print("Writing fasta")
146 | 	msa_path = os.path.join(indir, 'msa.fa')
147 | 	print("   path: %s" % msa_path)
148 | 	with open(msa_path, 'w') as f:
149 | 		for chrom in chroms:
150 | 			for genome_id in genomes:
151 | 				f.write('>%s %s\n' % (genome_id, chrom))
152 | 				f.write(''.join(genomes[genome_id][chrom])+'\n')
153 | 			f.write("=\n")
154 | 
155 | 	print("\nDone!")
156 | 	print("Time (s):", round(time.time()-start,2))
157 | """
158 | 


--------------------------------------------------------------------------------
/align_io/xmfa_mummer4_io.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | import copy
  4 | from align_io import seq_ali
  5 | 
  6 | class SimpleSequence:
  7 | 	def __init__(self, indx, id="", seq=""):
  8 | 		self.index = indx
  9 | 		self.id = id
 10 | 		self.seq = seq
 11 | 		self.chrom = ""
 12 | 
 13 | def parse(fpath, max_sample=float('inf')):
 14 | 		"""
 15 | 		this section reads the body of the parsnp xmfa file,
 16 | 		the '=' marked the end of a multiple sequence alignment,
 17 | 		a multiple sequence alignment contains sequences in fasta format for each sample
 18 | 		each sequence fasta header was splitted to different atttributes, and store in Sequence class for description.
 19 | 		each sequence was added to the Sequence class
 20 | 		"""
 21 | 		alns = []
 22 | 		total_len = 0
 23 | 		with open(fpath) as file:
 24 | 			last_aln = None
 25 | 			cur_aln = seq_ali.Alignment()
 26 | 			indx = 0
 27 | 			for line in file:
 28 | 				if line.startswith('='):
 29 | 					cur_aln.nseqs = len(cur_aln.seqs)
 30 | 					cur_aln.ncols = len(cur_aln.seqs[0].seq)
 31 | 					cur_aln.chrom = cur_aln.seqs[0].chrom
 32 | 					# print(cur_aln.nseqs)
 33 | 					cur_aln.update()
 34 | 					last_aln = cur_aln
 35 | 					cur_aln = seq_ali.Alignment()
 36 | 					alns.append(last_aln)
 37 | 					total_len = total_len + last_aln.ncols
 38 | 				else:
 39 | 					if len(cur_aln.seqs) <= max_sample:
 40 | 						if line.startswith('>'):
 41 | 							seq = SimpleSequence(indx)
 42 | 							temp_items = line.rstrip().split(' ')
 43 | 							seq.id = temp_items[0][1:]
 44 | 							seq.chrom = temp_items[1]
 45 | 							cur_aln.seqs.append(seq)
 46 | 							indx = indx + 1
 47 | 						else:
 48 | 							cur_aln.seqs[-1].seq += line.rstrip().upper()
 49 | 					else:
 50 | 						if len(cur_aln.seqs) > max_sample:
 51 | 							cur_aln.seqs.pop()
 52 | 						pass
 53 | 
 54 | 		print("total length of alignments: {}".format(total_len))
 55 | 		return alns
 56 | 
 57 | def iter_parse(fpath, max_sample=float('inf')):
 58 | 		"""
 59 | 		this section reads the body of the parsnp xmfa file,
 60 | 		the '=' marked the end of a multiple sequence alignment,
 61 | 		a multiple sequence alignment contains sequences in fasta format for each sample
 62 | 		each sequence fasta header was splitted to different atttributes, and store in Sequence class for description.
 63 | 		each sequence was added to the Sequence class
 64 | 		"""
 65 | 		lines = []
 66 | 		n_align = 0
 67 | 		with open(fpath) as file:
 68 | 			for line in file:
 69 | 				lines.append(line)
 70 | 				if line.startswith('='):
 71 | 					n_align = n_align + 1
 72 | 
 73 | 		if n_align > 1:
 74 | 			last_aln = None
 75 | 			cur_aln = seq_ali.Alignment()
 76 | 			indx = 0
 77 | 			for line in lines:
 78 | 				if line.startswith('='):
 79 | 					cur_aln.nseqs = len(cur_aln.seqs)
 80 | 					cur_aln.ncols = len(cur_aln.seqs[0].seq)
 81 | 					cur_aln.chrom = cur_aln.seqs[0].chrom
 82 | 
 83 | 					cur_aln.update()
 84 | 					last_aln = cur_aln
 85 | 					cur_aln = seq_ali.Alignment()
 86 | 					yield last_aln
 87 | 				else:
 88 | 					if len(cur_aln.seqs) <= max_sample:
 89 | 						if line.startswith('>'):
 90 | 							seq = SimpleSequence(indx)
 91 | 							temp_items = line.rstrip().split(' ')
 92 | 							seq.id = temp_items[0][1:]
 93 | 							seq.chrom = temp_items[1]
 94 | 							cur_aln.seqs.append(seq)
 95 | 							indx = indx + 1
 96 | 						else:
 97 | 							cur_aln.seqs[-1].seq += line.rstrip().upper()
 98 | 					else:
 99 | 						if len(cur_aln.seqs) > max_sample:
100 | 							cur_aln.seqs.pop()
101 | 						pass
102 | 		else:
103 | 			max_iter_stride = 200*1000
104 | 
105 | 			last_aln = None
106 | 			cur_aln = seq_ali.Alignment()
107 | 			indx = 0
108 | 
109 | 			for line in lines:
110 | 				if line.startswith('='):
111 | 					cur_aln.nseqs = len(cur_aln.seqs)
112 | 					cur_aln.ncols = len(cur_aln.seqs[0].seq)
113 | 					cur_aln.chrom = cur_aln.seqs[0].chrom
114 | 
115 | 					if cur_aln.ncols <= max_iter_stride:
116 | 						# print cur_aln.ncols
117 | 						cur_aln.update()
118 | 						yield cur_aln
119 | 					else:
120 | 						for sp in range(0, cur_aln.ncols, max_iter_stride):
121 | 							next_aln = seq_ali.Alignment()
122 | 							for seq in cur_aln.seqs:
123 | 								temp_seq = copy.deepcopy(seq)
124 | 								if sp+max_iter_stride <= cur_aln.ncols+1000:
125 | 									temp_seq.seq = temp_seq.seq[sp:(sp+max_iter_stride)]
126 | 								else:
127 | 									temp_seq.seq = temp_seq.seq[sp:]
128 | 								next_aln.seqs.append(temp_seq)
129 | 								#print next_aln.seqs[-1].chrom
130 | 
131 | 							next_aln.nseqs = len(next_aln.seqs)
132 | 							next_aln.ncols = len(next_aln.seqs[0].seq)
133 | 							next_aln.chrom = next_aln.seqs[0].chrom
134 | 							next_aln.update()
135 | 							yield next_aln
136 | 				else:
137 | 					if len(cur_aln.seqs) <= max_sample:
138 | 						if line.startswith('>'):
139 | 							seq = SimpleSequence(indx)
140 | 							temp_items = line.rstrip().split(' ')
141 | 							seq.id = temp_items[0][1:]
142 | 							seq.chrom = temp_items[1]
143 | 							cur_aln.seqs.append(seq)
144 | 							indx = indx + 1
145 | 						else:
146 | 							cur_aln.seqs[-1].seq += line.rstrip().upper()
147 | 					else:
148 | 						if len(cur_aln.seqs) > max_sample:
149 | 							cur_aln.seqs.pop()
150 | 						pass
151 | 


--------------------------------------------------------------------------------
/snps_io/vcf_io.py:
--------------------------------------------------------------------------------
  1 | class SNP:
  2 | 	def __init__(self, chrom, variant_id, pos, ref, alt, third=None, forth=None, avail_alleles=None, info=None, fmt=None, sample_ids=None):
  3 | 		self.chrom = chrom
  4 | 		self.var_id = variant_id
  5 | 		self.pos = pos
  6 | 
  7 | 		self.ref_allele = ref
  8 | 		self.alt_allele = alt
  9 | 		self.third_allele = third
 10 | 		self.forth_allele = forth
 11 | 
 12 | 		self.avail_alleles = avail_alleles
 13 | 
 14 | 		if info is None:
 15 | 			self.info = {}
 16 | 			self.info['NS'] = -1
 17 | 			self.info['DP'] = -1
 18 | 			self.info['AF'] = -1
 19 | 		else:
 20 | 			self.info = info
 21 | 
 22 | 		if fmt is None:
 23 | 			self.format = {}
 24 | 			self.format['GP1'] = ""
 25 | 			self.format['GP2'] = ""
 26 | 			self.format['GP3'] = ""
 27 | 			self.format['GP4'] = ""
 28 | 		else:
 29 | 			self.format = fmt
 30 | 
 31 | 		if sample_ids is None:
 32 | 			self.sample_ids = []
 33 | 		else:
 34 | 			self.sample_ids = sample_ids
 35 | 
 36 | def format_header(sample_ids, cmdl):
 37 | 	import time
 38 | 	header = ""
 39 | 	header += """##fileformat=VCFv4.1\n"""
 40 | 	header += """##fileDate=%s\n""" % time.strftime("%Y-%m-%d %H:%M")
 41 | 	header += """##source=https://github.com/zjshi/Maast\n"""
 42 | 	header += """##command='%s'\n""" % cmdl
 43 | 	header += """##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">\n"""
 44 | 	header += """##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n"""
 45 | 	header += """##INFO=<ID=AF,Number=A,Type=Float,Description="Alternate Allele Frequency">\n"""
 46 | 	header += """##FORMAT=<ID=GP1,Number=1,Type=Float,Description="Genotype 1 (REF) Probability">\n"""
 47 | 	header += """##FORMAT=<ID=GP2,Number=1,Type=Float,Description="Genotype 2 (1st ALT) Probability">\n"""
 48 | 	header += """##FORMAT=<ID=GP3,Number=1,Type=Float,Description="Genotype 3 (2nd ALT) Probability">\n"""
 49 | 	header += """##FORMAT=<ID=GP4,Number=1,Type=Float,Description="Genotype 4 (3rd ALT) Probability">\n"""
 50 | 
 51 | 	col_names = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + sample_ids
 52 | 	header += """#%s\n""" % "\t".join(col_names)
 53 | 
 54 | 	return header
 55 | 
 56 | def format_snp(snp):
 57 | 	record = ""
 58 | 	record += str(snp.chrom) + "\t" # CHROM
 59 | 	record += str(snp.pos) + "\t" # POS
 60 | 	record += str(snp.var_id) + "\t" # ID
 61 | 	record += (snp.ref_allele + b"\t").decode() # REF
 62 | 	record += (snp.avail_alleles + b"\t").decode() # ALT
 63 | 	record += ".\t" # QUAL
 64 | 	record += "PASS\t" # FILTER
 65 | 	record += "%s\t" % format_info(snp) # INFO
 66 | 	record += "%s\t" % ":".join(sorted(snp.format.keys())) # FORMAT
 67 | 	record += "%s\n" % format_samples(snp) # GENOTYPES
 68 | 	return record
 69 | 
 70 | def format_info(snp):
 71 | 	return ";".join([key + "=" + str(value) for key, value in snp.info.items()])
 72 | 
 73 | def format_samples(snp):
 74 | 	formats = sorted(snp.format.keys())
 75 | 	indexes = range(len(snp.sample_ids))
 76 | 	return "\t".join([":".join([str(snp.format[f][i]) for f in formats]) for i in indexes])
 77 | 
 78 | def write_vcf_header(snps, outdir, cmdl='unspecified'):
 79 | 	import sys
 80 | 
 81 | 	path = outdir+'/core_snps.vcf'
 82 | 	if len(snps) > 0:
 83 | 		with open(path, 'w') as file:
 84 | 			file.write(format_header(snps[0].sample_ids, cmdl))
 85 | 
 86 | def write_vcf(snps, outdir, single_chrom_rep=False):
 87 | 	import sys
 88 | 
 89 | 	path = outdir+'/core_snps.vcf'
 90 | 	if len(snps) > 0:
 91 | 		with open(path, 'a') as file:
 92 | 			for snp in snps:
 93 | 				if single_chrom_rep is True:
 94 | 					t_snp = snp
 95 | 					t_snp.pos = t_snp.var_id
 96 | 					file.write(format_snp(t_snp))
 97 | 				else:
 98 | 					file.write(format_snp(snp))
 99 | 	else:
100 | 		print("Empty set of SNPs was found for the dataset, the file writing was skipped")
101 | 
102 | def write_coords_header(coords, out_dir):
103 | 	path = out_dir+'/coords.tsv'
104 | 	with open(path, 'w') as file:
105 | 		file.write('\t'.join(['chrom', 'start', 'end'])+'\n')
106 | 
107 | def write_coords(coords, out_dir):
108 | 	path = out_dir+'/coords.tsv'
109 | 	with open(path, 'a') as file:
110 | 		for d in coords:
111 | 			file.write('\t'.join([d['chrom'], str(d['start']), str(d['end'])])+'\n')
112 | 
113 | def merge_coords(coords, min_gap=1):
114 | 	if len(coords) > 1:
115 | 		merged_coords = []
116 | 
117 | 		last_coord = coords[0]
118 | 		for i, coord in enumerate(coords[1:]):
119 | 			if coord['start'] - last_coord['end'] <= min_gap:
120 | 				if coord['chrom'] == last_coord['chrom']:
121 | 					new_coord = {'chrom':coord['chrom'], 'start':last_coord['start'], 'end':coord['end']}
122 | 					last_coord = new_coord
123 | 					continue
124 | 
125 | 			merged_coords.append(last_coord)
126 | 			last_coord = coord
127 | 
128 | 		merged_coords.append(last_coord)
129 | 
130 | 		return merged_coords
131 | 	else:
132 | 		return coords
133 | 
134 | def write_genome(genome, out_dir):
135 | 	path = out_dir+'/consensus.fna'
136 | 	with open(path, 'w') as file:
137 | 		file.write('>consensus\n'+genome+'\n')
138 | 
139 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
140 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
141 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
142 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
143 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
144 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
145 | ##FILTER=<ID=q10,Description="Quality below 10">
146 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
147 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
148 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
149 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
150 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
151 | 


--------------------------------------------------------------------------------
/align_io/seq_ali.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | 
  5 | """
  6 | Class for encapsulating a single Multiple Sequence Alignment(MSA)
  7 | """
  8 | class Alignment:
  9 | 	def __init__(self):
 10 | 		self.desc = '' #not used for now
 11 | 		# chrom has identifier like 'cluster123', which mark a division on the core-genome, which is a conserved region found cross all reference genomes
 12 | 		self.chrom = ''
 13 | 		self.nseqs = 0
 14 | 		self.ncols = 0 # number of sites
 15 | 		self.seqs = [] # actual sequence for each sample
 16 | 		self.sample_ids = []
 17 | 
 18 | 		"""
 19 | 		attributes below were described in function update()
 20 | 		"""
 21 | 		self.char_mat = []
 22 | 		self.local_pos = []
 23 | 		self.count_mat = []
 24 | 		self.freq_mat = []
 25 | 
 26 | 		self.ref_alleles = []
 27 | 		self.alt_alleles = []
 28 | 		self.third_alleles = []
 29 | 		self.forth_alleles = []
 30 | 
 31 | 		self.ref_prob_mat = []
 32 | 		self.alt_prob_mat = []
 33 | 		self.third_prob_mat = []
 34 | 		self.forth_prob_mat = []
 35 | 
 36 | 		self.sample_presence = []
 37 | 		self.ref_freqs = []
 38 | 		self.alt_freqs = []
 39 | 		self.third_freqs = []
 40 | 		self.forth_freqs = []
 41 | 
 42 | 		self.prevalence = []
 43 | 		self.aligned_pctg = []
 44 | 
 45 | 	def update(self):
 46 | 		assert len(self.seqs) > 1
 47 | 		self.nseqs = len(self.seqs)
 48 | 		self.ncols = len(self.seqs[0].seq)
 49 | 		self.sample_ids = [seq.id for seq in self.seqs]
 50 | 
 51 | 		"""
 52 | 		generate char matrix from the aligned sequences
 53 | 
 54 | 		example:
 55 | 		sample1: ATCG
 56 | 		sample2: ATGG
 57 | 		sample3: ATGC
 58 | 
 59 | 		the char matrix is the transpose of [[A, T, G, C],[A, T, G, G],[A, T, G, C]]
 60 | 		"""
 61 | 		self.char_mat = np.array([np.fromstring(seq.seq, dtype='c') for seq in self.seqs])
 62 | 
 63 | 		#print self.char_mat.shape
 64 | 		#print self.char_mat.nbytes
 65 | 
 66 | 		"""
 67 | 		count A, T, G, C, N and - for each site on the sequences
 68 | 		local_pos stores all local positions of each site on this core-genome division (or alignment)
 69 | 		"""
 70 | 		As = np.sum(self.char_mat == b'A', axis=0)
 71 | 		Ts = np.sum(self.char_mat == b'T', axis=0)
 72 | 		Gs = np.sum(self.char_mat == b'G', axis=0)
 73 | 		Cs = np.sum(self.char_mat == b'C', axis=0)
 74 | 		Ns = np.sum(self.char_mat == b'N', axis=0)
 75 | 		Gaps = np.sum(self.char_mat == b'-', axis=0)
 76 | 
 77 | 		self.local_pos = np.arange(len(self.seqs[0].seq))
 78 | 
 79 | 		self.count_mat = np.array([As, Ts, Gs, Cs, Ns, Gaps])
 80 | 
 81 | 		#print(self.char_mat)
 82 | 		#print(self.char_mat == 'A')
 83 | 		#print(As)
 84 | 		#print(self.count_mat)
 85 | 		#print(self.count_mat.shape)
 86 | 
 87 | 		"""
 88 | 		char_template: complete set of chars for each site on the sequences, from which the ref allele and alt allele will be selected
 89 | 		"""
 90 | 		char_template = np.array([
 91 | 			np.repeat(b'A', self.count_mat.shape[1]),
 92 | 			np.repeat(b'T', self.count_mat.shape[1]),
 93 | 			np.repeat(b'G', self.count_mat.shape[1]),
 94 | 			np.repeat(b'C', self.count_mat.shape[1])
 95 | 			# np.repeat('N', self.count_mat.shape[1])
 96 | 			# np.repeat('-', self.count_mat.shape[1])
 97 | 		])
 98 | 
 99 | 		"""
100 | 		sorting the counts of different chars at each site,
101 | 		then select the indices of chars whose counts are in top 2,
102 | 		then using the indices of chars to select the chars,
103 | 		then the top 1 char (the char with highest count) at each site is considered as ref allele for now,
104 | 		then the char with the second highest count at each site is considered as alt allele for now,
105 | 		for those sites that have only one char, the counts of other chars are zeroes, so theorectically any of them can be selected by program, but in reality '-' will be selected, no exception was found.
106 | 		"""
107 | 		count_inds_mat = self.count_mat[0:4,:].argsort(axis=0)
108 | 		top2_inds = count_inds_mat[-4:,]
109 | 		top2_char_mat = np.choose(top2_inds, char_template)
110 | 		self.ref_alleles = top2_char_mat[3,:]
111 | 		self.alt_alleles = top2_char_mat[2,:]
112 | 
113 | 		self.third_alleles = top2_char_mat[1,:]
114 | 		self.forth_alleles = top2_char_mat[0,:]
115 | 
116 | 		"""
117 | 		frequency matrix has the same shape as the char matrix
118 | 		it is initialize to have None only,
119 | 		in the end, it is used to store the presence/absence of ref/alt allele for each site cross all samples with the following rules:
120 | 		- presence of ref allele: 1
121 | 		- absence of ref allele: 0
122 | 		- presence of N or -: None
123 | 		"""
124 | 		self.freq_mat = np.repeat(np.int8(-1), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape)
125 | 
126 | 
127 | 		"""
128 | 		these two masks have the same shape as the frequence matrix
129 | 		"""
130 | 		# ref_mask = ((self.char_mat == self.ref_alleles) & (self.char_mat != '-') & (self.char_mat != 'N'))
131 | 		# alt_mask = ((self.char_mat == self.alt_alleles) & (self.char_mat != '-') & (self.char_mat != 'N'))
132 | 
133 | 		ref_mask = (self.char_mat == self.ref_alleles)
134 | 		alt_mask = (self.char_mat == self.alt_alleles)
135 | 		third_mask = (self.char_mat == self.third_alleles)
136 | 		forth_mask = (self.char_mat == self.forth_alleles)
137 | 		"""
138 | 		such operation is possible because, I guess, numpy store matrix in a gigantic 1D array.
139 | 		"""
140 | 		self.freq_mat[ref_mask] = 0
141 | 		self.freq_mat[alt_mask] = 1
142 | 		self.freq_mat[third_mask] = 2
143 | 		self.freq_mat[forth_mask] = 3
144 | 
145 | 		self.ref_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape)
146 | 		self.alt_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape)
147 | 		self.third_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape)
148 | 		self.forth_prob_mat = np.repeat(np.int8(0), self.char_mat.shape[0]*self.char_mat.shape[1]).reshape(self.char_mat.shape)
149 | 
150 | 
151 | 
152 | 		self.ref_prob_mat[ref_mask] = 1
153 | 		self.alt_prob_mat[alt_mask] = 1
154 | 		self.third_prob_mat[third_mask] = 1
155 | 		self.forth_prob_mat[forth_mask] = 1
156 | 
157 | 		ref_counts = np.sum(ref_mask, axis=0)
158 | 		alt_counts = np.sum(alt_mask, axis=0)
159 | 		third_counts = np.sum(third_mask, axis=0)
160 | 		forth_counts = np.sum(forth_mask, axis=0)
161 | 		"""
162 | 		the sample presence here only sum up the counts of A, T, G, C for each site, leave out the N and -,
163 | 		it facilitate the calculation of prevalence of the site on certain sequence alignment position
164 | 		ref and alt allele frequencies were calculated with denominator of sample_presence rather than the number of sample.
165 | 		naturally, there is another route to calculate them.
166 | 		"""
167 | 		self.sample_presence = np.sum(self.count_mat[0:4,:], axis=0)
168 | 		self.prevalence = self.sample_presence/self.nseqs
169 | 
170 | 		zero_mask = (self.sample_presence == 0)
171 | 		self.sample_presence[zero_mask] = 1
172 | 		self.ref_freqs = ref_counts/self.sample_presence
173 | 		self.alt_freqs = alt_counts/self.sample_presence
174 | 		self.third_freqs = third_counts/self.sample_presence
175 | 		self.forth_freqs = forth_counts/self.sample_presence
176 | 
177 | 		self.ref_freqs[zero_mask] = 0
178 | 		self.alt_freqs[zero_mask] = 0
179 | 		self.third_freqs[zero_mask] = 0
180 | 		self.forth_freqs[zero_mask] = 0
181 | 
182 | 		self.sample_presence[zero_mask] = 0
183 | 
184 | 		unaligned_masks = (self.count_mat[4:6,:] != 0)
185 | 		self.aligned_pctg = 1 - (np.sum(unaligned_masks, axis=0) / self.nseqs)
186 | 


--------------------------------------------------------------------------------
/snps_io/id_genome_clusters.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import sys, os
  4 | import argparse, operator
  5 | 
  6 | import numpy as np
  7 | import networkx as nx
  8 | 
  9 | from time import time
 10 | 
 11 | class GenomeCluster:
 12 | 	def __init__(self, max_d):
 13 | 		self.max_d  = max_d
 14 | 
 15 | 		self.genomes = dict()
 16 | 		self.links = dict()
 17 | 
 18 | 		self.tag_genome = None
 19 | 
 20 | 	def size(self):
 21 | 		return len(genomes)
 22 | 
 23 | 	def add(self, genome1, genome2, d, edge_weighted):
 24 | 		weight = 1
 25 | 		if edge_weighted is True:
 26 | 			weight = 1 - d
 27 | 
 28 | 		if genome1 not in self.genomes:
 29 | 			self.genomes[genome1] = weight
 30 | 		else:
 31 | 			self.genomes[genome1] = self.genomes[genome1] + weight
 32 | 
 33 | 		if genome2 not in self.genomes:
 34 | 			self.genomes[genome2] = weight
 35 | 		else:
 36 | 			self.genomes[genome2] = self.genomes[genome2] + weight
 37 | 
 38 | 		link = "{}|{}".format(genome1, genome2)
 39 | 		if genome1 > genome2:
 40 | 			link = "{}|{}".format(genome2, genome1)
 41 | 
 42 | 		#sys.stderr.write("{} {}\n".format(genome1, genome2))
 43 | 
 44 | 		if link not in self.links:
 45 | 			self.links[link] = d
 46 | 		else:
 47 | 			#assert True
 48 | 			assert self.links[link] == d
 49 | 			#sys.stderr.write("{} {}\n".format(genome1, genome2))
 50 | 			
 51 | 
 52 | 	def merge(self, cluster2, genome1, genome2, d, edge_weighted):
 53 | 		if self.contains(genome1) and cluster2.contains(genome2):
 54 | 			self.add(genome1, genome2, d, edge_weighted)
 55 | 			for link in cluster2.links.keys():
 56 | 				genomes = link.split("|")
 57 | 				self.add(genomes[0], genomes[1], cluster2.links[link], edge_weighted)
 58 | 		else:
 59 | 			sys.exit("{} is not in cluster1 or {} is not cluster2, nullify the basis for merging".format(genome1, genome2))
 60 | 
 61 | 	def is_empty(self):
 62 | 		return len(self.genomes.keys()) == 0
 63 | 
 64 | 	def contains(self, genome):
 65 | 		return genome in self.genomes
 66 | 
 67 | 	def id_tag_genome(self, cent_meth):
 68 | 		# no snps
 69 | 		if len(self.genomes) == 0:
 70 | 			sys.exit("\nError: no genomes on cluster: cannot id tag genome\n")
 71 | 		# one snp
 72 | 		elif len(self.genomes) == 1:
 73 | 			self.tag_genome = self.genomes.keys()[0]
 74 | 		else:
 75 | 			tmp_min = 0
 76 | 			tmp_genome = None
 77 | 			if cent_meth == "degree":
 78 | 				for genome in self.genomes.keys():
 79 | 					if self.genomes[genome] > tmp_min:
 80 | 						tmp_min = self.genomes[genome]
 81 | 						tmp_genome = genome
 82 | 			else:
 83 | 				G = nx.Graph()
 84 | 				centrality = dict()
 85 | 				for link in self.links.keys():
 86 | 					genomes = link.split("|")
 87 | 					G.add_edge(genomes[0], genomes[1])
 88 | 				if cent_meth == "eigenvector":
 89 | 					centrality = nx.eigenvector_centrality(G)
 90 | 				elif cent_meth == "katz":
 91 | 					centrality = nx.katz_centrality(G)
 92 | 				elif cent_meth == "closeness":
 93 | 					centrality = nx.closeness_centrality(G)
 94 | 				elif cent_meth == "information":
 95 | 					centrality = nx.information_centrality(G)
 96 | 				elif cent_meth == "betweenness":
 97 | 					centrality = nx.betweenness_centrality(G)
 98 | 				elif cent_meth == "load":
 99 | 					centrality = nx.load_centrality(G)
100 | 				else:
101 | 					sys.exit("Error: centrality method {} is not support for tag genome identification".format(cent_meth))
102 | 				tmp_genome = [k for k, v in sorted(centrality.items(), key=lambda x: x[1])][-1]
103 | 			self.tag_genome = tmp_genome
104 | 
105 | 		return self.tag_genome
106 | 
107 | 	def fmtout(self):
108 | 		sorted_tuples = sorted(self.genomes.items(), key=operator.itemgetter(1), reverse=True)
109 | 		sorted_genomes = [genome_tuple[0] for genome_tuple in sorted_tuples]
110 | 
111 | 		return "* {} {}".format(self.tag_genome, " ".join(sorted_genomes))
112 | 
113 | 	def fmtout_all(self):
114 | 		fmt_str = "{}\n".format(self.fmtout())
115 | 
116 | 		for link in self.links.keys():
117 | 			fmt_str += "- {} {}\n".format(link, self.links[link])
118 | 
119 | 		return fmt_str
120 | 
121 | def search_genome_clusters(dist_path, max_d, cent_meth, edge_weighted):
122 | 	sys.stderr.write("[clustering] start\n")
123 | 
124 | 	genome_clusters = []
125 | 	genome_lookup = dict()
126 | 
127 | 	with open(dist_path, 'r') as fh:
128 | 		for line in fh:
129 | 			items = line.rstrip().split('\t')
130 | 			genome1, genome2, d = items[0], items[1], float(items[2])
131 | 
132 | 			if genome1 >= genome2 or d > max_d:
133 | 				#sys.stderr.write("{} {}\n".format(genome1, genome2))
134 | 				continue
135 | 			# sys.stderr.write("{} {}\n".format(genome1, genome2))
136 | 
137 | 			if genome1 not in genome_lookup and genome2 not in genome_lookup:
138 | 				new_cluster = GenomeCluster(max_d)
139 | 				new_cluster.add(genome1, genome2, d, edge_weighted)
140 | 				genome_clusters.append(new_cluster)
141 | 				genome_lookup[genome1] = len(genome_clusters) - 1
142 | 				genome_lookup[genome2] = len(genome_clusters) - 1
143 | 			elif genome1 in genome_lookup and genome2 not in genome_lookup:
144 | 				cluster_indx = genome_lookup[genome1]
145 | 				genome_lookup[genome2] = cluster_indx
146 | 				genome_clusters[cluster_indx].add(genome1, genome2, d, edge_weighted)
147 | 			elif genome1 not in genome_lookup and genome2 in genome_lookup:
148 | 				cluster_indx = genome_lookup[genome2]
149 | 				genome_lookup[genome1] = cluster_indx
150 | 				genome_clusters[cluster_indx].add(genome1, genome2, d, edge_weighted)
151 | 			else:
152 | 				if genome_lookup[genome1] == genome_lookup[genome2]:
153 | 					pass
154 | 				else:
155 | 					cluster_indx1 = genome_lookup[genome1]
156 | 					cluster_indx2 = genome_lookup[genome2]
157 | 					genome_clusters[cluster_indx1].merge(genome_clusters[cluster_indx2], genome1, genome2, d, edge_weighted)
158 | 
159 | 					for genome in genome_clusters[cluster_indx2].genomes:
160 | 						genome_lookup[genome] = cluster_indx1
161 | 
162 | 					genome_clusters[cluster_indx2] = None
163 | 
164 | 	sys.stderr.write("[clustering] done\n")
165 | 	sys.stderr.write("[clustering] {} genomes have been included in clusters\n".format(len(genome_lookup.keys())))
166 | 
167 | 	good_clusters = verify_clusters(genome_clusters, genome_lookup, cent_meth)
168 | 
169 | 	return good_clusters, len(genome_lookup.keys())
170 | 
171 | def verify_clusters(genome_clusters, genome_lookup, cent_meth):
172 | 	for genome in genome_lookup.keys():
173 | 		assert genome in genome_clusters[genome_lookup[genome]].genomes
174 | 
175 | 	good_clusters = []
176 | 	for i, cluster in enumerate(genome_clusters):
177 | 		if cluster is not None:
178 | 			cluster.id_tag_genome(cent_meth)
179 | 			good_clusters.append(cluster)
180 | 
181 | 			for genome in cluster.genomes:
182 | 				assert genome_lookup[genome] == i
183 | 
184 | 	return good_clusters
185 | 
186 | def output_clusters(good_clusters, output_path="/dev/stdout"):
187 | 	if output_path is not None:
188 | 		with open(output_path, 'w') as fh:
189 | 			for gcluster in good_clusters:
190 | 				fh.write(gcluster.fmtout_all())
191 | 
192 | def build_genome_blocks(dist_path, total_n, critical_n=100, max_d=0.01, end_d=0.000001, range_factor=1.2, cent_meth="degree", edge_weigthed=False, output_path=None):
193 | 	optimal_d = 0
194 | 	optimal_n = 0
195 | 	optimal_clusters = []
196 | 
197 | 	upper_cap = critical_n * range_factor
198 | 
199 | 	genome_clusters, clust_n = search_genome_clusters(dist_path, max_d, cent_meth, edge_weigthed)
200 | 	
201 | 	tag_n = total_n - clust_n + len(genome_clusters)
202 | 
203 | 	firstcut_exit = False
204 | 	if tag_n > upper_cap:
205 | 		print("Program will continue with a non-optimal number ({}) of genomes. Perhaps try a higher cutoff (current {})".format(str(tag_n), str(max_d)))
206 | 		optimal_d = max_d
207 | 		optimal_n = tag_n
208 | 		optimal_clusters = genome_clusters
209 | 		firstcut_exit = True
210 | 	elif tag_n >= critical_n and tag_n <= upper_cap:
211 | 		# perfect scenario on exit
212 | 		optimal_d = max_d
213 | 		optimal_n = tag_n
214 | 		optimal_clusters = genome_clusters
215 | 	else:
216 | 		# determine lower bound
217 | 		min_d = max_d
218 | 
219 | 		print("[Searching lower cap]")
220 | 		while min_d >= end_d and tag_n < critical_n:
221 | 			min_d = min_d / 10
222 | 
223 | 			genome_clusters, clust_n = search_genome_clusters(dist_path, min_d, cent_meth, edge_weigthed)
224 | 			tag_n = total_n - clust_n + len(genome_clusters)
225 | 
226 | 			print("\t{}: {} tag genomes".format(min_d, tag_n))
227 | 
228 | 		print("[End earching]")
229 | 
230 | 		# binary search into critical range
231 | 		print("[Searching optimal d-cut]")
232 | 		if min_d < end_d and tag_n < critical_n:
233 | 			print("Program cannot reach the number ({}) of genomes required for core-genome SNP calling.")
234 | 			print("Proceeding with orginal set of genomes")
235 | 
236 | 			optimal_d = None 
237 | 			optimal_n = None 
238 | 			optimal_clusters = None 
239 | 		else:
240 | 			left_d = max_d
241 | 			right_d = min_d
242 | 			mid_point = int((upper_cap + critical_n) / 2)
243 | 
244 | 			delta_d = 1 # arbitary value; does not matter
245 | 
246 | 			while delta_d > 0.0000001 and (tag_n > upper_cap or tag_n < critical_n):
247 | 				cur_d = (left_d + right_d) / 2
248 | 
249 | 				genome_clusters, clust_n = search_genome_clusters(dist_path, cur_d, cent_meth, edge_weigthed)
250 | 				tag_n = total_n - clust_n + len(genome_clusters)
251 | 
252 | 				if tag_n > mid_point:
253 | 					right_d = cur_d
254 | 				else:
255 | 					left_d = cur_d
256 | 
257 | 				delta_d = abs(left_d - right_d)
258 | 					
259 | 				print("\tsearching space [ {} , {} ]".format(left_d, right_d))
260 | 				print("\tcurrent d-cut: {}".format(cur_d))
261 | 				print("\tcurrent no of tags: {}".format(tag_n))
262 | 
263 | 				if tag_n >= critical_n:
264 | 					delta_1 = abs(tag_n - mid_point)
265 | 					delta_2 = abs(optimal_n - mid_point)
266 | 
267 | 					if delta_1 < delta_2:
268 | 						optimal_d = cur_d
269 | 						optimal_n = tag_n
270 | 						optimal_clusters = genome_clusters
271 | 					else:
272 | 						pass
273 | 				else:
274 | 					pass
275 | 
276 | 
277 | 		print("[Searching optimal d-cut]")
278 | 
279 | 		if tag_n < critical_n and optimal_n < critical_n:
280 | 			print("Program cannot reach the number ({}) of genomes required for core-genome SNP calling.")
281 | 			print("Proceeding with orginal set of genomes. Or try higher MAF")
282 | 
283 | 			optimal_d = None 
284 | 			optimal_n = None 
285 | 			optimal_clusters = None 
286 | 
287 | 	return optimal_clusters, optimal_d, optimal_n, firstcut_exit 
288 | 


--------------------------------------------------------------------------------
/db_io/build_db.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import sys, os, argparse, copy, signal
  4 | import numpy as np
  5 | import multiprocessing as mp
  6 | 
  7 | from time import time, sleep
  8 | 
  9 | from Bio import SeqIO
 10 | from Bio.SeqRecord import SeqRecord
 11 | 
 12 | """
 13 | This function fetch all possible kmers of size specified by kmer_size argument,
 14 | all fetched kmers will serve as the database used for verify the uniqueness of kmers that actually snp
 15 | build a dictionary for all possible kmers is quite time consuming, one possible way to speed up is to pre-compute it and store it in form of binary file that is directly loadable, thus avoid hash computation again.
 16 | 
 17 | Upadate: 07/01/18
 18 | Using the following format for the sake of simplicity
 19 | kmer_seq: ATGC
 20 | """
 21 | def fetch_all_kmers(genome_seq, kmer_size, coords=None):
 22 | 	kmers = []
 23 | 
 24 | 	if len(genome_seq) < kmer_size:
 25 | 		return kmers
 26 | 	else:
 27 | 		for i in range(len(genome_seq)-kmer_size+1):
 28 | 			kmer = genome_seq[i:(i+kmer_size)]
 29 | 			kmers.append(kemr)
 30 | 			rc_kmer = revcomp(kmer)
 31 | 		return kmers
 32 | 
 33 | def build_kmer_db(genome_seq, kmer_size):
 34 | 	kmers = dict()
 35 | 
 36 | 	if len(genome_seq) < kmer_size:
 37 | 		return kmers
 38 | 	else:
 39 | 		for i in range(len(genome_seq)-kmer_size+1):
 40 | 			kmer = genome_seq[i:(i+kmer_size)]
 41 | 			if kmer not in kmers:
 42 | 				kmers[kmer] = 1
 43 | 			else:
 44 | 				kmers[kmer] = kmers[kmer] + 1
 45 | 
 46 | 		#for kmer, count in kmers.iteritems():
 47 | 		#	sys.stderr.write("{}\t{}\n".format(kmer, count))
 48 | 
 49 | 		return kmers
 50 | 
 51 | """
 52 | The module takes kmer size argument and allows to build kmer database of different size.
 53 | It also takes a kmer_type argument for allowing two approaches to search kmers that cover target snps:
 54 | 1) fetch all eligible kmers; fetch_all_snp_kmers()
 55 | 2) fetch kmers whose target snp was at the center; fetch_center_snp_kmers().
 56 | 
 57 | Upadate: 07/01/18
 58 | Using the following format for the sake of simplicity
 59 | id/glob_pos: 11111
 60 | allele_pos_on_kmer: 3
 61 | kmer_seq(REF/+): ATGC
 62 | kmer_seq(ALT/+): ATTC
 63 | kmer_seq(REF/-): GCAT
 64 | kmer_seq(ALT/-): GAAT
 65 | """
 66 | def fetch_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, kmer_type, coords=None):
 67 | 	if kmer_type == 'all':
 68 | 		return fetch_all_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords)
 69 | 	elif kmer_type == 'center':
 70 | 		return fetch_center_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords)
 71 | 	else:
 72 | 		sys.exit("the specified kmer_type value was not recognized by the program: {}".format(kmer_type))
 73 | 
 74 | def fetch_all_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords=None):
 75 | 	print("[searching] start to search {}-mers".format(kmer_size))
 76 | 
 77 | 	inds_map = None
 78 | 	if coords is not None:
 79 | 		inds_map = [None for i in range(len(genome_seq))]
 80 | 
 81 | 		for i, coord in enumerate(coords):
 82 | 			for j in range(int(coord[1]), int(coord[2])+1):
 83 | 				inds_map[j] = i
 84 | 
 85 | 	kmers = []
 86 | 	for ri, pos in enumerate(snp_pos):
 87 | 		kmer_start = int(pos)-kmer_size+1
 88 | 		kmer_end = int(pos)+kmer_size-1
 89 | 
 90 | 		if inds_map is not None:
 91 | 			if inds_map[int(pos)] is None:
 92 | 				continue
 93 | 
 94 | 			cur_coord = coords[inds_map[int(pos)]]
 95 | 			coord_start, coord_end = int(cur_coord[1]), int(cur_coord[2])
 96 | 			kmer_start, kmer_end = max(coord_start, kmer_start), min(coord_end, kmer_end)
 97 | 
 98 | 		if kmer_end - kmer_start + 1 >= kmer_size:
 99 | 			subseq = genome_seq[kmer_start:(kmer_end+1)]
100 | 
101 | 			for i in range(len(subseq)-kmer_size+1):
102 | 				kmer = subseq[i:(i+kmer_size)]
103 | 
104 | 				var_pos = kmer_size-i-1
105 | 				
106 | 				kmer = kmer[:var_pos]+snp_alleles[ri][0]+kmer[var_pos+1:]
107 | 				akmer = kmer[:var_pos]+snp_alleles[ri][1]+kmer[var_pos+1:]
108 | 
109 | 				rc_kmer = revcomp(kmer)
110 | 				rc_akmer = revcomp(akmer)
111 | 
112 | 				kmers.append([pos, var_pos, kmer, akmer, rc_kmer, rc_akmer])
113 | 	print("	a total of {} kmers was found\n".format(len(kmers)))
114 | 	return kmers
115 | 
116 | def load_msa(msa_path):
117 | 	genome_msa = dict()
118 | 
119 | 	with open(msa_path, 'r') as fh:
120 | 		for line in fh:
121 | 			if line[0] == '>':
122 | 				working_id = line.split(' ')[0][1:]
123 | 			elif line[0] == '=':
124 | 				pass
125 | 			else:
126 | 				if working_id not in genome_msa:
127 | 					genome_msa[working_id] = ""
128 | 				
129 | 				genome_msa[working_id] = genome_msa[working_id] + line.rstrip()
130 | 	
131 | 	genome_seqs = [genome_msa[key] for key in genome_msa.keys()]
132 | 
133 | 	return genome_seqs
134 | 
135 | 
136 | def fetch_all_from_msa(genome_seqs, ref_seq, snp_pos, snp_alleles, kmer_size, coords=None):
137 | 	print("[searching] start to search {}-mers".format(kmer_size))
138 | 
139 | 	inds_map = None
140 | 	if coords is not None:
141 | 		inds_map = [None for i in range(len(ref_seq))]
142 | 
143 | 		for i, coord in enumerate(coords):
144 | 			for j in range(int(coord[1]), int(coord[2])+1):
145 | 				inds_map[j] = i
146 | 
147 | 	kmer_records = []
148 | 	for ri, pos in enumerate(snp_pos):
149 | 		kmer_start = int(pos)-kmer_size+1
150 | 		kmer_end = int(pos)+kmer_size-1
151 | 
152 | 		if inds_map is not None:
153 | 			if inds_map[int(pos)] is None:
154 | 				continue
155 | 
156 | 			cur_coord = coords[inds_map[int(pos)]]
157 | 			coord_start, coord_end = int(cur_coord[1]), int(cur_coord[2])
158 | 			kmer_start, kmer_end = max(coord_start, kmer_start), min(coord_end, kmer_end)
159 | 
160 | 		if kmer_end - kmer_start + 1 >= kmer_size:
161 | 			subseqs = [genome_seq[kmer_start:(kmer_end+1)] for genome_seq in genome_seqs]
162 | 
163 | 			for i in range(len(subseqs[0])-kmer_size+1):
164 | 				raw_kmers = [subseq[i:(i+kmer_size)] for subseq in subseqs]
165 | 
166 | 				kmers = []
167 | 				for rk in raw_kmers:
168 | 					if '-' not in rk and 'N' not in rk:
169 | 						kmers.append(rk)
170 | 
171 | 				ukmers, counts = np.unique(kmers, return_counts=True)
172 | 				uk_inds = np.argsort(counts)[::-1]
173 | 
174 | 				var_pos = kmer_size-i-1
175 | 
176 | 				kmer = ""
177 | 				akmer = ""
178 | 				kflag = False
179 | 				akflag = False
180 | 				for ukmer in ukmers[uk_inds]:
181 | 					if kflag is False:
182 | 						if ukmer[var_pos] == snp_alleles[ri][0]:	
183 | 							kmer = ukmer
184 | 							kflag = True
185 | 					
186 | 					if akflag is False:
187 | 						if ukmer[var_pos] == snp_alleles[ri][1]:
188 | 							akmer = ukmer
189 | 							akflag = True
190 | 
191 | 					if kflag is True and akflag is True:
192 | 						break
193 | 				
194 | 				if len(kmer) != 31 or len(akmer) != 31:
195 | 					continue
196 | 
197 | 				rc_kmer = revcomp(kmer)
198 | 				rc_akmer = revcomp(akmer)
199 | 
200 | 				kmer_records.append([pos, var_pos, kmer, akmer, rc_kmer, rc_akmer])
201 | 
202 | 	print("	a total of {} kmer records was found\n".format(len(kmer_records)))
203 | 	return kmer_records
204 | 
205 | def fetch_center_snp_kmers(genome_seq, snp_pos, snp_alleles, kmer_size, coords=None):
206 | 	print("[searching] start to search {}-mers\n".format(kmer_size))
207 | 
208 | 	inds_map = None
209 | 	if coords is not None:
210 | 		inds_map = [None for i in range(len(genome_seq))]
211 | 
212 | 		for i, coord in enumerate(coords):
213 | 			for j in range(int(coord[1]), int(coord[2])+1):
214 | 				inds_map[j] = i
215 | 
216 | 	is_even = (kmer_size % 2 == 0)
217 | 
218 | 	kmers = []
219 | 	for ri, pos in enumerate(snp_pos):
220 | 		kmer_start, kmer_end, var_pos = 0, 0, 0
221 | 
222 | 		if is_even:
223 | 			var_pos = int(kmer_size/2)
224 | 			kmer_start = int(pos)-int(kmer_size/2)+1
225 | 			kmer_end = int(pos)+int(kmer_size/2)
226 | 		else:
227 | 			var_pos = int(kmer_size/2)+1
228 | 			kmer_start = int(pos)-int(kmer_size/2)
229 | 			kmer_end = int(pos)+int(kmer_size/2)
230 | 
231 | 		if inds_map is not None:
232 | 			if inds_map[int(pos)] is None:
233 | 				continue
234 | 
235 | 			cur_coord = coords[inds_map[int(pos)]]
236 | 
237 | 			coord_start = int(cur_coord[1])
238 | 			coord_end = int(cur_coord[2])
239 | 
240 | 			if kmer_start < coord_start or kmer_end > coord_end:
241 | 				continue
242 | 
243 | 		kmer = genome_seq[kmer_start:(kmer_end+1)]
244 | 		akmer = kmer[:var_pos]+snp_alleles[ri][0]+kmer[var_pos+1:]
245 | 
246 | 		akmer = kmer
247 | 		akmer = kmer[:var_pos]+snp_alleles[ri][1]+kmer[var_pos+1:]
248 | 
249 | 		rc_kmer = revcomp(kmer)
250 | 		rc_akmer = revcomp(akmer)
251 | 
252 | 		kmers.append([pos, var_pos, kmer, akmer, rc_kmer, rc_akmer])
253 | 	print("	a total of {} kmers was found\n".format(len(kmers)))
254 | 	return kmers
255 | 
256 | def revcomp(seq):
257 | 	""" Reverse complement sequence
258 | 
259 | 	Args:
260 | 		seq:	string from alphabet {A,T,C,G,N}
261 | 
262 | 	Returns:
263 | 		reverse complement of seq
264 | 	"""
265 | 	complement = {
266 | 		'A':'T',
267 | 		'T':'A',
268 | 		'G':'C',
269 | 		'C':'G',
270 | 		'N':'N',
271 | 		'R':'N',
272 | 		'Y':'N',
273 | 		'K':'N',
274 | 		'M':'N',
275 | 		'S':'N',
276 | 		'W':'N',
277 | 		'B':'N',
278 | 		'D':'N',
279 | 		'H':'N',
280 | 		'V':'N'
281 | 	}
282 | 	return ''.join([complement[_] for _ in seq[::-1]])
283 | 
284 | def calc_snp_coverage(kmers):
285 | 	return len(set([kmer[0] for kmer in kmers]))
286 | 
287 | def dump_tsv(kmers, output):
288 | 	with open(output, 'w') as fh:
289 | 		for kmer in kmers:
290 | 			if len(kmer) == 6:
291 | 				fh.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(*kmer))
292 | 			elif len(kmer) == 9:
293 | 				fh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(*kmer))
294 | 			else:
295 | 				assert False
296 | 
297 | # a mini function to load all coordinates in to memory
298 | def read_coords(fpath):
299 | 	print("[load] loading key coordinates on core-genome from {}".format(fpath))
300 | 
301 | 	coords = []
302 | 	with open(fpath, "r") as fh:
303 | 		fh.readline()
304 | 		for line in fh:
305 | 			coords.append(line.rstrip('\n').split('\t'))
306 | 
307 | 	print("	a total of {} divisions was found\n".format(str(len(coords))))
308 | 	return coords
309 | 
310 | def open_vcf_file_local(fpath):
311 | 	"""
312 | 	* ``Record.CHROM``; string
313 | 	* ``Record.POS``; int
314 | 	* ``Record.ID``; None
315 | 	* ``Record.REF``; string
316 | 	* ``Record.ALT``; list
317 | 	* ``Record.QUAL``; None
318 | 	* ``Record.FILTER``; list
319 | 	* ``Record.INFO``; dictionary
320 | 
321 | 	additional attributes:
322 | 	* ``Record.FORMAT``; string
323 | 	* ``Record.samples``; list
324 | 	* ``Record.genotype``; object
325 | 	"""
326 | 
327 | 	print("[load] loading core snps from {}".format(fpath))
328 | 
329 | 	snp_gb_pos = [] 
330 | 	snp_alleles = []
331 | 	with open(fpath, 'r') as fh:
332 | 		for l in fh:
333 | 			if l[0] == "#":
334 | 				continue
335 | 			else:
336 | 				values = l.rstrip().split('\t')[:5]
337 | 
338 | 				chrom = values[0]
339 | 				pos_r = int(values[1])
340 | 				gid = values[2]
341 | 				allele_ma = values[3]
342 | 				allele_mi = values[4]
343 | 
344 | 				if len(allele_mi) > 1:
345 | 					continue
346 | 				snp_gb_pos.append(int(gid))
347 | 				snp_alleles.append([allele_ma, allele_mi])
348 | 	print("	a total of {} core snps was found\n".format(str(len(snp_gb_pos))))
349 | 
350 | 	return snp_gb_pos, snp_alleles
351 | 
352 | 
353 | def open_genome_seq(genome_path):
354 | 	print("[load] loading core-genome consensus sequence from {}".format(genome_path))
355 | 
356 | 	records = list(SeqIO.parse(genome_path, "fasta"))
357 | 	main_genome = ""
358 | 	for record in records:
359 | 		main_genome = main_genome + str(record.seq).upper()
360 | 
361 | 	print("	the loaded core-genome has a consensus sequence of {} bases\n".format(str(len(main_genome))))
362 | 
363 | 	return main_genome
364 | 
365 | def read_kmerset(kmer_path):
366 | 	print("[load] loading kmerset from {}".format(kmer_path))
367 | 	kmerset = []
368 | 
369 | 	with open(kmer_path, "r") as fh:
370 | 		for line in fh:
371 | 			items = line.rstrip().split('\t')
372 | 			items[6] = int(items[6])
373 | 			items[7] = int(items[7])
374 | 			items[8] = int(items[8])
375 | 			kmerset.append(items)
376 | 
377 | 	print("	the loaded kmerset has {} kmer records\n".format(str(len(kmerset))))
378 | 	return kmerset
379 | 
380 | 


--------------------------------------------------------------------------------
/snps_io/concat_alleles.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | import operator
  3 | from time import time
  4 | 
  5 | def read_input(in_dir, subset_list=None):
  6 |     fpaths = []
  7 |     fnames = []
  8 |     
  9 |     subset_map = dict()
 10 | 
 11 |     for f in os.listdir(in_dir):
 12 |         subset_map[f] = 1
 13 | 
 14 |     if subset_list is not None:
 15 |         subset_map = dict()
 16 |         with open(subset_list, 'r') as fh:
 17 |             for ln in fh:
 18 |                 items = ln.rstrip().split('\t')
 19 |                 assert len(items) == 2
 20 |                 fname = items[0].split('/')[-1]
 21 |                 subset_map[fname] = 1
 22 |     
 23 |     for f in os.listdir(in_dir):
 24 |         if f in subset_map:
 25 |             fpath = in_dir.rstrip('/')+'/'+f
 26 | 
 27 |             if os.path.isfile(fpath):
 28 |                 fstats = os.stat(fpath)
 29 |                 if fstats.st_size >= 0:
 30 |                     fpaths.append(fpath)
 31 |                     fnames.append(f)
 32 |                 else:
 33 |                     sys.stderr.write("skip {}: empty file\n".format(fpath))
 34 |             else:
 35 |                 sys.stderr.write("skip {}: not exist\n".format(fpath))
 36 | 
 37 |         else:
 38 |             sys.stderr.write("skip {}\n".format(f))
 39 | 
 40 |     return fpaths, fnames
 41 | 
 42 | def read_msa(msa_in):
 43 |     valid_chars = dict()
 44 |     valid_chars['A'] = 'A'
 45 |     valid_chars['a'] = 'A'
 46 |     valid_chars['C'] = 'C'
 47 |     valid_chars['c'] = 'C'
 48 |     valid_chars['G'] = 'G'
 49 |     valid_chars['g'] = 'G'
 50 |     valid_chars['T'] = 'T'
 51 |     valid_chars['t'] = 'T'
 52 |     valid_chars['-'] = '-'
 53 | 
 54 |     alns = dict()
 55 |     cur_seq = ""
 56 |     cur_sample = ""
 57 |     cur_contig = ""
 58 |     with open(msa_in, 'r') as fh:
 59 |         for line in fh:
 60 |             if line[0] == '>':
 61 |                 items = line.rstrip().split(' ')
 62 |                 if items[0] not in alns:
 63 |                     alns[items[0]] = dict()
 64 |                 if items[1] not in alns[items[0]]:
 65 |                     alns[items[0]][items[1]] = ""
 66 |                 cur_sample = items[0]
 67 |                 cur_contig = items[1]
 68 |             elif line[0] == '=':
 69 |                 pass
 70 |             else:
 71 |                 elems = []
 72 |                 for char in line.rstrip().split():
 73 |                     if char not in valid_chars:
 74 |                         elems.append(char)
 75 |                     else:
 76 |                         elems.append(valid_chars[char])
 77 |                 alns[cur_sample][cur_contig] = "".join(elems)
 78 | 
 79 |     aln_recs = []
 80 |     for sample in alns.keys():
 81 |         for contig in alns.keys():
 82 |             aln_recs.append([sample, contig, alns[cur_sample][cur_contig]])
 83 | 
 84 |     sorted_alns = sorted(aln_recs, key = lambda x: (x[1], x[0]))
 85 |     concat_alns = dict()
 86 |     for aln_rec in sorted_alns:
 87 |         if aln_rec[0] not in concat_alns:
 88 |             concat_alns[aln_rec[0]] = ""
 89 |         else:
 90 |             concat_alns[aln_rec[0]] += aln_rec[2]
 91 | 
 92 |     return concat_alns 
 93 | 
 94 | def read_aln(aln_in):
 95 |     valid_chars = dict()
 96 |     valid_chars['A'] = 'A'
 97 |     valid_chars['a'] = 'A'
 98 |     valid_chars['C'] = 'C'
 99 |     valid_chars['c'] = 'C'
100 |     valid_chars['G'] = 'G'
101 |     valid_chars['g'] = 'G'
102 |     valid_chars['T'] = 'T'
103 |     valid_chars['t'] = 'T'
104 |     valid_chars['-'] = '-'
105 | 
106 |     alns = dict()
107 |     cur_seq = ""
108 |     cur_sample = ""
109 |     with open(aln_in, 'r') as fh:
110 |         for line in fh:
111 |             if line[0] == '>':
112 |                 cur_sample = line.rstrip()
113 |                 if cur_sample not in alns:
114 |                     alns[cur_sample] = ""
115 |             else:
116 |                 elems = []
117 |                 for char in line.rstrip().split():
118 |                     if char not in valid_chars:
119 |                         elems.append(char)
120 |                     else:
121 |                         elems.append(valid_chars[char])
122 |                 alns[cur_sample] += "".join(elems)
123 | 
124 |     return alns 
125 | 
126 | def write_aln(alns, out_path, max_gap=0.2):
127 |     with open(out_path, 'w') as fh:
128 |         for aln_key in alns.keys():
129 |             n_gaps = 0
130 |             total_len = len(alns[aln_key])
131 |             for base in alns[aln_key]:
132 |                 if base == '-':
133 |                     n_gaps += 1
134 | 
135 |             if n_gaps/total_len > max_gap:
136 |                 print("{}: skip {}".format(n_gaps/total_len, aln_key))
137 |             else:
138 |                 fh.write("{}\n{}\n".format(aln_key, alns[aln_key]))
139 | 
140 | def read_gtp(input_path, min_depth):
141 |     input_recs = dict()
142 | 
143 |     with open(input_path, 'r') as fh:
144 |         for line in fh:
145 |             items = line.rstrip().split("\t")
146 | 
147 |             contig_id = items[0]
148 |             contig_pos = items[1]
149 |             snp_key = contig_id + "__" + contig_pos
150 | 
151 |             cnt_allele_1 = int(items[5])
152 |             cnt_allele_2 = int(items[6])
153 |             if cnt_allele_1 + cnt_allele_2 < min_depth:
154 |                 continue
155 | 
156 |             allele = ""
157 |             if cnt_allele_1 > cnt_allele_2:
158 |                 allele = items[3]
159 |             else:
160 |                 allele = items[4]
161 |             input_recs[snp_key] = allele 
162 | 
163 |     return input_recs
164 | 
165 | def union_inputs(inputs, names):
166 |     first_union_in = dict()
167 | 
168 |     for input_recs in inputs:
169 |         for snp_key in input_recs:
170 |             if snp_key not in first_union_in:
171 |                 first_union_in[snp_key] = 1
172 |     print("first_union_in: {}".format(len(first_union_in.keys())))
173 | 
174 |     union_in = dict()
175 |     n_samples = len(names)
176 |     for snp_key in first_union_in.keys():
177 |         n_prev = 0
178 |         allele_col = dict()
179 |         for input_recs in inputs:
180 |             if snp_key in input_recs:
181 |                 n_prev += 1
182 |                 allele_col[input_recs[snp_key]] = 1
183 |         #if n_prev / n_samples >= 10 and len(allele_col.keys()) > 1:
184 |         if len(allele_col.keys()) > 1:
185 |             union_in[snp_key] = 1
186 |         
187 |     print("union_in: {}".format(len(union_in.keys())))
188 |     all_keys = [[key.split('__')[0], int(key.split('__')[1])] for key in union_in.keys()]
189 |     sorted_keys = sorted(all_keys, key = lambda x: (x[0], x[1]))
190 |     #sorted_keys = sorted(all_keys, key = operator.itemgetter(0, 1))
191 | 
192 |     allele_aln = dict()
193 |     for i, input_recs in enumerate(inputs):
194 |         alleles = []
195 |         for elem in sorted_keys:
196 |             snp_key = elem[0] + "__" + str(elem[1])
197 |             if snp_key in input_recs:
198 |                 alleles.append(input_recs[snp_key])
199 |             else:
200 |                 alleles.append('-')
201 |         allele_aln[names[i]] = alleles
202 | 
203 |     return allele_aln 
204 | 
205 | def concat_snps(allele_aln, allele_aln_fasta, max_gap, min_prev, min_maf, min_mac):
206 |     with open(allele_aln_fasta, 'w') as fh:
207 |         good_names = []
208 |         for name in allele_aln.keys():
209 |             n_gaps = 0
210 |             total_len = len(allele_aln[name])
211 |             for base in allele_aln[name]:
212 |                 if base == '-':
213 |                     n_gaps += 1
214 | 
215 |             if n_gaps/total_len > max_gap:
216 |                 print("{}: skip {}".format(n_gaps/total_len, name))
217 |             else:
218 |                 good_names.append(name)
219 | 
220 |         print(good_names)
221 |         comm_aln = dict()
222 |         comm_inds = []
223 |         n_samples = len(good_names)
224 | 
225 |         for i, allele in enumerate(allele_aln[good_names[0]]):
226 |             n_gaps = 0
227 |             alleles = []
228 |             allele_track = dict()
229 |             for name in good_names:
230 |                 alleles.append(allele_aln[name][i])
231 |                 if allele_aln[name][i] == '-':
232 |                     n_gaps += 1
233 |                 else:
234 |                     if allele_aln[name][i] not in allele_track:
235 |                         allele_track[allele_aln[name][i]] = 1
236 |                     else:
237 |                         allele_track[allele_aln[name][i]] += 1
238 | 
239 |             if (1 - n_gaps/n_samples) < min_prev:
240 |                 print("low prevalence: {}: skip {}".format(1 - n_gaps/n_samples, i))
241 |                 continue
242 |             elif len(allele_track.keys()) <= 1:
243 |                 print("not a SNP site: skip {}".format(i))
244 |                 continue
245 |             else:
246 |                 sorted_allele_track = sorted(allele_track.items(), key=lambda item: item[1], reverse=True)
247 |                 major_count = sorted_allele_track[0][1]
248 |                 minor_count = sorted_allele_track[1][1]
249 |                 if minor_count/n_samples < min_maf:
250 |                     print("low min MAF: {}: skip {}".format(minor_count/n_samples, i))
251 |                     continue
252 |                 elif minor_count < min_mac:
253 |                     print("low min MAC: {}: skip {}".format(minor_count, i))
254 |                     continue
255 |                 else:
256 |                     comm_inds.append(i)
257 | 
258 |         print("number of good sites: {}".format(len(comm_inds)))
259 |         print("number of good samples: {}".format(len(good_names)))
260 |         
261 |         for name in good_names:
262 |             if name not in comm_aln:
263 |                 comm_aln[name] = []
264 |             for i in comm_inds:
265 |                 comm_aln[name].append(allele_aln[name][i])
266 |             fh.write(">{}\n{}\n".format(name, "".join(comm_aln[name])))
267 | 
268 |     return allele_aln_fasta
269 | 
270 | 
271 | def run_command(cmd, env=None):
272 |     import subprocess as sp
273 |     if env:
274 |         p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE, env=env)
275 |     else:
276 |         p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
277 |     out, err = p.communicate()
278 |     if p.returncode != 0:
279 |         err_msg =  "\nError: the following returned non-zero status: '%s':\n" % cmd
280 |         err_msg += "\n%s" % err
281 |         sys.exit(err_msg)
282 |     else:
283 |         return out, err
284 | 
285 | def run_fasttree(snp_str_fasta, out_dir):
286 |     sys.stderr.write("[start] inferring max. likelihood tree\n")
287 |     sys.stderr.write("\tsnp string fasta path: {}\n".format(snp_str_fasta))
288 | 
289 |     o_mat_path = out_dir + "/concat_allele.aln.mat"
290 | 
291 |     command = "FastTreeMP -makematrix -nt -gtr < "
292 |     command += snp_str_fasta
293 |     command += " > "
294 |     command += o_mat_path
295 | 
296 |     environ = os.environ.copy()
297 |     run_command(command, environ)
298 |     sys.stderr.write("\tfinishing up, distance matrix is writtedn to {}\n".format(o_mat_path))
299 | 
300 |     o_tre_path = out_dir + "/concat_allele.aln.tre"
301 | 
302 |     command = "FastTreeMP -nt -gtr < "
303 |     command += snp_str_fasta
304 |     command += " > "
305 |     command += o_tre_path
306 | 
307 |     environ = os.environ.copy()
308 |     run_command(command, environ)
309 | 
310 |     sys.stderr.write("\tfinishing up, tree is writtedn to {}\n".format(o_tre_path))
311 |     sys.stderr.write("[done] inferring max. likelihood tree\n")
312 | 
313 | def concat_allele_tree(args):
314 |     in_dir = args['input_dir']
315 |     in_path = args['input_list']
316 | 
317 |     out_dir = args['out_dir'].rstrip('/')
318 |     if not os.path.isdir(out_dir):
319 |         os.makedirs(out_dir)
320 | 
321 |     min_sites_per_sample = args['min_sites_per_sample']
322 |     max_gap_ratio = args['max_gap_ratio']
323 |     min_site_prev = args['min_site_prev']
324 |     min_maf = args['min_maf']
325 |     min_mac = args['min_mac']
326 | 
327 |     paths, names = read_input(in_dir, in_path) 
328 |     if len(names) != len(set(names)):
329 |         sys.stderr.write("\n[error] names of input files are not unqiue.\n")
330 |         sys.exit()
331 |     input_recs = []
332 |     aln_fasta = out_dir + '/concat_allele.aln.fasta'
333 |     nonempty_names = []
334 |     for i, path in enumerate(paths):
335 |         if not os.path.exists(path):
336 |             print("Skip input: {} does not exists.".format(path))
337 |             continue
338 |         if args["min_depth"] is not None:
339 |             min_depth = args["min_depth"]
340 |         input_rec = read_gtp(path, min_depth)
341 |         if len(input_rec.keys()) < min_sites_per_sample:
342 |             print("{}: skipped {}".format(len(input_rec.keys()), path))
343 |         else:
344 |             input_recs.append(input_rec)
345 |             nonempty_names.append(names[i])
346 |     allele_aln = union_inputs(input_recs, nonempty_names)
347 |     concat_snps(allele_aln, aln_fasta, max_gap_ratio, min_site_prev, min_maf, min_mac)
348 |     run_fasttree(aln_fasta, out_dir)
349 | 


--------------------------------------------------------------------------------
/snps_io/align_assembly.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | from snps_io import vcf_io
  5 | 
  6 | class AlignAssembly:
  7 | 	def __init__(self, algns=[], max_sites=float('inf'), gpos_offset=0):
  8 | 		self.alignments = algns
  9 | 		self.sample_ids = []
 10 | 
 11 | 		self.chroms = np.array([])
 12 | 		# self.char_mat = np.array([])
 13 | 		self.global_pos = np.array([])
 14 | 		self.local_pos = np.array([])
 15 | 		self.ref_alleles = np.array([])
 16 | 		self.alt_alleles = np.array([])
 17 | 		self.third_alleles = np.array([])
 18 | 		self.forth_alleles = np.array([])
 19 | 
 20 | 		self.ref_prob_mat = np.array([])
 21 | 		self.alt_prob_mat = np.array([])
 22 | 		self.third_prob_mat = np.array([])
 23 | 		self.forth_prob_mat = np.array([])
 24 | 
 25 | 		self.freq_mat = np.array([])
 26 | 		self.sample_presence = np.array([])
 27 | 		self.prevalence = np.array([])
 28 | 		self.ref_freqs = np.array([])
 29 | 		self.alt_freqs = np.array([])
 30 | 		self.third_freqs = np.array([])
 31 | 		self.forth_freqs = np.array([])
 32 | 
 33 | 		self.gpos_offset = gpos_offset
 34 | 		self.is_spliced = False
 35 | 
 36 | 		if self._check_():
 37 | 			self.splice(max_sites)
 38 | 
 39 | 		self.snps = []
 40 | 		self.coords = []
 41 | 		self.consensus_genome = ""
 42 | 
 43 | 	def _check_(self):
 44 | 		if len(self.alignments) > 2:
 45 | 			if all([len(align.sample_ids) == len(self.alignments[0].sample_ids) for align in self.alignments]):
 46 | 				return True
 47 | 			else:
 48 | 				return False
 49 | 		else:
 50 | 			return False
 51 | 
 52 | 	def splice(self, max_sites=float('inf')):
 53 | 		if (len(self.alignments) == 1):
 54 | 			print("warning: cannot splice less than 2 alignments")
 55 | 
 56 | 			first_align = self.alignments[0]
 57 | 			self.sample_ids = first_align.sample_ids
 58 | 
 59 | 			# self.char_mat = np.concatenate([algn.char_mat for algn in self.alignments], axis=1)
 60 | 			self.chroms = np.array(np.repeat(first_align.chrom, first_align.ncols))
 61 | 			self.local_pos = np.array(first_align.local_pos)
 62 | 
 63 | 			self.ref_alleles = np.array(first_align.ref_alleles)
 64 | 			self.alt_alleles = np.array(first_align.alt_alleles)
 65 | 			self.third_alleles = np.array(first_align.third_alleles)
 66 | 			self.forth_alleles = np.array(first_align.forth_alleles)
 67 | 
 68 | 			self.ref_prob_mat = np.array(first_align.ref_prob_mat)
 69 | 			self.alt_prob_mat = np.array(first_align.alt_prob_mat)
 70 | 			self.third_prob_mat = np.array(first_align.third_prob_mat)
 71 | 			self.forth_prob_mat = np.array(first_align.forth_prob_mat)
 72 | 
 73 | 			self.freq_mat = np.array(first_align.freq_mat) # attention here
 74 | 			self.sample_presence = np.array(first_align.sample_presence)
 75 | 			self.prevalence = np.array(first_align.prevalence)
 76 | 			self.ref_freqs = np.array(first_align.ref_freqs)
 77 | 			self.alt_freqs = np.array(first_align.alt_freqs)
 78 | 			self.third_freqs = np.array(first_align.third_freqs)
 79 | 			self.forth_freqs = np.array(first_align.forth_freqs)
 80 | 
 81 | 			self.global_pos = np.arange(len(self.chroms)) + self.gpos_offset
 82 | 
 83 | 			self.is_spliced = True
 84 | 			if max_sites < len(self.chroms):
 85 | 				self.cut_short(max_sites)
 86 | 		elif len(self.alignments) > 1 and all([len(align.sample_ids) == len(self.alignments[0].sample_ids) for align in self.alignments]):
 87 | 			first_align = self.alignments[0]
 88 | 			self.sample_ids = first_align.sample_ids
 89 | 
 90 | 			# self.char_mat = np.concatenate([algn.char_mat for algn in self.alignments], axis=1)
 91 | 			self.chroms = np.concatenate([np.repeat(algn.chrom, algn.ncols) for algn in self.alignments], axis=0)
 92 | 			self.local_pos = np.concatenate([algn.local_pos for algn in self.alignments], axis=0)
 93 | 
 94 | 			self.ref_alleles = np.concatenate([algn.ref_alleles for algn in self.alignments], axis=0)
 95 | 			self.alt_alleles = np.concatenate([algn.alt_alleles for algn in self.alignments], axis=0)
 96 | 			self.third_alleles = np.concatenate([algn.third_alleles for algn in self.alignments], axis=0)
 97 | 			self.forth_alleles = np.concatenate([algn.forth_alleles for algn in self.alignments], axis=0)
 98 | 
 99 | 
100 | 			self.ref_prob_mat = np.concatenate([algn.ref_prob_mat for algn in self.alignments], axis=1)
101 | 			self.alt_prob_mat = np.concatenate([algn.alt_prob_mat for algn in self.alignments], axis=1)
102 | 			self.third_prob_mat = np.concatenate([algn.third_prob_mat for algn in self.alignments], axis=1)
103 | 			self.forth_prob_mat = np.concatenate([algn.forth_prob_mat for algn in self.alignments], axis=1)
104 | 
105 | 			self.freq_mat = np.concatenate([algn.freq_mat for algn in self.alignments], axis=1)
106 | 
107 | 			self.sample_presence = np.concatenate([algn.sample_presence for algn in self.alignments], axis=0)
108 | 			self.prevalence = np.concatenate([algn.prevalence for algn in self.alignments], axis=0)
109 | 			self.ref_freqs = np.concatenate([algn.ref_freqs for algn in self.alignments], axis=0)
110 | 			self.alt_freqs = np.concatenate([algn.alt_freqs for algn in self.alignments], axis=0)
111 | 			self.third_freqs = np.concatenate([algn.third_freqs for algn in self.alignments], axis=0)
112 | 			self.forth_freqs = np.concatenate([algn.forth_freqs for algn in self.alignments], axis=0)
113 | 
114 | 			self.global_pos = np.arange(len(self.chroms)) + self.gpos_offset
115 | 
116 | 			self.is_spliced = True
117 | 
118 | 			if max_sites < len(self.chroms):
119 | 				self.cut_short(max_sites)
120 | 		else:
121 | 			print("errors: zero or uneven alignments")
122 | 
123 | 		print("total number of sites: {}".format(len(self.chroms)))
124 | 
125 | 		return self.is_spliced
126 | 
127 | 	def cut_short(self, _max_sites):
128 | 		if self.is_spliced:
129 | 			if _max_sites < len(self.chroms):
130 | 				max_sites = int(_max_sites)
131 | 
132 | 				self.chroms = self.chroms[:max_sites]
133 | 				self.global_pos = self.global_pos[:max_sites]
134 | 				self.local_pos = self.local_pos[:max_sites]
135 | 				self.ref_alleles = self.ref_alleles[:max_sites]
136 | 				self.alt_alleles = self.alt_alleles[:max_sites]
137 | 				self.freq_mat = self.freq_mat[:,:max_sites]
138 | 
139 | 				self.ref_prob_mat = self.ref_prob_mat[:,:max_sites]
140 | 				self.alt_prob_mat = self.alt_prob_mat[:,:max_sites]
141 | 				self.third_prob_mat = self.third_prob_mat[:,:max_sites]
142 | 				self.forth_prob_mat = self.forth_prob_mat[:,:max_sites]
143 | 
144 | 				self.sample_presence = self.sample_presence[:max_sites]
145 | 				self.prevalence = self.prevalence[:max_sites]
146 | 				self.ref_freqs = self.ref_freqs[:max_sites]
147 | 				self.alt_freqs = self.alt_freqs[:max_sites]
148 | 		else:
149 | 			print("warnings: impossible to cut short unspliced alignments, no changes has been done!")
150 | 
151 | 	def id_core_genome(self, min_prev, min_alt_freq):
152 | 		print("min. prevalence: {}".format(min_prev))
153 | 		print("min. alt. frequency: {}".format(min_alt_freq))
154 | 
155 | 		if self.is_spliced:
156 | 			prev_mask = (self.prevalence >= min_prev)
157 | 			snp_mask = (self.alt_freqs >= min_alt_freq) & (self.ref_alleles != b'N') & (self.ref_alleles != b'-')
158 | 			wildcard_mask = (self.ref_alleles != b'N') & (self.ref_alleles != b'-')
159 | 
160 | 			# alt_freq_mask = ((1 - self.ref_freqs - self.alt_freqs) <= (min_alt_freq+0.000000001))
161 | 
162 | 
163 | 			# fake_mask = np.logical_not(alt_freq_mask)
164 | 			# print alt_freq_mask[fake_mask]
165 | 			# print (1 - self.ref_freqs - self.alt_freqs)[fake_mask]
166 | 			# print self.ref_freqs[fake_mask]
167 | 			# print self.alt_freqs[fake_mask]
168 | 			# print self.third_freqs[fake_mask]
169 | 			# print self.forth_freqs[fake_mask]
170 | 
171 | 			self.consensus_genome = self.id_consensus_genome()
172 | 
173 | 			shift_chroms = np.append(self.chroms[1:], self.chroms[-1])
174 | 			boundary_mask = np.logical_not((shift_chroms == self.chroms))
175 | 			#goodness_mask = (prev_mask & alt_freq_mask & wildcard_mask)
176 | 			goodness_mask = (prev_mask & wildcard_mask)
177 | 
178 | 			self.coords = self.id_coordinates(boundary_mask, goodness_mask)
179 | 
180 | 			print("masked by prev_mask: {}".format(np.sum(prev_mask)))
181 | 			print("masked by snp_mask: {}".format(np.sum(snp_mask)))
182 | 			# print "masked by alt_freq_mask: {}".format(np.sum(alt_freq_mask))
183 | 			print("masked by wildcard_mask: {}".format(np.sum(wildcard_mask)))
184 | 
185 | 			calling_mask = goodness_mask & snp_mask
186 | 			self.snps = self.id_snps(calling_mask)
187 | 		else:
188 | 			sys.exit("premature call of id_core_genome, the multiple alignments were sliced yet.")
189 | 
190 | 	def id_consensus_genome(self):
191 | 		if self.is_spliced:
192 | 			if len(self.ref_alleles) > 0:
193 | 				return b''.join([ref_allele for ref_allele in self.ref_alleles])
194 | 			else:
195 | 				return b''
196 | 		else:
197 | 			return b''
198 | 
199 | 	def id_snps(self, calling_mask):
200 | 		if self.is_spliced:
201 | 			snps = []
202 | 
203 | 			snp_chroms = self.chroms[calling_mask]
204 | 			snp_gb_pos = self.global_pos[calling_mask]
205 | 			snp_lc_pos = self.local_pos[calling_mask]
206 | 			snp_refs = self.ref_alleles[calling_mask]
207 | 			snp_alts = self.alt_alleles[calling_mask]
208 | 			snp_third = self.third_alleles[calling_mask]
209 | 			snp_forth = self.forth_alleles[calling_mask]
210 | 
211 | 			snp_ref_prob_mat = self.ref_prob_mat[:,calling_mask]
212 | 			snp_alt_prob_mat = self.alt_prob_mat[:,calling_mask]
213 | 			snp_third_prob_mat = self.third_prob_mat[:,calling_mask]
214 | 			snp_forth_prob_mat = self.forth_prob_mat[:,calling_mask]
215 | 
216 | 			snp_freqs = self.freq_mat[:,calling_mask]
217 | 			snp_presence = self.sample_presence[calling_mask]
218 | 			snp_prevs = self.prevalence[calling_mask]
219 | 			snp_ref_freqs = self.ref_freqs[calling_mask]
220 | 			snp_alt_freqs = self.alt_freqs[calling_mask]
221 | 			snp_third_freqs = self.third_freqs[calling_mask]
222 | 			snp_forth_freqs = self.forth_freqs[calling_mask]
223 | 
224 | 			for i, chrom in enumerate(snp_chroms):
225 | 				var_id = str(snp_gb_pos[i])
226 | 
227 | 				freq_row = snp_freqs[:,i]
228 | 				freq_row[snp_freqs[:,i] == None] = -1
229 | 
230 | 				snp_ref_prob_row = snp_ref_prob_mat[:,i]
231 | 				snp_ref_prob_row[snp_ref_prob_mat[:,i] == None] = -1
232 | 
233 | 				snp_alt_prob_row = snp_alt_prob_mat[:,i]
234 | 				snp_alt_prob_row[snp_alt_prob_mat[:,i] == None] = -1
235 | 
236 | 				snp_third_prob_row = snp_third_prob_mat[:,i]
237 | 				snp_third_prob_row[snp_third_prob_mat[:,i] == None] = -1
238 | 
239 | 				snp_forth_prob_row = snp_forth_prob_mat[:,i]
240 | 				snp_forth_prob_row[snp_forth_prob_mat[:,i] == None] = -1
241 | 
242 | 				allele_mask = (np.array([snp_ref_prob_row.sum(), snp_alt_prob_row.sum(), snp_third_prob_row.sum(), snp_forth_prob_row.sum()]) > 0)
243 | 
244 | 				alleles = np.array([snp_alts[i], snp_third[i], snp_forth[i]])
245 | 				alleles = alleles[allele_mask[1:]]
246 | 
247 | 				if len(alleles) == 0:
248 | 					avail_alleles = b'.'
249 | 				else:
250 | 					avail_alleles = b','.join(alleles)
251 | 
252 | 				snp = self._make_snp_(
253 | 					chrom, var_id, snp_lc_pos[i],
254 | 					snp_refs[i], snp_alts[i], snp_third[i], snp_forth[i], avail_alleles,
255 | 					len(self.sample_ids), snp_presence[i], round(snp_alt_freqs[i], 3),
256 | 					self.sample_ids, snp_ref_prob_row, snp_alt_prob_row, snp_third_prob_row, snp_forth_prob_row
257 | 				)
258 | 				snps.append(snp)
259 | 
260 | 			self.snps = snps
261 | 			return snps
262 | 		else:
263 | 			return []
264 | 
265 | 	def id_coordinates(self, boundary_mask, goodness_mask):
266 | 		if self.is_spliced:
267 | 			end_pos = np.array([])
268 | 			start_pos = np.array([])
269 | 
270 | 			if len(self.alignments) > 1:
271 | 				end_pos = self.global_pos[boundary_mask]
272 | 				end_pos = np.append(end_pos, self.global_pos[-1])
273 | 
274 | 				start_pos = np.array([self.global_pos[0]])
275 | 				shift_ends = end_pos[:-1] + 1
276 | 				start_pos = np.concatenate((start_pos, shift_ends))
277 | 			else:
278 | 				end_pos = np.array([self.global_pos[-1]])
279 | 				start_pos = np.array([self.global_pos[0]])
280 | 
281 | 			bad_pos = self.global_pos[np.logical_not(goodness_mask)]
282 | 
283 | 			rshift_bad_pos = bad_pos + 1
284 | 			lshift_bad_pos = bad_pos - 1
285 | 
286 | 			start_pos = np.concatenate((start_pos, rshift_bad_pos))
287 | 			end_pos = np.concatenate((end_pos, lshift_bad_pos))
288 | 
289 | 			start_pos = np.sort(start_pos)
290 | 			end_pos = np.sort(end_pos)
291 | 
292 | 			good_region_mask = (start_pos <= end_pos)
293 | 			start_pos = start_pos[good_region_mask]
294 | 			end_pos = end_pos[good_region_mask]
295 | 
296 | 			end_pos = np.sort(end_pos)
297 | 
298 | 			coords = []
299 | 			for i, sp in enumerate(start_pos):
300 | 				coords.append({'chrom':self.chroms[sp-self.gpos_offset], 'start':sp, 'end':end_pos[i]})
301 | 
302 | 			self.coords = coords
303 | 			return coords
304 | 		else:
305 | 			return []
306 | 
307 | 	def _make_snp_(self, chrom, var_id, pos, ref, alt, third, forth, avail_alleles, NS, DP, AF, samp_ids, gp1, gp2, gp3, gp4):
308 | 		""" Format SNP for VCF """
309 | 		info = {}
310 | 		info['NS'] = NS
311 | 		info['DP'] = DP
312 | 		info['AF'] = AF
313 | 
314 | 		dat_fmt = {}
315 | 		dat_fmt['GP1'] = gp1
316 | 		dat_fmt['GP2'] = gp2
317 | 		dat_fmt['GP3'] = gp3
318 | 		dat_fmt['GP4'] = gp4
319 | 
320 | 		snp = vcf_io.SNP(chrom, var_id, pos, ref, alt, third, forth, avail_alleles, info, dat_fmt, samp_ids)
321 | 
322 | 		return snp
323 | 
324 | def call_snps(aligns, max_sites, min_prev, snp_freq):
325 | 	"""
326 | 	Loop over each genomic site in each contig.
327 | 	For each site, fetch per-sample info from pileup files.
328 | 	Initialize GenomicSite object
329 | 	Determine site prevalence and allele frequency.
330 | 	Keep track of core-genome coordinates and SNPs in those regions.
331 | 
332 | 	Args:
333 | 		max_sites:		int; max number of sites to process
334 | 		min_prev:		float; minimum prevalence for calling core sites
335 | 		snp_freq:		float; minimum minor allele frequency for snp calling
336 | 	"""
337 | 
338 | 	aa = AlignAssembly(aligns, max_sites)
339 | 
340 | 	if not aa.is_spliced:
341 | 		aa.splice()
342 | 
343 | 	if max_sites < len(aa.chroms):
344 | 		aa.cut_short(max_sites)
345 | 
346 | 	aa.id_core_genome(min_prev, snp_freq)
347 | 
348 | 	return aa
349 | 
350 | def call_snps_iter(align_iterator, max_sites, min_prev, snp_freq):
351 | 	"""
352 | 	Loop over each genomic site in each contig.
353 | 	For each site, fetch per-sample info from pileup files.
354 | 	Initialize GenomicSite object
355 | 	Determine site prevalence and allele frequency.
356 | 	Keep track of core-genome coordinates and SNPs in those regions.
357 | 
358 | 	Args:
359 | 		max_sites:		int; max number of sites to process
360 | 		min_prev:		float; minimum prevalence for calling core sites
361 | 		snp_freq:		float; minimum minor allele frequency for snp calling
362 | 	"""
363 | 
364 | 	block_size = 100*1000
365 | 	counter = 0
366 | 	gb_pos = 0
367 | 
368 | 	aligns = []
369 | 	for align in align_iterator:
370 | 		aligns.append(align)
371 | 		counter = counter + align.ncols
372 | 
373 | 		if counter > block_size:
374 | 			aa = AlignAssembly(aligns, max_sites, gb_pos)
375 | 
376 | 			if not aa.is_spliced:
377 | 				aa.splice()
378 | 
379 | 			if max_sites < len(aa.chroms):
380 | 				aa.cut_short(max_sites)
381 | 
382 | 			aa.id_core_genome(min_prev, snp_freq)
383 | 
384 | 			for ali in aligns:
385 | 				gb_pos = gb_pos + ali.ncols
386 | 
387 | 			aligns = []
388 | 			counter = 0
389 | 
390 | 			yield aa
391 | 
392 | 
393 | 	if len(aligns) > 0:
394 | 		aa = AlignAssembly(aligns, max_sites, gb_pos)
395 | 
396 | 		if not aa.is_spliced:
397 | 			aa.splice()
398 | 
399 | 		if max_sites < len(aa.chroms):
400 | 			aa.cut_short(max_sites)
401 | 
402 | 		aa.id_core_genome(min_prev, snp_freq)
403 | 
404 | 		yield aa
405 | 


--------------------------------------------------------------------------------
/src/callm_db_build.cpp:
--------------------------------------------------------------------------------
  1 | #if __linux__
  2 | #include <linux/version.h>
  3 | #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
  4 | #define _MAP_POPULATE_AVAILABLE
  5 | #endif
  6 | #endif
  7 | 
  8 | #ifdef _MAP_POPULATE_AVAILABLE
  9 | #define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
 10 | #else
 11 | #define MMAP_FLAGS MAP_PRIVATE
 12 | #endif
 13 | 
 14 | #include <sys/mman.h>
 15 | #include <sys/types.h>
 16 | #include <sys/stat.h>
 17 | 
 18 | #include <iostream>
 19 | #include <fstream>
 20 | #include <vector>
 21 | #include <unordered_map>
 22 | #include <algorithm>
 23 | #include <chrono>
 24 | #include <cstring>
 25 | 
 26 | #include <fcntl.h>
 27 | #include <unistd.h>
 28 | #include <assert.h>
 29 | 
 30 | using namespace std;
 31 | 
 32 | 
 33 | // this program scans its input (fastq text stream) for forward k mers,
 34 | 
 35 | // usage:
 36 | //    g++ -O3 --std=c++11 -o vfkmrz_bunion vfkmrz_bunion.cpp
 37 | //    ./vfkmrz_bunion -k1 </path/to/kmer_list1> -k2 </path/to/kmer_list2>
 38 | //
 39 | // standard fastq format only for input, otherwise failure is almost guaranteed. 
 40 | 
 41 | // global variable declaration starts here
 42 | constexpr auto k = 31;
 43 | 
 44 | // set operation mode
 45 | // valid values: 0, 1, 2
 46 | // 0 is set union operation; 1 is set intersection operation; 2 is set difference([set1-set2]);
 47 | constexpr auto s_mod = 0;
 48 | 
 49 | // parameters for <unistd.h> file read; from the source of GNU coreutils wc
 50 | constexpr auto step_size = 256 * 1024 * 1024;
 51 | constexpr auto buffer_size = 256 * 1024 * 1024;
 52 | 
 53 | // output file path
 54 | constexpr auto out_path = "/dev/stdout";
 55 | 
 56 | // get time elapsed since when it all began in milliseconds.
 57 | long chrono_time() {
 58 |     using namespace chrono;
 59 |     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 60 | }
 61 | 
 62 | // number of bits per single nucleotide base
 63 | constexpr int bpb = 2;
 64 | 
 65 | size_t get_fsize(const char* filename) {
 66 |     struct stat st;
 67 |     stat(filename, &st);
 68 |     return st.st_size;
 69 | }
 70 | 
 71 | 
 72 | char* get_ftype(const char* filename) {
 73 |     int fn_len = strlen(filename);
 74 |     char *ftype = (char *)malloc(5);
 75 |     
 76 |     for(int i = 0; i < 4; ++i) {
 77 |         ftype[i] = filename[fn_len - 4 + i];
 78 |     }
 79 | 
 80 |     ftype[4] = '\0';
 81 |     
 82 |     return ftype;
 83 | } 
 84 | 
 85 | 
 86 | template <class int_type>
 87 | int_type bit_encode(const char c) {
 88 |     switch (c) {
 89 |     case 'A': return 0;
 90 |     case 'C': return 1;
 91 |     case 'G': return 2;
 92 |     case 'T': return 3;
 93 |     }
 94 | 
 95 |     assert(false);
 96 | }
 97 | 
 98 | 
 99 | template <class int_type>
100 | char bit_decode(const int_type bit_code) {
101 |     switch (bit_code) {
102 |     case 0: return 'A';
103 |     case 1: return 'C';
104 |     case 2: return 'G';
105 |     case 3: return 'T';
106 |     }
107 |     assert(false);
108 | }
109 | 
110 | template <class int_type>
111 | void make_code_dict(int_type* code_dict) {
112 |     code_dict['A'] = bit_encode<int_type>('A');
113 |     code_dict['C'] = bit_encode<int_type>('C');
114 |     code_dict['G'] = bit_encode<int_type>('G');
115 |     code_dict['T'] = bit_encode<int_type>('T');
116 | }
117 | 
118 | template <class int_type>
119 | int_type seq_encode(const char* buf, int len, const int_type* code_dict, const int_type b_mask) {
120 |     int_type seq_code = 0;
121 |     for (int i=0;  i < len;  ++i) {
122 |         const int_type b_code = code_dict[buf[i]];
123 |         seq_code |= ((b_code & b_mask) << (bpb * (len - i - 1)));
124 |     }
125 |     return seq_code;
126 | }
127 | 
128 | template <class int_type>
129 | void seq_decode(char* buf, const int len, const int_type seq_code, int_type* code_dict, const int_type b_mask) {
130 |     for (int i=0;  i < len-1;  ++i) {
131 |         const int_type b_code = (seq_code >> (bpb * (len - i - 2))) & b_mask;
132 |         buf[i] = bit_decode<int_type>(b_code);
133 |     }
134 | 
135 |     buf[len-1] = '\0';
136 | }
137 | 
138 | 
139 | template <class int_type>
140 | void bit_load(const char* k_path, vector<char>& buffer, vector<tuple<int_type, int_type>>& k_vec, const int_type* code_dict, const int_type b_mask) {
141 |     auto t_start = chrono_time();
142 | 
143 |     char* window = buffer.data();
144 | 
145 |     uintmax_t n_lines = 0;
146 | 
147 |     int fd;
148 |     fd = open(k_path, O_RDONLY);
149 | 
150 |     int cur_pos = 0;
151 |     int snp_pos = 0;
152 | 
153 |     char seq_buf[k];
154 | 	char snp_id[16];
155 | 
156 |     //auto fh = fstream(out_path, ios::out | ios::binary);
157 | 
158 | 	bool id_switch = false;
159 |     bool has_wildcard = false;
160 | 
161 |     while (true) {
162 | 
163 |         const ssize_t bytes_read = read(fd, window, step_size);
164 | 
165 |         if (bytes_read == 0)
166 |             break;
167 | 
168 |         if (bytes_read == (ssize_t) -1) {
169 |             cerr << "unknown fetal error when reading " << k_path << endl;
170 |             exit(EXIT_FAILURE);
171 |         }
172 | 
173 |         for (int i = 0;  i < bytes_read;  ++i) {
174 |             char c = toupper(window[i]);
175 |             if (c == '\n') {
176 |                 ++n_lines;
177 | 
178 |                 if (has_wildcard) {
179 |                     has_wildcard = false;
180 |                     continue;    
181 |                 }
182 | 
183 |                 auto code = seq_encode<int_type>(seq_buf, k, code_dict, b_mask);
184 | 
185 | 				snp_id[snp_pos] = '\0';
186 | 				int_type id_int = stoull(snp_id);
187 | 				//int_type id_int = 1;
188 | 
189 |                 k_vec.push_back(tuple<int_type, int_type>(code, id_int));
190 | 
191 |                 cur_pos = 0;
192 | 				snp_pos = 0;
193 | 
194 | 				id_switch = false;
195 |             } else if (c == '\t'){
196 | 				id_switch = true;
197 | 			} else {
198 |                 if (c == 'N') {
199 |                     has_wildcard = true;    
200 |                 }
201 | 
202 | 				if (id_switch) {
203 | 					snp_id[snp_pos++] = c;	
204 | 				} else {
205 | 					seq_buf[cur_pos++] = c;
206 | 				}
207 |             }
208 |         }
209 | 
210 |         //fh.write(&kmers[0], kmers.size());
211 | 
212 |         // cerr << n_lines << " lines were scanned after " << (chrono_time() - t_start) / 1000 << " seconds" << endl;
213 |     }
214 | 
215 |     auto timeit = chrono_time();
216 | }
217 | 
218 | 
219 | template <class int_type>
220 | void bit_load(vector<char>& buffer, vector<tuple<int_type, int_type>>& k_vec, const int_type* code_dict, const int_type b_mask) {
221 |     auto t_start = chrono_time();
222 | 
223 |     char* window = buffer.data();
224 | 
225 |     uintmax_t n_lines = 0;
226 | 
227 |     int cur_pos = 0;
228 |     int snp_pos = 0;
229 | 
230 |     char seq_buf[k];
231 | 	char snp_id[16];
232 | 
233 |     //auto fh = fstream(out_path, ios::out | ios::binary);
234 | 
235 | 	bool id_switch = false;
236 |     bool has_wildcard = false;
237 | 
238 |     while (true) {
239 | 
240 |         const ssize_t bytes_read = read(STDIN_FILENO, window, step_size);
241 | 
242 |         if (bytes_read == 0)
243 |             break;
244 | 
245 |         if (bytes_read == (ssize_t) -1) {
246 |             cerr << "unknown fetal error when reading from stdin" << endl;
247 |             exit(EXIT_FAILURE);
248 |         }
249 | 
250 |         for (int i = 0;  i < bytes_read;  ++i) {
251 |             char c = toupper(window[i]);
252 |             if (c == '\n') {
253 |                 ++n_lines;
254 | 
255 |                 if (has_wildcard) {
256 |                     has_wildcard = false;
257 |                     continue;    
258 |                 }
259 | 
260 |                 auto code = seq_encode<int_type>(seq_buf, k, code_dict, b_mask);
261 | 
262 | 				snp_id[snp_pos] = '\0';
263 | 				int_type id_int = stoull(snp_id);
264 | 				//int_type id_int = 1;
265 | 
266 |                 k_vec.push_back(tuple<int_type, int_type>(code, id_int));
267 | 
268 |                 cur_pos = 0;
269 | 				snp_pos = 0;
270 | 				id_switch = false;
271 |             } else if (c == '\t'){
272 | 				id_switch = true;
273 | 			} else {
274 |                 if (c == 'N') {
275 |                     has_wildcard = true;    
276 |                 }
277 | 
278 | 				if (id_switch) {
279 | 					snp_id[snp_pos++] = c;	
280 | 				} else {
281 | 					seq_buf[cur_pos++] = c;
282 | 				}
283 |             }
284 |         }
285 | 
286 |         //fh.write(&kmers[0], kmers.size());
287 | 
288 |         // cerr << n_lines << " lines were scanned after " << (chrono_time() - t_start) / 1000 << " seconds" << endl;
289 |     }
290 | 
291 |     auto timeit = chrono_time();
292 | }
293 | 
294 | 
295 | template <class int_type>
296 | void binary_load(const char* k_path, vector<tuple<int_type, int_type>>& k_vec) {
297 |     size_t filesize = get_fsize(k_path);
298 |     //Open file
299 |     int fd = open(k_path, O_RDONLY, 0);
300 |     assert(fd != -1);
301 |     //Execute mmap
302 |     //uint64_t* mmappedData = (uint64_t *) mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
303 |     int_type* mmappedData = (int_type *) mmap(NULL, filesize, PROT_READ, MMAP_FLAGS, fd, 0);
304 |     assert(mmappedData != MAP_FAILED);
305 |     //Write the mmapped data to stdout (= FD #1)
306 | 
307 |     // write(1, mmappedData, filesize);
308 | 
309 |     // char seq_buf[k+1];
310 | 
311 |     auto l_start = chrono_time();
312 | 
313 |     for (uint64_t i = 0; i < filesize/8; i=i+2) {
314 |         // seq_decode<uint_fast64_t>(seq_buf, k, mmappedData[i], b_mask);
315 | 
316 |         auto kmer_int = mmappedData[i];
317 |         auto snp = mmappedData[i+1];
318 | 
319 | 		string snp_str = to_string(snp);
320 | 		
321 | 		if (snp_str[6] == '2'){
322 | 			snp_str[6] = '0';
323 | 		} else if (snp_str[6] == '3') {
324 | 			snp_str[6] = '1';
325 | 		}
326 | 
327 |         auto k_pair = make_tuple(kmer_int, stoull(snp_str));
328 | 
329 |         k_vec.push_back(k_pair);
330 |     }
331 | 
332 |     //Cleanup
333 |     int rc = munmap(mmappedData, filesize);
334 |     assert(rc == 0);
335 |     close(fd);
336 | }
337 | 
338 | 
339 | template <class int_type>
340 | bool cmp_tuple(const tuple<int_type, int_type> &a, const tuple<int_type, int_type> &b){
341 | 	return get<0>(a) < get<0>(b);
342 | }
343 | 
344 | template <class int_type>
345 | void multi_btc64() {	
346 |     int_type lsb = 1;
347 |     int_type b_mask = (lsb << bpb) - lsb;
348 | 
349 |     int_type code_dict[1 << (sizeof(char) * 8)];
350 |     make_code_dict<int_type>(code_dict);
351 | 
352 |     vector<tuple<int_type, int_type>> kdb;
353 |     vector<char> buffer(buffer_size);
354 | 
355 |     bit_load<int_type>(buffer, kdb, code_dict, b_mask);	
356 | 
357 |     auto timeit = chrono_time();
358 |     sort(kdb.begin(), kdb.end(), cmp_tuple<int_type>);
359 |     // typename vector<int_type>::iterator ip = unique(kdb.begin(), kdb.end());
360 |     // kdb.resize(std::distance(kdb.begin(), ip));
361 |     cerr << "Done!\n" << "It takes " << (chrono_time() - timeit) / 1000 << " secs" << endl;
362 |     cerr << "the kmer list has " << kdb.size() << " kmers" << endl;
363 | 
364 |     // char seq_buf[k+1];
365 | 	
366 | 	vector<int_type> o_buff;
367 | 
368 |     ofstream fh(out_path, ofstream::out | ofstream::binary);
369 | 
370 |     for (auto it = kdb.begin(); it != kdb.end(); ++it) {
371 |         // seq_decode(seq_buf, k+1, *it, code_dict, b_mask);    
372 |         // fh << seq_buf << "\n";
373 |         // fh << *it << "\n";
374 | 
375 | 		// cerr << get<0>(*it) << '\t' << get<1>(*it) << '\n';
376 | 		o_buff.push_back(get<0>(*it));
377 | 		o_buff.push_back(get<1>(*it));
378 |     }
379 | 
380 |     fh.write((char*)&o_buff[0], o_buff.size() * sizeof(int_type));
381 | 
382 |     fh.close();
383 | }
384 | 
385 | template <class int_type>
386 | void multi_btc64(int n_path, char** kpaths) {	
387 |     int_type lsb = 1;
388 |     int_type b_mask = (lsb << bpb) - lsb;
389 | 
390 |     int_type code_dict[1 << (sizeof(char) * 8)];
391 |     make_code_dict<int_type>(code_dict);
392 | 
393 |     vector<tuple<int_type, int_type>> kdb;
394 |     vector<char> buffer(buffer_size);
395 | 
396 |     for (int i = 1; i < n_path; ++i) {
397 |         cerr << kpaths[i] << endl;
398 | 
399 | 		char* kp_type = get_ftype(kpaths[i]);
400 | 		
401 | 		if (strcmp(kp_type, ".tsv") == 0) {
402 | 			bit_load<int_type>(kpaths[i], buffer, kdb, code_dict, b_mask);	
403 | 		} else if (strcmp(kp_type, ".bin") == 0) {
404 | 			binary_load<int_type>(kpaths[i], kdb);
405 | 		} else {
406 | 			assert(false);
407 | 		}
408 |     }
409 | 
410 |     auto timeit = chrono_time();
411 | 
412 | 	sort(kdb.begin(), kdb.end(), cmp_tuple<int_type>);
413 |     // typename vector<int_type>::iterator ip = unique(kdb.begin(), kdb.end());
414 |     // kdb.resize(std::distance(kdb.begin(), ip));
415 |     cerr << "Sorting done! " << "It takes " << (chrono_time() - timeit) / 1000 << " secs" << endl;
416 |     cerr << "the kmer list has " << kdb.size() << " kmers" << endl;
417 | 
418 | 	char seq_buf[k+1];
419 |     // ofstream fh(out_path, ofstream::out | ofstream::binary);
420 | 	vector<int_type> o_buff;
421 | 
422 | 	// move onto checkout when checkout_flag is true
423 | 	bool checkout_flag = true;
424 | 	vector<tuple<int_type, int_type>> auto_queue;
425 | 
426 |     cerr << "start to check conflicts" << endl;
427 |     for (auto it = kdb.begin(); it+1 != kdb.end(); ++it) {
428 | 		// seq_decode(seq_buf, k+1, get<0>(*it), code_dict, b_mask);    
429 | 		// cerr << seq_buf << '\t' << get<1>(*it) << '\n';
430 | 		
431 | 		if (get<0>(*it) == get<0>(*(it+1))) {
432 | 			auto spe1 = stoi(to_string(get<1>(*it)).substr(0, 6));
433 | 			auto spe2 = stoi(to_string(get<1>(*(it+1))).substr(0, 6));
434 | 
435 | 			if (spe1 != spe2) {
436 | 				checkout_flag = false;			
437 | 			}
438 | 			
439 | 			auto_queue.push_back(*it);	
440 | 
441 | 			continue;
442 | 		}
443 | 
444 | 		// check out when code(i) != code(i+1)
445 | 		if (!checkout_flag) {
446 | 			for(auto iq = auto_queue.begin(); iq != auto_queue.end(); ++iq){
447 | 				//cerr << get<0>(*iq) << " - " << get<1>(*iq) << '\n';
448 | 			}
449 | 			auto_queue.clear();
450 | 			checkout_flag = true;
451 | 		} else {
452 | 			if (auto_queue.size() > 0){
453 | 				for(auto iq = auto_queue.begin(); iq != auto_queue.end(); ++iq){
454 | 					o_buff.push_back(get<0>(*iq));
455 | 					o_buff.push_back(get<1>(*iq));
456 | 				}
457 | 
458 | 				auto_queue.clear();
459 | 			}
460 | 			o_buff.push_back(get<0>(*it));
461 | 			o_buff.push_back(get<1>(*it));
462 | 		} 
463 |     }
464 | 
465 | 	auto end_ele = kdb.back();
466 | 	if (checkout_flag) {
467 | 		if (auto_queue.size() > 0){
468 | 			for(auto iq = auto_queue.begin(); iq != auto_queue.end(); ++iq){
469 | 				o_buff.push_back(get<0>(*iq));
470 | 				o_buff.push_back(get<1>(*iq));
471 | 			}
472 | 
473 | 			auto_queue.clear();
474 | 		}
475 | 		o_buff.push_back(get<0>(end_ele));
476 | 		o_buff.push_back(get<1>(end_ele));
477 | 	}
478 | 
479 |     cerr << "the kmer list has " << o_buff.size()/2<< " kmers after purging conflicts" << endl;
480 | 
481 | 	vector<tuple<int_type, int_type>>().swap(kdb);
482 | 
483 | 	unordered_map<uint64_t, tuple<int, int>> snp_indx;	
484 | 
485 |     for (auto it = o_buff.begin(); it != o_buff.end(); it=it+2) {
486 | 		auto snp = stoull(to_string(*(it+1)).substr(0, 6) + to_string(*(it+1)).substr(7));
487 | 		auto snp_type = stoi(to_string(*(it+1)).substr(6, 1));
488 | 		
489 | 		if (snp_indx.find(snp) == snp_indx.end()) {
490 | 			auto type_pair = make_tuple(0, 0);
491 | 			snp_indx.insert({snp, type_pair});
492 | 		}
493 | 
494 | 		assert(snp_type == 0 || snp_type == 1);
495 | 
496 | 		if (snp_type == 0) {  
497 | 			get<0>(snp_indx[snp]) = 1;
498 | 		} else {
499 | 			get<1>(snp_indx[snp]) = 1;
500 | 		}
501 | 	}
502 | 
503 | 	vector<int_type> v_buff;
504 | 
505 | 	for (auto it = o_buff.begin(); it != o_buff.end(); it=it+2) {
506 | 		auto snp = stoull(to_string(*(it+1)).substr(0, 6) + to_string(*(it+1)).substr(7));
507 | 
508 | 		assert(snp_indx.find(snp) != snp_indx.end());
509 | 
510 | 		if (get<0>(snp_indx[snp]) + get<1>(snp_indx[snp]) == 2) {
511 | 			v_buff.push_back(*it);
512 | 			v_buff.push_back(*(it+1));			
513 | 		}
514 | 	}
515 | 
516 |     cerr << "the kmer list has " << v_buff.size()/2<< " kmers after purging conflicts" << endl;
517 | 
518 |     ofstream fh(out_path, ofstream::binary);
519 | 
520 |     fh.write((char*)&v_buff[0], v_buff.size() * sizeof(int_type));
521 |     fh.close();
522 | }
523 | 
524 | void display_usage(char *fname){
525 | 	cout << "usage: " << fname << " fpath [fpath ...]\n";
526 | }
527 | 
528 | int main(int argc, char** argv){		
529 | 	if (argc == 2 && string(argv[1]) == "-h") {
530 | 		display_usage(argv[0]);
531 | 	} else if (argc >= 2) {
532 |         multi_btc64<uint64_t>(argc, argv);		
533 | 	} else if (argc == 1) {
534 |         multi_btc64<uint64_t>();
535 |     } else {
536 |         cerr << argv[0] << " reads from stdin or takes at least one arguments!" << endl;
537 | 		display_usage(argv[0]);
538 |         exit(EXIT_FAILURE);
539 |     }
540 | 
541 |     return 0;
542 | }
543 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Catalogue
  2 | 
  3 | * [Maast](https://github.com/zjshi/Maast#maast)
  4 | * [What Maast does](https://github.com/zjshi/Maast#what-maast-does)
  5 | * [How to cite](https://github.com/zjshi/Maast#how-to-cite)
  6 | * [Installation](https://github.com/zjshi/Maast#installation)
  7 | * [Conda Installation](https://github.com/zjshi/Maast#conda-installation)
  8 | * [How to use](https://github.com/zjshi/Maast#how-to-use)
  9 | * * [Type SNPs from a set of whole genome assemblies and sequencing reads from beginning to end in one single command line](https://github.com/zjshi/Maast#type-snps-from-a-set-of-whole-genome-assemblies-and-sequencing-reads-from-beginning-to-end-in-one-single-command-line)
 10 | * * [Genotype SNPs step by step](https://github.com/zjshi/Maast#genotype-snps-step-by-step)
 11 | * * * [Step 1a: Call SNP with a collection of whole genome assemblies](https://github.com/zjshi/Maast#step-1a-call-snp-with-a-collection-of-whole-genome-assemblies)
 12 | * * * [Step 1b: Call SNPs from a set of whole genomes without redundancy reduction](https://github.com/zjshi/Maast#step-1b-call-snps-from-a-set-of-whole-genomes-without-redundancy-reduction)
 13 | * * * [Step 1c: Call SNPs with customized minimum prevalence and minor allele frequency (MAF) thresholds](https://github.com/zjshi/Maast#step-1c-call-snps-with-customized-minimum-prevalence-and-minor-allele-frequency-maf-thresholds)
 14 | * * * [Step 2: Build SNP covering k-mer database](https://github.com/zjshi/Maast#step-2-build-snp-covering-k-mer-database)
 15 | * * * [Step 3: Genotype whole genome assemblies, sequencing reads or both](https://github.com/zjshi/Maast#step-3-genotype-whole-genome-assemblies-sequencing-reads-or-both)
 16 | * * * [Construct a SNP tree with Maast genotypes (optional)](https://github.com/zjshi/Maast#construct-a-snp-tree-with-maast-genotypes-optional)
 17 | * * * [More helper text and arguments](https://github.com/zjshi/Maast#more-helper-text-and-arguments)
 18 | * [Example tutorial](https://github.com/zjshi/Maast#example)
 19 | * * [Download and decompress test dataset](https://github.com/zjshi/Maast#download-and-decompress-test-dataset)
 20 | * * [Genotype SNPs from begin to end in one single command line with the test dataset](https://github.com/zjshi/Maast#genotype-snps-from-begin-to-end-in-one-single-command-line-with-the-test-dataset)
 21 | * * [Genotype SNPs step by step with the test dataset](https://github.com/zjshi/Maast#genotype-snps-step-by-step-with-the-test-dataset)
 22 | * * * [Step 1: Call SNPs with whole genome assemblies](https://github.com/zjshi/Maast#step-1-call-snps-with-whole-genome-assemblies)
 23 | * * * [Step 2: Build SNP covering k-mer database](https://github.com/zjshi/Maast#step-2-build-snp-covering-k-mer-database-1)
 24 | * * * [Step 3: Genotype whole genome assemblies, sequencing reads or both](https://github.com/zjshi/Maast#step-3-genotype-whole-genome-assemblies-sequencing-reads-or-both-1)
 25 | * * * [Construct a SNP tree with Maast genotypes (optional)](https://github.com/zjshi/Maast#construct-a-snp-tree-with-maast-genotypes-optional-1)
 26 | 
 27 | # Maast
 28 | 
 29 | Maast for microbial agile accurate SNP typing  
 30 | 
 31 | ## What Maast does
 32 | 
 33 | Recent spikes in available whole-genome sequences have greatly expanded intra-species diversity especially for prevalent species. As the number of genomes per species grows, it becomes computationally challenging to perform whole-genome alignment and call single nucleotide polymorphisms (SNPs). Furthermore, the genomes from some species are highly similar and hence redundant for SNP discovery. These trends are irreversible and worse over time. To address the challenge, we present Maast, a tool for discovering core-genome SNPs and genotyping these SNPs in conspecific genomes, contigs, or unassembled reads. Maast runs orders of magnitude faster than existing tools and uses less RAM because it is free of read alignment and assembly. Maast is also comparably accurate and recovers more core-genome SNPs compared to other the-state-of-art tools.
 34 | 
 35 | ## How to cite
 36 | 
 37 | The publication of Maast is in preparation. Please cite this GitHub repo as alternative for now. 
 38 | 
 39 | ## Installation
 40 | 
 41 | <b>Python requirement</b>
 42 | * Python3 (>=3.9.6)
 43 | 
 44 | <b>Required Python libraries</b>
 45 | * [NumPy] (https://numpy.org/install/) (>=1.19.5)
 46 | * [SciPy] (https://scipy.org/install/) (>=1.5.4)
 47 | * [Biopython] (https://biopython.org/wiki/Download) (>=1.79)
 48 | * [NetworkX] (https://pypi.org/project/networkx/) (>=2.5.1)
 49 | 
 50 | Note: the following installation command line might be helpful
 51 | `pip install numpy biopython`
 52 | 
 53 | <b>Required external programs</b>
 54 | * [Mash](https://github.com/marbl/Mash) (>=v2.2)
 55 | * [MUMmer4](https://github.com/mummer4/mummer) (>=v4.0.0)
 56 | 
 57 | <b>Optional installation</b>
 58 | * [FastTreeMP](http://www.microbesonline.org/fasttree/FastTreeMP) (>= v2.1.11) (Optional; only required when tree subcommand is run)  
 59 | * [pigz](https://zlib.net/pigz/) (Optional; A parallel implementation of gzip for modern multi-processor, multi-core machines)
 60 | * [lbzip2](http://lbzip2.org/) (Optional; A free, multi-threaded compression utility with support for bzip2 compressed file format)
 61 | * [lz4](http://www.lz4.org) (Optional; Extremely Fast Compression algorithm)
 62 | 
 63 | Note: the optional dependencies are not required for essential features of Maast, but they are recommended to be installed for better performance or additional features.  
 64 | 
 65 | First, retrieve a copy of Maast to your local computing environment   
 66 | 
 67 | `git clone https://github.com/zjshi/Maast.git`
 68 | 
 69 | Change your current working directory into where you put Maast   
 70 | `cd /path/to/Maast/`
 71 | 
 72 | Type in the command line to compile the source code of Maast   
 73 | `make`
 74 | 
 75 | Type in the command line to make GT-Pro ready to execute   
 76 | `chmod 755 maast`
 77 | 
 78 | The main program (`maast`) should be found in the same directory as `/path/to/Maast/`. This location can be added to the system path so that the main program can be accessed from anywhere. Reference through full path is also allowed.
 79 | 
 80 | Type in the command line to display help text   
 81 | 
 82 | `./maast -h`  
 83 | 
 84 | <b>Notes for C++ compiler</b>   
 85 | 
 86 | Maast requires a C++ compiler that is compatible with C++ 11 standards to work properly. All the tests have been done and passed with clang-900.0.38, but it should be compatible for GNU C Compiler (newer than 5.4.0). We have not tested Maast with older compilers, but we expect it to run similiarly as long as it compiles successfully.
 87 | 
 88 | 
 89 | ## Conda Installation
 90 | 
 91 | <b>Create a new conda environment</b>   
 92 | `conda create -n maast`
 93 | 
 94 | <b>Activate the environment just created</b>   
 95 | `conda activate maast`
 96 | 
 97 | <b>Conda automatic installation with all dependencies</b>   
 98 | `conda install -c conda-forge -c bioconda maast`
 99 | 
100 | <b>Quick installation verification</b>   
101 | `maast -h`  
102 | 
103 | ## How to use 
104 | 
105 | ### Type common SNPs from a set of whole genome assemblies and sequencing reads from beginning to end in one single command line
106 |  
107 | `maast end_to_end --in-dir /path/to/directory/containing/genomes/reads/or/both --out-dir /path/Maast/output/ --min-prev 0.9 --snp-freq 0.01`
108 | 
109 | Note:  
110 | 
111 | Input directory must have a number of whole genome assemblies in FASTA format. 
112 | 
113 | Maast can automatically identify file types with supported file suffix: whole genome assemblies (.fa, .fsa, .fna and .fasta) and sequencing reads (.fq and .fastq). Files compressed with popular algorithms, including .gz, .lz4 and .bz2, are also supported.
114 | 
115 | The running of end_to_end subcomand is equavalent to the running of genomes, db and genotype subcommand with default settings in a row.
116 | 
117 | ### Genotype SNPs step by step
118 | 
119 | #### Step 1a: Call SNP with a collection of whole genome assemblies
120 | `maast genomes --fna-dir /path/to/genomes/ --out-dir /path/Maast/output/`  
121 | 
122 | Note:  
123 | By default, Maast first collapsed redundancy in the input genomes and then call common SNPs from a subset of tag genomes. It also automatically identifies a centroid-genome and use it for the representative genome.
124 | 
125 | Upon a successful run, this step will produce several important files that are required for downstream steps.
126 | * reference.fna (Reference genome that provides genomic coordinate for SNPs)
127 | * core_snps.vcf (SNP catalog)
128 | * tag_paths.list (Selected tag genomes)
129 | 
130 | #### Step 1b: Call SNPs from a set of whole genomes with a speficied reference genome without redundancy reduction
131 | 
132 | `maast genomes --fna-dir /path/to/genomes/ --rep-fna /path/to/rep_genome.fna --out-dir /path/Maast/output/ --skip-centroid --keep-redundancy`  
133 | 
134 | #### Step 1c: Call SNPs with customized minimum prevalence and minor allele frequency (MAF) thresholds
135 | 
136 | `maast genomes --fna-dir /path/to/genomes/ --rep-fna /path/to/rep_genome.fna --out-dir /path/Maast/output/ --min-prev 0.95 --snp-freq 0.001`  
137 | 
138 | #### Step 2: Build SNP covering k-mer database
139 | 
140 | `maast db --ref-genome /path/to/reference.fna --vcf /path/to/core_snps.vcf --msa /path/to/tag_msa.fna --tag-fna-list /path/to/tag_paths.list --fna-dir /path/to/genomes/ --out-dir /path/Maast/output/`
141 | 
142 | Note:  
143 | 
144 | Upon a successful run, this step will produce a SNP covering k-mer database that is required for genotyping sequencing reads.
145 | * kmer_db.bin (SNP covering k-mer database)
146 | 
147 | #### Step 3: Genotype whole genome assemblies, sequencing reads or both
148 | 
149 | `maast genotype --in-dir /path/to/directory/containing/genomes/reads/or/both --ref-genome /path/to/reference.fna --db /path/to/kmer_db.bin --vcf /path/to/core_snps.vcf --out-dir /path/Maast/output/`
150 | 
151 | #### Construct a SNP tree with Maast genotypes (optional)
152 | 
153 | `maast tree --input-list /path/to/Maast/genotypes.input.tsv --out-dir /path/Maast/output/`
154 | 
155 | #### More helper text and arguments
156 | 
157 | `maast end_to_end|genomes|db|genotype|tree -h`
158 | 
159 | ## Example
160 | 
161 | ### Download and decompress test dataset 
162 | 
163 | `wget --content-disposition https://fileshare.czbiohub.org/s/TwGJAsAZ6dQsM49/download`
164 | 
165 | `tar xzvf 101346.tar.gz`
166 | 
167 | Note: after running the two command line above, one directory named 101346 can be found in the current directory. In the directory 101346, there are 300 whole genome assemblie in FASTA format (.fna) and 8 gzipped files of WGS sequencing reads in FASTQ format (.fastq.gz).
168 | 
169 | ### Genotype SNPs from begin to end in one single command line with the test dataset
170 | 
171 | `maast end_to_end --in-dir ./101346 --out-dir ./101346_out`
172 | 
173 | Note: after running the above command line, one directory name 101346_out can be found in the currently directory, which contains all resulting files and directories. 
174 | 
175 | The files include
176 | * reference.fna (selected reference genome)
177 | * tag_paths.list (list of selected tag genomes)
178 | * tag_msa.fna (multiple sequence alignment of tag genomes)
179 | * coords.tsv (coordinates of consensus genome)
180 | * core_snps.vcf (called SNPs in VCF format)
181 | * nr_kmer_set.tsv (raw SNP-covering k-mers)
182 | * check_fna_paths.list (a list of genomes used for validating SNP-covering k-mers)
183 | * kmer_prof.tsv (hit profile of SNP-covering k-mers)
184 | * selected_kmers.tsv (validated SNP-covering k-mers)
185 | * kmer_db.bin (optimized database of SNP-covering k-mers)
186 | 
187 | The directories include
188 | * gt_results (SNP genotyping results)
189 | * temp (tempory directory for hosting )
190 | 
191 | ### Genotype SNPs step by step with the test dataset
192 | 
193 | #### Step 1: Call SNPs with whole genome assemblies
194 | 
195 | `maast genomes --fna-dir ./101346 --out-dir ./101346_out`
196 | 
197 | Note: upon a successful run of the first step, the output files include
198 | * reference.fna (selected reference genome)
199 | * tag_paths.list (list of selected tag genomes)
200 | * tag_msa.fna (multiple sequence alignment of tag genomes)
201 | * coords.tsv (coordinates of consensus genome)
202 | * core_snps.vcf (called SNPs in VCF format)
203 | 
204 | #### Step 2: Build SNP covering k-mer database
205 | 
206 | `maast db --ref-genome ./101346_out/reference.fna --vcf ./101346_out/core_snps.vcf --msa ./101346_out/tag_msa.fna --tag-fna-list ./101346_out/tag_paths.list --fna-dir ./101346/ --out-dir ./101346_out/`
207 | 
208 | Note: all the required input files can be found from the output files of the first step. 
209 | 
210 | Upon a successful run of the second step, the output files include
211 | * nr_kmer_set.tsv (raw SNP-covering k-mers)
212 | * check_fna_paths.list (a list of genomes used for validating SNP-covering k-mers)
213 | * kmer_prof.tsv (hit profile of SNP-covering k-mers)
214 | * selected_kmers.tsv (validated SNP-covering k-mers)
215 | * kmer_db.bin (optimized database of SNP-covering k-mers)
216 | 
217 | Among them, kmer_db.bin is the database file that will be used in the next step along with a few other required files from the first step.
218 | 
219 | #### Step 3: Genotype whole genome assemblies, sequencing reads or both
220 | 
221 | `maast genotype --in-dir ./101346/ --ref-genome ./101346_out/reference.fna --db ./101346_out/kmer_db.bin --vcf ./101346_out/core_snps.vcf --out-dir ./101346_out/`
222 | 
223 | Note: Files to genotype should be supplied in a directory with --in-dir. Supported file types including FASTA and FASTQ formats. Input files can be all FASTAs, FASTQs or a mixture of both.
224 | 
225 | all other required input files could be found from the output files of two previous steps. 
226 | 
227 | The main output files are the SNP genotypes that can be found in the a directory named "gt_results" in the designated output directory, ./101346_out/ in this case.
228 | 
229 | It has seven fields as the following:
230 | 
231 | 1. Contig: string type with arbitary length which specifies the contig of a representative genome where a SNP is from
232 | 2. Local Pos: up to seven digits which specifies the local position of a SNP on a contig
233 | 3. Global Pos: up to seven digits which specifies the global position of a SNP in a species, served as sort of ID
234 | 4. Allele 1: single character, A, C, G or T, which specifies allele 1 of a SNP
235 | 5. Allele 2: similiar as Ref allele but specifies allele 2 of a SNP
236 | 6. Allele 1 Cnt: an integer specifying the count of detected allele 1 in a metagenome
237 | 7. Allele 2 Cnt: an integer specifying the count of detected allele 2 in a metagenome
238 | 
239 | An example of such looks like the following:
240 | 
241 | | Contig                                     | Local Pos     | Global Pos     | Allele 1       | Allele 2       | Allele 1 Cnt   | Allele 2 Cnt   |
242 | | :---                                       |    :----:     |     :----:     |    :----:      |    :----:      |    :----:      |    :----:      |
243 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15829         | 349759         | C              | T              | 65             | 0              |
244 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15863         | 20713          | C              | T              | 62             | 1              |
245 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15889         | 131457         | C              | A              | 62             | 0              |
246 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15907         | 4457           | G              | A              | 59             | 0              |
247 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15910         | 4553           | C              | A              | 59             | 0              |
248 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15937         | 151893         | C              | T              | 56             | 0              |
249 | | NODE_10_length_179788_cov_11.0000_ID_43085 | 15940         | 101338         | C              | T              | 55             | 0              |
250 | | ...                                        | ...           | ...            | ...            |  ...           |  ...           |  ...           |
251 | 
252 | #### Construct a SNP tree with Maast genotypes (optional)
253 | 
254 | This is an optional step that helps take advantage of genotyped SNPs for a quick application - SNP tree building
255 | 
256 | `paste <(find ./101346_out/gt_results/ -name '*tsv' | sort) <(find ./101346_out/gt_results/ -name '*tsv' | sort | cut -d'/' -f4 | cut -d'.' -f1) > 101346_genotypes.input.tsv`
257 | 
258 | Note: the step above generates a list of input pairs. Each pair per row contains a path to a genotype result file generated from Maast genotype command and a unique name of the file. The path and name are separated by a tab, like the following
259 | /file/path/1	name1
260 | /file/path/2	name2
261 | /file/path/3    name3
262 | ...
263 | 
264 | 
265 | The first three rows of 101346_genotypes.input.tsv in this example look like
266 | ./101346_out/gt_results/GUT_GENOME000400.fna.tsv	GUT_GENOME000400
267 | ./101346_out/gt_results/GUT_GENOME000466.fna.tsv	GUT_GENOME000466
268 | ./101346_out/gt_results/GUT_GENOME000688.fna.tsv	GUT_GENOME000688
269 | 
270 | 
271 | `maast tree --input-list ./101346_genotypes.input.tsv --out-dir ./101346_out/`
272 | 
273 | Note: upon the successful completion of this command, the following three output can be found:
274 | * concat_allele.aln.fasta (concatenated allele sequences with genotyped SNPs)
275 | * concat_allele.aln.mat (Pairwise genomic distances between concatenated allele sequences)
276 | * concat_allele.aln.tre (Phylogenetic tree built with concatenated allele sequences)
277 | 


--------------------------------------------------------------------------------
/src/callm_db_val.cpp:
--------------------------------------------------------------------------------
  1 | #if __linux__
  2 | #include <linux/version.h>
  3 | #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22)
  4 | #define _MAP_POPULATE_AVAILABLE
  5 | #endif
  6 | #endif
  7 | 
  8 | #ifdef _MAP_POPULATE_AVAILABLE
  9 | #define MMAP_FLAGS (MAP_PRIVATE | MAP_POPULATE)
 10 | #else
 11 | #define MMAP_FLAGS MAP_PRIVATE
 12 | #endif
 13 | 
 14 | #include <sys/mman.h>
 15 | #include <sys/types.h>
 16 | #include <sys/stat.h>
 17 | 
 18 | #include <iostream>
 19 | #include <fstream>
 20 | #include <vector>
 21 | #include <unordered_map>
 22 | #include <algorithm>
 23 | #include <chrono>
 24 | #include <cstring>
 25 | #include <thread>
 26 | 
 27 | #include <fcntl.h>
 28 | #include <unistd.h>
 29 | #include <assert.h>
 30 | 
 31 | using namespace std;
 32 | 
 33 | 
 34 | // this program scans its input (fastq text stream) for forward k mers,
 35 | 
 36 | // usage:
 37 | //    g++ -O3 --std=c++11 -o vfkmrz_bunion vfkmrz_bunion.cpp
 38 | //    ./vfkmrz_bunion -k1 </path/to/kmer_list1> -k2 </path/to/kmer_list2>
 39 | //
 40 | // standard fastq format only for input, otherwise failure is almost guaranteed. 
 41 | 
 42 | // global variable declaration starts here
 43 | constexpr auto k = 31;
 44 | 
 45 | // set operation mode
 46 | // valid values: 0, 1, 2
 47 | // 0 is set union operation; 1 is set intersection operation; 2 is set difference([set1-set2]);
 48 | constexpr auto s_mod = 0;
 49 | 
 50 | // parameters for <unistd.h> file read; from the source of GNU coreutils wc
 51 | constexpr auto step_size = 256 * 1024 * 1024;
 52 | constexpr auto buffer_size = 256 * 1024 * 1024;
 53 | 
 54 | // output file path
 55 | constexpr auto out_path = "/dev/stdout";
 56 | 
 57 | // get time elapsed since when it all began in milliseconds.
 58 | long chrono_time() {
 59 |     using namespace chrono;
 60 |     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 61 | }
 62 | 
 63 | // number of bits per single nucleotide base
 64 | constexpr int bpb = 2;
 65 | 
 66 | size_t get_fsize(const char* filename) {
 67 |     struct stat st;
 68 |     stat(filename, &st);
 69 |     return st.st_size;
 70 | }
 71 | 
 72 | 
 73 | char* get_ftype(const char* filename) {
 74 |     int fn_len = strlen(filename);
 75 |     char *ftype = (char *)malloc(5);
 76 |     
 77 |     for(int i = 0; i < 4; ++i) {
 78 |         ftype[i] = filename[fn_len - 4 + i];
 79 |     }
 80 | 
 81 |     ftype[4] = '\0';
 82 |     
 83 |     return ftype;
 84 | } 
 85 | 
 86 | void make_comp_map(char* comp_map) {
 87 |     comp_map['A'] = 'T';
 88 |     comp_map['C'] = 'G';
 89 |     comp_map['G'] = 'C';
 90 |     comp_map['T'] = 'A';
 91 | }
 92 | 
 93 | template <class int_type>
 94 | int_type bit_encode(const char c) {
 95 |     switch (c) {
 96 |     case 'A': return 0;
 97 |     case 'C': return 1;
 98 |     case 'G': return 2;
 99 |     case 'T': return 3;
100 |     }
101 | 
102 |     assert(false);
103 | }
104 | 
105 | template <class int_type>
106 | char bit_decode(const int_type bit_code) {
107 |     switch (bit_code) {
108 |     case 0: return 'A';
109 |     case 1: return 'C';
110 |     case 2: return 'G';
111 |     case 3: return 'T';
112 |     }
113 |     assert(false);
114 | }
115 | 
116 | template <class int_type>
117 | void make_code_dict(int_type* code_dict) {
118 |     code_dict['A'] = bit_encode<int_type>('A');
119 |     code_dict['C'] = bit_encode<int_type>('C');
120 |     code_dict['G'] = bit_encode<int_type>('G');
121 |     code_dict['T'] = bit_encode<int_type>('T');
122 | }
123 | 
124 | template <class int_type>
125 | int_type seq_encode(const char* buf, int len, const int_type* code_dict, const int_type b_mask) {
126 |     int_type seq_code = 0;
127 |     for (int i=0;  i < len;  ++i) {
128 |         const int_type b_code = code_dict[buf[i]];
129 |         seq_code |= ((b_code & b_mask) << (bpb * (len - i - 1)));
130 |     }
131 |     return seq_code;
132 | }
133 | 
134 | template <class int_type>
135 | void seq_decode(char* buf, const int len, const int_type seq_code, int_type* code_dict, const int_type b_mask) {
136 |     for (int i=0;  i < len-1;  ++i) {
137 |         const int_type b_code = (seq_code >> (bpb * (len - i - 2))) & b_mask;
138 |         buf[i] = bit_decode<int_type>(b_code);
139 |     }
140 | 
141 |     buf[len-1] = '\0';
142 | }
143 | 
144 | 
145 | template <class int_type>
146 | void load_profile(const char* k_path, vector<char>& buffer, vector<int_type>& kv1, vector<int_type>& kv2, vector<int_type>& kv3, vector<int_type>& kv4, vector<tuple<int_type, int_type>>& k_info, int_type* code_dict, const int_type b_mask) {
147 |     auto t_start = chrono_time();
148 | 
149 |     char* window = buffer.data();
150 | 
151 |     uintmax_t n_lines = 0;
152 | 
153 |     int fd;
154 |     fd = open(k_path, O_RDONLY);
155 | 
156 |     int k_cur = 0;
157 |     int snp_cur = 0;
158 |     int pos_cur = 0;
159 | 
160 |     char kbuf1[k];
161 |     char kbuf2[k];
162 |     char kbuf3[k];
163 |     char kbuf4[k];
164 | 
165 | 	char snp_pos[16];
166 | 	char kmer_pos[4];
167 | 
168 |     //auto fh = fstream(out_path, ios::out | ios::binary);
169 | 
170 | 	int cur_field = 0;
171 |     bool has_wildcard = false;
172 | 
173 |     while (true) {
174 | 
175 |         const ssize_t bytes_read = read(fd, window, step_size);
176 | 
177 |         if (bytes_read == 0)
178 |             break;
179 | 
180 |         if (bytes_read == (ssize_t) -1) {
181 |             cerr << "unknown fetal error when reading " << k_path << endl;
182 |             exit(EXIT_FAILURE);
183 |         }
184 | 
185 |         for (int i = 0;  i < bytes_read;  ++i) {
186 |             char c = window[i];
187 | 
188 |             if (c == '\n') {
189 |                 ++n_lines;
190 | 
191 |                 if (has_wildcard) {
192 |                     has_wildcard = false;
193 |                     continue;    
194 |                 }
195 | 
196 |                 auto code1 = seq_encode<int_type>(kbuf1, k, code_dict, b_mask);
197 |                 auto code2 = seq_encode<int_type>(kbuf2, k, code_dict, b_mask);
198 |                 auto code3 = seq_encode<int_type>(kbuf3, k, code_dict, b_mask);
199 |                 auto code4 = seq_encode<int_type>(kbuf4, k, code_dict, b_mask);
200 | 
201 | 				kv1.push_back(code1);
202 | 				kv2.push_back(code2);
203 | 				kv3.push_back(code3);
204 | 				kv4.push_back(code4);
205 | 
206 | 				snp_pos[snp_cur] = '\0';
207 | 				int_type id_int = stoull(snp_pos);
208 | 				
209 | 				kmer_pos[pos_cur] = '\0';
210 | 				int_type k_pos = stoull(kmer_pos);
211 | 
212 |                	k_info.push_back(tuple<int_type, int_type>(id_int, k_pos));
213 | 
214 | 				k_cur = 0;
215 |                 pos_cur = 0;
216 | 				snp_cur = 0;
217 | 
218 | 				cur_field = 0;
219 |             } else if (c == '\t'){
220 | 				++cur_field;
221 | 				k_cur = 0;
222 | 			} else {
223 |                 if (c == 'N') {
224 |                     has_wildcard = true;    
225 |                 }
226 | 
227 | 				if (cur_field == 0) {
228 | 					snp_pos[snp_cur++] = c;	
229 | 				} else if (cur_field == 1) {
230 | 					kmer_pos[pos_cur++] = c;	
231 | 				} else if (cur_field == 2) {
232 | 					kbuf1[k_cur++] = c;
233 | 				} else if (cur_field == 3) {
234 | 					kbuf2[k_cur++] = c;
235 | 				} else if (cur_field == 4) {
236 | 					kbuf3[k_cur++] = c;
237 | 				} else if (cur_field == 5) {
238 | 					kbuf4[k_cur++] = c;
239 | 				} else {
240 | 					//do nothing;
241 | 				}
242 |             }
243 |         }
244 |     }
245 | 
246 | 
247 | 	assert(kv1.size() == k_info.size() && kv1.size() == kv2.size());
248 | 	assert(kv1.size() == kv3.size() && kv1.size() == kv4.size());
249 | 
250 |     auto timeit = chrono_time();
251 |     close(fd);
252 | }
253 | 
254 | template <class int_type>
255 | void fna_load_pool(const char* fna_path, vector<char>& buffer, unordered_map<int_type, int_type>& k_map, const int_type* code_dict, const int_type b_mask) {
256 |     auto t_start = chrono_time();
257 | 
258 |     char comp_map[1 << (sizeof(char) * 8)];
259 |     make_comp_map(comp_map);
260 | 
261 |     char* window = buffer.data();
262 | 
263 |     uintmax_t n_lines = 0;
264 | 
265 |     int fd;
266 |     fd = open(fna_path, O_RDONLY);
267 | 
268 |     int cur_pos = 0;
269 | 
270 | 	vector<char> seq_buf(10*1000*1000);
271 | 	char* bases = seq_buf.data();
272 | 
273 | 	char kmer_buff[k];
274 | 	char rckmer_buff[k];
275 | 
276 | 	bool is_base = false;
277 |     bool has_wildcard = false;
278 | 
279 |     while (true) {
280 | 
281 |         const ssize_t bytes_read = read(fd, window, step_size);
282 | 
283 |         if (bytes_read == 0)
284 |             break;
285 | 
286 |         if (bytes_read == (ssize_t) -1) {
287 |             cerr << "unknown fetal error when reading " << fna_path << endl;
288 |             exit(EXIT_FAILURE);
289 |         }
290 | 
291 |         for (int i = 0;  i < bytes_read;  ++i) {
292 | 			char c = window[i];
293 |             if (c == '\n') {
294 | 				if (!is_base) {
295 | 					is_base = true;
296 | 				}
297 | 				continue;
298 |             } else if (c == '>') {
299 | 				for (int j = 0; j < cur_pos-k+1; ++j) {
300 | 					for (int l = 0; l < k; ++l) {
301 | 						kmer_buff[l] = bases[j+l];
302 | 
303 | 						if (kmer_buff[l] == 'N') {
304 | 							has_wildcard = true;
305 | 							break;						
306 | 						}
307 | 					}
308 | 
309 | 
310 | 					if (has_wildcard) {
311 | 						has_wildcard = false;
312 | 						continue;
313 | 					}
314 | 
315 | 					auto kmer_int = seq_encode<int_type>(kmer_buff, k, code_dict, b_mask);
316 | 					
317 | 					if (k_map.find(kmer_int) == k_map.end()) {
318 | 						k_map.insert({kmer_int, 1});
319 | 					} else {
320 | 						++k_map[kmer_int];
321 | 					}
322 | 
323 | 					for (int l = k-1; l >= 0; --l) {
324 | 						rckmer_buff[k-1-l] = comp_map[kmer_buff[l]];
325 | 					}
326 | 
327 | 					/* not really necessary when rc kmers present
328 | 					auto rckmer_int = seq_encode<int_type>(rckmer_buff, k, code_dict, b_mask);
329 | 
330 | 					if (k_map.find(rckmer_int) == k_map.end()) {
331 | 						k_map.insert({rckmer_int, 1});
332 | 					} else {
333 | 						++k_map[rckmer_int];
334 | 					}
335 | 					*/
336 | 				}
337 | 
338 | 				cur_pos = 0;
339 | 				is_base = false;
340 | 
341 | 				++n_lines;
342 | 			} else {
343 |                 if (is_base) {
344 |                     bases[cur_pos++] = toupper(c);
345 |                 }
346 |             }
347 |         }
348 | 
349 |     }
350 | 
351 | 	if (cur_pos >= k) {
352 | 		for (int j = 0; j < cur_pos-k+1; ++j) {
353 | 			for (int l = 0; l < k; ++l) {
354 | 				kmer_buff[l] = bases[j+l];
355 | 
356 | 				if (kmer_buff[l] == 'N') {
357 | 					has_wildcard = true;
358 | 					break;						
359 | 				}
360 | 			}
361 | 
362 | 			if (has_wildcard) {
363 | 				has_wildcard = false;
364 | 				continue;
365 | 			}
366 | 
367 | 			auto kmer_int = seq_encode<int_type>(kmer_buff, k, code_dict, b_mask);
368 | 			
369 | 			if (k_map.find(kmer_int) == k_map.end()) {
370 | 				k_map.insert({kmer_int, 1});
371 | 			} else {
372 | 				++k_map[kmer_int];
373 | 			}
374 | 
375 | 			/* not really necessary when rc kmers present
376 | 			for (int l = k-1; l >= 0; --l) {
377 | 				rckmer_buff[k-1-l] = comp_map[kmer_buff[l]];
378 | 			}
379 | 
380 | 			auto rckmer_int = seq_encode<int_type>(rckmer_buff, k, code_dict, b_mask);
381 | 			if (k_map.find(rckmer_int) == k_map.end()) {
382 | 				k_map.insert({rckmer_int, 1});
383 | 			} else {
384 | 				++k_map[rckmer_int];
385 | 			}
386 | 			*/
387 | 		}
388 | 
389 | 		cur_pos = 0;
390 | 		++n_lines;
391 | 	}
392 | 
393 | 	buffer.clear();
394 | 	cerr << fna_path << endl;
395 | 	cerr << "number of sequences " << n_lines/2 << endl;
396 | 	cerr << "number of unique kmers: "<< k_map.size() << endl << endl;
397 |     auto timeit = chrono_time();
398 |     close(fd);
399 | }
400 | 
401 | 
402 | template <class int_type>
403 | void bit_load_pool(const char* k_path, vector<char>& buffer, unordered_map<int_type, int_type>& k_map, const int_type* code_dict, const int_type b_mask) {
404 |     auto t_start = chrono_time();
405 | 
406 |     char* window = buffer.data();
407 | 
408 |     uintmax_t n_lines = 0;
409 | 
410 |     int fd;
411 |     fd = open(k_path, O_RDONLY);
412 | 
413 |     int cur_pos = 0;
414 |     int snp_pos = 0;
415 | 
416 |     char seq_buf[k];
417 |     char snp_id[16];
418 | 
419 |     //auto fh = fstream(out_path, ios::out | ios::binary);
420 | 
421 |     bool id_switch = false;
422 |     bool has_wildcard = false;
423 | 
424 |     while (true) {
425 | 
426 |         const ssize_t bytes_read = read(fd, window, step_size);
427 | 
428 |         if (bytes_read == 0)
429 |             break;
430 | 
431 |         if (bytes_read == (ssize_t) -1) {
432 |             cerr << "unknown fetal error when reading " << k_path << endl;
433 |             exit(EXIT_FAILURE);
434 |         }
435 | 
436 |         for (int i = 0;  i < bytes_read;  ++i) {
437 |             char c = toupper(window[i]);
438 |             if (c == '\n') {
439 |                 ++n_lines;
440 | 
441 |                 if (has_wildcard) {
442 |                     has_wildcard = false;
443 |                     continue;
444 |                 }
445 | 
446 |                 auto kmer_int = seq_encode<int_type>(seq_buf, k, code_dict, b_mask);
447 | 
448 |                 snp_id[snp_pos] = '\0';
449 | 				int_type kcount = stoull(snp_id);
450 | 					
451 | 				assert(k_map.find(kmer_int) == k_map.end());
452 | 				k_map.insert({kmer_int, kcount});
453 | 
454 |                 cur_pos = 0;
455 |                 snp_pos = 0;
456 | 
457 |                 id_switch = false;
458 |             } else if (c == '\t'){
459 |                 id_switch = true;
460 |             } else {
461 |                 if (c == 'N') {
462 |                     has_wildcard = true;
463 |                 }
464 | 
465 |                 if (id_switch) {
466 |                     snp_id[snp_pos++] = c;
467 |                 } else {
468 |                     seq_buf[cur_pos++] = c;
469 |                 }
470 |             }
471 |         }
472 | 
473 |         //fh.write(&kmers[0], kmers.size());
474 | 
475 |         // cerr << n_lines << " lines were scanned after " << (chrono_time() - t_start) / 1000 << " seconds" << endl;
476 |     }
477 | 
478 |     auto timeit = chrono_time();
479 |     close(fd);
480 | }
481 | 
482 | 
483 | template <class int_type>
484 | void bin_load_pool(const char* p_path, unordered_map<int_type, int_type>& k_map) {
485 |     size_t filesize = get_fsize(p_path);
486 |     //Open file
487 |     int fd = open(p_path, O_RDONLY, 0);
488 |     assert(fd != -1);
489 |     //Execute mmap
490 |     //uint64_t* mmappedData = (uint64_t *) mmap(NULL, filesize, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
491 |     int_type* mmappedData = (int_type *) mmap(NULL, filesize, PROT_READ, MMAP_FLAGS, fd, 0);
492 |     assert(mmappedData != MAP_FAILED);
493 |     //Write the mmapped data to stdout (= FD #1)
494 | 
495 |     // write(1, mmappedData, filesize);
496 | 
497 |     // char seq_buf[k+1];
498 | 
499 |     auto l_start = chrono_time();
500 | 
501 |     for (uint64_t i = 0; i < filesize/8; i=i+2) {
502 |         // seq_decode<uint_fast64_t>(seq_buf, k, mmappedData[i], b_mask);
503 | 
504 |         auto kmer_int = mmappedData[i];
505 |         auto kcount = mmappedData[i+1];
506 | 
507 | 		assert(k_map.find(kmer_int) == k_map.end());
508 | 		k_map.insert({kmer_int, kcount});
509 |     }
510 | 
511 |     //Cleanup
512 |     int rc = munmap(mmappedData, filesize);
513 |     assert(rc == 0);
514 |     close(fd);
515 | }
516 | 
517 | 
518 | template <class int_type>
519 | bool cmp_tuple(const tuple<int_type, int_type> &a, const tuple<int_type, int_type> &b){
520 | 	return get<0>(a) < get<0>(b);
521 | }
522 | 
523 | template <class int_type>
524 | void set_kvecs(char* db_path, const int kv_n, vector<int_type>* kvecs, vector<tuple<int_type, int_type>>& kinfo) {	
525 | 	assert(kv_n == 4);
526 | 
527 |     int_type lsb = 1;
528 |     int_type b_mask = (lsb << bpb) - lsb;
529 | 
530 |     int_type code_dict[1 << (sizeof(char) * 8)];
531 |     make_code_dict<int_type>(code_dict);
532 | 
533 |     vector<char> buffer(buffer_size);
534 | 
535 | 	load_profile<int_type>(db_path, buffer, kvecs[0], kvecs[1], kvecs[2], kvecs[3], kinfo, code_dict, b_mask);
536 | 
537 | 	cerr << "DB loading OK!" << endl;
538 | }
539 | 
540 | template <class int_type>
541 | void multi_dbval(int kv_n, vector<int_type>* kvecs, int n_path, vector<string>& kpaths, vector<tuple<int, int, int, int, int>>& prof_vec) {	
542 | 	assert(kv_n == 4);
543 | 
544 |     int_type lsb = 1;
545 |     int_type b_mask = (lsb << bpb) - lsb;
546 | 
547 |     int_type code_dict[1 << (sizeof(char) * 8)];
548 |     make_code_dict<int_type>(code_dict);
549 | 
550 |     vector<char> buffer(buffer_size);
551 | 
552 | 	int64_t prof_size = kvecs[0].size();
553 | 
554 | 	vector<int> lc_vecs[4];
555 | 
556 | 	for (int i = 0; i < 4; ++i) {
557 | 		lc_vecs[i].reserve(prof_size);
558 | 	}
559 | 
560 | 	prof_vec.resize(prof_size, make_tuple(0,0,0,0,0));
561 | 
562 | 	unordered_map<int_type, int_type> kpool;
563 | 
564 |     for (int i = 0; i < n_path; ++i) {
565 | 		char* kp_type = get_ftype(kpaths[i].c_str());
566 | 		
567 | 
568 | 		if (strcmp(kp_type, ".bin") == 0) {
569 | 			bin_load_pool<int_type>(kpaths[i].c_str(), kpool);
570 | 		} else {
571 | 			fna_load_pool<int_type>(kpaths[i].c_str(), buffer, kpool, code_dict, b_mask);	
572 | 		} 
573 | 		
574 | 		//else {
575 | 		//	bit_load_pool<int_type>(kpaths[i].c_str(), buffer, kpool, code_dict, b_mask);	
576 | 		//}
577 | 
578 | 		// splitted loops 
579 | 		for (int j = 0; j < 4; ++j) {
580 | 			for(auto it = kvecs[j].begin(); it != kvecs[j].end(); ++it) {
581 | 				if(kpool.find(*it) == kpool.end()) {
582 | 					lc_vecs[j].push_back(0);
583 | 				} else {
584 | 					lc_vecs[j].push_back(kpool[*it]);
585 | 				}
586 | 			}
587 | 		}			
588 | 		
589 | 		assert(lc_vecs[0].size() == lc_vecs[1].size());
590 | 		assert(lc_vecs[0].size() == lc_vecs[2].size());
591 | 		assert(lc_vecs[0].size() == lc_vecs[3].size());
592 | 
593 | 		const int64_t lc_size = lc_vecs[0].size();
594 | 
595 | 		for (int64_t j = 0; j < lc_size; ++j) {
596 | 			auto lc_sum = lc_vecs[0][j] + lc_vecs[1][j] + lc_vecs[2][j] + lc_vecs[3][j];
597 | 
598 | 			auto ref_sum = lc_vecs[0][j] + lc_vecs[2][j];
599 | 			auto alt_sum = lc_vecs[1][j] + lc_vecs[3][j];
600 | 			
601 | 			/*
602 | 			if (strcmp(kp_type, ".fna") == 0) {
603 | 				lc_sum = lc_sum / 2;
604 | 			}
605 | 			*/
606 | 
607 | 			if (lc_sum == 0) {
608 | 				++get<0>(prof_vec[j]);			
609 | 			} else if (lc_sum == 1) {
610 | 				++get<1>(prof_vec[j]);			
611 | 			} else {
612 | 				++get<2>(prof_vec[j]);			
613 | 			}
614 | 
615 | 			if (ref_sum > 0) {
616 | 				++get<3>(prof_vec[j]);			
617 | 			}
618 | 
619 | 			if (alt_sum > 0) {
620 | 				++get<4>(prof_vec[j]);			
621 | 			}
622 | 		}
623 | 
624 | 		for (int j = 0; j < 4; ++j) {
625 | 			lc_vecs[j].clear();
626 | 		}
627 | 
628 | 		kpool.clear();
629 |     }
630 | 
631 |     auto timeit = chrono_time();
632 | }
633 | 
634 | void display_usage(char *fname){
635 | 	cout << "usage: " << fname << " -d profile_path -n identifier [-t n_threads] [-o output_path] [-L path to list of input] inpath1 [ inpath2 ...]\n";
636 | }
637 | 
638 | int main(int argc, char** argv){		
639 | 	extern char *optarg;
640 |     extern int optind;
641 | 
642 |     bool dbflag = false;
643 |     bool inflag = false;
644 | 	bool idflag = false;
645 | 	bool list_flag = false;
646 | 
647 | 
648 |     char* fname = argv[0];
649 |     char* db_path = (char *)"";
650 | 	char* list_path = (char *)"";
651 |     char* oname = (char *)"/dev/stdout";
652 | 	char* spe_id = (char *)"";
653 | 
654 |     int n_threads = 1;
655 | 
656 |     int opt;
657 |     while ((opt = getopt(argc, argv, "d:n:t:L:o:h")) != -1) {
658 |         switch (opt) {
659 |             case 'd':
660 |                 dbflag = true;
661 |                 db_path = optarg;
662 |                 break;
663 |             case 'n':
664 |                 idflag = true;
665 |                 spe_id = optarg;
666 |                 break;
667 |             case 't':
668 |                 n_threads = stoi(optarg);
669 |                 break;
670 | 			case 'L':
671 | 				list_flag = true;
672 | 				list_path = optarg;
673 | 				break;
674 |             case 'o':
675 |                 oname = optarg;
676 |                 break;
677 |             case 'h': case '?':
678 |                 display_usage(fname);
679 |                 exit(1);
680 |         }
681 |     }
682 | 
683 |     cerr << fname << '\t' << db_path << '\t' << n_threads << endl;
684 | 
685 |     if (!dbflag) {
686 |         cerr << "missing argument: -d <sckmerdb_path: string>\n";
687 |         display_usage(fname);
688 |         exit(1);
689 |     }
690 | 
691 | 	if (!idflag) {
692 | 		cerr << "missing argument: -n <species identifier>\n";
693 |         display_usage(fname);
694 |         exit(1);
695 | 	}
696 | 
697 | 
698 | 	int in_pos = optind;
699 | 
700 | 	if (list_flag) {
701 |         cerr << "program reads a list of kmer pools for checking kmer uniqueness: " << list_path << endl;
702 |     } else { 
703 |         if (optind == argc) {
704 |             cerr << "missing argument: input (>1)\n";
705 |             display_usage(fname);
706 |             exit(1);
707 |         }
708 |     }
709 | 
710 | 
711 | 	vector<uint64_t> kvecs[4]; 
712 | 	const int max_size = 100 * 1000 * 1000;
713 | 
714 | 	for (int i = 0; i < 4; ++i) {
715 |         kvecs[i].reserve(max_size);
716 |     }
717 | 
718 | 	vector<tuple<uint64_t, uint64_t>> kinfo;
719 | 	
720 | 	set_kvecs<uint64_t>(db_path, 4, kvecs, kinfo);
721 | 
722 | 	vector<string> input_array[n_threads];
723 | 
724 | 	auto label = 0;
725 | 	if (list_flag) {
726 |         ifstream file(list_path);
727 |         string line;
728 | 		int l_count = 0;
729 |         while (getline(file, line)) {
730 | 			label = l_count % n_threads;
731 |             string tmp_line = line;
732 |             input_array[label].push_back(tmp_line);
733 | 			++l_count;
734 |         }
735 |     }
736 | 
737 | 	if (optind < argc) {
738 | 		for(; optind < argc; optind++) {
739 | 			auto slabel = (optind - in_pos + label) % n_threads;
740 | 			input_array[slabel].push_back(string(argv[optind]));
741 | 		}
742 | 	}
743 | 
744 | 	vector<tuple<int, int, int, int, int>> prof_vecs[n_threads];
745 | 	vector<thread> th_array;
746 | 	
747 | 	for (int i = 0; i < n_threads; ++i) {
748 | 		th_array.push_back(thread(multi_dbval<uint64_t>, 4, kvecs, input_array[i].size(), ref(input_array[i]), ref(prof_vecs[i])));
749 | 	}
750 | 
751 | 
752 | 	for (thread & ith : th_array) {
753 | 		ith.join();
754 | 	}
755 | 	th_array.clear();
756 | 
757 | 	vector<tuple<int, int, int, int, int>> reduced_prof;
758 | 	reduced_prof.resize(kvecs[0].size(), make_tuple(0,0,0,0,0));
759 | 
760 |     uint64_t lsb = 1;
761 |     uint64_t b_mask = (lsb << bpb) - lsb;
762 | 
763 |     uint64_t code_dict[1 << (sizeof(char) * 8)];
764 |     make_code_dict<uint64_t>(code_dict);
765 | 
766 | 	char sbuf1[k+1];
767 | 	char sbuf2[k+1];
768 | 	char sbuf3[k+1];
769 | 	char sbuf4[k+1];
770 | 
771 | 	ofstream myfile;
772 | 	myfile.open(oname);
773 | 
774 | 	for (int j = 0; j < kinfo.size(); ++j) {
775 | 		for (int i = 0; i < n_threads; ++i) {
776 | 			get<0>(reduced_prof[j]) += get<0>(prof_vecs[i][j]);	
777 | 			get<1>(reduced_prof[j]) += get<1>(prof_vecs[i][j]);	
778 | 			get<2>(reduced_prof[j]) += get<2>(prof_vecs[i][j]);	
779 | 			get<3>(reduced_prof[j]) += get<3>(prof_vecs[i][j]);	
780 | 			get<4>(reduced_prof[j]) += get<4>(prof_vecs[i][j]);	
781 | 		}
782 | 
783 | 		auto info_pair = kinfo[j];
784 | 
785 | 		seq_decode<uint64_t>(sbuf1, k+1, kvecs[0][j], code_dict, b_mask);
786 | 		seq_decode<uint64_t>(sbuf2, k+1, kvecs[1][j], code_dict, b_mask);
787 | 		seq_decode<uint64_t>(sbuf3, k+1, kvecs[2][j], code_dict, b_mask);
788 | 		seq_decode<uint64_t>(sbuf4, k+1, kvecs[3][j], code_dict, b_mask);
789 | 
790 | 		myfile << get<0>(info_pair) << '\t' << get<1>(info_pair) << '\t' << sbuf1 << '\t' << sbuf2 << '\t' << sbuf3 << '\t' << sbuf4 << '\t' << get<0>(reduced_prof[j]) << '\t' << get<1>(reduced_prof[j]) << '\t'<< get<2>(reduced_prof[j]) << '\t' << spe_id << '\t' << get<3>(reduced_prof[j]) << '\t'<< get<4>(reduced_prof[j]) << '\n';
791 | 	}
792 | 
793 |     return 0;
794 | }
795 | 


--------------------------------------------------------------------------------
/bin/maast.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | 
   3 | from __future__ import division
   4 | 
   5 | import sys, os, time, argparse
   6 | import shutil, hashlib, math, multiprocessing
   7 | import numpy as np
   8 | from operator import itemgetter
   9 | 
  10 | from Bio import SeqIO
  11 | 
  12 | from snps_io import id_genome_clusters, id_centroid
  13 | from snps_io import vcf_io, concat_alleles, gen_msa, align_assembly
  14 | 
  15 | from db_io import build_db
  16 | 
  17 | def get_data_type():
  18 | 	""" Get program specified by user (species, genes, or snps) """
  19 | 	import sys
  20 | 	if len(sys.argv) == 1 or sys.argv[1] in ['-h', '--help']:
  21 | 		cmd = 'maast '
  22 | 		print('usage: %s <module> [options]' % cmd)
  23 | 		print('')
  24 | 		print("version: 1.0.8")
  25 | 		print('')
  26 | 		print('description: identify and genotype core-genome snps from <module>')
  27 | 		print('')
  28 | 		print('modules:')
  29 | 		print('	end_to_end       Run full Maast pipeline from begining to end')
  30 | 		print('	genomes          Perform multiple alignment of genomes to call core-genome SNPs')
  31 | 		print('	db               Build kmer database targeting snps')
  32 | 		print('	genotype         Call core-genome SNPs for single genomes and isolate sequencing data')
  33 | 		print('	tree             Build SNP tree using identified genotypes')
  34 | 		print('')
  35 | 		print("use '%s <module> -h' for usage on a specific command" % cmd)
  36 | 		print('')
  37 | 		quit()
  38 | 	elif sys.argv[1] not in ['end_to_end', 'genomes', 'db', 'genotype', 'tree']:
  39 | 		sys.exit("\nError: invalid subcommand\n\nSupported subcommand: genomes, db, genotype, end_to_end, tree\n")
  40 | 	else:
  41 | 		return sys.argv[1]
  42 | 
  43 | def parse_args():
  44 | 
  45 | 	data_type = get_data_type()
  46 | 
  47 | 	parser = argparse.ArgumentParser(
  48 | 		formatter_class=argparse.RawTextHelpFormatter,
  49 | 		add_help=False,
  50 | 		usage=argparse.SUPPRESS
  51 | 		)
  52 | 
  53 | 	parser.add_argument('data_type', help=argparse.SUPPRESS)
  54 | 
  55 | 	if data_type == 'end_to_end':
  56 | 		end2end_input = parser.add_argument_group('end2end_input')
  57 | 		end2end_input.add_argument('--in-dir', type=str, metavar='PATH',required=True,
  58 | 			help = """Path to directory of raw-read-files in FASTQ format (.fastq or .fq; gzipped or not)\nor whole-genome sequences in FASTA format (.fna, .fa, .fsa or .fasta). (Required)""")
  59 | 
  60 | 	io = parser.add_argument_group('input/output')
  61 | 	io.add_argument('--out-dir', type=str, metavar='PATH', required=True,
  62 | 		help="""Directory to store output (required)""")
  63 | 
  64 | 	if data_type in ['genomes']: 
  65 | 		io.add_argument('--fna-dir', type=str, metavar='PATH', required=True,
  66 | 			help = """Path to directory of genomes in FASTA format (required)""")
  67 | 	
  68 | 	if data_type in ['genomes', 'end_to_end']:
  69 | 		io.add_argument('--rep-fna', type=str, metavar='PATH', default=None,
  70 | 			help = """Path to the reference genome serving as the template for whole genome alignment. \nIf provided, Maast will not identify and use centroid genome for reference (default None)""")
  71 | 		io.add_argument('--skip-align', action='store_true', default=False,
  72 | 			help = """skip whole genome sequence or short read alignment, only applicable when alignment \nhas already been done (default False)""")
  73 | 		io.add_argument('--has-completeness', action='store_true', default=False,
  74 | 			help = """Toggle for specifying completeness for supplied genomes sequenes. If toggled on, \nit requries to supply either --completeness or --completeness-list (default False)""")
  75 | 		io.add_argument('--completeness', type=float, metavar='FLOAT', default=None,
  76 | 			help = """Single completeness value for all genomes sequenes \n(i.e. all genomes have the same completeness) (default False)""")
  77 | 		io.add_argument('--completeness-list', type=str, metavar='PATH', default=None,
  78 | 			help = """Path to list of pairs of genome file name and completeness value, separated by tab character. \n(note: genome file names should have no duplicates, and should cover all files specified in --fna-dir) (default None)""")
  79 | 		io.add_argument('--missing-ratio', type=float, metavar='FLOAT', default=0.05,
  80 | 			help = """Parameter defining the missing ratio of core sites even when completeness is 1 (default 0.05)""")
  81 | 		io.add_argument('--min-pid', type=float, metavar='FLOAT', default=0,
  82 | 			help = """Parameter defining the minimal identity for including each aligned block, [0, 100] (default 0)""")
  83 | 		io.add_argument('--min-aln-len', type=int, metavar='INT', default=10,
  84 | 			help = """Parameter defining the minimal length for including each aligned block (default 10)""")
  85 | 		io.add_argument('--max-pid-delta', type=float, metavar='FLOAT', default=0.1,
  86 | 			help = """Parameter defining the maximum identity gap between identity of each aligned block and \nwhole-genome ANI, all alignments with identity less than ANI * (1 - delta) will be purged, [0, 1] (default 10)""")
  87 | 		io.add_argument('--mem', action='store_true', default=False,
  88 | 			help = """calling SNPs by genomic segment, option for memory saving (default False)""")
  89 | 
  90 | 	if data_type in ['genomes', 'end_to_end']:
  91 | 		prep = parser.add_argument_group('preprocessing')
  92 | 		prep.add_argument('--keep-redundancy', action='store_true', default=False,
  93 | 			help="""If toggled on, Maast will skip redundancy removal and move on with all input genomes (default=False)""")
  94 | 		prep.add_argument('--skip-centroid', action='store_true', default=False,
  95 | 			help="""If toggled on, Maast will not attempt to identify and use centroid genome for reference (default=False)""")
  96 | 		prep.add_argument('--sketch-k', type=int, metavar='INT', default=21,
  97 | 			help="""k-mer size for building Mash sketch (default=21)""")
  98 | 		prep.add_argument('--sketch-size', type=int, metavar='INT', default=5000,
  99 | 			help="""The number of k-mers per Mash sketch (default=5000)""")
 100 | 		prep.add_argument('--precut', type=float, metavar='FLOAT', default=0.05,
 101 | 			help="""Limit searches among pair of genomes with distance smaller than the provided value (default=0.05)""")
 102 | 		prep.add_argument('--start-cutoff', type=float, metavar='FLOAT', default=0.02,
 103 | 			help="""The cutoff from which Maast will start to search a distance cutoff, which generate \nthe good number of genome clusters and tag genomes based on a given MAF (default=0.02)""")
 104 | 		prep.add_argument('--end-cutoff', type=float, metavar='FLOAT', default=0.0001,
 105 | 			help="""Similiar to --start-cutoff, the cutoff at which Maast will end the search for a distance cutoff. \nThis value should be smaller than --start-cutoff (default=0.0001)""")
 106 | 		prep.add_argument('--range-factor', type=float, metavar='FLOAT', default=1.2,
 107 | 			help="""This factor times the minimum number of genomes needed for a given MAF will create \nthe upper bound of a range satisfying the search. It should be larger than 1 (default=1.2)""")
 108 | 		prep.add_argument('--tag-centrality', dest='centrality_method', default='degree',
 109 | 			choices=['degree', 'eigenvector', 'closeness', 'information', 'betweenness', 'load'],
 110 | 			help="""
 111 | Choose method to rank genomes by centrality in a genome cluster 
 112 |     degree: degree centrality, which rank genomes by the number of links (default)
 113 |     eigenvector: eigenvector centrality
 114 |     closeness: closeness centrality
 115 |     information: information centrality
 116 |     betweenness: betweenness centrality
 117 |     load: load centrality
 118 | 	*for method details, see https://networkx.org/documentation/stable/reference/algorithms/centrality.html""")
 119 | 		prep.add_argument('--edge-weighted', action='store_true', default=False,
 120 | 			help="""By default, Maast with binarize the edges (convert non-zero distance values to 1) for tag genome picking.\n If toggled on, Maast will use distance values as weights for tag genome picking. (default=False)""")
 121 | 		prep.add_argument('--centroid-distance', dest='cent_dist_type', default='L1',
 122 | 			choices=['L1', 'L2', 'Linf'],
 123 | 			help="""
 124 | Choose type of distance that will be used to pick centroid genome as the one with least distance to other genomes in the cluster. 
 125 |     L1: sum of distances to all other genomes
 126 |     L2: square root of the sum of the squared distances to all other genomes
 127 |     Linf: the maximum distance to all other genomes""")
 128 | 
 129 | 	if data_type in ['genomes', 'end_to_end']:
 130 | 		snps = parser.add_argument_group('snp-calling')
 131 | 		snps.add_argument('--max-sites', type=int, metavar='INT', default=float('inf'),
 132 | 			help="""Maximum genomic sites to parse (use all); useful for testing (default=inf)""")
 133 | 		snps.add_argument('--min-prev', type=float, metavar='FLOAT', default=1.0,
 134 | 			help="""Minimum prevalence (default=1.0)""")
 135 | 		snps.add_argument('--snp-freq', type=float, metavar='FLOAT', default=0.01,
 136 | 			help="""Minimum minor allele frequency for SNP calling (default=0.01)""")
 137 | 		snps.add_argument('--max-samples', type=int, metavar='INT', default=float('inf'),
 138 | 			help="""Only use a subset of genomes or metagenomes for snp calling (default=inf)""")
 139 | 
 140 | 	if data_type in ['db', 'end_to_end']:
 141 | 		db = parser.add_argument_group('db-building')
 142 | 	if data_type in ['db']:
 143 | 		db.add_argument('--ref-genome', type=str, dest='ref_genome', required=True,
 144 | 			help="""Path to reference genome sequence file (required)""")
 145 | 		db.add_argument('--vcf', type=str, dest='vcf', required=True,
 146 | 			help="""Path to a vcf file describing core snps/genetic variants called based on \nmultiple sequence alignments (required)""")
 147 | 		db.add_argument('--msa', type=str, dest='msa', required=True,
 148 | 			help="""Path to multiple sequence alignment file (required)""")
 149 | 		db.add_argument('--tag-fna-list', type=str, dest='tag_list', required=True,
 150 | 			help="""Path to a list of paths to the tag genomes (FASTA format) which are included \nin multiple sequence alignment file (required)""")
 151 | 		db.add_argument('--fna-dir', type=str, dest='fna_dir', default=None,
 152 | 			help="""Path to a list of paths to the tag genomes (FASTA format) which are included \nin multiple sequence alignment file (default=None)""")
 153 | 		db.add_argument('--coords', type=str, dest='coords', default=None,
 154 | 			help="""Path to core genome block coordinate file (default=None)""")
 155 | 
 156 | 	if data_type in ['db', 'end_to_end']:
 157 | 		db.add_argument('--genome-name', dest='genome_name', type=str, default='100000',
 158 | 			help="""Name of the core-genome corresponding to INPUT. Should be six digits \nwith the first digit in [1, 9] (default=100000)""")
 159 | 		db.add_argument('--kmer-type', dest='kmer_type', default='all',
 160 | 			choices=['all', 'center'],
 161 | 			help="""
 162 | Choose type of kmers to be fetched
 163 |     all: all elligible kmers 
 164 |         1) covered snp at any position
 165 |         and 2) do not cover any bad sites (e.g. N or -)
 166 |         and 3) were well contained on its coordinate division (default)
 167 |     center: all kmers whose target snps was at their centers.""")
 168 | 		db.add_argument('--snp-cover', dest='snp_type', default='all',
 169 | 			choices=['all', 'l1-tags', 'l2-tags'],
 170 | 			help="""
 171 | Choose object to kmerize
 172 |     all: all snps from the cluster will be attempted for kmer search; most kmers (default)
 173 |     l1-tags: only representative snps from all snp blocks will be attempted
 174 |     l2-tags: only representative snps from representative snp blocks will be attempted; fewest kmers
 175 | * note: all kmers must uniquely match an allele and intersect >= 1 SNP""")
 176 | 
 177 | 	if data_type in ['genotype', 'end_to_end']:
 178 | 		genotype_input = parser.add_argument_group('genotype_input')
 179 | 
 180 | 	if data_type in ['genotype']:
 181 | 		genotype_input.add_argument('--in-dir', type=str, metavar='PATH',required=True,
 182 | 			help = """Path to directory of raw-read-files in FASTQ format (.fastq or .fq; gzipped or not) \nor whole-genome sequences in FASTA format (.fna, .fa, .fsa or .fasta) (required)""")
 183 | 		genotype_input.add_argument('--ref-genome', type=str, dest='ref_genome', required=True,
 184 | 			help="""Path to reference genome sequence file (required)""")
 185 | 		genotype_input.add_argument('--db', type=str, metavar='PATH', dest='kmer_db_path', required=True,
 186 | 			help = """Path to directory of raw-read-files in FASTQ format (.fastq or .fq; gzipped or not) \nor whole-genome sequences in FASTA format (.fna, .fa, .fsa or .fasta) (required)""")
 187 | 		genotype_input.add_argument('--vcf', type=str, dest='vcf', required=True,
 188 | 			help="""Path to a vcf file describing core snps/genetic variants called based on \nmultiple sequence alignments (required)""")
 189 | 		single_genome = parser.add_argument_group('genome-genotyping')
 190 | 		single_genome.add_argument('--min-pid', type=float, metavar='FLOAT', default=0,
 191 | 			help = """Parameter defining the minimal identity for including each aligned block, [0, 100] (default=0)""")
 192 | 		single_genome.add_argument('--min-aln-len', type=int, metavar='INT', default=10,
 193 | 			help = """Parameter defining the minimal length for including each aligned block (default=10)""")
 194 | 		single_genome.add_argument('--max-pid-delta', type=float, metavar='FLOAT', default=0.1,
 195 | 			help = """Parameter defining the maximum identity gap between identity of each aligned block and \nwhole-genome ANI, all alignments with identity less than ANI * (1 - delta) will be purged, [0, 1] (default=0.1)""")
 196 | 
 197 | 	if data_type in ['genotype', 'end_to_end']:
 198 | 		genotype_input.add_argument('--merge-pairs', action='store_true', default=False,
 199 | 			help = """Flag to merge paired raw reads files in <in-dir>; indicated by ext '_1*' and '_2*'""")
 200 | 
 201 | 		align = parser.add_argument_group('reads-genotyping')
 202 | 		align.add_argument('--mode', default='very-sensitive',
 203 | 			choices=['very-fast', 'fast', 'sensitive', 'very-sensitive'],
 204 | 			help = """Alignment speed/sensitivity (default=very-sensitive)""")
 205 | 		align.add_argument('--max-reads', type=int, metavar='INT',
 206 | 			help = """Maximum # reads to use from each FASTQ file (default=None; use all)""")
 207 | 
 208 | 	if data_type in ['genomes', 'genotype', 'end_to_end']:
 209 | 		io.add_argument('--subset-list', type=str, metavar='PATH', default=None, 
 210 | 			help = """Path to file contains the names of the fullset or subset of the files in the input directory. \nFiles not in the list will not be included for snp calling (default=None; use all)""")
 211 | 
 212 | 	if data_type in ['tree']:	
 213 | 		tree_io = parser.add_argument_group('tree_io')
 214 | 		tree_io.add_argument('--input-dir', type=str, dest='input_dir', required=True,
 215 | 			help="""Input directory that should contains genotype result files generated from Maast genotype command""")
 216 | 		tree_io.add_argument('--input-list', type=str, dest='input_list', default=None,
 217 | 			help="""A list of input pairs. Each pair per row contains a path to a genotype result file generated \nfrom Maast genotype command and a unique name of the file. (required)
 218 | 					The path and name must be separated by a tab.
 219 | 					Example
 220 | 					/file/path/1	name1
 221 | 					/file/path/2	name2
 222 | 					/file/path/3	name3
 223 | 					...""")
 224 | 		tree_io.add_argument('--min-sites', type=int, dest='min_sites_per_sample', default=1000,
 225 | 			help="""Minimum SNP sites. Any allele sequence with a number of non-empty sites lower than \nthis value will not be included (default=1000)""")
 226 | 		tree_io.add_argument('--max-gap-ratio', type=float, dest='max_gap_ratio', default=0.5,
 227 | 			help="""Maximum ratio of gaps. Any allele sequence with a ratio of gap higher than this value \nwill not be included (default=0.5)""")
 228 | 		tree_io.add_argument('--min-site-prev', type=float, dest='min_site_prev', default=0.9,
 229 | 			help="""Minimum site prevalence. Any site with an actual allele presents in a fraction of sequences \nlower than this value will not be included (default=0.9)""")
 230 | 		tree_io.add_argument('--min-MAF', type=float, dest='min_maf', default=0.01,
 231 | 			help="""Minimum allele frequency. Any site with MAF lower than this value will not be included (default=0.01)""")
 232 | 		tree_io.add_argument('--min-MAC', type=float, dest='min_mac', default=1,
 233 | 			help="""Minimum allele count. Any site with MAC lower than this value will not be included (default=1)""")
 234 | 		tree_io.add_argument('--min-depth', type=float, dest='min_depth', default=1,
 235 | 			help="""Minimum read depth. Any site supported by a number of reads lower than this value will not be included. \nThis option is only for genotypes identified from sequencing reads. \nDefault value is 1 and any value >1 will effectively exclude all whole genome assemblies from analysis. \nCaution is advised (default=1)""")
 236 | 
 237 | 	misc = parser.add_argument_group('misc')
 238 | 	misc.add_argument("-h", "--help", action="help",
 239 | 		help="""Show this help message and exit""")
 240 | 	# use all if default=multiprocessing.cpu_count()
 241 | 	misc.add_argument('--threads', type=int, metavar='INT', default=1,
 242 | 		help="""Number of CPUs to use (default=1)""")
 243 | 	misc.add_argument('--overwrite', dest='overwrite', action='store_true', help="""Overwrite existing output files""")
 244 | 
 245 | 	args = vars(parser.parse_args())
 246 | 
 247 | 	args['data_type'] = data_type
 248 | 
 249 | 	return args
 250 | 
 251 | def run_command(cmd, env=None):
 252 | 	import subprocess as sp
 253 | 	if env:
 254 | 		p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE, env=env)
 255 | 	else:
 256 | 		p = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
 257 | 	out, err = p.communicate()
 258 | 	if p.returncode != 0:
 259 | 		err_msg =  "\nError: the following returned non-zero status: '%s':\n" % cmd
 260 | 		err_msg += "\n%s" % err
 261 | 		sys.exit(err_msg)
 262 | 	else:
 263 | 		return out.decode('utf-8'), err.decode('utf-8')
 264 | 
 265 | def parallel(function, argument_list, threads):
 266 | 	""" Based on: https://gist.github.com/admackin/003dd646e5fadee8b8d6 """
 267 | 	import multiprocessing as mp
 268 | 	import signal
 269 | 	import time
 270 | 
 271 | 	def init_worker():
 272 | 		signal.signal(signal.SIGINT, signal.SIG_IGN)
 273 | 
 274 | 	pool = mp.Pool(int(threads), init_worker)
 275 | 
 276 | 	try:
 277 | 		results = []
 278 | 		for arguments in argument_list:
 279 | 			p = pool.apply_async(function, args=arguments)
 280 | 			results.append(p)
 281 | 		pool.close()
 282 | 
 283 | 		while True:
 284 | 			if all(r.ready() for r in results):
 285 | 				return [r.get() for r in results]
 286 | 			time.sleep(1)
 287 | 
 288 | 	except KeyboardInterrupt:
 289 | 		pool.terminate()
 290 | 		pool.join()
 291 | 		sys.exit("\nKeyboardInterrupt")
 292 | 
 293 | def reformat_sequence_headers(args):
 294 | 	"""
 295 | 	Reformat sequence headers in input genomes to prevent parsnp from crashing
 296 | 	"""
 297 | 	import Bio.SeqIO
 298 | 	if 'fna_dir' in args:
 299 | 		try: os.makedirs(args['out_dir']+'/temp/genomes')
 300 | 		except: pass
 301 | 		for file in os.listdir(args['fna_dir']):
 302 | 			infile = open(args['fna_dir']+'/'+file)
 303 | 			outfile = open(args['out_dir']+'/temp/genomes/'+file, 'w')
 304 | 			for seq in Bio.SeqIO.parse(infile, 'fasta'):
 305 | 				seq.id = seq.id.replace('-', '_')
 306 | 				seq.seq = str(seq.seq).upper()
 307 | 				outfile.write('>'+seq.id+'\n'+seq.seq+'\n')
 308 | 		infile.close()
 309 | 		outfile.close()
 310 | 		args['fna_dir'] = args['out_dir']+'/temp/genomes'
 311 | 
 312 | 	if 'rep_fna' in args and args['rep_fna'] is not None:
 313 | 		infile = open(args['rep_fna'])
 314 | 		outfile = open(args['out_dir']+'/temp/'+os.path.basename(args['rep_fna']), 'w')
 315 | 		for seq in Bio.SeqIO.parse(infile, 'fasta'):
 316 | 			seq.id = seq.id.replace('-', '_')
 317 | 			seq.seq = str(seq.seq).upper()
 318 | 			outfile.write('>'+seq.id+'\n'+seq.seq+'\n')
 319 | 		infile.close()
 320 | 		outfile.close()
 321 | 		args['rep_fna'] = args['out_dir']+'/temp/'+os.path.basename(args['rep_fna'])
 322 | 
 323 | def locate_fpaths(args, in_dir, rep_fna=None, subset_list=None):
 324 | 	subset_map = dict()
 325 | 
 326 | 	for f in os.listdir(in_dir):
 327 | 		subset_map[f] = 1
 328 | 
 329 | 	if subset_list is not None:
 330 | 		subset_map = dict()
 331 | 		with open(subset_list, 'r') as fh:
 332 | 			for ln in fh:
 333 | 				subset_map[ln.rstrip()] = 1
 334 | 
 335 | 	args["subset_map"] = subset_map
 336 | 
 337 | 	ref_path = ""
 338 | 	fpaths = []
 339 | 
 340 | 	# Using the largest genome file in direcory for reference intead of randomly selecting anyone
 341 | 	lg_fpath = ""
 342 | 	cur_size = 0
 343 | 	for f in os.listdir(in_dir):
 344 | 		if f in subset_map:
 345 | 			fpath = in_dir.rstrip('/')+'/'+f
 346 | 			ftype = id_input_type(fpath)
 347 | 
 348 | 			if os.path.isfile(fpath) and ftype == "fasta":
 349 | 				fstats = os.stat(fpath)
 350 | 				fpaths.append(fpath)
 351 | 				if fstats.st_size >= cur_size:
 352 | 					cur_size = fstats.st_size
 353 | 					lg_fpath = fpath
 354 | 			else:
 355 | 				sys.stderr.write("skip {}: not fasta format\n".format(fpath))	
 356 | 
 357 | 		else:
 358 | 			sys.stderr.write("skip {}\n".format(f))
 359 | 
 360 | 	if rep_fna is not None: # Using speficied reference genome
 361 | 		ref_path = rep_fna
 362 | 	else:
 363 | 		ref_path = lg_fpath
 364 | 
 365 | 	args['rep_fna'] = ref_path
 366 | 	args['fna_paths'] = fpaths
 367 | 
 368 | def detect_single_chrom(ref_path):
 369 | 	single_chrom = True
 370 | 	chrom_cnt = 0
 371 | 	with open(ref_path, 'r') as fh:
 372 | 		for line in fh:
 373 | 			if line[0] == '>':
 374 | 				chrom_cnt = chrom_cnt + 1
 375 | 
 376 | 				if chrom_cnt == 1:
 377 | 					pass
 378 | 				else:
 379 | 					single_chrom = False
 380 | 					break
 381 | 
 382 | 	return single_chrom
 383 | 
 384 | def register_run_id(args, in_dir, single=False):
 385 | 	args['run_id'] = in_dir.rstrip('/').split('/')[-1]
 386 | 
 387 | 	if single is True:
 388 | 		args['run_id'] = args['run_id'] + "_single"
 389 | 
 390 | 	return args['run_id']
 391 | 
 392 | def register_msa_id(args, ref_path, fpaths):
 393 | 	order_names = []
 394 | 
 395 | 	for fpath in fpaths:
 396 | 		order_names.append(fpath.rstrip('/').split('/')[-1])
 397 | 
 398 | 	order_names.append(ref_path.rstrip('/').split('/')[-1])
 399 | 
 400 | 	in_string = "".join(order_names)
 401 | 	args['msa_id'] = hashlib.md5(in_string.encode()).hexdigest()
 402 | 
 403 | 	return args['msa_id']
 404 | 
 405 | def auto_min_pid_by_delta(coords_path, idt_delta):
 406 | 	min_pid_by_delta = 0
 407 | 
 408 | 	# fields = [('s1',int),('e1',int),
 409 | 	#		  ('s2',int),('e2',int),
 410 | 	#		  ('len1',int),('len2',int),
 411 | 	#		  ('pid',float),
 412 | 	#		  ('c1',str),('c2',str)]
 413 | 
 414 | 	pids = []
 415 | 	with open(coords_path) as f:
 416 | 		for i in range(5):
 417 | 			next(f)
 418 | 		for l in f:
 419 | 			values = l.replace(' | ', ' ').split()
 420 | 			pid = float(values[6])
 421 | 			pids.append(pid)
 422 | 
 423 | 	avg_pid = 0.7
 424 | 	if len(pids) != 0:
 425 | 		avg_pid = sum(pids)/len(pids)
 426 | 
 427 | 	min_pid_by_delta = avg_pid * (1 - idt_delta)
 428 | 
 429 | 	return min_pid_by_delta
 430 | 
 431 | def run_mummer4_single(fpath, genome_id, ref_fpath, rep_id, out_dir, skip_align, min_pid, min_aln_len, max_pid_delta, internal_thread_num):
 432 | 	print("	%s - %s" % (rep_id, genome_id))
 433 | 
 434 | 	try: os.makedirs(out_dir)
 435 | 	except: pass
 436 | 
 437 | 	log = open(out_dir+'/log','w')
 438 | 
 439 | 	if skip_align is True and os.path.isfile("%s/%s.delta" % (out_dir, genome_id)):
 440 | 		log.write('nucmer alignment was skipped\n')
 441 | 		print('	nucmer alignment skipped\n')
 442 | 	else:
 443 | 		command = "nucmer "
 444 | 		command += "-t %s " % internal_thread_num
 445 | 		command += "%s " % ref_fpath
 446 | 		command += "%s " % fpath
 447 | 		command += "--prefix %s/%s " % (out_dir, genome_id)
 448 | 		out, err = run_command(command)
 449 | 		log.write(str(out)+'\n'+str(err))
 450 | 
 451 | 	command = "delta-filter -q -r "
 452 | 	command += "-i %s " % str(min_pid)
 453 | 	command += "-l %s " % str(min_aln_len)
 454 | 	command += "%s/%s.delta " % (out_dir, genome_id)
 455 | 	command += "> %s/%s.filter.delta.1" % (out_dir, genome_id)
 456 | 	out, err = run_command(command)
 457 | 	log.write(str(out)+'\n'+str(err))
 458 | 
 459 | 	command = "show-coords "
 460 | 	command += "%s/%s.filter.delta.1 " % (out_dir, genome_id)
 461 | 	command += "> %s/%s" % (out_dir, 'coords.tmp')
 462 | 	out, err = run_command(command)
 463 | 	log.write(str(out)+'\n'+str(err))
 464 | 
 465 | 	coords_path = "{}/{}".format(out_dir, 'coords.tmp')
 466 | 	min_pid_by_delta = auto_min_pid_by_delta(coords_path, max_pid_delta)
 467 | 
 468 | 	command = "delta-filter -q -r "
 469 | 	command += "-i %s " % str(min_pid_by_delta)
 470 | 	command += "-l %s " % str(min_aln_len)
 471 | 	command += "%s/%s.delta " % (out_dir, genome_id)
 472 | 	command += "> %s/%s.filter.delta" % (out_dir, genome_id)
 473 | 	out, err = run_command(command)
 474 | 
 475 | 	for utility in ['coords', 'snps', 'diff']:
 476 | 		command = "show-%s " % utility
 477 | 		command += "%s/%s.filter.delta " % (out_dir, genome_id)
 478 | 		command += "> %s/%s" % (out_dir, utility)
 479 | 		out, err = run_command(command)
 480 | 		log.write(str(out)+'\n'+str(err))
 481 | 
 482 | 
 483 | def run_mummer4(args):
 484 | 	fpaths = args['fna_paths']
 485 | 	if 'tag_genome_paths' in args:
 486 | 		fpaths = args['tag_genome_paths']
 487 | 		
 488 | 	ref_fpath = args['rep_fna']
 489 | 	if 'tag_ref' in args:
 490 | 		ref_fpath = args['tag_ref']
 491 | 
 492 | 	register_run_id(args, args['fna_dir'])
 493 | 	register_msa_id(args, ref_fpath, fpaths)
 494 | 
 495 | 	print("reference genome path: %s" % ref_fpath)
 496 | 
 497 | 	args['mummer4_dir'] = args['out_dir']+'/temp/mummer4/'+args['run_id']
 498 | 	try: os.makedirs(args['mummer4_dir'])
 499 | 	except: pass
 500 | 
 501 | 	shutil.copy(ref_fpath, os.path.join(args['mummer4_dir'], 'reference.fna'))
 502 | 
 503 | 	arg_list = []
 504 | 	rep_id = '.'.join(ref_fpath.split('/')[-1].split('.')[:-1])
 505 | 
 506 | 	print("[paired alignment]: start")
 507 | 	for fpath in fpaths:
 508 | 		genome_id = '.'.join(fpath.split('/')[-1].split('.')[:-1])
 509 | 		out_dir = '%s/aln/%s' % (args['mummer4_dir'], genome_id)
 510 | 		arg_list.append([fpath, genome_id, ref_fpath, rep_id, out_dir, args['skip_align'], args['min_pid'], args['min_aln_len'], args['max_pid_delta'], 1])
 511 | 
 512 | 	print("[paired alignment]: done")
 513 | 
 514 | 	parallel(run_mummer4_single, arg_list, args['threads'])
 515 | 
 516 | 	msa_path = gen_msa.build_msa(indir=args['mummer4_dir'], overwrite=True, subset=args["subset_map"])
 517 | 
 518 | 	shutil.copy(os.path.join(args['mummer4_dir'], 'reference.fna'), args['out_dir'])
 519 | 
 520 | 	args['msa_path'] = args['out_dir'] + '/tag_msa.fna'
 521 | 	shutil.move(msa_path, args['msa_path'])
 522 | 
 523 | 	args['msa_type'] = 'xmfa-mummer4'
 524 | 	
 525 | 	args['tag_list_path'] = args['out_dir'] + '/tag_paths.list'
 526 | 
 527 | 	with open(args['tag_list_path'], 'w') as fh:
 528 | 		for fpath in fpaths:
 529 | 			fh.write("{}\n".format(fpath.rstrip()))
 530 | 	
 531 | 
 532 | def run_mash_scketch(args):
 533 | 	ref_fpath = args['rep_fna']
 534 | 	fpaths = args['fna_paths']
 535 | 
 536 | 	register_run_id(args, args['fna_dir'])
 537 | 	register_msa_id(args, ref_fpath, fpaths)
 538 | 
 539 | 	print("reference genome path: %s" % ref_fpath)
 540 | 
 541 | 	args['mash_dir'] = args['out_dir']+'/temp/mash/'+args['run_id']
 542 | 
 543 | 	try: os.makedirs(args['mash_dir'])
 544 | 	except: pass
 545 | 
 546 | 	args['fna_list_path'] = args['mash_dir'] + '/in_fna.list'
 547 | 
 548 | 	with open(args['fna_list_path'], 'w') as fh:
 549 | 		for fpath in fpaths:
 550 | 			fh.write("{}\n".format(fpath))
 551 | 	
 552 | 	print("[building mash sketch]: start")
 553 | 
 554 | 	command = "mash sketch "
 555 | 	command += "-k %s " % str(args['sketch_k'])
 556 | 	command += "-s %s " % str(args['sketch_size'])
 557 | 	command += "-p %s " % str(args['threads'])
 558 | 	command += "-o %s " % (args['mash_dir']+'/mash_sketch')
 559 | 	command += "-l %s " % args['fna_list_path']
 560 | 
 561 | 	out, err = run_command(command)
 562 | 	with open(args['logfile'], 'a') as logger: 
 563 | 		logger.write(str(out)+'\n'+str(err))
 564 | 
 565 | 	args['mash_sketch_path'] = args['mash_dir']+'/mash_sketch.msh'
 566 | 
 567 | def run_mash_dist(args):
 568 | 	sketch_path = args['mash_sketch_path']
 569 | 
 570 | 	assert os.path.exists(sketch_path)
 571 | 
 572 | 	args['mash_dist_path'] = args['mash_dir'] + '/mash_dist.tsv'
 573 | 
 574 | 	print("[calculating mash distance]: start")
 575 | 
 576 | 	command = "mash dist "
 577 | 	command += "-p %s " % str(args['threads'])
 578 | 	command += "%s %s " % (sketch_path, sketch_path)
 579 | 	command += "> %s " % args['mash_dist_path'] 
 580 | 
 581 | 	out, err = run_command(command)
 582 | 	with open(args['logfile'], 'a') as logger: 
 583 | 		logger.write(str(out)+'\n'+str(err))
 584 | 
 585 | def do_precut(args):
 586 | 	dist_path = args['mash_dist_path']
 587 | 
 588 | 	assert os.path.exists(dist_path)
 589 | 
 590 | 	args['cut_dist_path'] = args['mash_dir'] + '/mash_dist.cut.tsv'
 591 | 
 592 | 	print("[cut mash distance: {}]: start".format(str(args['precut'])))
 593 | 
 594 | 	command = "awk '$3 < %s' " % str(args['precut'])
 595 | 	command += "%s " % dist_path
 596 | 	command += "> %s " % args['cut_dist_path']
 597 | 	
 598 | 	out, err = run_command(command)
 599 | 	with open(args['logfile'], 'a') as logger: 
 600 | 		logger.write(str(out)+'\n'+str(err))
 601 | 
 602 | def id_clusters(args):
 603 | 	run_mash_scketch(args)
 604 | 
 605 | 	run_mash_dist(args)
 606 | 
 607 | 	s_cut = args['start_cutoff']
 608 | 	e_cut = args['end_cutoff']
 609 | 	r_fac = args['range_factor']
 610 | 
 611 | 	total_n = len(args['fna_paths'])
 612 | 
 613 | 	maf = args['snp_freq']
 614 | 
 615 | 	critical_n = math.ceil(1 / maf)
 616 | 	
 617 | 	do_precut(args)
 618 | 	dist_path = args['cut_dist_path']
 619 | 	assert os.path.exists(dist_path)
 620 | 
 621 | 	optimal_clusters, optimal_d, optimal_n = [], None, None
 622 | 	while s_cut <= args['precut']:
 623 | 		optimal_clusters, optimal_d, optimal_n, firstcut_exit = id_genome_clusters.build_genome_blocks(dist_path, total_n, critical_n, s_cut, e_cut, r_fac, args['centrality_method'], args['edge_weighted'])
 624 | 		if firstcut_exit is True:
 625 | 			s_cut = s_cut + 0.01
 626 | 		else:
 627 | 			break
 628 | 		
 629 | 
 630 | 	clust_genomes = dict()
 631 | 	tag_genomes = []
 632 | 	for cluster in optimal_clusters:
 633 | 		tag_genomes.append(cluster.tag_genome)
 634 | 		for genome in cluster.genomes: 
 635 | 			clust_genomes[genome] = 1
 636 | 
 637 | 	
 638 | 	for fpath in args['fna_paths']:
 639 | 		if fpath not in clust_genomes:
 640 | 			tag_genomes.append(fpath)
 641 | 	
 642 | 	args['tag_genome_paths'] = tag_genomes
 643 | 
 644 | def id_tag_ref(args):
 645 | 	if 'mash_dist_path' not in args or not os.path.exists(args['mash_dist_path']):
 646 | 		run_mash_scketch(args)
 647 | 		run_mash_dist(args)
 648 | 
 649 | 	dist_path = args['mash_dist_path']
 650 | 
 651 | 	tag_paths = args['fna_paths']
 652 | 	if 'tag_genome_paths' in args and len(args['tag_genome_paths']) > 1:
 653 | 		tag_paths = args['tag_genome_paths']
 654 | 
 655 | 	centroid = id_centroid.identify(tag_paths, dist_path, args['cent_dist_type'])
 656 | 	
 657 | 	print(centroid)
 658 | 
 659 | 	args['tag_ref'] = centroid
 660 | 	args['rep_fna'] = centroid
 661 | 
 662 | def run_kmerset_validate(args):
 663 | 	assert os.path.exists(args['kmer_set'])
 664 | 	assert os.path.exists(args['tag_list'])
 665 | 
 666 | 	args['kmer_prof_path'] = args['out_dir']+'/kmer_prof.tsv'
 667 | 
 668 | 	args['check_fna_paths'] = args['out_dir']+'/check_fna_paths.list'
 669 | 	if 'fna_paths' in args:
 670 | 		with open(args['check_fna_paths'], 'w') as fh:
 671 | 			for fpath in args['fna_paths']:
 672 | 				fh.write("{}\n".format(fpath))
 673 | 
 674 | 	print("[validating kmer set]: start")
 675 | 
 676 | 	command = "callm_db_val "
 677 | 	command += "-d %s " % args['kmer_set']
 678 | 	command += "-n %s " % args['genome_name']
 679 | 	command += "-t %s " % args['threads']
 680 | 	#command += "-L %s " % args['tag_list'] 
 681 | 	command += "-L %s " % args['check_fna_paths']
 682 | 	command += "-o %s " % args['kmer_prof_path']
 683 | 
 684 | 	out, err = run_command(command)
 685 | 	with open(args['logfile'], 'a') as logger:
 686 | 		logger.write(str(out)+'\n'+str(err))
 687 | 
 688 | def filter_kmers(args):
 689 | 	assert os.path.exists(args['kmer_prof_path'])
 690 | 
 691 | 	args['filtered_kmer_path'] = args['out_dir']+'/selected_kmers.tsv'
 692 | 
 693 | 	with open(args['filtered_kmer_path'], 'w') as fw:
 694 | 		with open(args['kmer_prof_path'], 'r') as fh:
 695 | 			for line in fh:
 696 | 				items = line.rstrip().split('\t')
 697 | 
 698 | 				nonsingle_hit = int(items[8])
 699 | 				
 700 | 				null_hit = int(items[6])
 701 | 				single_hit = int(items[7])
 702 | 				
 703 | 				ref_hit = int(items[10])
 704 | 				alt_hit = int(items[11])
 705 | 
 706 | 				if nonsingle_hit > 0:
 707 | 					continue
 708 | 
 709 | 				if single_hit / (single_hit + null_hit) < 0.5:
 710 | 					continue
 711 | 
 712 | 				if ref_hit == 0 or alt_hit == 0:
 713 | 					continue
 714 | 
 715 | 				rec1 = "{}\t{}0{}".format(items[2], items[9], items[0])
 716 | 				rec2 = "{}\t{}1{}".format(items[3], items[9], items[0])
 717 | 				rec3 = "{}\t{}0{}".format(items[4], items[9], items[0])
 718 | 				rec4 = "{}\t{}1{}".format(items[5], items[9], items[0])
 719 | 
 720 | 				fw.write("{}\n{}\n{}\n{}\n".format(rec1, rec2, rec3, rec4))
 721 | 
 722 | def run_build_db(args):
 723 | 	assert args['filtered_kmer_path']
 724 | 	
 725 | 	args['kmer_db_path'] = args['out_dir']+'/kmer_db.bin'
 726 | 
 727 | 	command = "callm_db_build "
 728 | 	command += "%s " % args['filtered_kmer_path']
 729 | 	command += "> %s " % args['kmer_db_path']
 730 | 
 731 | 	out, err = run_command(command)
 732 | 	with open(args['logfile'], 'a') as logger:
 733 | 		logger.write(str(out)+'\n'+str(err))
 734 | 
 735 | def read_input_dir(args, in_dir, subset_list=None):
 736 | 	subset_map = dict()
 737 | 
 738 | 	for f in os.listdir(in_dir):
 739 | 		subset_map[f] = 1
 740 | 
 741 | 	if subset_list is not None:
 742 | 		subset_map = dict()
 743 | 		with open(subset_list, 'r') as fh:
 744 | 			for ln in fh:
 745 | 				subset_map[ln.rstrip()] = 1
 746 | 
 747 | 	args["subset_map"] = subset_map
 748 | 
 749 | 	fna_paths = []
 750 | 	fq_paths = []
 751 | 
 752 | 	for f in os.listdir(in_dir):
 753 | 		if f in subset_map:
 754 | 			fpath = in_dir.rstrip('/')+'/'+f
 755 | 			print(fpath)
 756 | 			
 757 | 			if os.path.isdir(fpath):
 758 | 				continue
 759 | 
 760 | 			assert os.path.isfile(fpath)
 761 | 			ftype = id_input_type(fpath)
 762 | 
 763 | 			if ftype == "unknown":
 764 | 				sys.stderr.write("skip {}: unknown input type\n".format(fpath))	
 765 | 			elif ftype == "not_supported":
 766 | 				sys.stderr.write("skip {}: compressed fasta is not supported yet\n".format(fpath))
 767 | 			elif ftype == "fasta":
 768 | 				fna_paths.append(fpath)
 769 | 			elif ftype in ["fastq", "fastq.gz", "fastq.lz4", "fastq.bz2"]:
 770 | 				fq_paths.append(fpath)
 771 | 			else:
 772 | 				assert False
 773 | 		else:
 774 | 			sys.stderr.write("skip {}\n".format(f))
 775 | 
 776 | 	fq_pairs = []
 777 | 	if len(fq_paths) > 1:
 778 | 		fq_pairs = pair_inputs(fq_paths)
 779 | 
 780 | 	args['fna_paths'] = fna_paths
 781 | 	args['fq_paths'] = fq_paths
 782 | 	args['fq_pairs'] = fq_pairs
 783 | 
 784 | 
 785 | def id_input_type(fpath):
 786 | 	in_type = "fastq" #default
 787 | 
 788 | 	fn_its = fpath.split("/")[-1].split(".")
 789 | 
 790 | 	fn_end = ""
 791 | 	if fn_its[-1] in ['gz', 'lz4', 'bz2']:
 792 | 		fn_end = fn_its[-2]
 793 | 	else:
 794 | 		fn_end = fn_its[-1]
 795 | 
 796 | 	if fn_end in ['fa', 'fsa', 'fna', 'fasta']:
 797 | 		in_type = "fasta"
 798 | 	elif fn_end in ['fq', 'fastq']:
 799 | 		in_type = "fastq"
 800 | 	else:
 801 | 		in_type = "unknown"
 802 | 
 803 | 	if fn_its[-1] in ['gz', 'lz4', 'bz2']:
 804 | 		if fn_end in ['fa', 'fsa', 'fna', 'fasta']:
 805 | 			in_type = "not_supported"
 806 | 		else:
 807 | 			in_type = in_type + '.' + fn_its[-1]
 808 | 			
 809 | 	return in_type
 810 | 
 811 | def pair_inputs(fq_paths):
 812 | 	pairs = dict()
 813 | 
 814 | 	for fqpath in fq_paths:
 815 | 		fn_its = fqpath.split("/")[-1].split(".")
 816 | 		fq_name_parts = fn_its[0].split("_")
 817 | 		
 818 | 		if len(fq_name_parts) != 2:
 819 | 			continue
 820 | 
 821 | 		if fq_name_parts[1] not in ["1", "2"]:
 822 | 			continue
 823 | 
 824 | 		if fq_name_parts[0] not in pairs:
 825 | 			pairs[fq_name_parts[0]] = dict()
 826 | 
 827 | 		pairs[fq_name_parts[0]][fq_name_parts[1]] = fqpath
 828 | 	
 829 | 	real_pairs = []
 830 | 	for name in pairs.keys():
 831 | 		if "1" in pairs[name] and "2" in pairs[name]:
 832 | 			real_pairs.append([pairs[name]["1"], pairs[name]["2"], name])
 833 | 			
 834 | 	return real_pairs
 835 | 
 836 | def genotype_single_genomes(args):
 837 | 	ref_fpath = args['ref_genome']
 838 | 	fpaths = args['fna_paths']
 839 | 
 840 | 	print("reference genome path: %s" % ref_fpath)
 841 | 
 842 | 	args['genotype_dir'] = args['out_dir']+'/temp/genotype'
 843 | 	try: os.makedirs(args['genotype_dir'])
 844 | 	except: pass
 845 | 
 846 | 	args['gt_results_dir'] = args['out_dir']+'/gt_results'
 847 | 	try: os.makedirs(args['gt_results_dir'])
 848 | 	except: pass
 849 | 
 850 | 	arg_list = []
 851 | 	arg_list_gt = []
 852 | 	rep_id = '.'.join(ref_fpath.split('/')[-1].split('.')[:-1])
 853 | 
 854 | 	global ref 
 855 | 	ref = read_ref(ref_fpath)
 856 | 
 857 | 	global genos 
 858 | 	genos = extract_genotypes(args['vcf'])
 859 | 
 860 | 	print("[paired alignment]: start")
 861 | 	for fpath in fpaths:
 862 | 		genome_id = fpath.split('/')[-1]
 863 | 		out_dir = '%s/aln/%s' % (args['genotype_dir'], genome_id)
 864 | 		arg_list.append([fpath, genome_id, ref_fpath, rep_id, out_dir, False, args['min_pid'], args['min_aln_len'], args['max_pid_delta'], 1])
 865 | 
 866 | 		coord_path = out_dir + '/coords'
 867 | 		snp_path = out_dir + '/snps'
 868 | 		output = args['gt_results_dir'] + '/' + genome_id + ".tsv"
 869 | 		arg_list_gt.append([genos, ref, coord_path, snp_path, output])
 870 | 
 871 | 	print("[paired alignment]: done")
 872 | 
 873 | 	parallel(run_mummer4_single, arg_list, args['threads'])
 874 | 	parallel(run_single_fasta_gt, arg_list_gt, args['threads'])
 875 | 
 876 | def read_ref(fpath):
 877 | 	seq_recs = list(SeqIO.parse(fpath, "fasta"))
 878 | 
 879 | 	rec_table = dict()
 880 | 	for rec in seq_recs:
 881 | 		rec_table[rec.id] = str(rec.seq).upper()
 882 | 
 883 | 	return rec_table
 884 | 
 885 | def extract_genotypes(vcf_path):
 886 | 	genos = []
 887 | 	with open(vcf_path, 'r') as fh:
 888 | 		for l in fh:
 889 | 			if l[0] == "#":
 890 | 				continue
 891 | 			else:
 892 | 				values = l.rstrip().split('\t')[:5]
 893 | 
 894 | 				chrom = values[0]
 895 | 				pos_r = int(values[1])
 896 | 				gid = values[2]
 897 | 				allele_ma = values[3]
 898 | 				allele_mi = values[4]
 899 | 
 900 | 				if len(allele_mi) > 1:
 901 | 					continue
 902 | 
 903 | 				genos.append([chrom, str(pos_r), gid, allele_ma, allele_mi])
 904 | 
 905 | 	return genos 
 906 | 
 907 | def run_single_fasta_gt(genos, ref, coord_path, snp_path, output):
 908 | 	coord_map = dict()
 909 | 
 910 | 	with open(coord_path, 'r') as fh:
 911 | 		for i in range(5):
 912 | 			next(fh)
 913 | 		for l in fh:
 914 | 			values = l.replace(' | ', ' ').split()
 915 | 			# position in coords file is 1 indexed compared to 0 indexed in vcf
 916 | 			start = int(values[0]) - 1
 917 | 			end = int(values[1]) - 1
 918 | 			chrom = values[7]
 919 | 
 920 | 			assert end > start
 921 | 
 922 | 			if chrom not in coord_map:
 923 | 				coord_map[chrom] = []
 924 | 
 925 | 			coord_map[chrom].append([start, end])
 926 | 
 927 | 
 928 | 	snp_map = dict()
 929 | 	with open(snp_path) as fh:
 930 | 		for i in range(5):
 931 | 			next(fh)
 932 | 		for l in fh:
 933 | 			values = l.replace(' | ', ' ').split()
 934 | 			# position in snps file is 1 indexed compared to 0 indexed in vcf
 935 | 			pos_r = int(values[0]) - 1 
 936 | 			allele_r = values[1]
 937 | 			allele_a = values[2]
 938 | 			chrom = values[10]
 939 | 			
 940 | 			if allele_r == "." or allele_a == ".":
 941 | 				continue
 942 | 
 943 | 			if chrom not in snp_map:
 944 | 				snp_map[chrom] = dict()
 945 | 
 946 | 			snp_map[chrom][pos_r] = [allele_r, allele_a]
 947 | 	
 948 | 	gtypes = []
 949 | 	for geno in genos:
 950 | 		chrom = geno[0]
 951 | 		pos_r = int(geno[1])
 952 | 		gid = geno[2]
 953 | 		allele_ma = geno[3]
 954 | 		allele_mi = geno[4]
 955 | 		
 956 | 		if chrom not in coord_map:
 957 | 			continue
 958 | 
 959 | 		for g_range in coord_map[chrom]:
 960 | 			if pos_r >= g_range[0] and pos_r <= g_range[1]:
 961 | 				if chrom in snp_map and pos_r in snp_map[chrom]:
 962 | 					if allele_mi == snp_map[chrom][pos_r][1]:
 963 | 						gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '0', '1'])
 964 | 					else:
 965 | 						gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '1', '0'])
 966 | 				else:
 967 | 					assert chrom in ref	
 968 | 					allele_r = ref[chrom][pos_r]
 969 | 					if allele_mi == allele_r:
 970 | 						gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '0', '1'])
 971 | 					else:
 972 | 						gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, '1', '0'])
 973 | 
 974 | 	with open(output, 'w') as fw:
 975 | 		for gtype in gtypes:
 976 | 			fw.write("{}\n".format("\t".join(gtype)))
 977 | 
 978 | def genotype_reads(args):
 979 | 	fpaths = args['fq_paths']
 980 | 
 981 | 	args['genotype_dir'] = args['out_dir']+'/temp/genotype'
 982 | 	try: os.makedirs(args['genotype_dir'])
 983 | 	except: pass
 984 | 
 985 | 	args['gt_results_dir'] = args['out_dir']+'/gt_results'
 986 | 	try: os.makedirs(args['gt_results_dir'])
 987 | 	except: pass
 988 | 
 989 | 	gt_paths = []
 990 | 	outname = '%s/iso_gt' % args['genotype_dir']
 991 | 	try: os.makedirs(outname)
 992 | 	except: pass
 993 | 
 994 | 	mode = 2
 995 | 	if args['mode'] == "very-fast":
 996 | 		mode = 10
 997 | 	elif args['mode'] == "fast":
 998 | 		mode = 5
 999 | 	elif args['mode'] == 'sensitive':
1000 | 		mode = 2
1001 | 	elif args['mode'] == 'very-sensitive':
1002 | 		mode = 1
1003 | 	else:
1004 | 		assert False
1005 | 
1006 | 	command = "iso_gt_mtar "
1007 | 	command += "-d %s " % args['kmer_db_path']
1008 | 	command += "-t %s " % args['threads']
1009 | 	command += "-j %s " % mode
1010 | 	command += "-o %s/" % outname
1011 | 	command += "%{in} "
1012 | 	command += "-f "
1013 | 
1014 | 	for fpath in fpaths:
1015 | 		command += "%s " % fpath
1016 | 		gt_paths.append(outname + '/' + extract_fastq_path_name(fpath) + ".tsv")
1017 | 	
1018 | 	out, err = run_command(command)
1019 | 	with open(args['logfile'], 'a') as logger:
1020 | 		logger.write(str(out)+'\n'+str(err))
1021 | 
1022 | 	merge_paths = []
1023 | 	if args["merge_pairs"]:
1024 | 		assert "fq_pairs" in args
1025 | 
1026 | 		for fq_pair in args["fq_pairs"]:
1027 | 			fq_1 = fq_pair[0]
1028 | 			fq_2 = fq_pair[1]
1029 | 			fq_name = fq_pair[2]
1030 | 
1031 | 			fq_gt_1 = extract_fastq_path_name(fq_1) + ".tsv"
1032 | 			fq_gt_2 = extract_fastq_path_name(fq_2) + ".tsv"
1033 | 			
1034 | 			fq_merge = dict()
1035 | 			for fq_gt in [fq_gt_1, fq_gt_2]:
1036 | 				with open(fq_gt, 'r') as fh:
1037 | 					for line in fh:
1038 | 						items = line.rstrip().split('\t')
1039 | 						if items[0] not in fq_merge:
1040 | 							fq_merge[items[0]] = int(items[0])
1041 | 						else:
1042 | 							fq_merge[items[0]] += int(items[0])
1043 | 
1044 | 			merge_output = outname + "/" + fq_name + ".merged.tsv"
1045 | 			with open(merge_output, 'w') as fw:
1046 | 				for snp in fq_merge.keys():
1047 | 					fw.write("{}\t{}\n".format(snp, str(fq_merge[snp])))
1048 | 
1049 | 			merge_paths.append(merge_output)
1050 | 	
1051 | 	arg_list = []
1052 | 	for gt_path in gt_paths + merge_paths:
1053 | 		fq_id = '.'.join(gt_path.split('/')[-1].split('.')[:-1])
1054 | 		output = args['gt_results_dir'] + '/' + fq_id + '.reads.tsv'
1055 | 		arg_list.append([args['vcf'], gt_path, output])
1056 | 
1057 | 	parallel(run_parse_single, arg_list, args['threads'])			
1058 | 
1059 | def extract_fastq_path_name(fpath):
1060 | 	# chop off all leading '.' and '/'
1061 | 	pparts = []
1062 | 	real_idx = 0
1063 | 	for i, ppart in enumerate(fpath.split('/')):
1064 | 		if ppart == '.' or ppart == "..":
1065 | 			continue
1066 | 		else:
1067 | 			real_idx = i
1068 | 			break
1069 | 	
1070 | 	vpath = '/'.join(fpath.split('/')[real_idx:])
1071 | 
1072 | 	path_parts = vpath.split('.')
1073 | 	real_parts = []
1074 | 	if path_parts[-1] in ['gz', 'lz4', 'bz2']:
1075 | 		real_parts = path_parts[:-2]
1076 | 	elif path_parts[-1] in ['fq', 'fastq']:
1077 | 		real_parts = path_parts[:-1]
1078 | 	else:
1079 | 		assert False
1080 | 	
1081 | 	return ".".join(real_parts).replace('/', '_').replace('.','_')
1082 | 
1083 | 
1084 | def run_parse_single(vcf_path, gt_path, output):
1085 | 	snp_map = dict()
1086 | 
1087 | 	with open(gt_path, 'r') as fh:
1088 | 		for line in fh:
1089 | 			values = line.rstrip().split('\t')
1090 | 			snp = values[0]
1091 | 			count = values[1]
1092 | 
1093 | 			allele_type = int(snp[6])
1094 | 			assert allele_type in [0, 1]
1095 | 
1096 | 			gid = snp[7:]
1097 | 			
1098 | 			if gid not in snp_map:
1099 | 				snp_map[gid] = [0, 0]
1100 | 			
1101 | 			snp_map[gid][allele_type] = snp_map[gid][allele_type] + int(count)
1102 | 
1103 | 	gtypes = []
1104 | 	with open(vcf_path, 'r') as fh:
1105 | 		for l in fh:
1106 | 			if l[0] == "#":
1107 | 				continue
1108 | 			else:
1109 | 				values = l.rstrip().split('\t')[:5]
1110 | 
1111 | 				chrom = values[0]
1112 | 				pos_r = int(values[1])
1113 | 				gid = values[2]
1114 | 				allele_ma = values[3]
1115 | 				allele_mi = values[4]
1116 | 
1117 | 				if len(allele_mi) > 1:
1118 | 					continue
1119 | 
1120 | 				if gid in snp_map: 	
1121 | 					gtypes.append([chrom, str(pos_r), gid, allele_ma, allele_mi, str(snp_map[gid][0]), str(snp_map[gid][1])])
1122 | 	
1123 | 	with open(output, 'w') as fw:
1124 | 		for gtype in gtypes:
1125 | 			fw.write("{}\n".format("\t".join(gtype)))
1126 | 
1127 | def call_snps_main(args):
1128 | 	cmdl_str = ' '.join(sys.argv[1:])
1129 | 
1130 | 	if args['data_type'] in ['genomes', 'end_to_end']:
1131 | 		locate_fpaths(args, args['fna_dir'], args['rep_fna'], args['subset_list'])
1132 | 
1133 | 
1134 | 	if args['data_type'] in ['genomes', 'end_to_end']:
1135 | 		if args["has_completeness"]:
1136 | 			if args["completeness"]:
1137 | 				args["min_prev"] = (1 - float(args["missing_ratio"])) * float(args["completeness"])
1138 | 			elif args["completeness_list"]:
1139 | 				completeness_map = {}
1140 | 				with open(args["completeness_list"], 'w') as fh:
1141 | 					for line in fh:
1142 | 						items = line.rstrip('').split('\t')
1143 | 						completeness_map[items[0]] = float(items[1])
1144 | 				
1145 | 				ref_fpath = args['rep_fna']
1146 | 				fpaths = args['fna_paths']
1147 | 				
1148 | 				completenesses = []
1149 | 
1150 | 				for fpath in fpaths:
1151 | 					fname = fpath.rstrip('/').split('/')[-1]
1152 | 					if fname in completeness_map:
1153 | 						completenesses.append(completeness_map[fname])
1154 | 					else:
1155 | 						sys.exit("missing completeness: {}".format(fpath))
1156 | 
1157 | 				avg_completeness = sum(completenesses)/len(completenesses)
1158 | 				args["min_prev"] = (1 - float(args["missing_ratio"])) * avg_completeness
1159 | 			else:
1160 | 				print("useless option --has-completeness")
1161 | 	
1162 | 	if len(args['fna_paths']) <= 5:
1163 | 		sys.exit("Input genomes {} are fewer than the min. requirement (5)".format(len(args['fna_paths'])))
1164 | 
1165 | 	if len(args['fna_paths']) <= math.ceil(1 / args['snp_freq']):
1166 | 		print("[Warning] Total number of genomes ({}) < min. number of genomes required for effective SNP calling with MAF {} ({})".format(len(args['fna_paths']), args['snp_freq'], math.ceil(1 / args['snp_freq'])))
1167 | 		print("[Warning] Skip tag genome selection, all genomes will be used")
1168 | 		args['keep_redundancy'] = True
1169 | 
1170 | 	if args['data_type'] in ['genomes', 'end_to_end']:
1171 | 		if not args['keep_redundancy']:
1172 | 			id_clusters(args)
1173 | 
1174 | 		if args['skip_centroid']:
1175 | 			assert args['rep_fna'] is not None
1176 | 			assert os.path.exists(args['rep_fna'])
1177 | 		else:
1178 | 			id_tag_ref(args)
1179 | 
1180 | 
1181 | 	# >>> 1. Generate multiple-genome-alignment or pileups
1182 | 
1183 | 	# data type is genomes: use parsnp to perform multiple genome alignment
1184 | 	start = time.time()
1185 | 	if args['data_type'] in ['genomes', 'end_to_end']:
1186 | 		print("Running mummer4; start")
1187 | 		run_mummer4(args)
1188 | 		#args['mummer4_dir'] = '/Users/jasonshi/Documents/zjshi_github/snpMLST/unit_test_raw/snps_from_genomes/Borrelia_burgdorferi_56121/temp/mummer4/54d64396-732c-42b0-8e88-3de63e8a665e/msa.fna'
1189 | 		# msa_path = gen_msa.build_msa(indir=args['mummer4_dir'], max_genomes=1280)
1190 | 		# args['msa_path'] = '/Users/jasonshi/Documents/zjshi_github/snpMLST/unit_test_raw/snps_from_genomes/Borrelia_burgdorferi_56121/temp/mummer4/54d64396-732c-42b0-8e88-3de63e8a665e/msa.fa'
1191 | 		# args['msa_type'] = 'xmfa-mummer4'
1192 | 		print("Running mummer4; done!")
1193 | 	print("Elapsed time: {}".format(time.time()-start))
1194 | 
1195 | 
1196 | 	# >>> 2. Parse multiple-genome-alignment or pileup and call SNPs
1197 | 
1198 | 	# fetch generator to parse msa columns or mpileup sites
1199 | 	start = time.time()
1200 | 	print("Fetching file-type-specific parser; start")
1201 | 	if args['data_type'] in ['genomes', 'end_to_end', 'msa']:
1202 | 		from align_io import msa
1203 | 		if args['mem']:
1204 | 			site_assembly = msa.iter_parse(args['msa_path'], args['msa_type'], args['max_samples'])
1205 | 		else:
1206 | 			site_assembly = msa.monolithic_parse(args['msa_path'], args['msa_type'], args['max_samples'])
1207 | 
1208 | 	print("Fetching file-type-specific parser; done")
1209 | 	print("Elapsed time: {}".format(time.time()-start))
1210 | 
1211 | 
1212 | 	# id core-genome coords and snps
1213 | 	start = time.time()
1214 | 	print("Identifying core-snps; start")
1215 | 	print("max sites: {}".format(args['max_sites']))
1216 | 	print("min prevalence: {}".format(args['min_prev']))
1217 | 	print("min MAF: {}".format(args['snp_freq']))
1218 | 
1219 | 	if args['mem']:
1220 | 		align_assembs = align_assembly.call_snps_iter(site_assembly, args['max_sites'], args['min_prev'], args['snp_freq'])
1221 | 	else:
1222 | 		align_assembs = align_assembly.call_snps(site_assembly, args['max_sites'], args['min_prev'], args['snp_freq'])
1223 | 	print("Identifying core-snps; done")
1224 | 	print("Elapsed time: {}".format(time.time()-start))
1225 | 
1226 | 	# sys.exit()
1227 | 
1228 | 	single_chrom_rep = False
1229 | 
1230 | 	if args['mem'] is True and args['rep_fna'] is not None:
1231 | 		single_chrom_rep = detect_single_chrom(args['rep_fna'])
1232 | 
1233 | 	# write output files
1234 | 	start = time.time()
1235 | 	print("Writing snps to VCF; start")
1236 | 	if args['mem']:
1237 | 		header_ready = False
1238 | 		coords_buffer = []
1239 | 		for align_assemb in align_assembs:
1240 | 			if len(align_assemb.snps) > 0:
1241 | 				if not header_ready:
1242 | 					vcf_io.write_coords_header(coords_buffer, args['out_dir'])
1243 | 					vcf_io.write_vcf_header(align_assemb.snps, args['out_dir'], cmdl_str)
1244 | 					header_ready = True
1245 | 
1246 | 				# vcf_io.write_genome(core_genome.consensus_genome, args['out_dir'])
1247 | 				coords_buffer = coords_buffer + align_assemb.coords
1248 | 				vcf_io.write_vcf(align_assemb.snps, args['out_dir'], single_chrom_rep)
1249 | 
1250 | 		vcf_io.write_coords(vcf_io.merge_coords(coords_buffer), args['out_dir'])
1251 | 		# vcf_io.write_coords(coords_buffer, args['out_dir'])
1252 | 	else:
1253 | 		vcf_io.write_coords_header(align_assembs.coords, args['out_dir'])
1254 | 		vcf_io.write_vcf_header(align_assembs.snps, args['out_dir'], cmdl_str)
1255 | 		vcf_io.write_coords(align_assembs.coords, args['out_dir'])
1256 | 		# vcf_io.write_genome(core_genome.consensus_genome, args['out_dir'])
1257 | 		vcf_io.write_vcf(align_assembs.snps, args['out_dir'])
1258 | 	print("Writing snps to VCF; done!")
1259 | 	print("Elapsed time: {}".format(time.time()-start))
1260 | 
1261 | 
1262 | def build_db_main(args):
1263 | 	print("Database building; start")
1264 | 	args['kmer_size'] = 31
1265 | 
1266 | 	genome_path, vcf_path, coords_path, tag_list_path = args['ref_genome'], args['vcf'], args['coords'], args['tag_list']
1267 | 	k_size, k_type = args['kmer_size'], args['kmer_type']
1268 | 
1269 | 	if args['fna_dir'] is not None:
1270 | 		locate_fpaths(args, args['fna_dir'])
1271 | 
1272 | 	genome_seq = build_db.open_genome_seq(genome_path)
1273 | 	#snps = build_db.open_vcf_file(vcf_path)
1274 | 
1275 | 	coords = None
1276 | 	if coords_path is not None:
1277 | 		coords = build_db.read_coords(coords_path)
1278 | 
1279 | 	snp_gb_pos, snp_alleles = build_db.open_vcf_file_local(vcf_path)
1280 | 	#snp_gb_pos = [int(snp.ID) for snp in snps]
1281 | 	#snp_alleles = [[str(snp.REF), str(snp.ALT[0])] for snp in snps]
1282 | 	#snp_kmers = fetch_snp_kmers(genome_seq, snp_gb_pos, snp_alleles, k_size, k_type, coords)
1283 | 
1284 | 	genome_seqs = build_db.load_msa(args['msa'])
1285 | 	snp_kmers = build_db.fetch_all_from_msa(genome_seqs, genome_seq, snp_gb_pos, snp_alleles, k_size, coords)
1286 | 
1287 | 	args['kmer_set'] = args['out_dir'] + '/nr_kmer_set.tsv'
1288 | 
1289 | 	build_db.dump_tsv(snp_kmers, args['kmer_set'])
1290 | 
1291 | 	run_kmerset_validate(args)
1292 | 
1293 | 	filter_kmers(args)
1294 | 
1295 | 	run_build_db(args)
1296 | 
1297 | 	print("Database building; finished")
1298 | 
1299 | 
1300 | def genotype_main(args):
1301 | 	print("Genotyping; start")
1302 | 	read_input_dir(args, args['in_dir'], args['subset_list'])
1303 | 	
1304 | 	try: os.makedirs(args['out_dir'])
1305 | 	except: pass
1306 | 
1307 | 	if len(args["fna_paths"]) > 0:
1308 | 		print("Genomes found; start")
1309 | 		genotype_single_genomes(args)
1310 | 		print("Genomes found; done")
1311 | 	
1312 | 	if len(args["fq_paths"]) > 0:
1313 | 		print("Reads found; start")
1314 | 		genotype_reads(args)
1315 | 		print("Reads found; start")
1316 | 
1317 | 	print("Genotyping; finished")
1318 | 
1319 | def tree_main(args):
1320 | 	print("SNP tree building; start")
1321 | 	concat_alleles.concat_allele_tree(args)
1322 | 	print("SNP tree building; finished")
1323 | 
1324 | def end2end_main(args):
1325 | 	try: os.makedirs(args['out_dir'])
1326 | 	except: pass
1327 | 
1328 | 	args['fna_dir'] = args['in_dir']
1329 | 	locate_fpaths(args, args['in_dir'], args['rep_fna'], args['subset_list'])
1330 | 	call_snps_main(args)
1331 | 
1332 | 	args['kmer_size'] = 31
1333 | 	args['ref_genome'] = args['rep_fna']
1334 | 	args['vcf'] = args['out_dir'].rstrip('/') + '/core_snps.vcf' 
1335 | 	args['coords'] = args['out_dir'].rstrip('/') + '/coords.tsv'
1336 | 	args['tag_list'] = args['out_dir'].rstrip('/') + '/tag_paths.list'
1337 | 	args['msa'] = args['out_dir'].rstrip('/') + '/tag_msa.fna'
1338 | 
1339 | 	build_db_main(args)
1340 | 
1341 | 	print("Genotyping; start")
1342 | 	read_input_dir(args, args['in_dir'], args['subset_list'])
1343 | 	if len(args["fna_paths"]) > 0:
1344 | 		print("Genomes found; start")
1345 | 		genotype_single_genomes(args)
1346 | 		print("Genomes found; done")
1347 | 	
1348 | 	if len(args["fq_paths"]) > 0:
1349 | 		print("Reads found; start")
1350 | 		genotype_reads(args)
1351 | 		print("Reads found; start")
1352 | 	print("Genotyping; finished")
1353 | 	print("All output files are in {}".format(args['out_dir']))
1354 | 	print("The output files include the following")
1355 | 	print("    reference.fna (selected reference genome)")
1356 | 	print("    tag_paths.list (list of selected tag genomes)")
1357 | 	print("    tag_msa.fna (multiple sequence alignment of tag genomes)")
1358 | 	print("    coords.tsv (coordinates of consensus genome)")
1359 | 	print("    core_snps.vcf (called SNPs in VCF format)")
1360 | 	print("    nr_kmer_set.tsv (raw SNP-covering k-mers)")
1361 | 	print("    check_fna_paths.list (a list of genomes used for validating SNP-covering k-mers)")
1362 | 	print("    kmer_prof.tsv (hit profile of SNP-covering k-mers)")
1363 | 	print("    selected_kmers.tsv (validated SNP-covering k-mers)")
1364 | 	print("    kmer_db.bin (optimized database of SNP-covering k-mers)")
1365 | 	print("The directories include")
1366 | 	print("    gt_results (SNP genotyping results)")
1367 | 	print("    temp (tempory directory for hosting)")
1368 | 
1369 | def main():
1370 | 	args = parse_args()
1371 | 
1372 | 	if args['overwrite'] is True:
1373 | 		try: os.rmdir(args['out_dir'])
1374 | 		except: pass
1375 | 
1376 | 	try: os.makedirs(args['out_dir'])
1377 | 	except: pass
1378 | 
1379 | 	args['logfile'] = "{}/logfile".format(args['out_dir'].rstrip('/'))
1380 | 
1381 | 	if args['data_type'] == 'genomes':
1382 | 		call_snps_main(args)
1383 | 	elif args['data_type'] == 'db':
1384 | 		build_db_main(args)
1385 | 	elif args['data_type'] == 'genotype':
1386 | 		genotype_main(args)
1387 | 	elif args['data_type'] == 'tree':
1388 | 		tree_main(args)
1389 | 	elif args['data_type'] == 'end_to_end':
1390 | 		end2end_main(args)
1391 | 	else:
1392 | 		sys.exit("\nError: invalid subcommand\nSupported subcommand: genomes, db, genotype, tree, end_to_end\n")
1393 | 
1394 | if __name__ == "__main__":
1395 | 	main()
1396 | 


--------------------------------------------------------------------------------