├── utils ├── __init__.py ├── spades_wrapper.py ├── VStrains_PE_Inference.py ├── VStrains_SPAdes.py ├── VStrains_Preprocess.py ├── VStrains_Alignment.py ├── VStrains_IO.py ├── VStrains_Extension.py └── VStrains_Decomposition.py ├── requirements.txt ├── VStrains_logo.png ├── environment.yml ├── MANIFEST.in ├── .gitignore ├── LICENSE ├── recipe └── meta.yaml ├── setup.py ├── evals ├── sampling.py └── quast_evaluation.py ├── vstrains └── README.md /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | graph-tool 2 | minimap2 3 | numpy 4 | gfapy 5 | matplotlib 6 | -------------------------------------------------------------------------------- /VStrains_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metagentools/VStrains/HEAD/VStrains_logo.png -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: vstrains 2 | channels: 3 | - defaults 4 | - bioconda 5 | - conda-forge 6 | dependencies: 7 | - python=3 8 | - graph-tool>=2.45 9 | - minimap2>=2.24 10 | - numpy>=1.23.5 11 | - gfapy>=1.2.3 12 | - matplotlib>=3.6.2 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | include LICENSE 4 | include VStrains_logo.png 5 | include environment.yml 6 | include setup.py 7 | 8 | include vstrains 9 | 10 | 11 | recursive-include recipe/* 12 | recursive-include utils/* 13 | recursive-include evals/* 14 | 15 | global-exclude utils/__pycache__*.pyc 16 | global-exclude evals/__pycache__*.pyc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | *.sh 10 | 11 | # Packages # 12 | ############ 13 | # it's better to unpack these files and commit the raw source 14 | # git has its own built in compression methods 15 | *.7z 16 | *.dmg 17 | *.gz 18 | *.iso 19 | *.jar 20 | *.rar 21 | *.tar 22 | *.zip 23 | 24 | # Logs and databases # 25 | ###################### 26 | *.log 27 | *.sql 28 | *.sqlite 29 | 30 | # OS generated files # 31 | ###################### 32 | .DS_Store 33 | .DS_Store? 34 | ._* 35 | .Spotlight-V100 36 | .Trashes 37 | ehthumbs.db 38 | Thumbs.db 39 | 40 | # Evaluation result # 41 | ##################### 42 | eval_result/ 43 | example/ 44 | benchmark/* 45 | quast*/ 46 | acc*/ 47 | testcase/ 48 | src/tmp/* 49 | *.fa 50 | *.fq 51 | *.fasta 52 | *.fastq 53 | *.gfa 54 | *.csv 55 | *.paf 56 | *.pyc 57 | *.sh 58 | # pycache # 59 | ########### 60 | */__pycache__/* 61 | *.cpython* 62 | src/__pycache__/* 63 | legacy/ 64 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2022] [Runpeng Luo] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /recipe/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "VStrains" %} 2 | {% set version = "1.1.0" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | source: 9 | url: https://github.com/metagentools/{{ name }}/releases/download/v{{ version }}/{{ name }}-{{ version }}.tar.gz 10 | sha256: 79a77435dd0f648fe55bb5930ef8fdd874d4aec990850ab20dd8b067d8df5ec0 11 | 12 | build: 13 | number: 0 14 | noarch: python 15 | script: 16 | - "{{ PYTHON }} -m pip install . -vv" 17 | 18 | requirements: 19 | host: 20 | - pip>=22.3.1 21 | - python=3 22 | - graph-tool>=2.45 23 | - minimap2>=2.24 24 | - numpy>=1.23.5 25 | - gfapy>=1.2.3 26 | - matplotlib>=3.6.2 27 | run: 28 | - python=3 29 | - graph-tool>=2.45 30 | - minimap2>=2.24 31 | - numpy>=1.23.5 32 | - gfapy>=1.2.3 33 | - matplotlib>=3.6.2 34 | 35 | test: 36 | commands: 37 | - vstrains -h 38 | 39 | about: 40 | home: "https://github.com/metagentools/MetaCoAG" 41 | license: MIT 42 | license_file: LICENSE 43 | summary: "VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs" 44 | doc_url: "https://github.com/metagentools/VStrains/blob/master/README.md" 45 | dev_url: "https://github.com/metagentools/VStrains" 46 | 47 | extra: 48 | recipe-maintainers: 49 | - JohnLuo 50 | # identifiers: 51 | # - doi:10.1101/2022.10.21.513181v3 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from setuptools import setup, find_packages 4 | 5 | # read the contents of your README file 6 | from pathlib import Path 7 | 8 | this_directory = Path(__file__).parent 9 | long_description = (this_directory / "README.md").read_text() 10 | 11 | packages = find_packages() 12 | package_data = {"utils": ["utils/*"]} 13 | 14 | data_files = [(".", ["LICENSE", "README.md"])] 15 | 16 | setup( 17 | name="vstrains", 18 | version="1.1.0", 19 | zip_safe=True, 20 | author="Runpeng Luo and Yu Lin", 21 | author_email="runpengluo@gmail.com", 22 | description="VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs", 23 | long_description=long_description, 24 | long_description_content_type="text/markdown", 25 | url="https://github.com/metagentools/VStrains", 26 | license="MIT", 27 | packages=packages, 28 | package_data=package_data, 29 | data_files=data_files, 30 | include_package_data=True, 31 | scripts=["vstrains"], 32 | classifiers=[ 33 | "Development Status :: 5 - Production/Stable", 34 | "Programming Language :: Python :: 3", 35 | "License :: OSI Approved :: MIT License", 36 | "Natural Language :: English", 37 | "Topic :: Scientific/Engineering :: Bio-Informatics", 38 | "Operating System :: OS Independent", 39 | ], 40 | install_requires=[ 41 | # "graph-tool", # not distributed via Pip 42 | # "minimap2", # not distributed via Pip 43 | "numpy", 44 | "gfapy", 45 | "matplotlib", 46 | ], 47 | python_requires=">=3", 48 | ) 49 | -------------------------------------------------------------------------------- /utils/spades_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import argparse 4 | import time 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser( 8 | prog="spades_wrapper.py", 9 | description="""Build assembly graph&contig using SPAdes --careful mode, 10 | with input pair-end reads and store the graph.""", 11 | ) 12 | parser.add_argument( 13 | "-f", 14 | "--forward", 15 | dest="forward", 16 | type=str, 17 | required=True, 18 | help="Forward reads, fastq format", 19 | ) 20 | parser.add_argument( 21 | "-r", 22 | "--reverse", 23 | dest="reverse", 24 | type=str, 25 | required=True, 26 | help="Reverse reads, fastq format", 27 | ) 28 | parser.add_argument( 29 | "-spades", 30 | "--spades_path", 31 | dest="spades", 32 | type=str, 33 | required=True, 34 | help="absolute path to spades executable", 35 | ) 36 | parser.add_argument( 37 | "-t", 38 | "--threads", 39 | dest="thread_count", 40 | default=8, 41 | help="Set number of threads used for SPAdes.", 42 | ) 43 | parser.add_argument( 44 | "-o", "--output_dir", dest="output_dir", type=str, required=True 45 | ) 46 | args = parser.parse_args() 47 | 48 | global_t1_start = time.perf_counter() 49 | global_t2_start = time.process_time() 50 | 51 | filepath = os.path.dirname(os.path.abspath(__file__)) 52 | spades = args.spades 53 | 54 | if spades: 55 | print(filepath) 56 | subprocess.check_call( 57 | "rm -rf {0} && mkdir {0}".format(args.output_dir), shell=True 58 | ) 59 | 60 | subprocess.check_call( 61 | spades 62 | + " -1 {0} -2 {1} --careful -t {3} -o {4}".format( 63 | args.forward, args.reverse, args.thread_count, args.output_dir 64 | ), 65 | shell=True, 66 | ) 67 | else: 68 | print("SPAdes executable path haven't specified.") 69 | 70 | t1_stop = time.perf_counter() 71 | t2_stop = time.process_time() 72 | 73 | print("\SPAdes assembly completed") 74 | print("Elapsed time: {:.1f} seconds".format(t1_stop - global_t1_start)) 75 | print("CPU process time: {:.1f} seconds".format(t2_stop - global_t2_start)) 76 | -------------------------------------------------------------------------------- /evals/sampling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import subprocess 4 | import sys 5 | import random 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser( 10 | prog="sampling", 11 | description="""Sampling the pairend fastq file""", 12 | ) 13 | 14 | parser.add_argument( 15 | "-s", 16 | "--sampling_ratio", 17 | dest="sratio", 18 | type=int, 19 | required=True, 20 | help="sampling ratio, 2 for sampling half the dataset, etc.,", 21 | ) 22 | parser.add_argument( 23 | "-f", 24 | "--forward", 25 | dest="fwd", 26 | type=str, 27 | required=True, 28 | help="forward .fastq file", 29 | ) 30 | 31 | parser.add_argument( 32 | "-r", 33 | "--reverse", 34 | dest="rve", 35 | type=str, 36 | required=True, 37 | help="reverse .fastq file", 38 | ) 39 | 40 | parser.add_argument( 41 | "-of", 42 | "--out_forward", 43 | dest="ofwd", 44 | type=str, 45 | required=True, 46 | help="output forward .fastq file", 47 | ) 48 | 49 | parser.add_argument( 50 | "-or", 51 | "--out_reverse", 52 | dest="orve", 53 | type=str, 54 | required=True, 55 | help="output reverse .fastq file", 56 | ) 57 | 58 | args = parser.parse_args() 59 | 60 | if 1 / args.sratio <= 0 or 1 / args.sratio >= 1: 61 | print("error ratio, please input a valid ratio") 62 | sys.exit(1) 63 | 64 | subprocess.check_call("echo " " > {0}".format(args.ofwd), shell=True) 65 | subprocess.check_call("echo " " > {0}".format(args.orve), shell=True) 66 | 67 | with open(args.fwd, "r") as fwd: 68 | with open(args.rve, "r") as rve: 69 | with open(args.ofwd, "w") as ofwd: 70 | with open(args.orve, "w") as orve: 71 | flines = fwd.readlines() 72 | rlines = rve.readlines() 73 | n = len(flines) // 4 74 | k = 0 75 | print("total number of reads: ", n) 76 | for i in range(n): 77 | if random.random() > 1 / args.sratio: 78 | continue 79 | k += 1 80 | for fcurr in flines[i * 4 : i * 4 + 4]: 81 | ofwd.write(fcurr) 82 | for rcurr in rlines[i * 4 : i * 4 + 4]: 83 | orve.write(rcurr) 84 | print("sample {0} reads given ratio {1}".format(k, args.sratio)) 85 | orve.close() 86 | ofwd.close() 87 | rve.close() 88 | fwd.close() 89 | 90 | return 91 | 92 | 93 | if __name__ == "__main__": 94 | sys.exit(main()) 95 | -------------------------------------------------------------------------------- /evals/quast_evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import subprocess 3 | import argparse 4 | import sys 5 | import os 6 | 7 | usage = "Use MetaQUAST to evaluate assembly result" 8 | Author = "Runpeng Luo" 9 | 10 | 11 | def sep_ref(ref_file, id=0): 12 | ref_file_list = [] 13 | i = 0 14 | with open(ref_file, "r") as ref: 15 | j = 0 16 | lines = ref.readlines() 17 | l = len(lines) 18 | while j < l - 1: 19 | name_in_file = lines[j] 20 | name = str(lines[j][1:-1]) 21 | name = name.split(" ")[0] 22 | name = name.split(".")[0] 23 | strain = lines[j + 1] 24 | j = j + 2 25 | file_name = "sub_" + str(id) + "_" + str(name) + "_ref.fasta" 26 | subprocess.check_call("touch {0}".format(file_name), shell=True) 27 | with open(file_name, "w") as sub_file: 28 | sub_file.write(name_in_file) 29 | sub_file.write(strain) 30 | sub_file.close() 31 | ref_file_list.append(file_name) 32 | i = i + 1 33 | ref.close() 34 | print("ref list: ", ref_file_list) 35 | return ref_file_list 36 | 37 | 38 | def quast_eval(files, ref, o, quast, id=0): 39 | subprocess.check_call("rm -rf sub_{0}_*_ref.fasta".format(id), shell=True) 40 | 41 | ref_file_list = sep_ref(ref, id) 42 | 43 | command = "python2 {0} --unique-mapping --report-all-metrics -m 500 -t 8 ".format( 44 | quast 45 | ) 46 | for fname in files: 47 | command += fname + " " 48 | 49 | command += "-o " + o + " -R " 50 | 51 | for file in ref_file_list: 52 | command += file + "," 53 | command = command[:-1] 54 | 55 | print(command) 56 | subprocess.check_call(command, shell=True) 57 | 58 | # clean up 59 | subprocess.check_call("rm -rf sub_{0}_*_ref.fasta".format(id), shell=True) 60 | return 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser(prog="quast_evaluation.py", description=usage) 65 | parser.add_argument( 66 | "-quast", 67 | "--path_to_quast", 68 | dest="quast", 69 | required=True, 70 | help="path to MetaQuast python script, version >= 5.2.0", 71 | ) 72 | parser.add_argument( 73 | "-cs", 74 | "--contig_files", 75 | dest="files", 76 | default=None, 77 | nargs="+", 78 | help="contig files from different tools, separated by space", 79 | ) 80 | parser.add_argument( 81 | "-d", 82 | "--contig_dir", 83 | dest="idir", 84 | default=None, 85 | help="contig files from different tools, stored in the directory, .fasta format", 86 | ) 87 | parser.add_argument( 88 | "-ref", 89 | "--ref_file", 90 | dest="ref_file", 91 | type=str, 92 | required=True, 93 | help="ref file (single)", 94 | ) 95 | parser.add_argument( 96 | "-o", 97 | "--output_dir", 98 | dest="output_dir", 99 | type=str, 100 | required=True, 101 | help="output directory", 102 | ) 103 | args = parser.parse_args() 104 | 105 | if args.idir == None and args.files == None: 106 | print("Please provide correct query input") 107 | sys.exit(1) 108 | 109 | if args.idir != None and ( 110 | not os.path.exists(args.idir) or not os.path.isdir(args.idir) 111 | ): 112 | print("Please provide correct directory") 113 | sys.exit(1) 114 | 115 | files = [] 116 | if args.files != None: 117 | files.extend(args.files) 118 | if args.idir != None: 119 | files.extend( 120 | [ 121 | str(args.idir) + s 122 | for s in sorted(os.listdir(args.idir)) 123 | if s.endswith(".fasta") or s.endswith(".fa") 124 | ] 125 | ) 126 | 127 | quast_eval(files, args.ref_file, args.output_dir, args.quast) 128 | -------------------------------------------------------------------------------- /utils/VStrains_PE_Inference.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import time 5 | import subprocess 6 | import numpy 7 | import sys 8 | 9 | rev_dict = {"A": "T", "T": "A", "C": "G", "G": "C"} 10 | 11 | 12 | def reverse_seq(seq: str): 13 | return "".join(rev_dict[x] for x in reversed(seq)) 14 | 15 | 16 | def single_end_read_mapping( 17 | seq: str, kmer_htable: dict, index2seqlen: list, split_len: int, len_index2id: int 18 | ): 19 | nodes = numpy.zeros(len_index2id, dtype=int) 20 | coords = [sys.maxsize for _ in range(len_index2id)] 21 | kindices = [sys.maxsize for _ in range(len_index2id)] 22 | 23 | rlen = len(seq) 24 | for i in range(rlen - split_len + 1): 25 | kmer = seq[i : i + split_len] 26 | if kmer in kmer_htable: 27 | # found a collide node 28 | for rid, rcord in kmer_htable[kmer]: 29 | nodes[rid] += 1 30 | coords[rid] = min(coords[rid], rcord) 31 | kindices[rid] = min(kindices[rid], i) 32 | 33 | saturates = [] 34 | L = 0 35 | R = 0 36 | for i, v in enumerate(nodes): 37 | if coords[i] == sys.maxsize or kindices[i] == sys.maxsize: 38 | continue 39 | L = max(coords[i], coords[i] - kindices[i]) 40 | R = min(coords[i] + index2seqlen[i] - 1, coords[i] - kindices[i] + rlen - 1) 41 | saturate = R - L - (split_len - 1) + 1 42 | expected = ( 43 | (min(rlen, index2seqlen[i]) - split_len + 1) * (rlen - split_len) / rlen 44 | ) 45 | if v >= max(min(saturate, expected), 1): 46 | # print(i,v,"passed") 47 | saturates.append(i) 48 | return saturates 49 | 50 | 51 | def main(): 52 | print( 53 | "----------------------Paired-End Information Alignment----------------------" 54 | ) 55 | parser = argparse.ArgumentParser( 56 | prog="pe_info", 57 | description="""Align Paired-End reads to nodes in graph to obtain strong links""", 58 | ) 59 | 60 | parser.add_argument( 61 | "-g", "--gfa,", dest="gfa", type=str, required=True, help="graph, .gfa format" 62 | ) 63 | 64 | parser.add_argument( 65 | "-o", 66 | "--output_dir", 67 | dest="dir", 68 | type=str, 69 | required=True, 70 | help="output directory", 71 | ) 72 | 73 | parser.add_argument( 74 | "-f", "--forward", dest="fwd", required=True, help="forward read, .fastq" 75 | ) 76 | 77 | parser.add_argument( 78 | "-r", "--reverse", dest="rve", required=True, help="reverse read, .fastq" 79 | ) 80 | 81 | parser.add_argument( 82 | "-k", 83 | "--kmer_size", 84 | dest="kmer_size", 85 | type=int, 86 | default=128, 87 | help="unique kmer size", 88 | ) 89 | 90 | args = parser.parse_args() 91 | 92 | # initialize output directory 93 | if args.dir[-1] == "/": 94 | args.dir = args.dir[:-1] 95 | subprocess.check_call("rm -rf {0}".format(args.dir), shell=True) 96 | os.makedirs(args.dir, exist_ok=True) 97 | 98 | glb_start = time.time() 99 | 100 | # get gfa node informations 101 | index2id = [] 102 | index2seq = [] 103 | index2seqlen = [] 104 | 105 | with open(args.gfa, "r") as gfa: 106 | for Line in gfa: 107 | splited = (Line[:-1]).split("\t") 108 | if splited[0] == "S": 109 | index2id.append(splited[1]) 110 | index2seq.append(splited[2]) 111 | index2seqlen.append(len(splited[2])) 112 | gfa.close() 113 | 114 | split_len = args.kmer_size + 1 115 | 116 | # construct hash table for gfa nodes with chunck kmer 117 | kmer_htable = {} 118 | for i, seq in enumerate(index2seq): 119 | seqlen = index2seqlen[i] 120 | for sub_i in range(seqlen - split_len + 1): 121 | kmer = seq[sub_i : sub_i + split_len] 122 | rev_kmer = reverse_seq(kmer) 123 | if kmer in kmer_htable: 124 | # not unique 125 | kmer_htable[kmer].append((i, sub_i)) 126 | else: 127 | # unique 128 | kmer_htable[kmer] = [(i, sub_i)] 129 | 130 | if rev_kmer in kmer_htable: 131 | # not unique 132 | kmer_htable[rev_kmer].append((i, sub_i)) 133 | else: 134 | # unique 135 | kmer_htable[rev_kmer] = [(i, sub_i)] 136 | 137 | # init nodes pairwise relationship 138 | len_index2id = len(index2id) 139 | node_mat = numpy.zeros((len_index2id, len_index2id), dtype=int) 140 | short_mat = numpy.zeros((len_index2id, len_index2id), dtype=int) 141 | 142 | n_reads = 0 143 | short_reads = 0 144 | used_reads = 0 145 | 146 | print("Start aligning reads to gfa nodes") 147 | fwd_fd = open(args.fwd, "r") 148 | rve_fd = open(args.rve, "r") 149 | fwd_reads = fwd_fd.readlines() 150 | rve_reads = rve_fd.readlines() 151 | fwd_fd.close() 152 | rve_fd.close() 153 | 154 | total_size = min(len(fwd_reads) // 4, len(rve_reads) // 4) 155 | for read_idx in range(total_size): 156 | if read_idx % 100000 == 0: 157 | print("Number of processed reads: ", read_idx) 158 | [_, fseq, _, _] = [s[:-1] for s in fwd_reads[read_idx * 4 : (read_idx + 1) * 4]] 159 | [_, rseq, _, _] = [s[:-1] for s in rve_reads[read_idx * 4 : (read_idx + 1) * 4]] 160 | if fseq.count("N") or rseq.count("N"): 161 | n_reads += 1 162 | elif len(fseq) < split_len or len(rseq) < split_len: 163 | short_reads += 1 164 | else: 165 | used_reads += 1 166 | # valid read pair 167 | lefts = single_end_read_mapping( 168 | fseq, kmer_htable, index2seqlen, split_len, len_index2id 169 | ) 170 | rights = single_end_read_mapping( 171 | rseq, kmer_htable, index2seqlen, split_len, len_index2id 172 | ) 173 | 174 | k = 0 175 | for i in lefts: 176 | for i2 in lefts[k:]: 177 | short_mat[i][i2] += 1 178 | k += 1 179 | 180 | k = 0 181 | for j in rights: 182 | for j2 in rights[k:]: 183 | short_mat[j][j2] += 1 184 | k += 1 185 | 186 | for i in lefts: 187 | for j in rights: 188 | node_mat[i][j] += 1 189 | 190 | out_file = "{0}/pe_info".format(args.dir) 191 | out_file2 = "{0}/st_info".format(args.dir) 192 | subprocess.check_call("touch {0}; echo " " > {0}".format(out_file), shell=True) 193 | subprocess.check_call("touch {0}; echo " " > {0}".format(out_file2), shell=True) 194 | with open(out_file, "w") as outfile: 195 | with open(out_file2, "w") as outfile2: 196 | for i in range(len_index2id): 197 | for j in range(len_index2id): 198 | outfile.write( 199 | "{0}:{1}:{2}\n".format(index2id[i], index2id[j], node_mat[i][j]) 200 | ) 201 | outfile2.write( 202 | "{0}:{1}:{2}\n".format( 203 | index2id[i], index2id[j], short_mat[i][j] 204 | ) 205 | ) 206 | outfile2.close() 207 | outfile.close() 208 | 209 | glb_elapsed = time.time() - glb_start 210 | print("Global time elapsed: ", glb_elapsed) 211 | print("result stored in: ", out_file) 212 | 213 | 214 | if __name__ == "__main__": 215 | main() 216 | sys.exit(0) 217 | -------------------------------------------------------------------------------- /vstrains: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import sys 5 | import os 6 | import platform 7 | import numpy 8 | import logging 9 | import time 10 | from datetime import date 11 | 12 | from utils import VStrains_SPAdes 13 | 14 | __author__ = "Runpeng Luo" 15 | __copyright__ = "Copyright 2022-2025, VStrains Project" 16 | __credits__ = ["Runpeng Luo", "Yu Lin"] 17 | __license__ = "MIT" 18 | __version__ = "1.1.0" 19 | __maintainer__ = "Runpeng Luo" 20 | __email__ = "John.Luo@anu.edu.au" 21 | __status__ = "Production" 22 | 23 | 24 | def run(args, logger): 25 | numpy.seterr(all="raise") 26 | RUNNER = { 27 | "spades": VStrains_SPAdes.run, 28 | } 29 | RUNNER[args.assembler](args, logger) 30 | 31 | 32 | def main(): 33 | parser = argparse.ArgumentParser( 34 | prog="VStrains", 35 | description="""Construct full-length viral strains under de novo approach 36 | from contigs and assembly graph, currently supports SPAdes""", 37 | ) 38 | 39 | parser.add_argument( 40 | "-a", 41 | "--assembler", 42 | dest="assembler", 43 | type=str, 44 | required=True, 45 | choices=["spades"], 46 | help="name of the assembler used. [spades]", 47 | ) 48 | 49 | parser.add_argument( 50 | "-g", 51 | "--graph", 52 | dest="gfa_file", 53 | type=str, 54 | required=True, 55 | help="path to the assembly graph, (.gfa format)", 56 | ) 57 | 58 | parser.add_argument( 59 | "-p", 60 | "--path", 61 | dest="path_file", 62 | type=str, 63 | required=False, 64 | help="contig file from SPAdes (.paths format), only required for SPAdes. e.g., contigs.paths", 65 | ) 66 | 67 | parser.add_argument( 68 | "-mc", 69 | "--minimum_coverage", 70 | dest="min_cov", 71 | default=None, 72 | type=int, 73 | help=argparse.SUPPRESS, 74 | # ( 75 | # "minimum node coverage cutoff [default: auto]" 76 | # ), 77 | ) 78 | 79 | parser.add_argument( 80 | "-ml", 81 | "--minimum_contig_length", 82 | dest="min_len", 83 | default=None, 84 | type=int, 85 | help=argparse.SUPPRESS, 86 | # ("minimum initial contig length [default: 250]"), 87 | ) 88 | 89 | parser.add_argument( 90 | "-r", 91 | "--reference_fa", 92 | dest="ref_file", 93 | default=None, 94 | type=str, 95 | help=argparse.SUPPRESS, 96 | ) 97 | 98 | parser.add_argument( 99 | "-o", 100 | "--output_dir", 101 | dest="output_dir", 102 | default="acc/", 103 | type=str, 104 | help="path to the output directory [default: acc/]", 105 | ) 106 | 107 | parser.add_argument( 108 | "-d", 109 | "--dev_mode", 110 | dest="dev", 111 | action="store_true", 112 | default=False, 113 | help=argparse.SUPPRESS, 114 | ) 115 | 116 | parser.add_argument( 117 | "-fwd", 118 | "--fwd_file", 119 | dest="fwd", 120 | required=True, 121 | default=None, 122 | type=str, 123 | help="paired-end sequencing reads, forward strand (.fastq format)", 124 | ) 125 | 126 | parser.add_argument( 127 | "-rve", 128 | "--rve_file", 129 | dest="rve", 130 | required=True, 131 | default=None, 132 | type=str, 133 | help="paired-end sequencing reads, reverse strand (.fastq format)", 134 | ) 135 | 136 | args = parser.parse_args() 137 | 138 | # parsing arguments, sanity check 139 | if (not args.gfa_file) or (not os.path.exists(args.gfa_file)): 140 | print("\nPath to the assembly graph is required, (.gfa format)") 141 | print("Please ensure the path is correct") 142 | print("\nExiting...\n") 143 | sys.exit(1) 144 | 145 | args.assembler = args.assembler.lower() 146 | 147 | if args.assembler.lower() == "spades": 148 | if (not args.path_file) or (not os.path.exists(args.path_file)): 149 | print( 150 | "\nPath to Contig file from SPAdes (.paths format) is required for SPAdes assmbler option. e.g., contigs.paths" 151 | ) 152 | print("\nExiting...\n") 153 | sys.exit(1) 154 | else: 155 | print("\nPlease make sure to provide the correct assembler type (SPAdes).") 156 | print("\nExiting...\n") 157 | sys.exit(1) 158 | 159 | if args.min_len != None: 160 | if args.min_len < 0: 161 | print( 162 | "\nPlease make sure to provide the correct option (invalid value for min_len or min_cov)." 163 | ) 164 | print("\nExiting...\n") 165 | sys.exit(1) 166 | else: 167 | args.min_len = 250 168 | 169 | if args.min_cov != None: 170 | if args.min_cov < 0: 171 | print( 172 | "\nPlease make sure to provide the correct option (invalid value for min_len or min_cov)." 173 | ) 174 | print("\nExiting...\n") 175 | sys.exit(1) 176 | 177 | if args.output_dir[-1] == "/": 178 | args.output_dir = args.output_dir[:-1] 179 | 180 | # initialize output directory 181 | os.makedirs(args.output_dir, exist_ok=True) 182 | try: 183 | os.makedirs(args.output_dir + "/gfa/") 184 | os.makedirs(args.output_dir + "/tmp/") 185 | os.makedirs(args.output_dir + "/paf/") 186 | os.makedirs(args.output_dir + "/aln/") 187 | except OSError as _: 188 | print("\nCurrent output directory is not empty") 189 | print("Please empty/re-create the output directory: " + str(args.output_dir)) 190 | print("\nExiting...\n") 191 | sys.exit(1) 192 | 193 | if os.path.exists(args.output_dir + "/vstrains.log"): 194 | os.remove(args.output + "/vstrains.log") 195 | 196 | # Setup logger 197 | # ----------------------- 198 | logger = logging.getLogger("VStrains %s" % __version__) 199 | logger.setLevel(logging.DEBUG if args.dev else logging.INFO) 200 | 201 | consoleHeader = logging.StreamHandler() 202 | consoleHeader.setLevel(logging.INFO) 203 | consoleHeader.setFormatter(logging.Formatter("%(message)s")) 204 | logger.addHandler(consoleHeader) 205 | 206 | fileHandler = logging.FileHandler(args.output_dir + "/vstrains.log") 207 | fileHandler.setLevel(logging.DEBUG if args.dev else logging.INFO) 208 | fileHandler.setFormatter(logging.Formatter("%(message)s")) 209 | logger.addHandler(fileHandler) 210 | 211 | logger.info("Welcome to VStrains!") 212 | logger.info( 213 | "VStrains is a strain-aware assembly tools, which constructs full-length " 214 | ) 215 | logger.info("virus strain with aid from de Bruijn assembly graph and contigs.") 216 | logger.info("") 217 | logger.info("System information:") 218 | try: 219 | logger.info(" VStrains version: " + str(__version__).strip()) 220 | logger.info(" Python version: " + ".".join(map(str, sys.version_info[0:3]))) 221 | logger.info(" OS: " + platform.platform()) 222 | except Exception: 223 | logger.info(" Problem occurred when getting system information") 224 | 225 | logger.info("") 226 | start_time = time.time() 227 | 228 | logger.info("Input arguments:") 229 | logger.info("Assembly type: " + args.assembler) 230 | logger.info("Assembly graph file: " + args.gfa_file) 231 | logger.info("Forward read file: " + args.fwd) 232 | logger.info("Reverse read file: " + args.rve) 233 | if args.assembler == "spades": 234 | logger.info("Contig paths file: " + args.path_file) 235 | logger.info("Output directory: " + os.path.abspath(args.output_dir)) 236 | if args.dev: 237 | logger.info("*DEBUG MODE is turned ON") 238 | logger.info("\n\n") 239 | logger.info( 240 | "======= VStrains pipeline started. Log can be found here: " 241 | + os.path.abspath(args.output_dir) 242 | + "/vstrains.log\n" 243 | ) 244 | 245 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 246 | consoleHeader.setFormatter(formatter) 247 | fileHandler.setFormatter(formatter) 248 | 249 | # all good 250 | run(args, logger) 251 | 252 | elapsed = time.time() - start_time 253 | 254 | consoleHeader.setFormatter(logging.Formatter("%(message)s")) 255 | fileHandler.setFormatter(logging.Formatter("%(message)s")) 256 | 257 | logger.info("") 258 | logger.info("Thanks for using VStrains") 259 | logger.info( 260 | "Result is stored in {0}/strain.fasta".format(os.path.abspath(args.output_dir)) 261 | ) 262 | logger.info( 263 | "You can visualise the path stored in {0}/strain.paths via {0}/gfa/graph_L0.gfa".format( 264 | os.path.abspath(args.output_dir) 265 | ) 266 | ) 267 | logger.info("Finished: {0}".format(date.today().strftime("%B %d, %Y"))) 268 | logger.info("Elapsed time: {0}".format(elapsed)) 269 | logger.info("Exiting...") 270 | logger.removeHandler(fileHandler) 271 | logger.removeHandler(consoleHeader) 272 | 273 | return 0 274 | 275 | 276 | if __name__ == "__main__": 277 | main() 278 | -------------------------------------------------------------------------------- /utils/VStrains_SPAdes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from utils.VStrains_Utilities import * 4 | from utils.VStrains_Preprocess import ( 5 | graph_simplification, 6 | reindexing, 7 | threshold_estimation, 8 | ) 9 | from utils.VStrains_IO import ( 10 | graph_to_gfa, 11 | flipped_gfa_to_graph, 12 | gfa_to_graph, 13 | contig_dict_to_path, 14 | contig_dict_to_fasta, 15 | spades_paths_parser, 16 | process_pe_info, 17 | store_reinit_graph, 18 | ) 19 | from utils.VStrains_Decomposition import * 20 | from utils.VStrains_Extension import path_extension, best_matching 21 | import os 22 | import sys 23 | 24 | 25 | def run(args, logger): 26 | TEMP_DIR = args.output_dir 27 | 28 | logger.info("VStrains-SPAdes started") 29 | 30 | logger.info(">>>STAGE: parsing graph and contigs") 31 | graph, simp_node_dict, simp_edge_dict = gfa_to_graph(args.gfa_file, logger) 32 | graph_to_gfa( 33 | graph, 34 | simp_node_dict, 35 | simp_edge_dict, 36 | logger, 37 | "{0}/gfa/graph_L0.gfa".format(TEMP_DIR), 38 | ) 39 | graph0, simp_node_dict0, simp_edge_dict0 = flipped_gfa_to_graph( 40 | "{0}/gfa/graph_L0.gfa".format(TEMP_DIR), logger 41 | ) 42 | graph0, simp_node_dict0, simp_edge_dict0, idx_mapping = reindexing( 43 | graph0, simp_node_dict0, simp_edge_dict0 44 | ) 45 | graph_to_gfa( 46 | graph0, 47 | simp_node_dict0, 48 | simp_edge_dict0, 49 | logger, 50 | "{0}/gfa/graph_L0r.gfa".format(TEMP_DIR), 51 | ) 52 | 53 | # cut-off coverage, graph preprocess parameter 54 | THRESHOLD = 0 55 | if args.min_cov != None: 56 | THRESHOLD = args.min_cov 57 | logger.info("user-defined node minimum coverage: {0}".format(THRESHOLD)) 58 | else: 59 | THRESHOLD = threshold_estimation(graph0, logger, TEMP_DIR) 60 | logger.info("computed node minimum coverage: {0}".format(THRESHOLD)) 61 | 62 | contig_dict, contig_info = spades_paths_parser( 63 | graph0, 64 | simp_node_dict0, 65 | simp_edge_dict0, 66 | idx_mapping, 67 | logger, 68 | args.path_file, 69 | args.min_len, 70 | THRESHOLD, 71 | ) 72 | copy_contig_dict = {} 73 | for cno, [contig, clen, ccov] in contig_dict.items(): 74 | copy_contig_dict[cno] = [list(contig), clen, ccov] 75 | # debug only 76 | contig_dict_to_path(contig_dict, "{0}/tmp/init_contigs.paths".format(TEMP_DIR)) 77 | contig_dict_to_fasta( 78 | graph0, 79 | simp_node_dict0, 80 | contig_dict, 81 | "{0}/tmp/init_contigs.fasta".format(TEMP_DIR), 82 | ) 83 | if args.ref_file: 84 | minimap_api( 85 | args.ref_file, 86 | "{0}/tmp/init_contigs.fasta".format(TEMP_DIR), 87 | "{0}/paf/init_contigs_to_strain.paf".format(TEMP_DIR), 88 | ) 89 | # debug only 90 | logger.info(">>>STAGE: preprocess") 91 | graph_simplification( 92 | graph0, simp_node_dict0, simp_edge_dict0, None, logger, THRESHOLD 93 | ) 94 | graph_to_gfa( 95 | graph0, 96 | simp_node_dict0, 97 | simp_edge_dict0, 98 | logger, 99 | "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR), 100 | ) 101 | graph1, simp_node_dict1, simp_edge_dict1 = flipped_gfa_to_graph( 102 | "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR), logger 103 | ) 104 | 105 | # filter out contig that contains erroroness nodes 106 | for cno, [contig, _, _] in list(contig_dict.items()): 107 | if any([c not in simp_node_dict1 for c in contig]): 108 | contig_dict.pop(cno) 109 | logger.debug("unreliable contig with low coverage: {0}".format(cno)) 110 | 111 | # get graph kmer size 112 | ksize = graph1.ep.overlap[list(graph1.edges())[0]] if graph1.num_edges() > 0 else 0 113 | logger.info("graph kmer size: {0}".format(ksize)) 114 | if ksize <= 0: 115 | logger.error("invalid kmer-size, the graph does not contain any edges, exit..") 116 | sys.exit(1) 117 | 118 | # obtain paired end information 119 | script_path = "{0}/VStrains_PE_Inference.py".format( 120 | os.path.abspath(os.path.dirname(__file__)) 121 | ) 122 | subprocess.check_call( 123 | "python {0} -g {1} -o {2} -f {3} -r {4} -k {5}".format( 124 | script_path, 125 | "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR), 126 | "{0}/aln".format(TEMP_DIR), 127 | args.fwd, 128 | args.rve, 129 | ksize, 130 | ), 131 | shell=True, 132 | ) 133 | logger.info("paired end information stored") 134 | pe_info_file = "{0}/aln/pe_info".format(TEMP_DIR) 135 | st_info_file = "{0}/aln/st_info".format(TEMP_DIR) 136 | pe_info, dcpy_pe_info = process_pe_info( 137 | simp_node_dict1.keys(), pe_info_file, st_info_file 138 | ) 139 | 140 | edge_cleaning(graph1, simp_edge_dict1, contig_dict, pe_info, logger) 141 | 142 | graph2, simp_node_dict2, simp_edge_dict2 = store_reinit_graph( 143 | graph1, 144 | simp_node_dict1, 145 | simp_edge_dict1, 146 | logger, 147 | "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR), 148 | ) 149 | 150 | contig_dict_to_path(contig_dict, "{0}/tmp/pre_contigs.paths".format(TEMP_DIR)) 151 | contig_dict_to_fasta( 152 | graph2, 153 | simp_node_dict2, 154 | contig_dict, 155 | "{0}/tmp/pre_contigs.fasta".format(TEMP_DIR), 156 | ) 157 | # stat evaluation 158 | if args.ref_file: 159 | map_ref_to_graph( 160 | args.ref_file, 161 | simp_node_dict2, 162 | "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR), 163 | logger, 164 | True, 165 | "{0}/paf/node_to_ref.paf".format(TEMP_DIR), 166 | "{0}/tmp/temp_gfa_to_fasta_pre.fasta".format(TEMP_DIR), 167 | ) 168 | minimap_api( 169 | args.ref_file, 170 | "{0}/tmp/pre_contigs.fasta".format(TEMP_DIR), 171 | "{0}/paf/pre_contigs_to_strain.paf".format(TEMP_DIR), 172 | ) 173 | map_ref_to_contig( 174 | contig_dict, logger, "{0}/paf/pre_contigs_to_strain.paf".format(TEMP_DIR) 175 | ) 176 | # end stat 177 | 178 | # split the branches using link information 179 | graphf, simp_node_dictf, simp_edge_dictf = iter_graph_disentanglement( 180 | graph2, 181 | simp_node_dict2, 182 | simp_edge_dict2, 183 | contig_dict, 184 | pe_info, 185 | args.ref_file, 186 | logger, 187 | 0.05 * numpy.median([graph2.vp.dp[node] for node in graph2.vertices()]), 188 | TEMP_DIR, 189 | ) 190 | 191 | contig_dict_to_path(contig_dict, "{0}/tmp/post_contigs.paths".format(TEMP_DIR)) 192 | contig_dict_to_fasta( 193 | graphf, 194 | simp_node_dictf, 195 | contig_dict, 196 | "{0}/tmp/post_contigs.fasta".format(TEMP_DIR), 197 | ) 198 | # stat evaluation 199 | if args.ref_file: 200 | map_ref_to_graph( 201 | args.ref_file, 202 | simp_node_dictf, 203 | "{0}/gfa/split_graph_final.gfa".format(TEMP_DIR), 204 | logger, 205 | True, 206 | "{0}/paf/node_to_ref_final.paf".format(TEMP_DIR), 207 | "{0}/tmp/temp_gfa_to_fasta_post.fasta".format(TEMP_DIR), 208 | ) 209 | minimap_api( 210 | args.ref_file, 211 | "{0}/tmp/post_contigs.fasta".format(TEMP_DIR), 212 | "{0}/paf/post_contigs_to_strain.paf".format(TEMP_DIR), 213 | ) 214 | map_ref_to_contig( 215 | contig_dict, logger, "{0}/paf/post_contigs_to_strain.paf".format(TEMP_DIR) 216 | ) 217 | # end stat 218 | logger.info(">>>STAGE: contig path extension") 219 | 220 | # refine partial links using best match 221 | full_link = best_matching( 222 | graphf, simp_node_dictf, simp_edge_dictf, contig_dict, pe_info, logger 223 | ) 224 | 225 | # update graph coverage on non-trivial branch, maximize 226 | increment_nt_branch_coverage(graphf, simp_node_dictf, logger) 227 | 228 | graph_to_gfa( 229 | graphf, 230 | simp_node_dictf, 231 | simp_edge_dictf, 232 | logger, 233 | "{0}/gfa/split_graph_final.gfa".format(TEMP_DIR), 234 | ) 235 | 236 | # extend the graph 237 | p_delta = 0.05 * numpy.median([graphf.vp.dp[node] for node in graphf.vertices()]) 238 | strain_dict, usages = path_extension( 239 | graphf, 240 | simp_node_dictf, 241 | simp_edge_dictf, 242 | contig_dict, 243 | full_link, 244 | dcpy_pe_info, 245 | logger, 246 | p_delta, 247 | TEMP_DIR, 248 | ) 249 | 250 | logger.info(">>>STAGE: final process") 251 | contig_resolve(strain_dict) 252 | graphl, simp_node_dictl, simp_edge_dictl = flipped_gfa_to_graph( 253 | "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR), logger 254 | ) 255 | trim_contig_dict(graphl, simp_node_dictl, strain_dict, logger) 256 | contig_dup_removed_s(strain_dict, logger) 257 | contig_dict_to_path( 258 | strain_dict, "{0}/tmp/tmp_strain.paths".format(TEMP_DIR), None, False 259 | ) 260 | 261 | # recover repeat nodes back to contig 262 | strain_repeat_resol( 263 | graph0, simp_node_dict0, strain_dict, contig_info, copy_contig_dict, logger 264 | ) 265 | 266 | logger.info(">>>STAGE: generate result") 267 | contig_dict_to_fasta( 268 | graph0, simp_node_dict0, strain_dict, "{0}/strain.fasta".format(TEMP_DIR) 269 | ) 270 | contig_dict_to_path( 271 | strain_dict, "{0}/strain.paths".format(TEMP_DIR), idx_mapping, True 272 | ) 273 | if args.ref_file: 274 | minimap_api( 275 | args.ref_file, 276 | "{0}/strain.fasta".format(TEMP_DIR), 277 | "{0}/paf/strain_to_ref.paf".format(TEMP_DIR), 278 | ) 279 | logger.info("VStrains-SPAdes finished") 280 | return 0 281 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | VStrains logo 3 |

4 | 5 | # VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs 6 | 7 | ![GitHub](https://img.shields.io/github/license/metagentools/VStrains) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 9 | 10 | Manual 11 | =========== 12 | 13 | Table of Contents 14 | ----------------- 15 | 16 | 1. [About VStrains](#sec1)
17 | 2. [Updates](#sec2)
18 | 3. [Installation](#sec3)
19 | 3.1. [Option 1. Quick Install](#sec3.1)
20 | 3.2. [Option 2. Manual Install](#sec3.2)
21 | 3.3. [Download & Install VStrains](#sec3.3)
22 | 4. [Running VStrains](#sec4)
23 | 4.1. [Quick Usage](#sec4.1)
24 | 4.2. [Support SPAdes](#sec4.2)
25 | 4.3. [Output](#sec4.3)
26 | 5. [Stand-alone binaries](#sec5)
27 | 6. [Experiment](#sec6)
28 | 7. [Citation](#sec7)
29 | 8. [Feedback and bug reports](#sec8)
30 | 31 | 32 | # About VStrains 33 | 34 | VStrains is a de novo approach for reconstructing strains from viral quasispecies. 35 | 36 | 37 | 38 | 39 | # Updates 40 | 41 | ## VStrains 1.1.0 Release (03 Feb 2023) 42 | * Replace the PE link inference module `VStrains_Alignment.py` with `VStrains_PE_Inference.py` 43 | 44 | `VStrains_PE_Inference.py` implements a hash table approach that produce efficient perfect match lookup, the new module leads to consistent evaluation results and substantially decrease the runtime and memory usage against previous alignment approach. 45 | 46 | 47 | 48 | 49 | # Installation 50 | 51 | VStrains requires a 64-bit Linux system or Mac OS and python (supported versions are python3: 3.2 and higher). 52 | 53 | 54 | ## Option 1. Quick Install (**recommended**) 55 | 56 | Install [(mini)conda](https://conda.io/miniconda.html) as a light-weighted package management tool. Run the following commands to initialize and setup the conda environment for VStrains 57 | 58 | ```bash 59 | # add channels 60 | conda config --add channels defaults 61 | conda config --add channels bioconda 62 | conda config --add channels conda-forge 63 | 64 | # create conda environment 65 | conda create --name VStrains-env 66 | 67 | # activate conda environment 68 | conda activate VStrains-env 69 | 70 | conda install -c bioconda -c conda-forge python=3 graph-tool minimap2 numpy gfapy matplotlib 71 | ``` 72 | 73 | 74 | ## Option 2. Manual Install 75 | 76 | Manually install dependencies: 77 | - [minimap2](https://github.com/lh3/minimap2) 78 | 79 | And python modules: 80 | - [graph-tool](https://graph-tool.skewed.de) 81 | - [numpy](https://numpy.org) 82 | - [gfapy](https://github.com/ggonnella/gfapy) 83 | - [matplotlib](https://matplotlib.org) 84 | 85 | 86 | ## Download & Install VStrains 87 | 88 | After successfully setup the environment and dependencies, clone the VStrains into your desirable place. 89 | 90 | ```bash 91 | git clone https://github.com/metagentools/VStrains.git 92 | ``` 93 | 94 | Install the VStrains via `Pip` 95 | 96 | ```bash 97 | cd VStrains; pip install . 98 | ``` 99 | 100 | Run the following commands to ensure VStrains is correctly setup & installed. 101 | 102 | ```bash 103 | vstrains -h 104 | ``` 105 | 106 | 107 | # Running VStrains 108 | 109 | VStrains supports assembly results from [SPAdes](https://github.com/ablab/spades) (includes metaSPAdes and metaviralSPAdes) and may supports other graph-based assemblers in the future. 110 | 111 | 112 | ## Quick Usage 113 | 114 | ``` 115 | usage: VStrains [-h] -a {spades} -g GFA_FILE [-p PATH_FILE] [-o OUTPUT_DIR] -fwd FWD -rve RVE 116 | 117 | Construct full-length viral strains under de novo approach from contigs and assembly graph, currently supports 118 | SPAdes 119 | 120 | optional arguments: 121 | -h, --help show this help message and exit 122 | -a {spades}, --assembler {spades} 123 | name of the assembler used. [spades] 124 | -g GFA_FILE, --graph GFA_FILE 125 | path to the assembly graph, (.gfa format) 126 | -p PATH_FILE, --path PATH_FILE 127 | contig file from SPAdes (.paths format), only required for SPAdes. e.g., contigs.paths 128 | -o OUTPUT_DIR, --output_dir OUTPUT_DIR 129 | path to the output directory [default: acc/] 130 | -fwd FWD, --fwd_file FWD 131 | paired-end sequencing reads, forward strand (.fastq format) 132 | -rve RVE, --rve_file RVE 133 | paired-end sequencing reads, reverse strand (.fastq format) 134 | ``` 135 | 136 | VStrains takes as input an assembly graph in Graphical Fragment Assembly (GFA) Format and associated contig information, together with the raw reads in paired-end format (e.g., forward.fastq, reverse.fastq). 137 | 138 | 139 | ## Support SPAdes 140 | 141 | When running SPAdes, we recommend to use `--careful` option for more accurate assembly results. Do not modify any contig/node name from the SPAdes assembly results for consistency. Please refer to [SPAdes](https://github.com/ablab/spades) for further guideline. Example usage as below: 142 | 143 | ```bash 144 | # SPAdes assembler example, pair-end reads 145 | python spades.py -1 forward.fastq -2 reverse.fastq --careful -t 16 -o output_dir 146 | ``` 147 | 148 | Both assembly graph (`assembly_graph_after_simplification.gfa`) and contig information (`contigs.paths`) can be found in the output directory after running SPAdes assembler. Please use them together with raw reads as inputs for VStrains, and set `-a` flag to `spades`. Example usage as below: 149 | 150 | ```bash 151 | vstrains -a spades -g assembly_graph_after_simplification.gfa -p contigs.paths -o output_dir -fwd forward.fastq -rve reverse.fastq 152 | ``` 153 | 154 | 155 | ## Output 156 | 157 | 158 | VStrains stores all output files in ``, which is set by the user. 159 | 160 | * `/aln/` directory contains paired-end (PE) linkage information, which is stored in `pe_info` and `st_info`. 161 | * `/gfa/` directory contains iteratively simplified assembly graphs, where `graph_L0.gfa` contains the assembly graph produced by SPAdes after Strandedness Canonization, `split_graph_final.gfa` contains the assembly graph after Graph Disentanglement, and `graph_S_final.gfa` contains the assembly graph after Contig-based Path Extraction, the rests are intermediate results. All the assembly graphs are in [GFA 1.0 format](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md). 162 | * `/paf/` and `/tmp/` are temporary directories, feel free to ignore them. 163 | * `/strain.fasta` contains resulting strains in `.fasta`, the headers for each strain has the form `NODE___` which is compatiable to SPAdes contigs format. 164 | * `/strain.paths` contains paths in the assembly graph (input `GFA_FILE`) corresponding to `strain.fasta` using [Bandage](https://github.com/rrwick/Bandage) for further downstream analysis. 165 | * `/vstrains.log` contains the VStrains log. 166 | 167 | 168 | 169 | 176 | 177 | 178 | # Stand-alone binaries 179 | 180 | `evals/quast_evaluation.py` is a wrapper script for strain-level experimental result analysis using [MetaQUAST](https://github.com/ablab/quast). 181 | 182 | ``` 183 | usage: quast_evaluation.py [-h] -quast QUAST [-cs FILES [FILES ...]] [-d IDIR] -ref REF_FILE -o OUTPUT_DIR 184 | 185 | Use MetaQUAST to evaluate assembly result 186 | 187 | options: 188 | -h, --help show this help message and exit 189 | -quast QUAST, --path_to_quast QUAST 190 | path to MetaQuast python script, version >= 5.2.0 191 | -cs FILES [FILES ...], --contig_files FILES [FILES ...] 192 | contig files from different tools, separated by space 193 | -d IDIR, --contig_dir IDIR 194 | contig files from different tools, stored in the directory, .fasta format 195 | -ref REF_FILE, --ref_file REF_FILE 196 | ref file (single) 197 | -o OUTPUT_DIR, --output_dir OUTPUT_DIR 198 | output directory 199 | ``` 200 | 201 | 202 | # Experiment 203 | 204 | VStrains is evaluated on both simulated and real datasets under default settings, and the source of the datasets can be found in the links listed below: 205 | 1. Simulated Dataset, can be found at [savage-benchmark](https://bitbucket.org/jbaaijens/savage-benchmarks/src/master/) (No preprocessing is required) 206 | - 6 Poliovirus (20,000x) 207 | - 10 HCV (20,000x) 208 | - 15 ZIKV (20,000x) 209 | 2. Real Dataset (please refer to [Supplementary Material](https://www.biorxiv.org/content/10.1101/2022.10.21.513181v3.supplementary-material) for preprocessing the real datasets) 210 | - 5 HIV labmix (20,000x) [SRR961514](https://www.ncbi.nlm.nih.gov/sra/?term=SRR961514), reference genome sequences are available at [5 HIV References](https://github.com/cbg-ethz/5-virus-mix/blob/master/data/REF.fasta) 211 | - 2 SARS-COV-2 (4,000x) [SRR18009684](https://www.ncbi.nlm.nih.gov/sra/?term=SRR18009684), [SRR18009686](https://www.ncbi.nlm.nih.gov/sra/?term=SRR18009686), pre-processed reads and individually assemble ground-truth reference sequences can be found at [2 SARS-COV-2 Dataset](https://github.com/RunpengLuo/sarscov2-4000x) 212 | 213 | 214 | # Citation 215 | VStrains has been accepted at [RECOMB 2023](http://recomb2023.bilkent.edu.tr/program.html) and manuscript is publicly available at [here](https://link.springer.com/chapter/10.1007/978-3-031-29119-7_1). 216 | 217 | If you use VStrains in your work, please cite the following publications. 218 | 219 | Runpeng Luo and Yu Lin, VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs 220 | 221 | 222 | # Feedback and bug reports 223 | 224 | Thanks for using VStrains. If any bugs be experienced during execution, please re-run the program with additional `-d` flag and provide the `vstains.log` together with user cases via `Issues` 225 | -------------------------------------------------------------------------------- /utils/VStrains_Preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from logging import Logger 4 | import subprocess 5 | from graph_tool.all import Graph 6 | 7 | import numpy 8 | import matplotlib.pyplot as plt 9 | 10 | from utils.VStrains_Utilities import * 11 | 12 | 13 | def reindexing(graph: Graph, simp_node_dict: dict, simp_edge_dict: dict): 14 | """ 15 | Reindex the nodes, with idx-node_id mappings 16 | """ 17 | idx_mapping = {} 18 | idx_node_dict = {} 19 | idx_edge_dict = {} 20 | idx = 0 21 | for no, node in simp_node_dict.items(): 22 | if graph.vp.color[node] == "black": 23 | idx_mapping[no] = str(idx) 24 | graph.vp.id[node] = str(idx) 25 | idx_node_dict[str(idx)] = node 26 | idx += 1 27 | for (u, v), e in simp_edge_dict.items(): 28 | if ( 29 | graph.ep.color[e] == "black" 30 | and graph.vp.color[e.source()] == "black" 31 | and graph.vp.color[e.target()] == "black" 32 | ): 33 | idx_edge_dict[(idx_mapping[u], idx_mapping[v])] = e 34 | return graph, idx_node_dict, idx_edge_dict, idx_mapping 35 | 36 | 37 | def threshold_estimation(graph: Graph, logger: Logger, temp_dir): 38 | dps = [graph.vp.dp[node] for node in graph.vertices()] 39 | # handle edge case, when the graph contains uniform coverage 40 | if max(dps) == min(dps): 41 | return 0.00 42 | regions, bins = numpy.histogram( 43 | dps, bins=int((max(dps) - min(dps)) // (0.05 * numpy.median(dps))) 44 | ) 45 | pidx, _ = max(list(enumerate(regions)), key=lambda p: p[1]) 46 | ratio = 0.00 47 | if pidx == 0: 48 | ratio = 0.05 49 | # global peak belongs to first filter region, find maximum peak range, bound by 25% Median 50 | for i in range(0, 4): 51 | if i >= len(regions): 52 | logger.warning( 53 | "histogram is not properly set, reset cutoff to default (0.05*M)" 54 | ) 55 | ratio = 0.05 56 | break 57 | if regions[i] > regions[i + 1]: 58 | ratio += 0.05 59 | else: 60 | break 61 | threshold = ratio * numpy.median(dps) 62 | plt.figure(figsize=(128, 64)) 63 | for b in bins: 64 | plt.axvline(b, color="blue") 65 | plt.hist(x=dps, bins=len(dps)) 66 | plt.axvline(threshold, color="r") 67 | plt.title("node coverage bar plot") 68 | plt.xticks(numpy.arange(min(dps), max(dps) + 1, 50.0)) 69 | plt.savefig("{0}{1}".format(temp_dir, "/tmp/bar_plot.png")) 70 | return threshold 71 | 72 | 73 | def graph_simplification( 74 | graph: Graph, 75 | simp_node_dict: dict, 76 | simp_edge_dict: dict, 77 | contig_dict: dict, 78 | logger: Logger, 79 | min_cov, 80 | ): 81 | """ 82 | Directly remove all the vertex with coverage less than minimum coverage and related edge 83 | 84 | Node belongs to any contigs should not be removed 85 | return: 86 | removed_node_dict 87 | removed_edge_dict 88 | """ 89 | logger.info("graph simplification") 90 | logger.debug( 91 | "Total nodes: " 92 | + str(len(simp_node_dict)) 93 | + " Total edges: " 94 | + str(len(simp_edge_dict)) 95 | ) 96 | node_to_contig_dict = {} 97 | edge_to_contig_dict = {} 98 | if contig_dict != None: 99 | node_to_contig_dict, edge_to_contig_dict = contig_map_node(contig_dict) 100 | # iterate until no more node be removed from the graph 101 | for id, node in list(simp_node_dict.items()): 102 | if graph.vp.dp[node] <= min_cov: 103 | if id in node_to_contig_dict: 104 | continue 105 | 106 | graph_remove_vertex(graph, simp_node_dict, id, printout=False) 107 | 108 | for e in set(node.all_edges()): 109 | uid = graph.vp.id[e.source()] 110 | vid = graph.vp.id[e.target()] 111 | if (uid, vid) in edge_to_contig_dict: 112 | continue 113 | if (uid, vid) in simp_edge_dict: 114 | graph_remove_edge(graph, simp_edge_dict, uid, vid, printout=False) 115 | 116 | logger.debug( 117 | "Remain nodes: " 118 | + str(len(simp_node_dict)) 119 | + " Total edges: " 120 | + str(len(simp_edge_dict)) 121 | ) 122 | logger.info("done") 123 | return 124 | 125 | 126 | # ------------------------------------LEGACY------------------------------------# 127 | def paths_from_src(graph: Graph, simp_node_dict: dict, self_node, src, maxlen): 128 | """ 129 | retrieve all the path from src node to any node 130 | within maxlen restriction, in straight direction 131 | """ 132 | 133 | def dfs_rev(graph: Graph, u, curr_path: list, maxlen, visited, all_path): 134 | visited[u] = True 135 | curr_path.append(u) 136 | curr_len = path_len(graph, curr_path) 137 | if curr_len >= maxlen: 138 | all_path.append(list(curr_path)) 139 | else: 140 | for v in u.out_neighbors(): 141 | if not visited[v]: 142 | dfs_rev(graph, v, curr_path, maxlen, visited, all_path) 143 | curr_path.pop(-1) 144 | visited[u] = False 145 | return 146 | 147 | visited = {} 148 | for u in graph.vertices(): 149 | if graph.vp.id[u] not in simp_node_dict: 150 | visited[u] = True 151 | else: 152 | visited[u] = False 153 | visited[self_node] = True 154 | all_path = [] 155 | dfs_rev(graph, src, [], maxlen, visited, all_path) 156 | return all_path 157 | 158 | 159 | def paths_to_tgt(graph: Graph, simp_node_dict: dict, self_node, tgt, maxlen): 160 | """ 161 | retrieve all the path from any node to tgt node 162 | within maxlen restriction, in reverse direction 163 | """ 164 | 165 | def dfs_rev(graph: Graph, v, curr_path: list, maxlen, visited, all_path): 166 | visited[v] = True 167 | curr_path.insert(0, v) 168 | curr_len = path_len(graph, curr_path) 169 | if curr_len >= maxlen: 170 | all_path.append(list(curr_path)) 171 | else: 172 | for u in v.in_neighbors(): 173 | if not visited[u]: 174 | dfs_rev(graph, u, curr_path, maxlen, visited, all_path) 175 | curr_path.pop(0) 176 | visited[v] = False 177 | return 178 | 179 | visited = {} 180 | for u in graph.vertices(): 181 | if graph.vp.id[u] not in simp_node_dict: 182 | visited[u] = True 183 | else: 184 | visited[u] = False 185 | visited[self_node] = True 186 | all_path = [] 187 | dfs_rev(graph, tgt, [], maxlen, visited, all_path) 188 | return all_path 189 | 190 | 191 | def tip_removal_s( 192 | graph: Graph, 193 | simp_node_dict: dict, 194 | contig_dict: dict, 195 | logger: Logger, 196 | tempdir, 197 | accept_rate=0.99, 198 | ): 199 | if not graph_is_DAG(graph, simp_node_dict): 200 | logger.info("Graph is Cyclic, tip removal start..") 201 | tip_removed = False 202 | while not tip_removed: 203 | tip_removed = tip_removal( 204 | graph, simp_node_dict, logger, tempdir, accept_rate 205 | ) 206 | for cno, [contig, _, ccov] in list(contig_dict.items()): 207 | if not all([no in simp_node_dict for no in contig]): 208 | subcontigs = [] 209 | curr_contig = [] 210 | addLast = False 211 | for no in contig: 212 | if no in simp_node_dict: 213 | addLast = True 214 | curr_contig.append(no) 215 | else: 216 | addLast = False 217 | if curr_contig != []: 218 | subcontigs.append(curr_contig[:]) 219 | curr_contig = [] 220 | if addLast: 221 | subcontigs.append(curr_contig[:]) 222 | 223 | contig_dict.pop(cno) 224 | for i, subc in enumerate(subcontigs): 225 | sublen = path_len(graph, [simp_node_dict[c] for c in subc]) 226 | contig_dict[cno + "^" + str(i)] = [subc, sublen, ccov] 227 | else: 228 | logger.info("Graph is DAG, tip removal skipped.") 229 | logger.info("done") 230 | return 231 | 232 | 233 | def tip_removal( 234 | graph: Graph, simp_node_dict: dict, logger: Logger, tempdir, accept_rate 235 | ): 236 | """ 237 | retrieve all the source/tail simple path, and merge them into adjacent neighbor path if possible 238 | 239 | the collapse step can be done before node depeth rebalance, since it only regards to 240 | matching score within node seq len 241 | 242 | if is the case, then spades contig may also be modified. 243 | """ 244 | 245 | def remove_tip(graph: Graph, simp_node_dict: dict, from_node, to_path): 246 | """ 247 | collapse the node with the given path, increment given path depth, remove related information 248 | about the node. 249 | """ 250 | graph.vp.color[from_node] = "gray" 251 | pending_dp = graph.vp.dp[from_node] 252 | for node in to_path: 253 | graph.vp.dp[node] += pending_dp 254 | simp_node_dict.pop(graph.vp.id[from_node]) 255 | for e in from_node.all_edges(): 256 | graph.ep.color[e] = "gray" 257 | logger.debug( 258 | path_to_id_string( 259 | graph, 260 | to_path, 261 | "Tip Node {0} collapsed to path".format(graph.vp.id[from_node]), 262 | ) 263 | ) 264 | return 265 | 266 | def cand_collapse_path(graph: Graph, from_node, to_paths, temp_dir): 267 | """ 268 | use minimap2 -c to evaluation the node-path similarity, sort based on matching score in DESC order 269 | 270 | return: the most similar path if there exist a path with score >= accept rate, else return None 271 | """ 272 | ref_loc = "{0}/ref.fa".format(temp_dir) 273 | query_loc = "{0}/query.fa".format(temp_dir) 274 | overlap_loc = "{0}/overlap.paf".format(temp_dir) 275 | subprocess.check_call( 276 | "touch {0}; echo > {0}; touch {1}; echo > {1}".format(ref_loc, query_loc), 277 | shell=True, 278 | ) 279 | 280 | id_path_dict = {} 281 | for id, path in list(enumerate(to_paths)): 282 | id_path_dict[id] = path 283 | 284 | # retrieve all the path information and save into ref.fa 285 | with open(ref_loc, "w") as ref_file: 286 | for id, path in id_path_dict.items(): 287 | name = ">" + str(id) + "\n" 288 | seq = path_to_seq(graph, path, id) + "\n" 289 | ref_file.write(name) 290 | ref_file.write(seq) 291 | ref_file.close() 292 | 293 | # save from node info to query.fa 294 | with open(query_loc, "w") as query_file: 295 | name = ">" + graph.vp.id[from_node] + "\n" 296 | seq = path_to_seq(graph, [from_node], name) + "\n" 297 | query_file.write(name) 298 | query_file.write(seq) 299 | query_file.close() 300 | 301 | # minimap to obtain matching score for all node-path 302 | id_evalscore = {} 303 | minimap_api(ref_loc, query_loc, overlap_loc) 304 | with open(overlap_loc, "r") as overlap_file: 305 | for Line in overlap_file: 306 | splited = (Line[:-1]).split("\t") 307 | path_no = int(splited[5]) 308 | nmatch = int(splited[9]) 309 | nblock = int(splited[10]) 310 | if path_no not in id_evalscore: 311 | id_evalscore[path_no] = [nmatch / nblock] 312 | else: 313 | id_evalscore[path_no].append(nmatch / nblock) 314 | overlap_file.close() 315 | 316 | # remove temp file 317 | subprocess.check_call( 318 | "rm {0}; rm {1}; rm {2}".format(ref_loc, query_loc, overlap_loc), shell=True 319 | ) 320 | 321 | id_evalscore_sum = [] 322 | for id, scores in id_evalscore.items(): 323 | mean_score = numpy.mean(scores) if len(scores) != 0 else 0 324 | id_evalscore_sum.append((id, mean_score)) 325 | 326 | best_match = sorted(id_evalscore_sum, key=lambda t: t[1], reverse=True) 327 | logger.debug("Tip Node: " + str(graph.vp.id[from_node]) + str(best_match)) 328 | if len(best_match) == 0: 329 | return None 330 | elif best_match[0][1] >= accept_rate: 331 | return id_path_dict[best_match[0][0]] 332 | else: 333 | return None 334 | 335 | is_removed = True 336 | # get all the source simple path 337 | src_nodes = [] 338 | tgt_nodes = [] 339 | isolated_node = [] 340 | for node in simp_node_dict.values(): 341 | if node.in_degree() + node.out_degree() == 0: 342 | isolated_node.append(node) 343 | elif node.in_degree() == 0: 344 | src_nodes.append(node) 345 | elif node.out_degree() == 0: 346 | tgt_nodes.append(node) 347 | else: 348 | None 349 | 350 | # src node collapse 351 | src_nodes = sorted(src_nodes, key=lambda x: graph.vp.dp[x]) 352 | for src in src_nodes: 353 | src_len = path_len(graph, [src]) 354 | potential_paths = [] 355 | # path retrieve 356 | for out_branch in src.out_neighbors(): 357 | if graph.vp.id[out_branch] not in simp_node_dict: 358 | continue 359 | # print("current out branch: ", graph.vp.id[out_branch]) 360 | for in_tgt in out_branch.in_neighbors(): 361 | if graph.vp.id[in_tgt] == graph.vp.id[src]: 362 | # coincidence path 363 | continue 364 | if graph.vp.id[in_tgt] not in simp_node_dict: 365 | # collapsed path in previous iteration 366 | continue 367 | # print("current in tgt: ", graph.vp.id[in_tgt]) 368 | potential_paths.extend( 369 | paths_to_tgt(graph, simp_node_dict, src, in_tgt, src_len) 370 | ) 371 | cand_path = cand_collapse_path(graph, src, potential_paths, tempdir) 372 | if cand_path != None: 373 | remove_tip(graph, simp_node_dict, src, cand_path) 374 | is_removed = False 375 | 376 | # target node collapse 377 | tgt_nodes = sorted(tgt_nodes, key=lambda x: graph.vp.dp[x]) 378 | for tgt in tgt_nodes: 379 | tgt_len = path_len(graph, [tgt]) 380 | potential_paths = [] 381 | # path retrieve 382 | for in_branch in tgt.in_neighbors(): 383 | if graph.vp.id[in_branch] not in simp_node_dict: 384 | continue 385 | # print("current in branch: ", graph.vp.id[in_branch]) 386 | for out_src in in_branch.out_neighbors(): 387 | if graph.vp.id[out_src] == graph.vp.id[tgt]: 388 | # coincidence path 389 | continue 390 | if graph.vp.id[out_src] not in simp_node_dict: 391 | # collapsed path in previous iteration 392 | continue 393 | # print("current out src: ", graph.vp.id[out_src]) 394 | potential_paths.extend( 395 | paths_from_src(graph, simp_node_dict, tgt, out_src, tgt_len) 396 | ) 397 | cand_path = cand_collapse_path(graph, tgt, potential_paths, tempdir) 398 | if cand_path != None: 399 | remove_tip(graph, simp_node_dict, tgt, cand_path) 400 | is_removed = False 401 | return is_removed 402 | -------------------------------------------------------------------------------- /utils/VStrains_Alignment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import os 4 | import time 5 | import subprocess 6 | import numpy 7 | import sys 8 | 9 | 10 | def process_paf_file( 11 | index2id, 12 | index2reflen, 13 | len_index2id, 14 | read_ids, 15 | fwd_paf_file, 16 | rve_paf_file, 17 | split_len, 18 | tid, 19 | ): 20 | print("Batch {0} start".format(tid)) 21 | print("current pid: {0}".format(os.getpid())) 22 | start = time.time() 23 | 24 | node_mat = numpy.zeros((len_index2id, len_index2id), dtype=int) 25 | short_mat = numpy.zeros((len_index2id, len_index2id), dtype=int) 26 | 27 | id2index = {} 28 | for i in range(len_index2id): 29 | id2index[index2id[i]] = i 30 | 31 | read2index = {} 32 | index2read = numpy.array( 33 | [(k, fwdlen, revlen) for (k, _, _, fwdlen, revlen) in read_ids], dtype=int 34 | ) 35 | 36 | conf_alns_f = [None for _ in index2read] 37 | # numpy.array([None for _ in index2read], dtype=object) 38 | conf_cords_f = [None for _ in index2read] 39 | # numpy.array([None for _ in index2read], dtype=object) 40 | 41 | conf_alns_r = [None for _ in index2read] 42 | # numpy.array([None for _ in index2read], dtype=object) 43 | conf_cords_r = [None for _ in index2read] 44 | # numpy.array([None for _ in index2read], dtype=object) 45 | 46 | for i, (glb_index, f_local_inds, r_local_inds, _, _) in enumerate(read_ids): 47 | read2index[glb_index] = i 48 | conf_alns_f[i] = [[] for _ in range(f_local_inds)] 49 | # numpy.array([[] for _ in range(f_local_inds)], dtype=object) 50 | conf_cords_f[i] = [[] for _ in range(f_local_inds)] 51 | # numpy.array([[] for _ in range(f_local_inds)], dtype=object) 52 | conf_alns_r[i] = [[] for _ in range(r_local_inds)] 53 | # numpy.array([[] for _ in range(r_local_inds)], dtype=object) 54 | conf_cords_r[i] = [[] for _ in range(r_local_inds)] 55 | # numpy.array([[] for _ in range(r_local_inds)], dtype=object) 56 | 57 | for file in [fwd_paf_file, rve_paf_file]: 58 | with open(file, "r") as fwd_paf: 59 | file_count = 0 60 | for line in fwd_paf: 61 | if line == "\n": 62 | break 63 | splited = (line[:-1]).split("\t") 64 | seg_no = splited[0] 65 | [glb_seg_no, sub_no] = seg_no.split("_") 66 | ref_no = str(splited[5]) 67 | ref_start_coord = int(splited[7]) # 0-based 68 | nm = int(splited[10]) - int(splited[9]) 69 | if nm == 0 and int(splited[10]) == split_len: 70 | if file == fwd_paf_file: 71 | conf_alns_f[read2index[int(glb_seg_no)]][int(sub_no)].append( 72 | id2index[ref_no] 73 | ) 74 | conf_cords_f[read2index[int(glb_seg_no)]][int(sub_no)].append( 75 | ref_start_coord 76 | ) 77 | else: 78 | conf_alns_r[read2index[int(glb_seg_no)]][int(sub_no)].append( 79 | id2index[ref_no] 80 | ) 81 | conf_cords_r[read2index[int(glb_seg_no)]][int(sub_no)].append( 82 | ref_start_coord 83 | ) 84 | file_count += 1 85 | fwd_paf.close() 86 | # print("Batch {0} finished alignment file parsing".format(tid)) 87 | subprocess.check_call("rm {0}".format(fwd_paf_file), shell=True) 88 | subprocess.check_call("rm {0}".format(rve_paf_file), shell=True) 89 | # nonunique_counter = 0 90 | 91 | def retrieve_single_end_saturation(glb_index, conf_alns, conf_cords, rlen, ks): 92 | nodes = numpy.zeros(len_index2id, dtype=int) 93 | coords = [None for _ in range(len_index2id)] 94 | kindices = [None for _ in range(len_index2id)] 95 | for i, sub_aln_statuses in enumerate(conf_alns[glb_index]): 96 | # if len(sub_aln_statuses) > 1: 97 | # nonunique_counter += 1 98 | for j, sub_aln_status in enumerate(sub_aln_statuses): 99 | nodes[sub_aln_status] += 1 100 | if coords[sub_aln_status] == None: 101 | coords[sub_aln_status] = conf_cords[glb_index][i][j] 102 | else: 103 | coords[sub_aln_status] = min( 104 | coords[sub_aln_status], conf_cords[glb_index][i][j] 105 | ) 106 | if kindices[sub_aln_status] == None: 107 | kindices[sub_aln_status] = i 108 | else: 109 | kindices[sub_aln_status] = min(kindices[sub_aln_status], i) 110 | saturates = [] 111 | L = 0 112 | R = 0 113 | for i, v in enumerate(nodes): 114 | if coords[i] == None or kindices[i] == None: 115 | continue 116 | L = max(coords[i], coords[i] - kindices[i]) 117 | R = min(coords[i] + index2reflen[i] - 1, coords[i] - kindices[i] + rlen - 1) 118 | saturate = R - L - (split_len - 1) + 1 119 | expected = (min(rlen, index2reflen[i]) - ks + 1) * (rlen - ks) / rlen 120 | if v >= max(min(saturate, expected), 1): 121 | # print(i,v,"passed") 122 | saturates.append(i) 123 | return saturates 124 | 125 | for glb_id, fwdlen, revlen in index2read: 126 | glb_index = read2index[glb_id] 127 | lefts = retrieve_single_end_saturation( 128 | glb_index, conf_alns_f, conf_cords_f, fwdlen, split_len 129 | ) 130 | rights = retrieve_single_end_saturation( 131 | glb_index, conf_alns_r, conf_cords_r, revlen, split_len 132 | ) 133 | 134 | k = 0 135 | for i in lefts: 136 | for i2 in lefts[k:]: 137 | short_mat[i][i2] += 1 138 | k += 1 139 | k = 0 140 | for j in rights: 141 | for j2 in rights[k:]: 142 | short_mat[j][j2] += 1 143 | k += 1 144 | 145 | for i in lefts: 146 | for j in rights: 147 | node_mat[i][j] += 1 148 | 149 | # free up memory 150 | conf_alns_f[glb_index] = None 151 | conf_alns_r[glb_index] = None 152 | 153 | elapsed = time.time() - start 154 | print("Batch {0} finished".format(tid)) 155 | # print("Batch: {0} found non unique kmer count: {1}".format(tid, nonunique_counter)) 156 | print("Batch: {0} time spent for processing paf file: {1}".format(tid, elapsed)) 157 | return node_mat, short_mat 158 | 159 | 160 | def batch_split( 161 | fwd_file: str, 162 | rve_file: str, 163 | temp_dir: str, 164 | batch_size: int, 165 | do_split: bool, 166 | split_len, 167 | ): 168 | """split the read file into several 169 | Args: 170 | fwd_file (str): _description_ 171 | rve_file (str): _description_ 172 | batch_size (int): _description_ 173 | Returns: 174 | list: list of batch files 175 | """ 176 | n_reads = 0 177 | short_reads = 0 178 | used_reads = 0 179 | fkmer = 0 180 | rkmer = 0 181 | 182 | temp_file_fwd = None 183 | temp_file_rve = None 184 | local_reads = 0 185 | local_list = [] 186 | batch_count = 0 187 | read_summary = [] 188 | sub_files = [] 189 | # forward reverse read processing 190 | with open(fwd_file, "r") as fwd: 191 | with open(rve_file, "r") as rve: 192 | fwd_reads = fwd.readlines() 193 | rev_reads = rve.readlines() 194 | total_size = min(len(fwd_reads) // 4, len(rev_reads) // 4) 195 | # marker_test = 1 196 | # total_size = min(marker_test, total_size) 197 | for i in range(total_size): 198 | # if i % batch_size == 0: 199 | # print("Processed {0} reads up to now.".format(i)) 200 | [_, fseq, _, feval] = [s[:-1] for s in fwd_reads[i * 4 : (i + 1) * 4]] 201 | [_, rseq, _, reval] = [s[:-1] for s in rev_reads[i * 4 : (i + 1) * 4]] 202 | if fseq.count("N") or rseq.count("N"): 203 | n_reads += 1 204 | elif len(fseq) < split_len or len(rseq) < split_len: 205 | short_reads += 1 206 | else: 207 | used_reads += 1 208 | local_reads += 1 209 | local_list.append((fseq, feval, rseq, reval)) 210 | if local_reads == batch_size or ( 211 | local_reads > 0 and i == total_size - 1 212 | ): 213 | # file creation 214 | sub_fwd_filename = "{0}/temp_forward_{1}.fastq".format( 215 | temp_dir, batch_count 216 | ) 217 | sub_rve_filename = "{0}/temp_reverse_{1}.fastq".format( 218 | temp_dir, batch_count 219 | ) 220 | subprocess.check_call( 221 | "touch {0}; echo " " > {0}".format(sub_fwd_filename), shell=True 222 | ) 223 | subprocess.check_call( 224 | "touch {0}; echo " " > {0}".format(sub_rve_filename), shell=True 225 | ) 226 | temp_file_fwd = open(sub_fwd_filename, "w") 227 | temp_file_rve = open(sub_rve_filename, "w") 228 | 229 | read_ids = [] 230 | if do_split: 231 | for j, (fseq, feval, rseq, reval) in enumerate(local_list): 232 | fread_id_subs = len(fseq) - split_len + 1 233 | rread_id_subs = len(rseq) - split_len + 1 234 | prefix_name = "@{0}_".format(j) 235 | # forward 236 | for sub_i in range(len(fseq) - split_len + 1): 237 | subfread = fseq[sub_i : sub_i + split_len] 238 | subfeval = feval[sub_i : sub_i + split_len] 239 | temp_file_fwd.write( 240 | prefix_name + "{0} /1\n".format(sub_i) 241 | ) 242 | temp_file_fwd.write(subfread + "\n") 243 | temp_file_fwd.write("+\n") 244 | temp_file_fwd.write(subfeval + "\n") 245 | fkmer += len(fseq) - split_len + 1 246 | # reverse 247 | for sub_i in range(len(rseq) - split_len + 1): 248 | subrread = rseq[sub_i : sub_i + split_len] 249 | subreval = reval[sub_i : sub_i + split_len] 250 | temp_file_rve.write( 251 | prefix_name + "{0} /2\n".format(sub_i) 252 | ) 253 | temp_file_rve.write(subrread + "\n") 254 | temp_file_rve.write("+\n") 255 | temp_file_rve.write(subreval + "\n") 256 | rkmer += len(rseq) - split_len + 1 257 | read_ids.append( 258 | (j, fread_id_subs, rread_id_subs, len(fseq), len(rseq)) 259 | ) 260 | else: 261 | for j, (fseq, feval, rseq, reval) in enumerate(local_list): 262 | prefix_name = "@{0}_".format(j) 263 | temp_file_fwd.write(prefix_name + "{0} /1\n".format(0)) 264 | temp_file_fwd.write(fseq + "\n") 265 | temp_file_fwd.write("+\n") 266 | temp_file_fwd.write(feval + "\n") 267 | 268 | temp_file_rve.write(prefix_name + "{0} /2\n".format(0)) 269 | temp_file_rve.write(rseq + "\n") 270 | temp_file_rve.write("+\n") 271 | temp_file_rve.write(reval + "\n") 272 | read_ids.append((j, 1, 1, len(fseq), len(rseq))) 273 | temp_file_fwd.close() 274 | temp_file_rve.close() 275 | read_summary.append(read_ids) 276 | sub_files.append((sub_fwd_filename, sub_rve_filename)) 277 | local_reads = 0 278 | local_list = [] 279 | batch_count += 1 280 | fwd.close() 281 | rve.close() 282 | 283 | print("total number of reads (before): ", total_size) 284 | print("total reads containing N: ", n_reads) 285 | print("total reads too short [<{0}]: ".format(split_len), short_reads) 286 | print("total number of reads (used): ", used_reads) 287 | print("total number of forward reads kmer: ", fkmer) 288 | print("total number of reverse reads kmer: ", rkmer) 289 | return read_summary, sub_files 290 | 291 | 292 | def minimap_alignment(fasta_file, sub_files, temp_dir): 293 | paf_files = [] 294 | for i, (sub_fwd_filename, sub_rve_filename) in enumerate(sub_files): 295 | print( 296 | "minimap reads {0},{1} to graph..".format( 297 | sub_fwd_filename, sub_rve_filename 298 | ) 299 | ) 300 | start = time.time() 301 | sub_fwd_paf = "{0}/temp_fwd_aln_{1}.paf".format(temp_dir, i) 302 | subprocess.check_call( 303 | "minimap2 -c -t 16 {0} {1} > {2}".format( 304 | fasta_file, sub_fwd_filename, sub_fwd_paf 305 | ), 306 | shell=True, 307 | ) 308 | # -B 40 -O 20,50 -E 30,10 -z 1,1 -k 27 -w 18 -s 256 309 | subprocess.check_call("rm {0}".format(sub_fwd_filename), shell=True) 310 | 311 | sub_rve_paf = "{0}/temp_rve_aln_{1}.paf".format(temp_dir, i) 312 | subprocess.check_call( 313 | "minimap2 -c -t 16 {0} {1} > {2}".format( 314 | fasta_file, sub_rve_filename, sub_rve_paf 315 | ), 316 | shell=True, 317 | ) 318 | subprocess.check_call("rm {0}".format(sub_rve_filename), shell=True) 319 | 320 | paf_files.append((sub_fwd_paf, sub_rve_paf)) 321 | elapsed = time.time() - start 322 | print("Time spent for minimap2: ", elapsed) 323 | return paf_files 324 | 325 | 326 | def main(): 327 | print( 328 | "----------------------Paired-End Information Alignment----------------------" 329 | ) 330 | parser = argparse.ArgumentParser( 331 | prog="pe_info", 332 | description="""Align Paired-End reads to nodes in graph to obtain strong links""", 333 | ) 334 | 335 | parser.add_argument( 336 | "-g", "--gfa,", dest="gfa", type=str, required=True, help="graph, .gfa format" 337 | ) 338 | 339 | parser.add_argument( 340 | "-o", 341 | "--output_dir", 342 | dest="dir", 343 | type=str, 344 | required=True, 345 | help="output directory", 346 | ) 347 | 348 | parser.add_argument( 349 | "-f", "--forward", dest="fwd", required=True, help="forward read, .fastq" 350 | ) 351 | 352 | parser.add_argument( 353 | "-r", "--reverse", dest="rve", required=True, help="reverse read, .fastq" 354 | ) 355 | 356 | parser.add_argument( 357 | "-k", 358 | "--kmer_size", 359 | dest="kmer_size", 360 | type=int, 361 | default=128, 362 | help="unique kmer size", 363 | ) 364 | 365 | args = parser.parse_args() 366 | 367 | # initialize output directory 368 | if args.dir[-1] == "/": 369 | args.dir = args.dir[:-1] 370 | subprocess.check_call("rm -rf {0}".format(args.dir), shell=True) 371 | os.makedirs(args.dir, exist_ok=True) 372 | 373 | glb_start = time.time() 374 | tmp_g2s_file = "{0}/temp_graph_seq.fasta".format(args.dir) 375 | 376 | # convert gfa to fasta file 377 | index2id = [] 378 | index2reflen = [] 379 | with open(args.gfa, "r") as gfa: 380 | with open(tmp_g2s_file, "w") as fasta: 381 | for Line in gfa: 382 | splited = (Line[:-1]).split("\t") 383 | if splited[0] == "S": 384 | fasta.write(">{0}\n{1}\n".format(splited[1], splited[2])) 385 | index2id.append(splited[1]) 386 | index2reflen.append(len(splited[2])) 387 | fasta.close() 388 | gfa.close() 389 | 390 | split_len = args.kmer_size + 1 391 | # split reads to several batches 392 | read_summary, sub_files = batch_split( 393 | args.fwd, args.rve, args.dir, 40000, True, split_len 394 | ) 395 | # minimap2 reads to fasta file 396 | paf_files = minimap_alignment(tmp_g2s_file, sub_files, args.dir) 397 | 398 | len_index2id = len(index2id) 399 | node_mats = [] 400 | strand_mats = [] 401 | 402 | for i in range(len(paf_files)): 403 | (node_mat, strand_mat) = process_paf_file( 404 | index2id, 405 | index2reflen, 406 | len_index2id, 407 | read_summary[i], 408 | paf_files[i][0], 409 | paf_files[i][1], 410 | split_len, 411 | i, 412 | ) 413 | node_mats.append(node_mat) 414 | strand_mats.append(strand_mat) 415 | 416 | print("All processes have finished their job, combine the result.") 417 | # combine all the outputs 418 | glb_node_mat = numpy.sum(numpy.array(node_mats), axis=0) 419 | glb_strand_mat = numpy.sum(numpy.array(strand_mats), axis=0) 420 | out_file = "{0}/pe_info".format(args.dir) 421 | out_file2 = "{0}/st_info".format(args.dir) 422 | subprocess.check_call("touch {0}; echo " " > {0}".format(out_file), shell=True) 423 | with open(out_file, "w") as outfile: 424 | with open(out_file2, "w") as outfile2: 425 | for i in range(len_index2id): 426 | for j in range(len_index2id): 427 | outfile.write( 428 | "{0}:{1}:{2}\n".format( 429 | index2id[i], index2id[j], glb_node_mat[i][j] 430 | ) 431 | ) 432 | outfile2.write( 433 | "{0}:{1}:{2}\n".format( 434 | index2id[i], index2id[j], glb_strand_mat[i][j] 435 | ) 436 | ) 437 | outfile2.close() 438 | outfile.close() 439 | 440 | glb_elapsed = time.time() - glb_start 441 | print("Global time elapsed: ", glb_elapsed) 442 | print("result stored in: ", out_file) 443 | 444 | 445 | if __name__ == "__main__": 446 | main() 447 | sys.exit(0) 448 | -------------------------------------------------------------------------------- /utils/VStrains_IO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from logging import Logger 4 | from graph_tool.all import Graph 5 | import gfapy 6 | import subprocess 7 | import sys 8 | import re 9 | 10 | from utils.VStrains_Utilities import * 11 | 12 | 13 | def init_graph(): 14 | graph = Graph(directed=True) 15 | graph.vp.seq = graph.new_vertex_property("string", val="") 16 | graph.vp.dp = graph.new_vertex_property("double") 17 | graph.vp.id = graph.new_vertex_property("string", val="UD") 18 | graph.vp.color = graph.new_vertex_property("string") 19 | 20 | graph.ep.overlap = graph.new_edge_property("int", val=0) 21 | graph.ep.flow = graph.new_edge_property("double", val=0.0) 22 | graph.ep.color = graph.new_edge_property("string") 23 | 24 | return graph 25 | 26 | 27 | def gfa_to_graph(gfa_file, logger: Logger, init_ori=1): 28 | """ 29 | Convert assembly graph gfa file to graph 30 | Nodes: segment with corresponding 31 | """ 32 | 33 | logger.info("Parsing GFA format graph") 34 | gfa = gfapy.Gfa().from_file(filename=gfa_file) 35 | logger.info( 36 | "Parsed gfa file length: {0}, version: {1}".format(len(gfa.lines), gfa.version) 37 | ) 38 | 39 | graph = init_graph() 40 | graph.vp.visited = graph.new_vertex_property("int16_t", val=0) 41 | graph.vp.ori = graph.new_vertex_property("int16_t") # 1 = +, -1 = - 42 | 43 | graph.ep.visited = graph.new_edge_property("int", val=0) 44 | 45 | # S 46 | node_dict = {} 47 | dp_dict = {} 48 | edge_dict = {} 49 | for line in gfa.segments: 50 | # segment, convert into Node^- and Node^+ 51 | [t, seg_no, seg] = (str(line).split("\t"))[:3] 52 | tags = (str(line).split("\t"))[3:] 53 | dp_float = 0 54 | ln = 0 55 | kc = 0 56 | for tag in tags: 57 | if tag.startswith("dp") or tag.startswith("DP"): 58 | dp_float = float(tag.split(":")[2]) 59 | break 60 | if tag.startswith("ln") or tag.startswith("LN"): 61 | ln = int(tag.split(":")[2]) 62 | if tag.startswith("kc") or tag.startswith("KC"): 63 | kc = int(tag.split(":")[2]) 64 | if ln != 0 and kc != 0: 65 | break 66 | 67 | # gfa format check 68 | if t != "S" or (dp_float == 0 and (ln == 0 or kc == 0)): 69 | logger.error( 70 | "file: {0}, Illegal graph format, please double check if the graph has been contaminated".format( 71 | gfa_file 72 | ) 73 | ) 74 | sys.exit(1) 75 | 76 | if dp_float == 0: 77 | dp_float = kc / ln 78 | 79 | v_pos = graph.add_vertex() 80 | graph.vp.seq[v_pos] = seg 81 | graph.vp.dp[v_pos] = dp_float 82 | graph.vp.id[v_pos] = seg_no 83 | graph.vp.ori[v_pos] = 1 84 | graph.vp.visited[v_pos] = -1 85 | graph.vp.color[v_pos] = "black" 86 | 87 | v_neg = graph.add_vertex() 88 | graph.vp.seq[v_neg] = reverse_seq(seg) 89 | graph.vp.dp[v_neg] = dp_float 90 | graph.vp.id[v_neg] = seg_no 91 | graph.vp.ori[v_neg] = -1 92 | graph.vp.visited[v_neg] = -1 93 | graph.vp.color[v_neg] = "black" 94 | 95 | node_dict[seg_no] = (v_pos, v_neg) 96 | dp_dict[seg_no] = dp_float 97 | # L 98 | for edge in gfa.edges: 99 | [t, seg_no_l, ori_l, seg_no_r, ori_r] = (str(edge).split("\t"))[:5] 100 | tags = (str(edge).split("\t"))[5:] 101 | overlap_len = [tag for tag in tags if tag.endswith("m") or tag.endswith("M")][0] 102 | # gfa format check 103 | assert t == "L" and overlap_len[-1] == "M" 104 | 105 | u_pos, u_neg = node_dict[seg_no_l] 106 | v_pos, v_neg = node_dict[seg_no_r] 107 | u = u_pos if ori_l == "+" else u_neg 108 | v = v_pos if ori_r == "+" else v_neg 109 | 110 | if (seg_no_l, graph.vp.ori[u], seg_no_r, graph.vp.ori[v]) in edge_dict: 111 | logger.error( 112 | "parallel edge found, invalid case in assembly graph, please double-check the assembly graph format" 113 | ) 114 | logger.error("Pipeline aborted") 115 | sys.exit(1) 116 | 117 | if seg_no_l == seg_no_r: 118 | graph.vp.seq[u] = str.lower(graph.vp.seq[u]) 119 | graph.vp.seq[v] = str.lower(graph.vp.seq[v]) 120 | continue 121 | 122 | e = graph.add_edge(source=u, target=v) 123 | graph.ep.overlap[e] = int(overlap_len[:-1]) 124 | graph.ep.color[e] = "black" 125 | 126 | edge_dict[(seg_no_l, graph.vp.ori[u], seg_no_r, graph.vp.ori[v])] = e 127 | 128 | graph, simp_node_dict, simp_edge_dict = flip_graph_bfs( 129 | graph, node_dict, edge_dict, dp_dict, logger, init_ori 130 | ) 131 | red_graph, red_node_dict, red_edge_dict = reduce_graph( 132 | graph, simp_node_dict, simp_edge_dict 133 | ) 134 | return red_graph, red_node_dict, red_edge_dict 135 | 136 | 137 | def flip_graph_bfs( 138 | graph: Graph, 139 | node_dict: dict, 140 | edge_dict: dict, 141 | dp_dict: dict, 142 | logger: Logger, 143 | init_ori=1, 144 | ): 145 | """ 146 | Flip all the node orientation. 147 | 148 | return an node_dict, which only contains one orientation per node for simplicity. 149 | rename all the used node to positive, and forbidden the opponent node. 150 | """ 151 | 152 | def source_node_via_dp(dp_dict: dict): 153 | """ 154 | return the pos-neg node with maximum depth 155 | """ 156 | return max(dp_dict, key=dp_dict.get) 157 | 158 | def reverse_edge(graph: Graph, edge, node_dict: dict, edge_dict: dict): 159 | """ 160 | reverse an edge with altered orientation and direction. 161 | """ 162 | tmp_s = edge.source() 163 | tmp_t = edge.target() 164 | 165 | edge_dict.pop( 166 | ( 167 | graph.vp.id[tmp_s], 168 | graph.vp.ori[tmp_s], 169 | graph.vp.id[tmp_t], 170 | graph.vp.ori[tmp_t], 171 | ) 172 | ) 173 | 174 | tmp_s_pos, tmp_s_neg = node_dict[graph.vp.id[tmp_s]] 175 | tmp_t_pos, tmp_t_neg = node_dict[graph.vp.id[tmp_t]] 176 | s = tmp_t_pos if graph.vp.ori[tmp_t] == -1 else tmp_t_neg 177 | t = tmp_s_pos if graph.vp.ori[tmp_s] == -1 else tmp_s_neg 178 | 179 | o = graph.ep.overlap[edge] 180 | graph.remove_edge(edge) 181 | e = graph.add_edge(s, t) 182 | graph.ep.overlap[e] = o 183 | edge_dict[ 184 | (graph.vp.id[s], graph.vp.ori[s], graph.vp.id[t], graph.vp.ori[t]) 185 | ] = e 186 | 187 | return graph, e, edge_dict 188 | 189 | logger.info("flip graph orientation..") 190 | pick_dict = {} 191 | while set(dp_dict): 192 | seg_no = source_node_via_dp(dp_dict) 193 | source_pos, source_neg = node_dict[seg_no] 194 | graph.vp.visited[source_pos] = 0 195 | graph.vp.visited[source_neg] = 0 196 | fifo_queue = [[node_dict[seg_no], init_ori]] 197 | 198 | while fifo_queue: 199 | (v_pos, v_neg), ori = fifo_queue.pop() 200 | dp_dict.pop(graph.vp.id[v_pos]) 201 | 202 | u = None 203 | if ori == 1: 204 | u = v_pos 205 | pick_dict[graph.vp.id[u]] = "+" 206 | # print_vertex(graph, v_neg, "node to reverse pos") 207 | for e in set(v_neg.all_edges()): 208 | graph, r_e, edge_dict = reverse_edge(graph, e, node_dict, edge_dict) 209 | # print_edge(graph, r_e, "after reverse: ") 210 | else: 211 | u = v_neg 212 | pick_dict[graph.vp.id[u]] = "-" 213 | # print_vertex(graph, v_pos, "node to reverse neg") 214 | for e in set(v_pos.all_edges()): 215 | graph, r_e, edge_dict = reverse_edge(graph, e, node_dict, edge_dict) 216 | # print_edge(graph, r_e, "after reverse: ") 217 | 218 | graph.vp.visited[v_pos] = 1 219 | graph.vp.visited[v_neg] = 1 220 | # add further nodes into the fifo_queue 221 | for adj_node in u.all_neighbors(): 222 | if graph.vp.visited[adj_node] == -1: 223 | vpos, vneg = node_dict[graph.vp.id[adj_node]] 224 | graph.vp.visited[vpos] = 0 225 | graph.vp.visited[vneg] = 0 226 | # print("appending node {0} to queue".format(graph.vp.id[adj_node])) 227 | fifo_queue.append( 228 | [node_dict[graph.vp.id[adj_node]], graph.vp.ori[adj_node]] 229 | ) 230 | 231 | # verify sorted graph 232 | logger.info("final verifying graph..") 233 | assert len(pick_dict) == len(node_dict) 234 | for key, item in list(pick_dict.items()): 235 | v_pos, v_neg = node_dict[key] 236 | if item == "+": 237 | # FIXME split v_neg to a new node 238 | if v_neg.in_degree() + v_neg.out_degree() > 0: 239 | print_vertex( 240 | graph, v_neg, logger, "pick ambiguous found, pick both, split node" 241 | ) 242 | pick_dict[key] = "t" 243 | else: 244 | # FIXME split v_neg to a new node 245 | if v_pos.in_degree() + v_pos.out_degree() > 0: 246 | print_vertex( 247 | graph, v_pos, logger, "pick ambiguous found, pick both, split node" 248 | ) 249 | pick_dict[key] = "t" 250 | logger.info("Graph is verified") 251 | 252 | simp_node_dict = {} 253 | for seg_no, pick in pick_dict.items(): 254 | if pick == "+": 255 | simp_node_dict[seg_no] = node_dict[seg_no][0] 256 | elif pick == "-": 257 | simp_node_dict["-" + seg_no] = node_dict[seg_no][1] 258 | graph.vp.id[node_dict[seg_no][1]] = "-" + seg_no 259 | else: 260 | simp_node_dict[seg_no] = node_dict[seg_no][0] 261 | graph.vp.id[node_dict[seg_no][0]] = seg_no 262 | simp_node_dict["-" + seg_no] = node_dict[seg_no][1] 263 | graph.vp.id[node_dict[seg_no][1]] = "-" + seg_no 264 | 265 | simp_edge_dict = {} 266 | for e in edge_dict.values(): 267 | simp_edge_dict[(graph.vp.id[e.source()], graph.vp.id[e.target()])] = e 268 | logger.info("done") 269 | return graph, simp_node_dict, simp_edge_dict 270 | 271 | 272 | def reduce_graph(unsimp_graph: Graph, simp_node_dict: dict, simp_edge_dict: dict): 273 | graph = init_graph() 274 | red_node_dict = {} 275 | red_edge_dict = {} 276 | 277 | for no, node in simp_node_dict.items(): 278 | v = graph.add_vertex() 279 | graph.vp.seq[v] = unsimp_graph.vp.seq[node] 280 | graph.vp.dp[v] = unsimp_graph.vp.dp[node] 281 | graph.vp.id[v] = unsimp_graph.vp.id[node] 282 | graph.vp.color[v] = "black" 283 | red_node_dict[no] = v 284 | 285 | for (u, v), e in simp_edge_dict.items(): 286 | source = red_node_dict[u] 287 | sink = red_node_dict[v] 288 | 289 | re = graph.add_edge(source, sink) 290 | graph.ep.overlap[re] = unsimp_graph.ep.overlap[e] 291 | graph.ep.flow[re] = unsimp_graph.ep.flow[e] 292 | graph.ep.color[re] = "black" 293 | red_edge_dict[(u, v)] = re 294 | 295 | return graph, red_node_dict, red_edge_dict 296 | 297 | 298 | def flipped_gfa_to_graph(gfa_file, logger: Logger): 299 | """ 300 | read flipped gfa format graph in. 301 | """ 302 | logger.debug("Parsing GFA format graph") 303 | gfa = gfapy.Gfa().from_file(filename=gfa_file) 304 | logger.debug( 305 | "Parsed gfa file length: {0}, version: {1}".format(len(gfa.lines), gfa.version) 306 | ) 307 | 308 | graph = init_graph() 309 | red_node_dict = {} 310 | red_edge_dict = {} 311 | 312 | # S 313 | for line in gfa.segments: 314 | [_, seg_no, seg, dp] = str(line).split("\t") 315 | dp_float = float(dp.split(":")[2]) 316 | v = graph.add_vertex() 317 | graph.vp.seq[v] = seg 318 | graph.vp.dp[v] = dp_float 319 | graph.vp.id[v] = seg_no 320 | graph.vp.color[v] = "black" 321 | red_node_dict[seg_no] = v 322 | # L 323 | for edge in gfa.edges: 324 | [_, seg_no_l, ori_l, seg_no_r, ori_r, overlap_len] = str(edge).split("\t") 325 | source = red_node_dict[seg_no_l] 326 | sink = red_node_dict[seg_no_r] 327 | 328 | assert overlap_len[-1] == "M" and ori_l == ori_r 329 | re = graph.add_edge(source, sink) 330 | graph.ep.overlap[re] = int(overlap_len[:-1]) 331 | graph.ep.color[re] = "black" 332 | red_edge_dict[(seg_no_l, seg_no_r)] = re 333 | 334 | return graph, red_node_dict, red_edge_dict 335 | 336 | 337 | def graph_to_gfa( 338 | graph: Graph, simp_node_dict: dict, simp_edge_dict: dict, logger: Logger, filename 339 | ): 340 | """ 341 | store the swapped graph in simplifed_graph. 342 | """ 343 | subprocess.check_call("touch {0}; echo > {0}".format(filename), shell=True) 344 | 345 | with open(filename, "w") as gfa: 346 | for v in simp_node_dict.values(): 347 | if graph.vp.color[v] == "black": 348 | name = graph.vp.id[v] 349 | gfa.write( 350 | "S\t{0}\t{1}\tDP:f:{2}\n".format( 351 | name, graph.vp.seq[v], graph.vp.dp[v] 352 | ) 353 | ) 354 | 355 | for (u, v), e in simp_edge_dict.items(): 356 | node_u = simp_node_dict[u] if u in simp_node_dict else None 357 | node_v = simp_node_dict[v] if v in simp_node_dict else None 358 | 359 | if node_u == None or node_v == None: 360 | continue 361 | if graph.vp.color[node_u] != "black" or graph.vp.color[node_v] != "black": 362 | continue 363 | if graph.ep.color[e] != "black": 364 | continue 365 | gfa.write( 366 | "L\t{0}\t{1}\t{2}\t{3}\t{4}M\n".format( 367 | u, "+", v, "+", graph.ep.overlap[e] 368 | ) 369 | ) 370 | gfa.close() 371 | logger.info(filename + " is stored..") 372 | return 0 373 | 374 | 375 | def is_valid(p: list, idx_mapping: dict, simp_node_dict: dict, simp_edge_dict: dict): 376 | if len(p) == 0: 377 | return False 378 | if len(p) == 1: 379 | if p[0] not in idx_mapping: 380 | return False 381 | if idx_mapping[p[0]] not in simp_node_dict: 382 | return False 383 | return True 384 | for i in range(len(p) - 1): 385 | if p[i] not in idx_mapping or p[i + 1] not in idx_mapping: 386 | return False 387 | mu = idx_mapping[p[i]] 388 | mv = idx_mapping[p[i + 1]] 389 | if mu not in simp_node_dict: 390 | return False 391 | if mv not in simp_node_dict: 392 | return False 393 | if (mu, mv) not in simp_edge_dict: 394 | return False 395 | return True 396 | 397 | 398 | def spades_paths_parser( 399 | graph: Graph, 400 | simp_node_dict: dict, 401 | simp_edge_dict: dict, 402 | idx_mapping: dict, 403 | logger: Logger, 404 | path_file, 405 | min_len=250, 406 | min_cov=0, 407 | ): 408 | """ 409 | Map SPAdes's contig to the graph, return all the suitable contigs. 410 | """ 411 | 412 | def get_paths(fd, path): 413 | subpaths = [] 414 | total_nodes = 0 415 | while path.endswith(";\n"): 416 | subpath = str(path[:-2]).split(",") 417 | subpath = list( 418 | map( 419 | lambda v: str(v[:-1]) if v[-1] == "+" else "-" + str(v[:-1]), 420 | subpath, 421 | ) 422 | ) 423 | subpathred = list(dict.fromkeys(subpath)) 424 | # validity check 425 | if is_valid(subpathred, idx_mapping, simp_node_dict, simp_edge_dict): 426 | subpath = list(map(lambda v: idx_mapping[v], subpath)) 427 | subpaths.append(subpath) 428 | total_nodes += len(subpath) 429 | path = fd.readline() 430 | 431 | subpath = path.rstrip().split(",") 432 | subpath = list( 433 | map(lambda v: str(v[:-1]) if v[-1] == "+" else "-" + str(v[:-1]), subpath) 434 | ) 435 | subpathred = list(dict.fromkeys(subpath)) 436 | # validity check 437 | if is_valid(subpathred, idx_mapping, simp_node_dict, simp_edge_dict): 438 | subpath = list(map(lambda v: idx_mapping[v], subpath)) 439 | subpaths.append(subpath) 440 | total_nodes += len(subpath) 441 | 442 | return subpaths, total_nodes 443 | 444 | logger.info("parsing SPAdes .paths file..") 445 | contig_dict = {} 446 | contig_info = {} 447 | try: 448 | with open(path_file, "r") as contigs_file: 449 | name = contigs_file.readline() 450 | path = contigs_file.readline() 451 | 452 | while name != "" and path != "": 453 | (cno, clen, ccov) = re.search( 454 | "%s(.*)%s(.*)%s(.*)" % ("NODE_", "_length_", "_cov_"), name.strip() 455 | ).group(1, 2, 3) 456 | subpaths, total_nodes = get_paths(contigs_file, path) 457 | 458 | name_r = contigs_file.readline() 459 | path_r = contigs_file.readline() 460 | (cno_r, clen_r, ccov_r) = re.search( 461 | "%s(.*)%s(.*)%s(.*)%s" % ("NODE_", "_length_", "_cov_", "'"), 462 | name_r.strip(), 463 | ).group(1, 2, 3) 464 | subpaths_r, total_nodes_r = get_paths(contigs_file, path_r) 465 | 466 | if not (cno == cno_r and clen == clen_r and ccov == ccov_r): 467 | raise BaseException 468 | 469 | # next contig group 470 | name = contigs_file.readline() 471 | path = contigs_file.readline() 472 | 473 | # pick one direction only 474 | (segments, total_n) = max( 475 | [(subpaths, total_nodes), (subpaths_r, total_nodes_r)], 476 | key=lambda t: t[1], 477 | ) 478 | 479 | # filter contig 480 | if segments == []: 481 | continue 482 | if total_n < 2 and (float(ccov) <= min_cov or int(clen) < min_len): 483 | continue 484 | for i, subpath in enumerate(segments): 485 | repeat_dict = {} 486 | for k in subpath: 487 | if k not in repeat_dict: 488 | repeat_dict[k] = 1 489 | else: 490 | repeat_dict[k] += 1 491 | subpath = list(dict.fromkeys(subpath)) 492 | 493 | if len(segments) != 1: 494 | contig_dict[cno + "$" + str(i)] = [ 495 | subpath, 496 | path_len(graph, [simp_node_dict[id] for id in subpath]), 497 | float(ccov), 498 | ] 499 | contig_info[cno + "$" + str(i)] = (None, repeat_dict) 500 | else: 501 | contig_dict[cno] = [subpath, int(clen), float(ccov)] 502 | contig_info[cno] = (None, repeat_dict) 503 | 504 | contigs_file.close() 505 | except BaseException as err: 506 | logger.error( 507 | err, 508 | "\nPlease make sure the correct SPAdes contigs .paths file is provided.", 509 | ) 510 | logger.error("Pipeline aborted") 511 | sys.exit(1) 512 | logger.debug(str(contig_dict)) 513 | logger.debug(str(contig_info)) 514 | logger.info("done") 515 | return contig_dict, contig_info 516 | 517 | 518 | def contig_dict_to_fasta( 519 | graph: Graph, simp_node_dict: dict, contig_dict: dict, output_file 520 | ): 521 | """ 522 | Store contig dict into fastq file 523 | """ 524 | subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True) 525 | 526 | with open(output_file, "w") as fasta: 527 | for cno, (contig, clen, ccov) in sorted( 528 | contig_dict.items(), key=lambda x: x[1][1], reverse=True 529 | ): 530 | contig_name = ( 531 | ">" + str(cno) + "_" + str(clen) + "_" + str(round(ccov, 2)) + "\n" 532 | ) 533 | seq = path_ids_to_seq(graph, contig, contig_name, simp_node_dict) + "\n" 534 | fasta.write(contig_name) 535 | fasta.write(seq) 536 | fasta.close() 537 | 538 | 539 | def strain_dict_to_fasta(strain_dict: dict, output_file): 540 | """ 541 | Store strain dict into fastq file 542 | """ 543 | subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True) 544 | 545 | with open(output_file, "w") as fasta: 546 | for cno, (sseq, clen, ccov) in sorted( 547 | strain_dict.items(), key=lambda x: x[1][1], reverse=True 548 | ): 549 | contig_name = ( 550 | ">" + str(cno) + "_" + str(clen) + "_" + str(round(ccov, 2)) + "\n" 551 | ) 552 | seq = sseq + "\n" 553 | fasta.write(contig_name) 554 | fasta.write(seq) 555 | fasta.close() 556 | 557 | 558 | def contig_dict_to_path( 559 | contig_dict: dict, output_file, id_mapping: dict = None, keep_original=False 560 | ): 561 | """ 562 | Store contig dict into paths file 563 | """ 564 | subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True) 565 | rev_id_mapping = {} 566 | if id_mapping != None: 567 | for id, map in id_mapping.items(): 568 | rev_id_mapping[map] = id 569 | with open(output_file, "w") as paths: 570 | for cno, (contig, clen, ccov) in sorted( 571 | contig_dict.items(), key=lambda x: x[1][1], reverse=True 572 | ): 573 | contig_name = "NODE_" + str(cno) + "_" + str(clen) + "_" + str(ccov) + "\n" 574 | path_ids = "" 575 | for id in contig: 576 | if keep_original: 577 | for iid in str(id).split("&"): 578 | if iid.find("*") != -1: 579 | rid = rev_id_mapping[iid[: iid.find("*")]] 580 | else: 581 | rid = rev_id_mapping[iid] 582 | if rid[0] == "-": 583 | rid = rid[1:] + "-" 584 | path_ids += rid + "," 585 | else: 586 | for iid in str(id).split("&"): 587 | if iid.find("*") != -1: 588 | rid = iid[: iid.find("*")] 589 | else: 590 | rid = iid 591 | path_ids += str(rid) + "," 592 | path_ids = path_ids[:-1] + "\n" 593 | paths.write(contig_name) 594 | paths.write(path_ids) 595 | paths.close() 596 | 597 | 598 | def process_pe_info(node_ids, pe_info_file, st_info_file): 599 | pe_info = {} 600 | for u in node_ids: 601 | for v in node_ids: 602 | pe_info[(min(u, v), max(u, v))] = 0 603 | with open(pe_info_file, "r") as file: 604 | for line in file: 605 | if line == "\n": 606 | break 607 | [u, v, mark] = line[:-1].split(":")[:3] 608 | # bidirection 609 | key = (min(u, v), max(u, v)) 610 | if pe_info.get(key) != None: 611 | pe_info[key] += int(mark) 612 | file.close() 613 | 614 | with open(st_info_file, "r") as file: 615 | for line in file: 616 | if line == "\n": 617 | break 618 | [u, v, mark] = line[:-1].split(":")[:3] 619 | # bidirection 620 | key = (min(u, v), max(u, v)) 621 | if pe_info.get(key) != None: 622 | pe_info[key] += int(mark) 623 | file.close() 624 | dcpy_pe_info = {} 625 | for (uid, wid), u in pe_info.items(): 626 | dcpy_pe_info[(uid, wid)] = u 627 | return pe_info, dcpy_pe_info 628 | 629 | 630 | def store_reinit_graph( 631 | graph: Graph, 632 | simp_node_dict: dict, 633 | simp_edge_dict: dict, 634 | logger: Logger, 635 | opt_filename, 636 | ): 637 | graph_to_gfa(graph, simp_node_dict, simp_edge_dict, logger, opt_filename) 638 | grapho, simp_node_dicto, simp_edge_dicto = flipped_gfa_to_graph( 639 | opt_filename, logger 640 | ) 641 | assign_edge_flow(grapho, simp_node_dicto, simp_edge_dicto) 642 | return grapho, simp_node_dicto, simp_edge_dicto 643 | -------------------------------------------------------------------------------- /utils/VStrains_Extension.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | from graph_tool.all import Graph 5 | from utils.VStrains_Utilities import * 6 | from utils.VStrains_Decomposition import get_non_trivial_branches, global_trivial_split 7 | from utils.VStrains_IO import store_reinit_graph 8 | 9 | 10 | def best_matching( 11 | graph: Graph, 12 | simp_node_dict: dict, 13 | simp_edge_dict: dict, 14 | contig_dict: dict, 15 | pe_info: dict, 16 | logger: Logger, 17 | ): 18 | full_link = {} 19 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict) 20 | node_to_contig_dict, _ = contig_map_node(contig_dict) 21 | for no, node in non_trivial_branches.items(): 22 | us = [graph.vp.id[src] for src in node.in_neighbors()] 23 | ws = [graph.vp.id[tgt] for tgt in node.out_neighbors()] 24 | logger.debug("---------------------------------------------") 25 | logger.debug( 26 | "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format( 27 | no, len(us), len(ws) 28 | ) 29 | ) 30 | # add contig supports 31 | support_contigs = node_to_contig_dict.get(no, []) 32 | con_info = {} 33 | for cno in support_contigs: 34 | [contig, clen, ccov] = contig_dict[cno] 35 | loc = contig.index(no) 36 | if loc > 0 and loc < len(contig) - 1: 37 | con_info[(contig[loc - 1], contig[loc + 1])] = con_info.get( 38 | (contig[loc - 1], contig[loc + 1]), [] 39 | ) 40 | con_info[(contig[loc - 1], contig[loc + 1])].append((cno, clen, ccov)) 41 | print_contig( 42 | cno, 43 | clen, 44 | round(ccov, 2), 45 | contig[max(loc - 1, 0) : loc + 2], 46 | logger, 47 | "support contig", 48 | ) 49 | kept_link = {} 50 | sec_comb = [] 51 | # init node usage for current branch 52 | in_usage = dict.fromkeys(us, 0) 53 | out_usage = dict.fromkeys(ws, 0) 54 | 55 | # align contig link first, and update status 56 | logger.debug("align contig link first") 57 | for uid in us: 58 | for wid in ws: 59 | logger.debug("---------------------") 60 | u = simp_node_dict[uid] 61 | w = simp_node_dict[wid] 62 | curr_pe = pe_info[(min(uid, wid), max(uid, wid))] 63 | 64 | logger.debug("{0} -> {1} PE: {2}".format(uid, wid, curr_pe)) 65 | logger.debug( 66 | "cov info: {0}[{1}] -> {2}[{3}]".format( 67 | graph.ep.flow[graph.edge(u, node)], 68 | pe_info[(min(uid, no), max(uid, no))], 69 | graph.ep.flow[graph.edge(node, w)], 70 | pe_info[(min(no, wid), max(no, wid))], 71 | ) 72 | ) 73 | accept = False 74 | if (uid, wid) in con_info: 75 | logger.debug( 76 | "current link supported by contig: {0}, added".format( 77 | con_info[(uid, wid)] 78 | ) 79 | ) 80 | accept = True 81 | if uid == wid: 82 | logger.debug( 83 | "current link is a self link: {0}, potential cyclic strain, added".format( 84 | uid 85 | ) 86 | ) 87 | accept = True 88 | 89 | if accept: 90 | in_usage[uid] += 1 91 | out_usage[wid] += 1 92 | kept_link[(uid, wid)] = curr_pe 93 | else: 94 | logger.debug("current link is secondary choice, process later") 95 | sec_comb.append((uid, wid, curr_pe)) 96 | 97 | logger.debug( 98 | "align paired end/single end information first (if any) to isolated nodes" 99 | ) 100 | sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True) 101 | for uid, wid, pe in sorted_sec_comb: 102 | if pe > 0: 103 | logger.debug( 104 | "-----SEC LINK {0} -> {1} PE: {2}-----".format(uid, wid, pe) 105 | ) 106 | logger.debug("- link [ > 0] supported case, added") 107 | in_usage[uid] += 1 108 | out_usage[wid] += 1 109 | kept_link[(uid, wid)] = pe 110 | full_link[no] = kept_link 111 | return full_link 112 | 113 | 114 | # extend contigs on both end, until a non distinct extension 115 | def contig_extension( 116 | graph: Graph, 117 | simp_node_dict: dict, 118 | contig: list, 119 | ccov, 120 | full_link: dict, 121 | logger: Logger, 122 | threshold, 123 | ): 124 | visited = dict.fromkeys(simp_node_dict.keys(), False) 125 | for no in contig[1:-1]: 126 | visited[no] = True 127 | final_path = [] 128 | final_path.extend([simp_node_dict[no] for no in contig][1:-1]) 129 | 130 | curr = simp_node_dict[contig[-1]] 131 | logger.debug("c-t extension") 132 | while curr != None and not visited[graph.vp.id[curr]]: 133 | visited[graph.vp.id[curr]] = True 134 | final_path.append(curr) 135 | out_branches = list([n for n in curr.out_neighbors()]) 136 | if len(out_branches) == 0: 137 | curr = None 138 | logger.debug("Reach the end") 139 | elif len(out_branches) == 1: 140 | curr = out_branches[0] 141 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr])) 142 | else: 143 | f_assigned = False 144 | if graph.vp.id[curr] in full_link and len(final_path) > 1: 145 | logger.debug("Curr is Branch") 146 | curr_links = [ 147 | simp_node_dict[wid] 148 | for (uid, wid) in full_link[graph.vp.id[curr]].keys() 149 | if uid == graph.vp.id[final_path[-2]] 150 | ] 151 | if len(curr_links) == 1: 152 | # curr = curr_links[0] 153 | # logger.debug("single link next: {0}".format(graph.vp.id[curr])) 154 | if graph.vp.dp[curr_links[0]] - ccov <= -2 * threshold: 155 | curr = None 156 | logger.debug( 157 | "{0} single link < 2delta, use coverage".format( 158 | graph.vp.id[curr_links[0]] 159 | ) 160 | ) 161 | else: 162 | curr = curr_links[0] 163 | logger.debug("single link next: {0}".format(graph.vp.id[curr])) 164 | elif len(curr_links) > 1: 165 | logger.debug("Ambiguous, stop extension") 166 | curr = None 167 | else: 168 | logger.debug("No link in here, use coverage information") 169 | f_assigned = True 170 | else: 171 | curr = None 172 | logger.debug("Not in full link or len of path <= 1") 173 | if f_assigned: 174 | in_branches = list([n for n in curr.in_neighbors()]) 175 | if len(final_path) > 1 and len(in_branches) > 0: 176 | curru = final_path[-2] 177 | opt_ws = sorted( 178 | out_branches, 179 | key=lambda ww: abs(graph.vp.dp[curru] - graph.vp.dp[ww]), 180 | ) 181 | bestw = opt_ws[0] 182 | opt_us = sorted( 183 | in_branches, 184 | key=lambda uu: abs(graph.vp.dp[bestw] - graph.vp.dp[uu]), 185 | ) 186 | if opt_us[0] == curru: 187 | delta = max( 188 | 2 * abs(graph.vp.dp[curru] - graph.vp.dp[bestw]), threshold 189 | ) 190 | if ( 191 | len(opt_us) > 1 192 | and abs(graph.vp.dp[opt_us[1]] - graph.vp.dp[bestw]) 193 | <= delta 194 | ): 195 | logger.debug("ambiguous best matching, stop extension") 196 | continue 197 | if ( 198 | len(opt_ws) > 1 199 | and abs(graph.vp.dp[curru] - graph.vp.dp[opt_ws[1]]) 200 | <= delta 201 | ): 202 | logger.debug("ambiguous best matching, stop extension") 203 | continue 204 | logger.debug("best matching") 205 | curr = bestw 206 | else: 207 | logger.debug("Not best match") 208 | curr = None 209 | else: 210 | curr = None 211 | logger.debug("No Link + Not trivial, stop extension") 212 | if curr == None: 213 | single_bests = sorted( 214 | [(onode, graph.vp.dp[onode]) for onode in out_branches], 215 | key=lambda tp: tp[1], 216 | reverse=True, 217 | ) 218 | logger.debug( 219 | "Try last bit: 1st: {0}, 2nd: {1}, delta: {2}, cov: {3}".format( 220 | (graph.vp.id[single_bests[0][0]], single_bests[0][1]), 221 | (graph.vp.id[single_bests[1][0]], single_bests[1][1]), 222 | threshold, 223 | ccov, 224 | ) 225 | ) 226 | if ( 227 | single_bests[0][1] - ccov > -threshold 228 | and single_bests[1][1] - ccov <= -threshold 229 | ): 230 | logger.debug("Last bit succ") 231 | curr = single_bests[0][0] 232 | else: 233 | logger.debug("Last bit fail") 234 | unode = simp_node_dict[contig[0]] 235 | if len(contig) == 1 and final_path[-1] not in unode.in_neighbors(): 236 | visited[contig[0]] = False 237 | final_path.pop(0) 238 | curr = unode 239 | logger.debug("s-c extension") 240 | while curr != None and not visited[graph.vp.id[curr]]: 241 | visited[graph.vp.id[curr]] = True 242 | final_path.insert(0, curr) 243 | in_branches = list([n for n in curr.in_neighbors()]) 244 | if len(in_branches) == 0: 245 | curr = None 246 | logger.debug("Reach the end") 247 | elif len(in_branches) == 1: 248 | curr = in_branches[0] 249 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr])) 250 | else: 251 | f_assigned = False 252 | if graph.vp.id[curr] in full_link and len(final_path) > 1: 253 | logger.debug("Curr is Branch") 254 | curr_links = [ 255 | simp_node_dict[uid] 256 | for (uid, wid) in full_link[graph.vp.id[curr]].keys() 257 | if wid == graph.vp.id[final_path[1]] 258 | ] 259 | if len(curr_links) == 1: 260 | # curr = curr_links[0] 261 | # logger.debug("single link next: {0}".format(graph.vp.id[curr])) 262 | if graph.vp.dp[curr_links[0]] - ccov <= -2 * threshold: 263 | curr = None 264 | logger.debug( 265 | "{0} single link < 2delta, use coverage".format( 266 | graph.vp.id[curr_links[0]] 267 | ) 268 | ) 269 | else: 270 | curr = curr_links[0] 271 | logger.debug("prev: {0}".format(graph.vp.id[curr])) 272 | elif len(curr_links) > 1: 273 | logger.debug("Ambiguous, stop extension") 274 | curr = None 275 | else: 276 | logger.debug("No link in here, use coverage information") 277 | f_assigned = True 278 | else: 279 | curr = None 280 | logger.debug("Not in full link or len of path <= 1") 281 | if f_assigned: 282 | out_branches = list([n for n in curr.out_neighbors()]) 283 | if len(final_path) > 1 and len(out_branches) > 0: 284 | currw = final_path[1] 285 | opt_us = sorted( 286 | in_branches, 287 | key=lambda uu: abs(graph.vp.dp[currw] - graph.vp.dp[uu]), 288 | ) 289 | bestu = opt_us[0] 290 | opt_ws = sorted( 291 | out_branches, 292 | key=lambda ww: abs(graph.vp.dp[bestu] - graph.vp.dp[ww]), 293 | ) 294 | if opt_ws[0] == currw: 295 | delta = max( 296 | 2 * abs(graph.vp.dp[currw] - graph.vp.dp[bestu]), threshold 297 | ) 298 | if ( 299 | len(opt_us) > 1 300 | and abs(graph.vp.dp[opt_us[1]] - graph.vp.dp[currw]) 301 | <= delta 302 | ): 303 | logger.debug("ambiguous best matching, stop extension") 304 | continue 305 | if ( 306 | len(opt_ws) > 1 307 | and abs(graph.vp.dp[bestu] - graph.vp.dp[opt_ws[1]]) 308 | <= delta 309 | ): 310 | logger.debug("ambiguous best matching, stop extension") 311 | continue 312 | logger.debug("best matching") 313 | curr = bestu 314 | else: 315 | logger.debug("Not best match") 316 | curr = None 317 | else: 318 | logger.debug("No Link + Not trivial, stop extension") 319 | curr = None 320 | if curr == None: 321 | single_bests = sorted( 322 | [(inode, graph.vp.dp[inode]) for inode in in_branches], 323 | key=lambda tp: tp[1], 324 | reverse=True, 325 | ) 326 | logger.debug( 327 | "Try last bit: 1st: {0}, 2nd: {1}, delta: {2}, cov: {3}".format( 328 | (graph.vp.id[single_bests[0][0]], single_bests[0][1]), 329 | (graph.vp.id[single_bests[1][0]], single_bests[1][1]), 330 | threshold, 331 | ccov, 332 | ) 333 | ) 334 | if ( 335 | single_bests[0][1] - ccov > -threshold 336 | and single_bests[1][1] - ccov <= -threshold 337 | ): 338 | logger.debug("Last bit succ") 339 | curr = single_bests[0][0] 340 | else: 341 | logger.debug("Last bit fail") 342 | return final_path 343 | 344 | 345 | def final_extension( 346 | graph: Graph, simp_node_dict: dict, contig: list, full_link: dict, logger: Logger 347 | ): 348 | visited = dict.fromkeys(simp_node_dict.keys(), False) 349 | for no in contig[1:-1]: 350 | visited[no] = True 351 | curr = simp_node_dict[contig[-1]] 352 | final_path = [] 353 | final_path.extend([simp_node_dict[no] for no in contig][1:-1]) 354 | # from curr to the tail, or to the non-extendable end 355 | logger.debug("c-t extension") 356 | while curr != None and not visited[graph.vp.id[curr]]: 357 | visited[graph.vp.id[curr]] = True 358 | final_path.append(curr) 359 | out_branches = list([n for n in curr.out_neighbors()]) 360 | if len(out_branches) == 0: 361 | curr = None 362 | logger.debug("Reach the end") 363 | elif len(out_branches) == 1: 364 | curr = out_branches[0] 365 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr])) 366 | else: 367 | if graph.vp.id[curr] in full_link and len(final_path) > 1: 368 | logger.debug("Curr is Branch") 369 | curr_links = [ 370 | simp_node_dict[wid] 371 | for (uid, wid) in full_link[graph.vp.id[curr]].keys() 372 | if uid == graph.vp.id[final_path[-2]] 373 | ] 374 | if len(curr_links) == 1: 375 | curr = curr_links[0] 376 | logger.debug("single link next: {0}".format(graph.vp.id[curr])) 377 | else: 378 | logger.debug("No/more link in here, end entension") 379 | curr = None 380 | else: 381 | curr = None 382 | logger.debug("Not in full link or len of path <= 1") 383 | 384 | unode = simp_node_dict[contig[0]] 385 | if len(contig) == 1 and final_path[-1] not in unode.in_neighbors(): 386 | visited[contig[0]] = False 387 | final_path.pop(0) 388 | curr = unode 389 | # from head to the curr, or to the non-extendable end 390 | logger.debug("s-c extension") 391 | while curr != None and not visited[graph.vp.id[curr]]: 392 | visited[graph.vp.id[curr]] = True 393 | final_path.insert(0, curr) 394 | in_branches = list([n for n in curr.in_neighbors()]) 395 | if len(in_branches) == 0: 396 | curr = None 397 | logger.debug("Reach the end") 398 | elif len(in_branches) == 1: 399 | curr = in_branches[0] 400 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr])) 401 | else: 402 | if graph.vp.id[curr] in full_link and len(final_path) > 1: 403 | logger.debug("Curr is Branch") 404 | curr_links = [ 405 | simp_node_dict[uid] 406 | for (uid, wid) in full_link[graph.vp.id[curr]].keys() 407 | if wid == graph.vp.id[final_path[1]] 408 | ] 409 | if len(curr_links) == 1: 410 | curr = curr_links[0] 411 | logger.debug("single link next: {0}".format(graph.vp.id[curr])) 412 | else: 413 | logger.debug("No/more link in here, end extension") 414 | curr = None 415 | else: 416 | curr = None 417 | logger.debug("Not in full link or len of path <= 1") 418 | return final_path 419 | 420 | 421 | def get_bubble_nodes(simp_node_dict: dict, contig: list): 422 | bubbles = [] 423 | for no in contig: 424 | if simp_node_dict[no].in_degree() == 1 and simp_node_dict[no].out_degree() == 1: 425 | bubbles.append(simp_node_dict[no]) 426 | return bubbles 427 | 428 | 429 | def reduce_graph( 430 | graph: Graph, 431 | simp_node_dict: dict, 432 | usages: dict, 433 | full_link: dict, 434 | logger: Logger, 435 | path, 436 | pcov, 437 | threshold, 438 | ): 439 | del_nodes_ids = [] 440 | for node in path: 441 | usages[graph.vp.id[node]] += 1 442 | graph.vp.dp[node] -= pcov 443 | if graph.vp.dp[node] <= threshold: 444 | del_nodes_ids.append(graph.vp.id[node]) 445 | graph.vp.color[node] = "gray" 446 | usages.pop(graph.vp.id[node]) 447 | logger.debug(list_to_string(del_nodes_ids, "invalid nodes")) 448 | for links in full_link.values(): 449 | for uid, wid in list(links.keys()): 450 | if ( 451 | graph.vp.color[simp_node_dict[uid]] != "black" 452 | or graph.vp.color[simp_node_dict[wid]] != "black" 453 | ): 454 | links.pop((uid, wid)) 455 | logger.debug("[D]{0}, {1}".format(uid, wid)) 456 | 457 | 458 | def reduce_id_simple(id_l: list): 459 | ids = [] 460 | for id in id_l: 461 | for iid in id.split("&"): 462 | if iid.find("*") != -1: 463 | ids.append(iid[: iid.find("*")]) 464 | else: 465 | ids.append(iid) 466 | return ids 467 | 468 | 469 | def reduce_Anode(id: str, sno2ids: dict): 470 | ids = [id] 471 | while any([iid.startswith("A") for iid in ids]): 472 | len_ids = len(ids) 473 | for i in range(len_ids): 474 | if ids[i].startswith("A"): 475 | id_v = ids.pop(i).split("*")[0] 476 | j = i 477 | for subid in sno2ids[id_v]: 478 | ids.insert(j, subid) 479 | j += 1 480 | break 481 | return ids 482 | 483 | 484 | def path_extension( 485 | graph: Graph, 486 | simp_node_dict: dict, 487 | simp_edge_dict: dict, 488 | contig_dict: dict, 489 | full_link: dict, 490 | pe_info: dict, 491 | logger: Logger, 492 | threshold, 493 | temp_dir, 494 | ): 495 | logger.debug( 496 | "-------------------------PATH Extension, delta: {0}".format(threshold) 497 | ) 498 | usages = dict.fromkeys(simp_node_dict.keys(), 0) # record the usage of each nodes 499 | strain_dict = {} 500 | rid = 1 501 | sno2ids = dict() 502 | while len(contig_dict) > 0: 503 | # perform trivial split 504 | prev_ids = list(simp_node_dict.keys()) 505 | trivial_split_count, id_mapping = global_trivial_split( 506 | graph, simp_node_dict, simp_edge_dict, logger 507 | ) 508 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 509 | graph, 510 | simp_node_dict, 511 | simp_edge_dict, 512 | logger, 513 | "{0}/gfa/graph_S{1}.gfa".format(temp_dir, rid), 514 | ) 515 | red_id_mapping = contig_dict_remapping( 516 | graph, 517 | simp_node_dict, 518 | simp_edge_dict, 519 | contig_dict, 520 | id_mapping, 521 | prev_ids, 522 | logger, 523 | ) 524 | # update links 525 | for no in list(full_link.keys()): 526 | if no not in simp_node_dict: 527 | full_link.pop(no) 528 | else: 529 | kept_link = full_link.pop(no) 530 | node = simp_node_dict[no] 531 | for (uid, wid), pe in list(kept_link.items()): 532 | # if len(red_id_mapping[uid]) != 1 or len(red_id_mapping[wid]) != 1: 533 | # kept_link.pop((uid, wid)) 534 | # else: 535 | # kept_link[(list(red_id_mapping[uid])[0], list(red_id_mapping[wid])[0])] = pe 536 | kept_link.pop((uid, wid)) 537 | if len(red_id_mapping[uid]) == 1 or len(red_id_mapping[wid]) == 1: 538 | for uuid in red_id_mapping[uid]: 539 | for wwid in red_id_mapping[wid]: 540 | if ( 541 | (uuid, wwid) not in kept_link 542 | and (simp_node_dict[uuid] in node.in_neighbors()) 543 | and (simp_node_dict[wwid] in node.out_neighbors()) 544 | ): 545 | kept_link[(uuid, wwid)] = pe 546 | full_link[no] = kept_link 547 | # update usages 548 | for no, u in list(usages.items()): 549 | usages.pop(no) 550 | for new_no in red_id_mapping[no]: 551 | usages[new_no] = u 552 | ############################ 553 | # get longest contig 554 | (longest_cno, [contig, clen, ccov]) = max( 555 | contig_dict.items(), key=lambda tp: tp[1][1] 556 | ) 557 | contig_dict.pop(longest_cno) 558 | if all(usages[cn] > 0 for cn in contig): 559 | print_contig( 560 | longest_cno, clen, ccov, contig, logger, "-----> Used previously" 561 | ) 562 | continue 563 | if any(graph.vp.color[simp_node_dict[no]] == "gray" for no in contig): 564 | print_contig( 565 | longest_cno, 566 | clen, 567 | ccov, 568 | contig, 569 | logger, 570 | "-----> Some node low cov, skip", 571 | ) 572 | continue 573 | 574 | cbubbles = get_bubble_nodes(simp_node_dict, contig) 575 | bbl_cov = ( 576 | numpy.median([graph.vp.dp[node] for node in cbubbles]) 577 | if len(cbubbles) != 0 578 | else ccov 579 | ) 580 | print_contig( 581 | longest_cno, 582 | clen, 583 | bbl_cov, 584 | contig, 585 | logger, 586 | "-----> Current extending contig: org ccov: {0}, use min {1}".format( 587 | ccov, min(ccov, bbl_cov) 588 | ), 589 | ) 590 | 591 | path = contig_extension( 592 | graph, 593 | simp_node_dict, 594 | contig, 595 | min(ccov, bbl_cov), 596 | full_link, 597 | logger, 598 | threshold, 599 | ) 600 | pno = "A" + str(rid) 601 | plen = path_len(graph, path) 602 | path_ids = [graph.vp.id[n] for n in path] 603 | sno2ids[pno] = [] 604 | for pid in path_ids: 605 | if pid in sno2ids: 606 | sno2ids[pno].extend(sno2ids[pid]) 607 | else: 608 | sno2ids[pno].append(pid) 609 | pbubbles = get_bubble_nodes(simp_node_dict, path_ids) 610 | bbl_pcov = ( 611 | numpy.median([graph.vp.dp[node] for node in pbubbles]) 612 | if len(pbubbles) != 0 613 | else ccov 614 | ) 615 | pcov = min([ccov, bbl_pcov, bbl_cov]) 616 | logger.debug( 617 | path_to_id_string( 618 | graph, path, "---*extended from contig {0}".format(longest_cno) 619 | ) 620 | ) 621 | logger.debug( 622 | "name: {0}, plen: {1}, pcov: {2}, bubble cov: {3}".format( 623 | pno, plen, pcov, bbl_pcov 624 | ) 625 | ) 626 | strain_dict[pno] = [sno2ids[pno], plen, pcov] 627 | for pid in path_ids: 628 | if pid in strain_dict: 629 | strain_dict.pop(pid) 630 | path_ins = [n for n in path[0].in_neighbors()] 631 | path_outs = [n for n in path[-1].out_neighbors()] 632 | if len(path_ins) == 0 and len(path_outs) == 0: 633 | # both end st 634 | logger.debug("st isolated, add to strain") 635 | reduce_graph( 636 | graph, simp_node_dict, usages, full_link, logger, path, pcov, threshold 637 | ) 638 | elif len(path_ins) != 0 and len(path_outs) == 0: 639 | if len(path) > 1: 640 | logger.debug("left connected, wait") 641 | reduce_graph( 642 | graph, 643 | simp_node_dict, 644 | usages, 645 | full_link, 646 | logger, 647 | path[1:], 648 | pcov, 649 | threshold, 650 | ) 651 | pnode = graph_add_vertex( 652 | graph, simp_node_dict, pno, pcov, path_to_seq(graph, path[1:], pno) 653 | ) 654 | graph_add_edge( 655 | graph, 656 | simp_edge_dict, 657 | path[0], 658 | pnode, 659 | graph.ep.overlap[graph.edge(path[0], path[1])], 660 | pcov, 661 | ) 662 | usages[pno] = 0 663 | elif len(path_ins) == 0 and len(path_outs) != 0: 664 | if len(path) > 1: 665 | logger.debug("right connected, wait") 666 | reduce_graph( 667 | graph, 668 | simp_node_dict, 669 | usages, 670 | full_link, 671 | logger, 672 | path[:-1], 673 | pcov, 674 | threshold, 675 | ) 676 | pnode = graph_add_vertex( 677 | graph, simp_node_dict, pno, pcov, path_to_seq(graph, path[:-1], pno) 678 | ) 679 | graph_add_edge( 680 | graph, 681 | simp_edge_dict, 682 | pnode, 683 | path[-1], 684 | graph.ep.overlap[graph.edge(path[-2], path[-1])], 685 | pcov, 686 | ) 687 | usages[pno] = 0 688 | else: 689 | if len(path) > 1: 690 | logger.debug("both connected, wait") 691 | reduce_graph( 692 | graph, 693 | simp_node_dict, 694 | usages, 695 | full_link, 696 | logger, 697 | path[1:-1], 698 | pcov, 699 | threshold, 700 | ) 701 | if len(path[1:-1]) > 0: 702 | pnode = graph_add_vertex( 703 | graph, 704 | simp_node_dict, 705 | pno, 706 | pcov, 707 | path_to_seq(graph, path[1:-1], pno), 708 | ) 709 | graph_add_edge( 710 | graph, 711 | simp_edge_dict, 712 | path[0], 713 | pnode, 714 | graph.ep.overlap[graph.edge(path[0], path[1])], 715 | pcov, 716 | ) 717 | graph_add_edge( 718 | graph, 719 | simp_edge_dict, 720 | pnode, 721 | path[-1], 722 | graph.ep.overlap[graph.edge(path[-2], path[-1])], 723 | pcov, 724 | ) 725 | usages[pno] = 0 726 | 727 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 728 | graph, 729 | simp_node_dict, 730 | simp_edge_dict, 731 | logger, 732 | "{0}/gfa/graph_S{1}post.gfa".format(temp_dir, rid), 733 | ) 734 | for cno in list(contig_dict.keys()): 735 | delete = False 736 | for no in contig_dict[cno][0]: 737 | if no not in simp_node_dict: 738 | delete = True 739 | if delete: 740 | contig_dict.pop(cno) 741 | rid += 1 742 | 743 | # remove trivial split multiple nodes 744 | seq_dict = {} 745 | for node in graph.vertices(): 746 | if graph.vp.seq[node] not in seq_dict: 747 | seq_dict[graph.vp.seq[node]] = [] 748 | seq_dict[graph.vp.seq[node]].append(node) 749 | 750 | for _, sp_nodes in seq_dict.items(): 751 | if len(sp_nodes) > 1: 752 | sorted_sp_nodes = sorted( 753 | sp_nodes, key=lambda vnode: graph.vp.dp[vnode], reverse=True 754 | ) 755 | for vnode in sorted_sp_nodes[1:]: 756 | graph_remove_vertex(graph, simp_node_dict, graph.vp.id[vnode]) 757 | usages.pop(graph.vp.id[vnode]) 758 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 759 | graph, 760 | simp_node_dict, 761 | simp_edge_dict, 762 | logger, 763 | "{0}/gfa/graph_S_final.gfa".format(temp_dir), 764 | ) 765 | # assign link information 766 | final_link_info = {} 767 | for node in graph.vertices(): 768 | for node2 in graph.vertices(): 769 | if node > node2: 770 | continue 771 | 772 | nid1s = reduce_id_simple(reduce_Anode(graph.vp.id[node], sno2ids)) 773 | # nid1s = ( 774 | # reduce_id_simple([graph.vp.id[node]]) 775 | # if graph.vp.id[node][0] != "A" 776 | # else reduce_id_simple(sno2ids[graph.vp.id[node].split("*")[0]]) 777 | # ) 778 | nid2s = reduce_id_simple(reduce_Anode(graph.vp.id[node2], sno2ids)) 779 | # nid2s = ( 780 | # reduce_id_simple([graph.vp.id[node2]]) 781 | # if graph.vp.id[node2][0] != "A" 782 | # else reduce_id_simple(sno2ids[graph.vp.id[node2].split("*")[0]]) 783 | # ) 784 | kpair = ( 785 | min(graph.vp.id[node], graph.vp.id[node2]), 786 | max(graph.vp.id[node], graph.vp.id[node2]), 787 | ) 788 | 789 | logger.debug("nid1s: {0}, nid2s: {1}".format(nid1s, nid2s)) 790 | logger.debug( 791 | "node1id: {0}, node2id: {1}".format( 792 | graph.vp.id[node], graph.vp.id[node2] 793 | ) 794 | ) 795 | final_link_info[kpair] = 0 796 | for id1 in nid1s: 797 | for id2 in nid2s: 798 | inner_kpair = (min(id1, id2), max(id1, id2)) 799 | final_link_info[kpair] += pe_info[inner_kpair] 800 | 801 | nt_branches = get_non_trivial_branches(graph, simp_node_dict) 802 | final_links = {} 803 | for no, node in nt_branches.items(): 804 | final_links[no] = {} 805 | us = [graph.vp.id[src] for src in node.in_neighbors()] 806 | ws = [graph.vp.id[tgt] for tgt in node.out_neighbors()] 807 | logger.debug("---------------------------------------------") 808 | logger.debug( 809 | "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format( 810 | no, len(us), len(ws) 811 | ) 812 | ) 813 | combs = [] 814 | in_usage = dict.fromkeys(us, 0) 815 | 816 | out_usage = dict.fromkeys(ws, 0) 817 | for uid in us: 818 | for wid in ws: 819 | combs.append( 820 | (uid, wid, final_link_info[(min(uid, wid), max(uid, wid))]) 821 | ) 822 | sorted_comb = sorted(combs, key=lambda x: x[2], reverse=True) 823 | for uid, wid, lf in sorted_comb: 824 | logger.debug("---------------------") 825 | if lf > 0 and in_usage[uid] == 0 and out_usage[wid] == 0: 826 | logger.debug( 827 | "-----SEC LINK {0} -> {1} LINK: {2}-----".format(uid, wid, lf) 828 | ) 829 | logger.debug("- unique link [ > 0] supported case, added") 830 | final_links[no][(uid, wid)] = lf 831 | in_usage[uid] += 1 832 | out_usage[wid] += 1 833 | 834 | # add all the nodes that not be used in contig extension to final resulting sets 835 | for node in sorted( 836 | graph.vertices(), key=lambda nd: len(graph.vp.seq[nd]), reverse=True 837 | ): 838 | if len(graph.vp.seq[node]) <= 600: 839 | break 840 | if usages[graph.vp.id[node]] == 0: 841 | logger.debug("Extend from free node: {0}".format(graph.vp.id[node])) 842 | ccov = graph.vp.dp[node] 843 | path = final_extension( 844 | graph, simp_node_dict, [graph.vp.id[node]], final_links, logger 845 | ) 846 | pno = "N" + str(rid) 847 | plen = path_len(graph, path) 848 | path_ids = [graph.vp.id[n] for n in path] 849 | pids = [] 850 | for pid in path_ids: 851 | if pid in sno2ids: 852 | pids.extend(sno2ids[pid]) 853 | else: 854 | pids.append(pid) 855 | for pid in path_ids: 856 | if pid in strain_dict: 857 | strain_dict.pop(pid) 858 | pbubbles = get_bubble_nodes(simp_node_dict, path_ids) 859 | pcov = ( 860 | numpy.median([graph.vp.dp[node] for node in pbubbles]) 861 | if len(pbubbles) != 0 862 | else graph.vp.dp[node] 863 | ) 864 | logger.debug( 865 | path_to_id_string( 866 | graph, 867 | path, 868 | "---*extended from free node {0}".format(graph.vp.id[node]), 869 | ) 870 | ) 871 | logger.debug("name: {0}, plen: {1}, pcov: {2}".format(pno, plen, pcov)) 872 | strain_dict[pno] = [pids, plen, pcov] 873 | for node in path: 874 | usages[graph.vp.id[node]] += 1 875 | rid += 1 876 | for sno, [_, _, scov] in list(strain_dict.items()): 877 | if scov <= 2 * threshold: 878 | strain_dict.pop(sno) 879 | 880 | # split zipped vertices 881 | rid = "" 882 | for cno in strain_dict.keys(): 883 | [contig, clen, ccov] = strain_dict[cno] 884 | rcontig = [] 885 | for id in contig: 886 | rcontig.extend(reduce_id_simple(reduce_Anode(id, sno2ids))) 887 | # for iid in str(id).split("&"): 888 | # if iid.find("*") != -1: 889 | # rid = iid[: iid.find("*")] 890 | # else: 891 | # rid = iid 892 | 893 | # if rid in sno2ids: 894 | # rcontig.extend(sno2ids[rid]) 895 | # else: 896 | # rcontig.append(rid) 897 | strain_dict[cno] = [rcontig, clen, ccov] 898 | 899 | return strain_dict, usages 900 | -------------------------------------------------------------------------------- /utils/VStrains_Decomposition.py: -------------------------------------------------------------------------------- 1 | from utils.VStrains_Utilities import * 2 | from utils.VStrains_IO import store_reinit_graph 3 | import matplotlib.pyplot as plt 4 | import numpy 5 | 6 | 7 | def link_split( 8 | sec_comb: list, 9 | kept_link: dict, 10 | in_usage: dict, 11 | in_capacity: dict, 12 | out_usage: dict, 13 | out_capacity: dict, 14 | logger, 15 | ): 16 | """update split plan using paired end & single end information""" 17 | logger.debug("attempt to split via paired end information") 18 | sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True) 19 | for uid, wid, pe in sorted_sec_comb: 20 | if pe <= 0: 21 | break 22 | logger.debug("-----SEC LINK {0} -> {1} PE: {2}".format(uid, wid, pe)) 23 | logger.debug("Capacity: {0} -> {1}".format(in_capacity[uid], out_capacity[wid])) 24 | logger.debug("- distinct compatiable case, added") 25 | in_usage[uid] += 1 26 | out_usage[wid] += 1 27 | kept_link[(uid, wid)] = ((in_capacity[uid] + out_capacity[wid]) / 2, pe) 28 | return 29 | 30 | 31 | def cov_split( 32 | us: list, 33 | ws: list, 34 | pe_info: dict, 35 | sec_comb: list, 36 | kept_link: dict, 37 | in_usage: dict, 38 | in_capacity: dict, 39 | out_usage: dict, 40 | out_capacity: dict, 41 | logger, 42 | ): 43 | """update split plan using coverage information""" 44 | logger.debug("attempt to split via coverage information") 45 | logger.debug( 46 | "align paired end/single end information first (if any) to isolated nodes" 47 | ) 48 | sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True) 49 | for uid, wid, pe in sorted_sec_comb: 50 | if pe <= 0: 51 | break 52 | if in_usage[uid] > 0 or out_usage[wid] > 0: 53 | continue 54 | logger.debug("-----SEC LINK {0} -> {1} PE: {2}-----".format(uid, wid, pe)) 55 | logger.debug("Capacity: {0} -> {1}".format(in_capacity[uid], out_capacity[wid])) 56 | logger.debug("- link [ > 0] supported case, added") 57 | in_usage[uid] += 1 58 | out_usage[wid] += 1 59 | kept_link[(uid, wid)] = ((in_capacity[uid] + out_capacity[wid]) / 2, pe) 60 | 61 | logger.debug("obtain best match via coverage similarity") 62 | for uid in us: 63 | if in_usage[uid] > 0: 64 | continue 65 | opt_ws = sorted(ws, key=lambda wwid: abs(in_capacity[uid] - out_capacity[wwid])) 66 | wid = opt_ws[0] 67 | opt_us = sorted(us, key=lambda uuid: abs(in_capacity[uuid] - out_capacity[wid])) 68 | if opt_us[0] == uid and out_usage[wid] == 0 and (uid, wid) not in kept_link: 69 | delta = 2 * abs(in_capacity[uid] - out_capacity[wid]) 70 | logger.debug( 71 | "Found coverage best match: {0} -> {1} with cov: {2}, {3}, checking delta bound: {4}".format( 72 | uid, wid, in_capacity[uid], out_capacity[wid], delta 73 | ) 74 | ) 75 | if ( 76 | abs(in_capacity[opt_us[1]] - out_capacity[wid]) <= delta 77 | or abs(in_capacity[uid] - out_capacity[opt_ws[1]]) <= delta 78 | ): 79 | logger.debug("ambiguous matching, skip") 80 | else: 81 | logger.debug("added") 82 | in_usage[uid] += 1 83 | out_usage[wid] += 1 84 | kept_link[(uid, wid)] = ( 85 | (in_capacity[uid] + out_capacity[wid]) / 2, 86 | pe_info[(min(uid, wid), max(uid, wid))], 87 | ) 88 | return 89 | 90 | 91 | def balance_split( 92 | graph: Graph, 93 | simp_node_dict: dict, 94 | simp_edge_dict: dict, 95 | contig_dict: dict, 96 | pe_info: dict, 97 | logger: Logger, 98 | ref_file: str, 99 | temp_dir: str, 100 | count_id: int, 101 | threshold, 102 | is_prim: bool, 103 | ): 104 | logger.info( 105 | "balance split using contigs&paired end links&coverage information.. isPrim: {0}".format( 106 | is_prim 107 | ) 108 | ) 109 | correct_X = [] 110 | correct_Y = [] 111 | false_error_X = [] 112 | false_error_Y = [] 113 | error_X = [] 114 | error_Y = [] 115 | error_text = [] 116 | cut = 100 117 | 118 | # detect all non-trivial branches right now 119 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict) 120 | split_branches = [] 121 | node_to_contig_dict, _ = contig_map_node(contig_dict) 122 | for no, node in non_trivial_branches.items(): 123 | us = [ 124 | graph.vp.id[e.source()] 125 | for e in node.in_edges() 126 | if graph.ep.color[e] == "black" 127 | ] 128 | ws = [ 129 | graph.vp.id[e.target()] 130 | for e in node.out_edges() 131 | if graph.ep.color[e] == "black" 132 | ] 133 | logger.debug("---------------------------------------------") 134 | logger.debug( 135 | "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format( 136 | no, len(us), len(ws) 137 | ) 138 | ) 139 | 140 | # authenticate if split-able 141 | if any([pe_info[(uid, uid)] == None for uid in us]) or any( 142 | [pe_info[(wid, wid)] == None for wid in ws] 143 | ): 144 | logger.debug( 145 | "current non-trivial branch: {0} is related to current iteration, split later".format( 146 | no 147 | ) 148 | ) 149 | continue 150 | if not is_non_trivial(graph, node): 151 | logger.debug( 152 | "current non-trivial branch: {0} is not non-trivial, potential bug".format( 153 | no 154 | ) 155 | ) 156 | continue 157 | if len(us) != len(ws): 158 | logger.debug("Not N-N split, skip") 159 | continue 160 | 161 | # check if link-split 162 | split_via_link = True 163 | 164 | # not perform link-split if any leaf is from a splitted node 165 | for id in us + ws: 166 | singles = id.split("&") 167 | if all([single.count("*") > 0 for single in singles]): 168 | logger.debug( 169 | "leaf:{0} is total branch nodes, no link information, skip link split".format( 170 | id 171 | ) 172 | ) 173 | split_via_link = False 174 | break 175 | 176 | # not perform link-split if no combination has link information 177 | if all( 178 | [pe_info[(min(uid, wid), max(uid, wid))] == 0 for uid in us for wid in ws] 179 | ): 180 | logger.debug( 181 | "current branch node too long, no link information, skip link split" 182 | ) 183 | split_via_link = False 184 | 185 | # add contig supports 186 | support_contigs = node_to_contig_dict.get(no, []) 187 | con_info = {} 188 | for cno in support_contigs: 189 | [contig, clen, ccov] = contig_dict[cno] 190 | loc = contig.index(no) 191 | if loc > 0 and loc < len(contig) - 1: 192 | con_info[(contig[loc - 1], contig[loc + 1])] = con_info.get( 193 | (contig[loc - 1], contig[loc + 1]), [] 194 | ) 195 | con_info[(contig[loc - 1], contig[loc + 1])].append((cno, clen, ccov)) 196 | print_contig( 197 | cno, 198 | clen, 199 | round(ccov, 2), 200 | contig[max(loc - 1, 0) : loc + 2], 201 | logger, 202 | "support contig", 203 | ) 204 | 205 | # debug only 206 | # obtain perfect split via reference 207 | expect_link = [] 208 | ref_pair_dict = {} 209 | ref_all_dict = {} 210 | if ref_file: 211 | lrefs = set() 212 | rrefs = set() 213 | error_nos = set() 214 | for uid in us: 215 | for wid in ws: 216 | u = simp_node_dict[uid] 217 | w = simp_node_dict[wid] 218 | ref_l = best_aln_score(graph, "L", [u], ref_file, temp_dir) 219 | best_ref_l = [ 220 | ref 221 | for [_, l, ref, nm] in ref_l 222 | if nm == 0 and l == len(graph.vp.seq[u]) 223 | ] 224 | ref_r = best_aln_score(graph, "R", [w], ref_file, temp_dir) 225 | best_ref_r = [ 226 | ref 227 | for [_, l, ref, nm] in ref_r 228 | if nm == 0 and l == len(graph.vp.seq[w]) 229 | ] 230 | lrefs = lrefs.union(best_ref_l) 231 | rrefs = rrefs.union(best_ref_r) 232 | ref_pair_dict[(uid, wid)] = set(best_ref_l).intersection( 233 | set(best_ref_r) 234 | ) 235 | ref_all_dict[(uid, wid)] = set( 236 | [ref for [_, _, ref, nm] in ref_l if nm < 5] 237 | ).union(set([ref for [_, _, ref, nm] in ref_r if nm < 5])) 238 | if len(ref_pair_dict[(uid, wid)]) > 0: 239 | expect_link.append((uid, wid)) 240 | if len(best_ref_l) == 0: 241 | error_nos.add(uid) 242 | if len(best_ref_r) == 0: 243 | error_nos.add(wid) 244 | sym_diff = lrefs.symmetric_difference(rrefs) 245 | if len(sym_diff) > 0: 246 | logger.debug( 247 | "Current branch have force mismatch connection for following strains: {0}".format( 248 | sym_diff 249 | ) 250 | ) 251 | # debug only 252 | 253 | kept_link = {} 254 | sec_comb = [] 255 | # init node usage for current branch 256 | in_usage = dict.fromkeys(us, 0) 257 | in_capacity = {} 258 | for uid in us: 259 | in_capacity[uid] = graph.ep.flow[simp_edge_dict[(uid, no)]] 260 | 261 | out_usage = dict.fromkeys(ws, 0) 262 | out_capacity = {} 263 | for wid in ws: 264 | out_capacity[wid] = graph.ep.flow[simp_edge_dict[(no, wid)]] 265 | 266 | # align contig link first, and update status 267 | logger.debug("align contig link first") 268 | for uid in us: 269 | for wid in ws: 270 | logger.debug("---------------------") 271 | u = simp_node_dict[uid] 272 | w = simp_node_dict[wid] 273 | curr_pe = pe_info[(min(uid, wid), max(uid, wid))] 274 | 275 | logger.debug("{0} -> {1} PE: {2}".format(uid, wid, curr_pe)) 276 | logger.debug( 277 | "cov info: {0}[{1}] -> {2}[{3}]".format( 278 | graph.ep.flow[graph.edge(u, node)], 279 | pe_info[(min(uid, no), max(uid, no))], 280 | graph.ep.flow[graph.edge(node, w)], 281 | pe_info[(min(no, wid), max(no, wid))], 282 | ) 283 | ) 284 | if ref_file: 285 | logger.debug( 286 | "intersect reference: {0}".format(ref_pair_dict[(uid, wid)]) 287 | ) 288 | # potential incorrect matching, but supported by links 289 | if len(ref_pair_dict[(uid, wid)]) == 0 and curr_pe > 0: 290 | logger.debug("False Positive case, WARN") 291 | accept = False 292 | if (uid, wid) in con_info: 293 | logger.debug( 294 | "current link supported by contig: {0}, added".format( 295 | con_info[(uid, wid)] 296 | ) 297 | ) 298 | accept = True 299 | if uid == wid: 300 | logger.debug( 301 | "current link is a self link: {0}, potential cyclic strain, added".format( 302 | uid 303 | ) 304 | ) 305 | accept = True 306 | 307 | if accept: 308 | in_usage[uid] += 1 309 | out_usage[wid] += 1 310 | kept_link[(uid, wid)] = ( 311 | (in_capacity[uid] + out_capacity[wid]) / 2, 312 | curr_pe, 313 | ) 314 | else: 315 | logger.debug("current link is secondary choice, process later") 316 | sec_comb.append((uid, wid, curr_pe)) 317 | if is_prim: 318 | if split_via_link: 319 | link_split( 320 | sec_comb, 321 | kept_link, 322 | in_usage, 323 | in_capacity, 324 | out_usage, 325 | out_capacity, 326 | logger, 327 | ) 328 | else: 329 | # secondary split, via link first, then coverage 330 | cov_split( 331 | us, 332 | ws, 333 | pe_info, 334 | sec_comb, 335 | kept_link, 336 | in_usage, 337 | in_capacity, 338 | out_usage, 339 | out_capacity, 340 | logger, 341 | ) 342 | if not ( 343 | all([u == 1 for u in in_usage.values()]) 344 | and all([v == 1 for v in out_usage.values()]) 345 | ): 346 | logger.debug("->Not satisfy N-N split, skip: {0}".format(kept_link)) 347 | continue 348 | worst_pair_diff = max( 349 | [ 350 | abs(in_capacity[uid] - out_capacity[wid]) 351 | for (uid, wid) in kept_link.keys() 352 | ] 353 | ) 354 | if worst_pair_diff > 4 * threshold: 355 | logger.debug( 356 | "worst pair coverage diff greater than 4 delta: {0} > {1}, too uneven, skip: {2}".format( 357 | worst_pair_diff, 4 * threshold, kept_link 358 | ) 359 | ) 360 | continue 361 | logger.debug("->perform split, all kept links: {0}".format(kept_link)) 362 | if ref_file: 363 | logger.debug("->expected links: {0}".format(expect_link)) 364 | if set(kept_link) != set(expect_link): 365 | logger.debug("Incorrect split") 366 | else: 367 | logger.debug("Correct split") 368 | 369 | split_branches.append(no) 370 | link2subs = {} 371 | counter = 0 372 | for (uid, wid), (sub_flow, pe) in kept_link.items(): 373 | logger.debug("--------> {0} - {1}".format(uid, wid)) 374 | # debug only 375 | if ref_file: 376 | if len(ref_pair_dict[(uid, wid)]) != 0: 377 | logger.debug("best pair") 378 | if pe <= cut: 379 | correct_X.append(pe) 380 | correct_Y.append(sub_flow) 381 | if pe < 5: 382 | logger.debug( 383 | "correct node with 0 pest {0}->{1}->{2}, with branch size: {3}".format( 384 | uid, no, wid, len(graph.vp.seq[node]) 385 | ) 386 | ) 387 | else: 388 | is_graph_error = False 389 | if uid in error_nos: 390 | logger.debug( 391 | "src: {0} is incorrect graph erroroness node, no optimal ref".format( 392 | uid 393 | ) 394 | ) 395 | is_graph_error = True 396 | if wid in error_nos: 397 | logger.debug( 398 | "tgt: {0} is incorrect graph erroroness node, no optimal ref".format( 399 | wid 400 | ) 401 | ) 402 | is_graph_error = True 403 | if len(ref_all_dict[(uid, wid)].intersection(sym_diff)) > 0: 404 | is_graph_error = True 405 | if is_graph_error: 406 | if pe <= cut: 407 | false_error_X.append(pe) 408 | false_error_Y.append(sub_flow) 409 | logger.debug("false positive error pair") 410 | else: 411 | if pe <= cut: 412 | error_X.append(pe) 413 | error_Y.append(sub_flow) 414 | error_text.append("{0}:{1}:{2}".format(uid, wid, pe)) 415 | logger.debug("error pair") 416 | # debug only 417 | # perform split 418 | sub_id = no + "*" + str(counter) 419 | counter += 1 420 | sub_node = graph_add_vertex( 421 | graph, simp_node_dict, sub_id, sub_flow, graph.vp.seq[node] 422 | ) 423 | 424 | graph_add_edge( 425 | graph, 426 | simp_edge_dict, 427 | simp_node_dict[uid], 428 | sub_node, 429 | graph.ep.overlap[simp_edge_dict[(uid, no)]], 430 | sub_flow, 431 | ) 432 | 433 | graph_add_edge( 434 | graph, 435 | simp_edge_dict, 436 | sub_node, 437 | simp_node_dict[wid], 438 | graph.ep.overlap[simp_edge_dict[(no, wid)]], 439 | sub_flow, 440 | ) 441 | link2subs[(uid, wid)] = sub_id 442 | 443 | # keep track of related contig record 444 | for cno in support_contigs: 445 | curr_contig, clen, ccov = contig_dict.pop(cno) 446 | branch_ind = curr_contig.index(no) 447 | uid = curr_contig[branch_ind - 1] if branch_ind > 0 else None 448 | wid = ( 449 | curr_contig[branch_ind + 1] 450 | if branch_ind < len(curr_contig) - 1 451 | else None 452 | ) 453 | if uid != None and wid != None: 454 | # unique mapping 455 | curr_contig[branch_ind] = link2subs[(uid, wid)] 456 | contig_dict[cno] = [curr_contig, clen, ccov] 457 | elif uid == None and wid == None: 458 | for sub_id in link2subs.values(): 459 | # all possible contigs 460 | contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [ 461 | [sub_id], 462 | len(graph.vp.seq[simp_node_dict[sub_id]]), 463 | graph.vp.dp[simp_node_dict[sub_id]], 464 | ] 465 | elif uid != None and wid == None: 466 | for (uid2, _), sub_id in link2subs.items(): 467 | if uid == uid2: 468 | curr_contig[branch_ind] = sub_id 469 | contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [ 470 | list(curr_contig), 471 | clen, 472 | ccov, 473 | ] 474 | else: 475 | for (_, wid2), sub_id in link2subs.items(): 476 | if wid == wid2: 477 | curr_contig[branch_ind] = sub_id 478 | contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [ 479 | list(curr_contig), 480 | clen, 481 | ccov, 482 | ] 483 | 484 | # remove related edges and vertex, update contig tracker 485 | for uid in us: 486 | graph_remove_edge(graph, simp_edge_dict, uid, no) 487 | for wid in ws: 488 | graph_remove_edge(graph, simp_edge_dict, no, wid) 489 | graph_remove_vertex(graph, simp_node_dict, no) 490 | node_to_contig_dict, _ = contig_map_node(contig_dict) 491 | 492 | # update link info 493 | for (uid, wid), sub_id in link2subs.items(): 494 | for nno in simp_node_dict.keys(): 495 | pe_info[(min(sub_id, nno), max(sub_id, nno))] = None 496 | for pu, pv in list(pe_info.keys()): 497 | if pu == no or pv == no: 498 | # out of date 499 | pe_info.pop((min(pu, pv), max(pu, pv))) 500 | # final step, assign all the none val pe link to 0 501 | for k in pe_info.keys(): 502 | if pe_info[k] == None: 503 | pe_info[k] = 0 504 | logger.debug("No of branch be removed: " + str(len(set(split_branches)))) 505 | logger.debug("Split branches: " + list_to_string(set(split_branches))) 506 | logger.info("done") 507 | 508 | # plot the data 509 | if ref_file: 510 | _, (ax1) = plt.subplots(1, 1, figsize=(32, 32)) 511 | ax1.scatter(correct_X, correct_Y, color="red", s=100, label="Correct") 512 | ax1.scatter( 513 | false_error_X, false_error_Y, color="blue", s=100, label="False-Positive" 514 | ) 515 | ax1.scatter(error_X, error_Y, color="green", marker="^", s=100, label="Error") 516 | 517 | for index in range(len(error_X)): 518 | ax1.text(error_X[index], error_Y[index], error_text[index], size=10) 519 | 520 | ax1.set_xlabel("PE") 521 | ax1.set_ylabel("FLOW") 522 | ax1.set_title("Scatter Plot - flow vs pe") 523 | ax1.legend() 524 | plt.yticks(numpy.arange(0, 500, 10)) 525 | plt.xticks(numpy.arange(0, cut + 1, 1)) 526 | plt.savefig( 527 | "{0}{1}".format(temp_dir, "/tmp/scatter_plot_pest_{0}.png".format(count_id)) 528 | ) 529 | 530 | return len(set(split_branches)) 531 | 532 | 533 | def trivial_split( 534 | graph: Graph, 535 | simp_node_dict: dict, 536 | simp_edge_dict: dict, 537 | pe_info: dict, 538 | logger: Logger, 539 | ): 540 | """ 541 | Split the graph, for any (0|1)->N, N->(0|1) branch, split by forking the 1 edge to N edge. 542 | """ 543 | logger.info("graph trivial split on NT related vertices..") 544 | # detect all non-trivial branches right now 545 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict) 546 | trivial_split_count = 0 547 | id_mapping = {} 548 | for id in simp_node_dict.keys(): 549 | id_mapping[id] = set() 550 | 551 | for ntno, ntnode in non_trivial_branches.items(): 552 | if graph.vp.color[ntnode] != "black": 553 | continue 554 | logger.debug("Current involving NT branch: {0}".format(ntno)) 555 | for inode in set(ntnode.in_neighbors()): 556 | if graph.vp.color[inode] != "black": 557 | continue 558 | ino = graph.vp.id[inode] 559 | if ino not in id_mapping: 560 | id_mapping[ino] = set() 561 | ines = [ue for ue in inode.in_edges() if graph.ep.color[ue] == "black"] 562 | outes = [ve for ve in inode.out_edges() if graph.ep.color[ve] == "black"] 563 | if len(ines) > 1 and len(outes) == 1: 564 | # n to 1 565 | logger.debug("{0}, n->1 split right".format(ino)) 566 | graph.vp.color[inode] = "gray" 567 | graph.ep.color[graph.edge(inode, ntnode)] = "gray" 568 | s = "A" 569 | for i in range(len(ines)): 570 | ine = ines[i] 571 | src = ine.source() 572 | snode = graph_add_vertex( 573 | graph, 574 | simp_node_dict, 575 | ino + "*" + chr(ord(s) + i), 576 | graph.ep.flow[ine], 577 | graph.vp.seq[inode], 578 | ) 579 | graph.ep.color[ine] = "gray" 580 | sedge_in = graph_add_edge( 581 | graph, 582 | simp_edge_dict, 583 | src, 584 | snode, 585 | graph.ep.overlap[ine], 586 | graph.ep.flow[ine], 587 | ) 588 | simp_node_dict[graph.vp.id[snode]] = snode 589 | simp_edge_dict[ 590 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()]) 591 | ] = sedge_in 592 | 593 | sedge_out = graph_add_edge( 594 | graph, 595 | simp_edge_dict, 596 | snode, 597 | ntnode, 598 | graph.ep.overlap[graph.edge(inode, ntnode)], 599 | graph.ep.flow[ine], 600 | ) 601 | simp_edge_dict[ 602 | ( 603 | graph.vp.id[sedge_out.source()], 604 | graph.vp.id[sedge_out.target()], 605 | ) 606 | ] = sedge_out 607 | id_mapping[ino].add(graph.vp.id[snode]) 608 | for nno in simp_node_dict.keys(): 609 | pe_info[ 610 | (min(graph.vp.id[snode], nno), max(graph.vp.id[snode], nno)) 611 | ] = None 612 | trivial_split_count += 1 613 | # update link information 614 | for pu, pv in list(pe_info.keys()): 615 | if pu == ino or pv == ino: 616 | # out of date 617 | pe_info.pop((min(pu, pv), max(pu, pv))) 618 | 619 | for onode in set(ntnode.out_neighbors()): 620 | if graph.vp.color[onode] != "black": 621 | continue 622 | ono = graph.vp.id[onode] 623 | if ono not in id_mapping: 624 | id_mapping[ono] = set() 625 | ines = [ue for ue in onode.in_edges() if graph.ep.color[ue] == "black"] 626 | outes = [ve for ve in onode.out_edges() if graph.ep.color[ve] == "black"] 627 | if len(ines) == 1 and len(outes) > 1: 628 | # 1 to n 629 | logger.debug("{0}, 1->n split left".format(ono)) 630 | graph.vp.color[onode] = "gray" 631 | graph.ep.color[graph.edge(ntnode, onode)] = "gray" 632 | s = "A" 633 | for i in range(len(outes)): 634 | oute = outes[i] 635 | tgt = oute.target() 636 | snode = graph_add_vertex( 637 | graph, 638 | simp_node_dict, 639 | ono + "*" + chr(ord(s) + i), 640 | graph.ep.flow[oute], 641 | graph.vp.seq[onode], 642 | ) 643 | graph.ep.color[oute] = "gray" 644 | sedge_out = graph_add_edge( 645 | graph, 646 | simp_edge_dict, 647 | snode, 648 | tgt, 649 | graph.ep.overlap[oute], 650 | graph.ep.flow[oute], 651 | ) 652 | simp_node_dict[graph.vp.id[snode]] = snode 653 | simp_edge_dict[ 654 | ( 655 | graph.vp.id[sedge_out.source()], 656 | graph.vp.id[sedge_out.target()], 657 | ) 658 | ] = sedge_out 659 | 660 | sedge_in = graph_add_edge( 661 | graph, 662 | simp_edge_dict, 663 | ntnode, 664 | snode, 665 | graph.ep.overlap[graph.edge(ntnode, onode)], 666 | graph.ep.flow[oute], 667 | ) 668 | simp_edge_dict[ 669 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()]) 670 | ] = sedge_in 671 | id_mapping[ono].add(graph.vp.id[snode]) 672 | for nno in simp_node_dict.keys(): 673 | pe_info[ 674 | (min(graph.vp.id[snode], nno), max(graph.vp.id[snode], nno)) 675 | ] = None 676 | trivial_split_count += 1 677 | # update link information 678 | for pu, pv in list(pe_info.keys()): 679 | if pu == ono or pv == ono: 680 | # out of date 681 | pe_info.pop((min(pu, pv), max(pu, pv))) 682 | for k in pe_info.keys(): 683 | if pe_info[k] == None: 684 | pe_info[k] = 0 685 | logger.debug( 686 | "Total split-ted trivial branch count: {0}".format(trivial_split_count) 687 | ) 688 | return trivial_split_count, id_mapping 689 | 690 | 691 | def global_trivial_split( 692 | graph: Graph, simp_node_dict: dict, simp_edge_dict: dict, logger: Logger 693 | ): 694 | """ 695 | Split the graph, for any (0|1)->N, N->(0|1) branch, split by forking the 1 edge to N edge. 696 | """ 697 | logger.info("graph trivial split..") 698 | 699 | BOUND_ITER = len(simp_node_dict) ** 2 700 | has_split = True 701 | trivial_split_count = 0 702 | id_mapping = {} 703 | for id in simp_node_dict.keys(): 704 | id_mapping[id] = set() 705 | while has_split and trivial_split_count < BOUND_ITER: 706 | has_split = False 707 | for id in list(simp_node_dict.keys()): 708 | node = simp_node_dict[id] 709 | if graph.vp.color[node] != "black": 710 | continue 711 | if id not in id_mapping: 712 | id_mapping[id] = set() 713 | ines = [ue for ue in node.in_edges() if graph.ep.color[ue] == "black"] 714 | outes = [ve for ve in node.out_edges() if graph.ep.color[ve] == "black"] 715 | if len(ines) == 1 and len(outes) > 1: 716 | logger.debug(id + " split left") 717 | graph.vp.color[node] = "gray" 718 | ine = ines[0] 719 | src = ine.source() 720 | graph.ep.color[ine] = "gray" 721 | s = "A" 722 | for i in range(len(outes)): 723 | oute = outes[i] 724 | tgt = oute.target() 725 | snode = graph_add_vertex( 726 | graph, 727 | simp_node_dict, 728 | id + "*" + chr(ord(s) + i), 729 | graph.ep.flow[oute], 730 | graph.vp.seq[node], 731 | ) 732 | graph.ep.color[oute] = "gray" 733 | sedge_out = graph_add_edge( 734 | graph, 735 | simp_edge_dict, 736 | snode, 737 | tgt, 738 | graph.ep.overlap[oute], 739 | graph.ep.flow[oute], 740 | ) 741 | simp_node_dict[graph.vp.id[snode]] = snode 742 | simp_edge_dict[ 743 | ( 744 | graph.vp.id[sedge_out.source()], 745 | graph.vp.id[sedge_out.target()], 746 | ) 747 | ] = sedge_out 748 | 749 | sedge_in = graph_add_edge( 750 | graph, 751 | simp_edge_dict, 752 | src, 753 | snode, 754 | graph.ep.overlap[ine], 755 | graph.ep.flow[oute], 756 | ) 757 | simp_edge_dict[ 758 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()]) 759 | ] = sedge_in 760 | id_mapping[id].add(graph.vp.id[snode]) 761 | has_split = True 762 | trivial_split_count += 1 763 | elif len(ines) > 1 and len(outes) == 1: 764 | logger.debug(id + " split right") 765 | graph.vp.color[node] = "gray" 766 | oute = outes[0] 767 | tgt = oute.target() 768 | graph.ep.color[oute] = "gray" 769 | s = "A" 770 | for i in range(len(ines)): 771 | ine = ines[i] 772 | src = ine.source() 773 | snode = graph_add_vertex( 774 | graph, 775 | simp_node_dict, 776 | id + "*" + chr(ord(s) + i), 777 | graph.ep.flow[ine], 778 | graph.vp.seq[node], 779 | ) 780 | graph.ep.color[ine] = "gray" 781 | sedge_in = graph_add_edge( 782 | graph, 783 | simp_edge_dict, 784 | src, 785 | snode, 786 | graph.ep.overlap[ine], 787 | graph.ep.flow[ine], 788 | ) 789 | simp_node_dict[graph.vp.id[snode]] = snode 790 | simp_edge_dict[ 791 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()]) 792 | ] = sedge_in 793 | 794 | sedge_out = graph_add_edge( 795 | graph, 796 | simp_edge_dict, 797 | snode, 798 | tgt, 799 | graph.ep.overlap[oute], 800 | graph.ep.flow[ine], 801 | ) 802 | simp_edge_dict[ 803 | ( 804 | graph.vp.id[sedge_out.source()], 805 | graph.vp.id[sedge_out.target()], 806 | ) 807 | ] = sedge_out 808 | id_mapping[id].add(graph.vp.id[snode]) 809 | has_split = True 810 | trivial_split_count += 1 811 | else: 812 | None 813 | if trivial_split_count >= BOUND_ITER: 814 | logger.warning("Strange topology detected, exit trivial split immediately") 815 | return None, id_mapping 816 | else: 817 | logger.debug("No of trivial branch be removed: " + str(trivial_split_count)) 818 | logger.info("done") 819 | return trivial_split_count, id_mapping 820 | 821 | 822 | def edge_cleaning( 823 | graph: Graph, simp_edge_dict: dict, contig_dict: dict, pe_info: dict, logger: Logger 824 | ): 825 | """ 826 | Detect the crossing edges and select the confident edges only. 827 | """ 828 | un_assigned_edge = graph.num_edges() 829 | assigned = dict.fromkeys( 830 | [(graph.vp.id[e.source()], graph.vp.id[e.target()]) for e in graph.edges()], 831 | False, 832 | ) 833 | _, edge_to_contig_dict = contig_map_node(contig_dict) 834 | logger.debug("Total edges: " + str(un_assigned_edge)) 835 | # converage iteration 836 | converage_flag = 0 837 | while True: 838 | for node in graph.vertices(): 839 | in_d = node.in_degree() 840 | in_e = [] 841 | for e in node.in_edges(): 842 | if assigned[(graph.vp.id[e.source()], graph.vp.id[e.target()])]: 843 | in_d = in_d - 1 844 | else: 845 | in_e.append(e) 846 | 847 | out_d = node.out_degree() 848 | out_e = [] 849 | for e in node.out_edges(): 850 | if assigned[(graph.vp.id[e.source()], graph.vp.id[e.target()])]: 851 | out_d = out_d - 1 852 | else: 853 | out_e.append(e) 854 | 855 | if in_d == 1: 856 | assigned[ 857 | (graph.vp.id[in_e[0].source()], graph.vp.id[in_e[0].target()]) 858 | ] = True 859 | un_assigned_edge = un_assigned_edge - 1 860 | if out_d == 1: 861 | assigned[ 862 | (graph.vp.id[out_e[0].source()], graph.vp.id[out_e[0].target()]) 863 | ] = True 864 | un_assigned_edge = un_assigned_edge - 1 865 | if converage_flag == un_assigned_edge: 866 | break 867 | else: 868 | converage_flag = un_assigned_edge 869 | 870 | logger.debug( 871 | "un-assigned edges after node-weight coverage iteration : {0}".format( 872 | un_assigned_edge 873 | ) 874 | ) 875 | for u, v in assigned.keys(): 876 | if not assigned[(u, v)]: 877 | logger.debug( 878 | "***cross un-assigned edge: {0} -> {1}, with paired end link {2}".format( 879 | u, v, pe_info[(min(u, v), max(u, v))] 880 | ) 881 | ) 882 | if (u, v) in edge_to_contig_dict: 883 | logger.debug( 884 | "support contig: {0}, force assign".format( 885 | edge_to_contig_dict[(u, v)] 886 | ) 887 | ) 888 | assigned[(u, v)] = True 889 | else: 890 | logger.debug("support contig: None") 891 | for u, v in assigned.keys(): 892 | if not assigned[(u, v)]: 893 | force_assign = True 894 | for w, z in assigned.keys(): 895 | if (u == w or v == z) and assigned[(w, z)]: 896 | force_assign = False 897 | break 898 | if not force_assign: 899 | graph.remove_edge(simp_edge_dict.pop((u, v))) 900 | logger.debug( 901 | "intersect unsupported edge: {0} -> {1}, removed".format(u, v) 902 | ) 903 | else: 904 | logger.debug("disjoint unsupported edge: {0} -> {1}, kept".format(u, v)) 905 | return assigned 906 | 907 | 908 | def iter_graph_disentanglement( 909 | graph: Graph, 910 | simp_node_dict: dict, 911 | simp_edge_dict: dict, 912 | contig_dict: dict, 913 | pe_info: dict, 914 | ref_file: str, 915 | logger: Logger, 916 | threshold, 917 | temp_dir, 918 | ): 919 | BOUND_ITER = len(simp_node_dict) ** 2 920 | it = 0 921 | total_removed_branch = 0 922 | num_split = 0 923 | iterCount = "A" 924 | for is_prim in [True, False]: # False 925 | do_trivial_split = True 926 | while it < BOUND_ITER: 927 | num_split = balance_split( 928 | graph, 929 | simp_node_dict, 930 | simp_edge_dict, 931 | contig_dict, 932 | pe_info, 933 | logger, 934 | ref_file, 935 | temp_dir, 936 | it, 937 | threshold, 938 | is_prim, 939 | ) 940 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 941 | graph, 942 | simp_node_dict, 943 | simp_edge_dict, 944 | logger, 945 | "{0}/gfa/split_graph_L{1}d.gfa".format(temp_dir, iterCount), 946 | ) 947 | simp_path_compactification( 948 | graph, simp_node_dict, simp_edge_dict, contig_dict, pe_info, logger 949 | ) 950 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 951 | graph, 952 | simp_node_dict, 953 | simp_edge_dict, 954 | logger, 955 | "{0}/gfa/split_graph_L{1}dc.gfa".format(temp_dir, iterCount), 956 | ) 957 | 958 | if num_split > 0: 959 | do_trivial_split = True 960 | else: 961 | if do_trivial_split: 962 | # trivial split nt branch related cases FIXME 963 | prev_ids = list(simp_node_dict.keys()) 964 | trivial_split_count, id_mapping = trivial_split( 965 | graph, simp_node_dict, simp_edge_dict, pe_info, logger 966 | ) 967 | logger.debug("my id mapping: {0}".format(id_mapping)) 968 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 969 | graph, 970 | simp_node_dict, 971 | simp_edge_dict, 972 | logger, 973 | "{0}/gfa/split_graph_L{1}dct.gfa".format(temp_dir, iterCount), 974 | ) 975 | 976 | contig_dict_remapping( 977 | graph, 978 | simp_node_dict, 979 | simp_edge_dict, 980 | contig_dict, 981 | id_mapping, 982 | prev_ids, 983 | logger, 984 | ) 985 | simp_path_compactification( 986 | graph, 987 | simp_node_dict, 988 | simp_edge_dict, 989 | contig_dict, 990 | pe_info, 991 | logger, 992 | ) 993 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 994 | graph, 995 | simp_node_dict, 996 | simp_edge_dict, 997 | logger, 998 | "{0}/gfa/split_graph_L{1}dctd.gfa".format(temp_dir, iterCount), 999 | ) 1000 | 1001 | contig_dup_removed_s(contig_dict, logger) 1002 | trim_contig_dict(graph, simp_node_dict, contig_dict, logger) 1003 | # analysis 1004 | if ref_file: 1005 | map_ref_to_graph( 1006 | ref_file, 1007 | simp_node_dict, 1008 | "{0}/gfa/split_graph_L{1}dc.gfa".format(temp_dir, iterCount), 1009 | logger, 1010 | True, 1011 | "{0}/paf/node_to_ref_{1}.paf".format(temp_dir, iterCount), 1012 | "{0}/tmp/temp_gfa_to_fasta_{1}.fasta".format(temp_dir, iterCount), 1013 | ) 1014 | # analysis 1015 | total_removed_branch += num_split 1016 | it += 1 1017 | iterCount = chr(ord(iterCount) + 1) 1018 | if num_split == 0: 1019 | if do_trivial_split: 1020 | do_trivial_split = False 1021 | else: 1022 | break 1023 | 1024 | logger.debug("Total non-trivial branches removed: " + str(total_removed_branch)) 1025 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict) 1026 | logger.debug( 1027 | list_to_string( 1028 | non_trivial_branches.keys(), 1029 | "non-trivial branches ({0}) left after paired-end&single-strand links".format( 1030 | len(non_trivial_branches) 1031 | ), 1032 | ) 1033 | ) 1034 | 1035 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph( 1036 | graph, 1037 | simp_node_dict, 1038 | simp_edge_dict, 1039 | logger, 1040 | "{0}/gfa/split_graph_final.gfa".format(temp_dir), 1041 | ) 1042 | return graph, simp_node_dict, simp_edge_dict 1043 | 1044 | 1045 | def best_aln_score(graph: Graph, ori, strain, ref_file, temp_dir): 1046 | fname = "{0}/temp_{1}.fa".format(temp_dir, ori) 1047 | pafname = "{0}/temp_{1}_aln.paf".format(temp_dir, ori) 1048 | subprocess.check_call('echo "" > {0}'.format(fname), shell=True) 1049 | with open(fname, "w") as f: 1050 | f.write(">{0}\n".format(ori)) 1051 | f.write("{0}\n".format(path_to_seq(graph, strain, ""))) 1052 | f.close() 1053 | minimap_api(ref_file, fname, pafname) 1054 | subprocess.check_call("rm {0}".format(fname), shell=True) 1055 | best_aln = [] 1056 | with open(pafname, "r") as paf: 1057 | for line in paf.readlines(): 1058 | splited = line[:-1].split("\t") 1059 | if len(splited) < 12: 1060 | continue 1061 | best_aln.append( 1062 | [ 1063 | splited[0], 1064 | int(splited[10]), 1065 | splited[5], 1066 | int(splited[10]) - int(splited[9]), 1067 | ] 1068 | ) 1069 | paf.close() 1070 | subprocess.check_call("rm {0}".format(pafname), shell=True) 1071 | return best_aln 1072 | --------------------------------------------------------------------------------