├── utils
├── __init__.py
├── spades_wrapper.py
├── VStrains_PE_Inference.py
├── VStrains_SPAdes.py
├── VStrains_Preprocess.py
├── VStrains_Alignment.py
├── VStrains_IO.py
├── VStrains_Extension.py
└── VStrains_Decomposition.py
├── requirements.txt
├── VStrains_logo.png
├── environment.yml
├── MANIFEST.in
├── .gitignore
├── LICENSE
├── recipe
└── meta.yaml
├── setup.py
├── evals
├── sampling.py
└── quast_evaluation.py
├── vstrains
└── README.md
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | graph-tool
2 | minimap2
3 | numpy
4 | gfapy
5 | matplotlib
6 |
--------------------------------------------------------------------------------
/VStrains_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metagentools/VStrains/HEAD/VStrains_logo.png
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: vstrains
2 | channels:
3 | - defaults
4 | - bioconda
5 | - conda-forge
6 | dependencies:
7 | - python=3
8 | - graph-tool>=2.45
9 | - minimap2>=2.24
10 | - numpy>=1.23.5
11 | - gfapy>=1.2.3
12 | - matplotlib>=3.6.2
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | include LICENSE
4 | include VStrains_logo.png
5 | include environment.yml
6 | include setup.py
7 |
8 | include vstrains
9 |
10 |
11 | recursive-include recipe/*
12 | recursive-include utils/*
13 | recursive-include evals/*
14 |
15 | global-exclude utils/__pycache__*.pyc
16 | global-exclude evals/__pycache__*.pyc
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled source #
2 | ###################
3 | *.com
4 | *.class
5 | *.dll
6 | *.exe
7 | *.o
8 | *.so
9 | *.sh
10 |
11 | # Packages #
12 | ############
13 | # it's better to unpack these files and commit the raw source
14 | # git has its own built in compression methods
15 | *.7z
16 | *.dmg
17 | *.gz
18 | *.iso
19 | *.jar
20 | *.rar
21 | *.tar
22 | *.zip
23 |
24 | # Logs and databases #
25 | ######################
26 | *.log
27 | *.sql
28 | *.sqlite
29 |
30 | # OS generated files #
31 | ######################
32 | .DS_Store
33 | .DS_Store?
34 | ._*
35 | .Spotlight-V100
36 | .Trashes
37 | ehthumbs.db
38 | Thumbs.db
39 |
40 | # Evaluation result #
41 | #####################
42 | eval_result/
43 | example/
44 | benchmark/*
45 | quast*/
46 | acc*/
47 | testcase/
48 | src/tmp/*
49 | *.fa
50 | *.fq
51 | *.fasta
52 | *.fastq
53 | *.gfa
54 | *.csv
55 | *.paf
56 | *.pyc
57 | *.sh
58 | # pycache #
59 | ###########
60 | */__pycache__/*
61 | *.cpython*
62 | src/__pycache__/*
63 | legacy/
64 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) [2022] [Runpeng Luo]
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/recipe/meta.yaml:
--------------------------------------------------------------------------------
1 | {% set name = "VStrains" %}
2 | {% set version = "1.1.0" %}
3 |
4 | package:
5 | name: "{{ name|lower }}"
6 | version: "{{ version }}"
7 |
8 | source:
9 | url: https://github.com/metagentools/{{ name }}/releases/download/v{{ version }}/{{ name }}-{{ version }}.tar.gz
10 | sha256: 79a77435dd0f648fe55bb5930ef8fdd874d4aec990850ab20dd8b067d8df5ec0
11 |
12 | build:
13 | number: 0
14 | noarch: python
15 | script:
16 | - "{{ PYTHON }} -m pip install . -vv"
17 |
18 | requirements:
19 | host:
20 | - pip>=22.3.1
21 | - python=3
22 | - graph-tool>=2.45
23 | - minimap2>=2.24
24 | - numpy>=1.23.5
25 | - gfapy>=1.2.3
26 | - matplotlib>=3.6.2
27 | run:
28 | - python=3
29 | - graph-tool>=2.45
30 | - minimap2>=2.24
31 | - numpy>=1.23.5
32 | - gfapy>=1.2.3
33 | - matplotlib>=3.6.2
34 |
35 | test:
36 | commands:
37 | - vstrains -h
38 |
39 | about:
40 | home: "https://github.com/metagentools/MetaCoAG"
41 | license: MIT
42 | license_file: LICENSE
43 | summary: "VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs"
44 | doc_url: "https://github.com/metagentools/VStrains/blob/master/README.md"
45 | dev_url: "https://github.com/metagentools/VStrains"
46 |
47 | extra:
48 | recipe-maintainers:
49 | - JohnLuo
50 | # identifiers:
51 | # - doi:10.1101/2022.10.21.513181v3
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from setuptools import setup, find_packages
4 |
5 | # read the contents of your README file
6 | from pathlib import Path
7 |
8 | this_directory = Path(__file__).parent
9 | long_description = (this_directory / "README.md").read_text()
10 |
11 | packages = find_packages()
12 | package_data = {"utils": ["utils/*"]}
13 |
14 | data_files = [(".", ["LICENSE", "README.md"])]
15 |
16 | setup(
17 | name="vstrains",
18 | version="1.1.0",
19 | zip_safe=True,
20 | author="Runpeng Luo and Yu Lin",
21 | author_email="runpengluo@gmail.com",
22 | description="VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs",
23 | long_description=long_description,
24 | long_description_content_type="text/markdown",
25 | url="https://github.com/metagentools/VStrains",
26 | license="MIT",
27 | packages=packages,
28 | package_data=package_data,
29 | data_files=data_files,
30 | include_package_data=True,
31 | scripts=["vstrains"],
32 | classifiers=[
33 | "Development Status :: 5 - Production/Stable",
34 | "Programming Language :: Python :: 3",
35 | "License :: OSI Approved :: MIT License",
36 | "Natural Language :: English",
37 | "Topic :: Scientific/Engineering :: Bio-Informatics",
38 | "Operating System :: OS Independent",
39 | ],
40 | install_requires=[
41 | # "graph-tool", # not distributed via Pip
42 | # "minimap2", # not distributed via Pip
43 | "numpy",
44 | "gfapy",
45 | "matplotlib",
46 | ],
47 | python_requires=">=3",
48 | )
49 |
--------------------------------------------------------------------------------
/utils/spades_wrapper.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import argparse
4 | import time
5 |
6 | if __name__ == "__main__":
7 | parser = argparse.ArgumentParser(
8 | prog="spades_wrapper.py",
9 | description="""Build assembly graph&contig using SPAdes --careful mode,
10 | with input pair-end reads and store the graph.""",
11 | )
12 | parser.add_argument(
13 | "-f",
14 | "--forward",
15 | dest="forward",
16 | type=str,
17 | required=True,
18 | help="Forward reads, fastq format",
19 | )
20 | parser.add_argument(
21 | "-r",
22 | "--reverse",
23 | dest="reverse",
24 | type=str,
25 | required=True,
26 | help="Reverse reads, fastq format",
27 | )
28 | parser.add_argument(
29 | "-spades",
30 | "--spades_path",
31 | dest="spades",
32 | type=str,
33 | required=True,
34 | help="absolute path to spades executable",
35 | )
36 | parser.add_argument(
37 | "-t",
38 | "--threads",
39 | dest="thread_count",
40 | default=8,
41 | help="Set number of threads used for SPAdes.",
42 | )
43 | parser.add_argument(
44 | "-o", "--output_dir", dest="output_dir", type=str, required=True
45 | )
46 | args = parser.parse_args()
47 |
48 | global_t1_start = time.perf_counter()
49 | global_t2_start = time.process_time()
50 |
51 | filepath = os.path.dirname(os.path.abspath(__file__))
52 | spades = args.spades
53 |
54 | if spades:
55 | print(filepath)
56 | subprocess.check_call(
57 | "rm -rf {0} && mkdir {0}".format(args.output_dir), shell=True
58 | )
59 |
60 | subprocess.check_call(
61 | spades
62 | + " -1 {0} -2 {1} --careful -t {3} -o {4}".format(
63 | args.forward, args.reverse, args.thread_count, args.output_dir
64 | ),
65 | shell=True,
66 | )
67 | else:
68 | print("SPAdes executable path haven't specified.")
69 |
70 | t1_stop = time.perf_counter()
71 | t2_stop = time.process_time()
72 |
73 | print("\SPAdes assembly completed")
74 | print("Elapsed time: {:.1f} seconds".format(t1_stop - global_t1_start))
75 | print("CPU process time: {:.1f} seconds".format(t2_stop - global_t2_start))
76 |
--------------------------------------------------------------------------------
/evals/sampling.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | import subprocess
4 | import sys
5 | import random
6 |
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(
10 | prog="sampling",
11 | description="""Sampling the pairend fastq file""",
12 | )
13 |
14 | parser.add_argument(
15 | "-s",
16 | "--sampling_ratio",
17 | dest="sratio",
18 | type=int,
19 | required=True,
20 | help="sampling ratio, 2 for sampling half the dataset, etc.,",
21 | )
22 | parser.add_argument(
23 | "-f",
24 | "--forward",
25 | dest="fwd",
26 | type=str,
27 | required=True,
28 | help="forward .fastq file",
29 | )
30 |
31 | parser.add_argument(
32 | "-r",
33 | "--reverse",
34 | dest="rve",
35 | type=str,
36 | required=True,
37 | help="reverse .fastq file",
38 | )
39 |
40 | parser.add_argument(
41 | "-of",
42 | "--out_forward",
43 | dest="ofwd",
44 | type=str,
45 | required=True,
46 | help="output forward .fastq file",
47 | )
48 |
49 | parser.add_argument(
50 | "-or",
51 | "--out_reverse",
52 | dest="orve",
53 | type=str,
54 | required=True,
55 | help="output reverse .fastq file",
56 | )
57 |
58 | args = parser.parse_args()
59 |
60 | if 1 / args.sratio <= 0 or 1 / args.sratio >= 1:
61 | print("error ratio, please input a valid ratio")
62 | sys.exit(1)
63 |
64 | subprocess.check_call("echo " " > {0}".format(args.ofwd), shell=True)
65 | subprocess.check_call("echo " " > {0}".format(args.orve), shell=True)
66 |
67 | with open(args.fwd, "r") as fwd:
68 | with open(args.rve, "r") as rve:
69 | with open(args.ofwd, "w") as ofwd:
70 | with open(args.orve, "w") as orve:
71 | flines = fwd.readlines()
72 | rlines = rve.readlines()
73 | n = len(flines) // 4
74 | k = 0
75 | print("total number of reads: ", n)
76 | for i in range(n):
77 | if random.random() > 1 / args.sratio:
78 | continue
79 | k += 1
80 | for fcurr in flines[i * 4 : i * 4 + 4]:
81 | ofwd.write(fcurr)
82 | for rcurr in rlines[i * 4 : i * 4 + 4]:
83 | orve.write(rcurr)
84 | print("sample {0} reads given ratio {1}".format(k, args.sratio))
85 | orve.close()
86 | ofwd.close()
87 | rve.close()
88 | fwd.close()
89 |
90 | return
91 |
92 |
93 | if __name__ == "__main__":
94 | sys.exit(main())
95 |
--------------------------------------------------------------------------------
/evals/quast_evaluation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import subprocess
3 | import argparse
4 | import sys
5 | import os
6 |
7 | usage = "Use MetaQUAST to evaluate assembly result"
8 | Author = "Runpeng Luo"
9 |
10 |
11 | def sep_ref(ref_file, id=0):
12 | ref_file_list = []
13 | i = 0
14 | with open(ref_file, "r") as ref:
15 | j = 0
16 | lines = ref.readlines()
17 | l = len(lines)
18 | while j < l - 1:
19 | name_in_file = lines[j]
20 | name = str(lines[j][1:-1])
21 | name = name.split(" ")[0]
22 | name = name.split(".")[0]
23 | strain = lines[j + 1]
24 | j = j + 2
25 | file_name = "sub_" + str(id) + "_" + str(name) + "_ref.fasta"
26 | subprocess.check_call("touch {0}".format(file_name), shell=True)
27 | with open(file_name, "w") as sub_file:
28 | sub_file.write(name_in_file)
29 | sub_file.write(strain)
30 | sub_file.close()
31 | ref_file_list.append(file_name)
32 | i = i + 1
33 | ref.close()
34 | print("ref list: ", ref_file_list)
35 | return ref_file_list
36 |
37 |
38 | def quast_eval(files, ref, o, quast, id=0):
39 | subprocess.check_call("rm -rf sub_{0}_*_ref.fasta".format(id), shell=True)
40 |
41 | ref_file_list = sep_ref(ref, id)
42 |
43 | command = "python2 {0} --unique-mapping --report-all-metrics -m 500 -t 8 ".format(
44 | quast
45 | )
46 | for fname in files:
47 | command += fname + " "
48 |
49 | command += "-o " + o + " -R "
50 |
51 | for file in ref_file_list:
52 | command += file + ","
53 | command = command[:-1]
54 |
55 | print(command)
56 | subprocess.check_call(command, shell=True)
57 |
58 | # clean up
59 | subprocess.check_call("rm -rf sub_{0}_*_ref.fasta".format(id), shell=True)
60 | return
61 |
62 |
63 | if __name__ == "__main__":
64 | parser = argparse.ArgumentParser(prog="quast_evaluation.py", description=usage)
65 | parser.add_argument(
66 | "-quast",
67 | "--path_to_quast",
68 | dest="quast",
69 | required=True,
70 | help="path to MetaQuast python script, version >= 5.2.0",
71 | )
72 | parser.add_argument(
73 | "-cs",
74 | "--contig_files",
75 | dest="files",
76 | default=None,
77 | nargs="+",
78 | help="contig files from different tools, separated by space",
79 | )
80 | parser.add_argument(
81 | "-d",
82 | "--contig_dir",
83 | dest="idir",
84 | default=None,
85 | help="contig files from different tools, stored in the directory, .fasta format",
86 | )
87 | parser.add_argument(
88 | "-ref",
89 | "--ref_file",
90 | dest="ref_file",
91 | type=str,
92 | required=True,
93 | help="ref file (single)",
94 | )
95 | parser.add_argument(
96 | "-o",
97 | "--output_dir",
98 | dest="output_dir",
99 | type=str,
100 | required=True,
101 | help="output directory",
102 | )
103 | args = parser.parse_args()
104 |
105 | if args.idir == None and args.files == None:
106 | print("Please provide correct query input")
107 | sys.exit(1)
108 |
109 | if args.idir != None and (
110 | not os.path.exists(args.idir) or not os.path.isdir(args.idir)
111 | ):
112 | print("Please provide correct directory")
113 | sys.exit(1)
114 |
115 | files = []
116 | if args.files != None:
117 | files.extend(args.files)
118 | if args.idir != None:
119 | files.extend(
120 | [
121 | str(args.idir) + s
122 | for s in sorted(os.listdir(args.idir))
123 | if s.endswith(".fasta") or s.endswith(".fa")
124 | ]
125 | )
126 |
127 | quast_eval(files, args.ref_file, args.output_dir, args.quast)
128 |
--------------------------------------------------------------------------------
/utils/VStrains_PE_Inference.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | import os
4 | import time
5 | import subprocess
6 | import numpy
7 | import sys
8 |
9 | rev_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}
10 |
11 |
12 | def reverse_seq(seq: str):
13 | return "".join(rev_dict[x] for x in reversed(seq))
14 |
15 |
16 | def single_end_read_mapping(
17 | seq: str, kmer_htable: dict, index2seqlen: list, split_len: int, len_index2id: int
18 | ):
19 | nodes = numpy.zeros(len_index2id, dtype=int)
20 | coords = [sys.maxsize for _ in range(len_index2id)]
21 | kindices = [sys.maxsize for _ in range(len_index2id)]
22 |
23 | rlen = len(seq)
24 | for i in range(rlen - split_len + 1):
25 | kmer = seq[i : i + split_len]
26 | if kmer in kmer_htable:
27 | # found a collide node
28 | for rid, rcord in kmer_htable[kmer]:
29 | nodes[rid] += 1
30 | coords[rid] = min(coords[rid], rcord)
31 | kindices[rid] = min(kindices[rid], i)
32 |
33 | saturates = []
34 | L = 0
35 | R = 0
36 | for i, v in enumerate(nodes):
37 | if coords[i] == sys.maxsize or kindices[i] == sys.maxsize:
38 | continue
39 | L = max(coords[i], coords[i] - kindices[i])
40 | R = min(coords[i] + index2seqlen[i] - 1, coords[i] - kindices[i] + rlen - 1)
41 | saturate = R - L - (split_len - 1) + 1
42 | expected = (
43 | (min(rlen, index2seqlen[i]) - split_len + 1) * (rlen - split_len) / rlen
44 | )
45 | if v >= max(min(saturate, expected), 1):
46 | # print(i,v,"passed")
47 | saturates.append(i)
48 | return saturates
49 |
50 |
51 | def main():
52 | print(
53 | "----------------------Paired-End Information Alignment----------------------"
54 | )
55 | parser = argparse.ArgumentParser(
56 | prog="pe_info",
57 | description="""Align Paired-End reads to nodes in graph to obtain strong links""",
58 | )
59 |
60 | parser.add_argument(
61 | "-g", "--gfa,", dest="gfa", type=str, required=True, help="graph, .gfa format"
62 | )
63 |
64 | parser.add_argument(
65 | "-o",
66 | "--output_dir",
67 | dest="dir",
68 | type=str,
69 | required=True,
70 | help="output directory",
71 | )
72 |
73 | parser.add_argument(
74 | "-f", "--forward", dest="fwd", required=True, help="forward read, .fastq"
75 | )
76 |
77 | parser.add_argument(
78 | "-r", "--reverse", dest="rve", required=True, help="reverse read, .fastq"
79 | )
80 |
81 | parser.add_argument(
82 | "-k",
83 | "--kmer_size",
84 | dest="kmer_size",
85 | type=int,
86 | default=128,
87 | help="unique kmer size",
88 | )
89 |
90 | args = parser.parse_args()
91 |
92 | # initialize output directory
93 | if args.dir[-1] == "/":
94 | args.dir = args.dir[:-1]
95 | subprocess.check_call("rm -rf {0}".format(args.dir), shell=True)
96 | os.makedirs(args.dir, exist_ok=True)
97 |
98 | glb_start = time.time()
99 |
100 | # get gfa node informations
101 | index2id = []
102 | index2seq = []
103 | index2seqlen = []
104 |
105 | with open(args.gfa, "r") as gfa:
106 | for Line in gfa:
107 | splited = (Line[:-1]).split("\t")
108 | if splited[0] == "S":
109 | index2id.append(splited[1])
110 | index2seq.append(splited[2])
111 | index2seqlen.append(len(splited[2]))
112 | gfa.close()
113 |
114 | split_len = args.kmer_size + 1
115 |
116 | # construct hash table for gfa nodes with chunck kmer
117 | kmer_htable = {}
118 | for i, seq in enumerate(index2seq):
119 | seqlen = index2seqlen[i]
120 | for sub_i in range(seqlen - split_len + 1):
121 | kmer = seq[sub_i : sub_i + split_len]
122 | rev_kmer = reverse_seq(kmer)
123 | if kmer in kmer_htable:
124 | # not unique
125 | kmer_htable[kmer].append((i, sub_i))
126 | else:
127 | # unique
128 | kmer_htable[kmer] = [(i, sub_i)]
129 |
130 | if rev_kmer in kmer_htable:
131 | # not unique
132 | kmer_htable[rev_kmer].append((i, sub_i))
133 | else:
134 | # unique
135 | kmer_htable[rev_kmer] = [(i, sub_i)]
136 |
137 | # init nodes pairwise relationship
138 | len_index2id = len(index2id)
139 | node_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
140 | short_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
141 |
142 | n_reads = 0
143 | short_reads = 0
144 | used_reads = 0
145 |
146 | print("Start aligning reads to gfa nodes")
147 | fwd_fd = open(args.fwd, "r")
148 | rve_fd = open(args.rve, "r")
149 | fwd_reads = fwd_fd.readlines()
150 | rve_reads = rve_fd.readlines()
151 | fwd_fd.close()
152 | rve_fd.close()
153 |
154 | total_size = min(len(fwd_reads) // 4, len(rve_reads) // 4)
155 | for read_idx in range(total_size):
156 | if read_idx % 100000 == 0:
157 | print("Number of processed reads: ", read_idx)
158 | [_, fseq, _, _] = [s[:-1] for s in fwd_reads[read_idx * 4 : (read_idx + 1) * 4]]
159 | [_, rseq, _, _] = [s[:-1] for s in rve_reads[read_idx * 4 : (read_idx + 1) * 4]]
160 | if fseq.count("N") or rseq.count("N"):
161 | n_reads += 1
162 | elif len(fseq) < split_len or len(rseq) < split_len:
163 | short_reads += 1
164 | else:
165 | used_reads += 1
166 | # valid read pair
167 | lefts = single_end_read_mapping(
168 | fseq, kmer_htable, index2seqlen, split_len, len_index2id
169 | )
170 | rights = single_end_read_mapping(
171 | rseq, kmer_htable, index2seqlen, split_len, len_index2id
172 | )
173 |
174 | k = 0
175 | for i in lefts:
176 | for i2 in lefts[k:]:
177 | short_mat[i][i2] += 1
178 | k += 1
179 |
180 | k = 0
181 | for j in rights:
182 | for j2 in rights[k:]:
183 | short_mat[j][j2] += 1
184 | k += 1
185 |
186 | for i in lefts:
187 | for j in rights:
188 | node_mat[i][j] += 1
189 |
190 | out_file = "{0}/pe_info".format(args.dir)
191 | out_file2 = "{0}/st_info".format(args.dir)
192 | subprocess.check_call("touch {0}; echo " " > {0}".format(out_file), shell=True)
193 | subprocess.check_call("touch {0}; echo " " > {0}".format(out_file2), shell=True)
194 | with open(out_file, "w") as outfile:
195 | with open(out_file2, "w") as outfile2:
196 | for i in range(len_index2id):
197 | for j in range(len_index2id):
198 | outfile.write(
199 | "{0}:{1}:{2}\n".format(index2id[i], index2id[j], node_mat[i][j])
200 | )
201 | outfile2.write(
202 | "{0}:{1}:{2}\n".format(
203 | index2id[i], index2id[j], short_mat[i][j]
204 | )
205 | )
206 | outfile2.close()
207 | outfile.close()
208 |
209 | glb_elapsed = time.time() - glb_start
210 | print("Global time elapsed: ", glb_elapsed)
211 | print("result stored in: ", out_file)
212 |
213 |
214 | if __name__ == "__main__":
215 | main()
216 | sys.exit(0)
217 |
--------------------------------------------------------------------------------
/vstrains:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import sys
5 | import os
6 | import platform
7 | import numpy
8 | import logging
9 | import time
10 | from datetime import date
11 |
12 | from utils import VStrains_SPAdes
13 |
14 | __author__ = "Runpeng Luo"
15 | __copyright__ = "Copyright 2022-2025, VStrains Project"
16 | __credits__ = ["Runpeng Luo", "Yu Lin"]
17 | __license__ = "MIT"
18 | __version__ = "1.1.0"
19 | __maintainer__ = "Runpeng Luo"
20 | __email__ = "John.Luo@anu.edu.au"
21 | __status__ = "Production"
22 |
23 |
24 | def run(args, logger):
25 | numpy.seterr(all="raise")
26 | RUNNER = {
27 | "spades": VStrains_SPAdes.run,
28 | }
29 | RUNNER[args.assembler](args, logger)
30 |
31 |
32 | def main():
33 | parser = argparse.ArgumentParser(
34 | prog="VStrains",
35 | description="""Construct full-length viral strains under de novo approach
36 | from contigs and assembly graph, currently supports SPAdes""",
37 | )
38 |
39 | parser.add_argument(
40 | "-a",
41 | "--assembler",
42 | dest="assembler",
43 | type=str,
44 | required=True,
45 | choices=["spades"],
46 | help="name of the assembler used. [spades]",
47 | )
48 |
49 | parser.add_argument(
50 | "-g",
51 | "--graph",
52 | dest="gfa_file",
53 | type=str,
54 | required=True,
55 | help="path to the assembly graph, (.gfa format)",
56 | )
57 |
58 | parser.add_argument(
59 | "-p",
60 | "--path",
61 | dest="path_file",
62 | type=str,
63 | required=False,
64 | help="contig file from SPAdes (.paths format), only required for SPAdes. e.g., contigs.paths",
65 | )
66 |
67 | parser.add_argument(
68 | "-mc",
69 | "--minimum_coverage",
70 | dest="min_cov",
71 | default=None,
72 | type=int,
73 | help=argparse.SUPPRESS,
74 | # (
75 | # "minimum node coverage cutoff [default: auto]"
76 | # ),
77 | )
78 |
79 | parser.add_argument(
80 | "-ml",
81 | "--minimum_contig_length",
82 | dest="min_len",
83 | default=None,
84 | type=int,
85 | help=argparse.SUPPRESS,
86 | # ("minimum initial contig length [default: 250]"),
87 | )
88 |
89 | parser.add_argument(
90 | "-r",
91 | "--reference_fa",
92 | dest="ref_file",
93 | default=None,
94 | type=str,
95 | help=argparse.SUPPRESS,
96 | )
97 |
98 | parser.add_argument(
99 | "-o",
100 | "--output_dir",
101 | dest="output_dir",
102 | default="acc/",
103 | type=str,
104 | help="path to the output directory [default: acc/]",
105 | )
106 |
107 | parser.add_argument(
108 | "-d",
109 | "--dev_mode",
110 | dest="dev",
111 | action="store_true",
112 | default=False,
113 | help=argparse.SUPPRESS,
114 | )
115 |
116 | parser.add_argument(
117 | "-fwd",
118 | "--fwd_file",
119 | dest="fwd",
120 | required=True,
121 | default=None,
122 | type=str,
123 | help="paired-end sequencing reads, forward strand (.fastq format)",
124 | )
125 |
126 | parser.add_argument(
127 | "-rve",
128 | "--rve_file",
129 | dest="rve",
130 | required=True,
131 | default=None,
132 | type=str,
133 | help="paired-end sequencing reads, reverse strand (.fastq format)",
134 | )
135 |
136 | args = parser.parse_args()
137 |
138 | # parsing arguments, sanity check
139 | if (not args.gfa_file) or (not os.path.exists(args.gfa_file)):
140 | print("\nPath to the assembly graph is required, (.gfa format)")
141 | print("Please ensure the path is correct")
142 | print("\nExiting...\n")
143 | sys.exit(1)
144 |
145 | args.assembler = args.assembler.lower()
146 |
147 | if args.assembler.lower() == "spades":
148 | if (not args.path_file) or (not os.path.exists(args.path_file)):
149 | print(
150 | "\nPath to Contig file from SPAdes (.paths format) is required for SPAdes assmbler option. e.g., contigs.paths"
151 | )
152 | print("\nExiting...\n")
153 | sys.exit(1)
154 | else:
155 | print("\nPlease make sure to provide the correct assembler type (SPAdes).")
156 | print("\nExiting...\n")
157 | sys.exit(1)
158 |
159 | if args.min_len != None:
160 | if args.min_len < 0:
161 | print(
162 | "\nPlease make sure to provide the correct option (invalid value for min_len or min_cov)."
163 | )
164 | print("\nExiting...\n")
165 | sys.exit(1)
166 | else:
167 | args.min_len = 250
168 |
169 | if args.min_cov != None:
170 | if args.min_cov < 0:
171 | print(
172 | "\nPlease make sure to provide the correct option (invalid value for min_len or min_cov)."
173 | )
174 | print("\nExiting...\n")
175 | sys.exit(1)
176 |
177 | if args.output_dir[-1] == "/":
178 | args.output_dir = args.output_dir[:-1]
179 |
180 | # initialize output directory
181 | os.makedirs(args.output_dir, exist_ok=True)
182 | try:
183 | os.makedirs(args.output_dir + "/gfa/")
184 | os.makedirs(args.output_dir + "/tmp/")
185 | os.makedirs(args.output_dir + "/paf/")
186 | os.makedirs(args.output_dir + "/aln/")
187 | except OSError as _:
188 | print("\nCurrent output directory is not empty")
189 | print("Please empty/re-create the output directory: " + str(args.output_dir))
190 | print("\nExiting...\n")
191 | sys.exit(1)
192 |
193 | if os.path.exists(args.output_dir + "/vstrains.log"):
194 | os.remove(args.output + "/vstrains.log")
195 |
196 | # Setup logger
197 | # -----------------------
198 | logger = logging.getLogger("VStrains %s" % __version__)
199 | logger.setLevel(logging.DEBUG if args.dev else logging.INFO)
200 |
201 | consoleHeader = logging.StreamHandler()
202 | consoleHeader.setLevel(logging.INFO)
203 | consoleHeader.setFormatter(logging.Formatter("%(message)s"))
204 | logger.addHandler(consoleHeader)
205 |
206 | fileHandler = logging.FileHandler(args.output_dir + "/vstrains.log")
207 | fileHandler.setLevel(logging.DEBUG if args.dev else logging.INFO)
208 | fileHandler.setFormatter(logging.Formatter("%(message)s"))
209 | logger.addHandler(fileHandler)
210 |
211 | logger.info("Welcome to VStrains!")
212 | logger.info(
213 | "VStrains is a strain-aware assembly tools, which constructs full-length "
214 | )
215 | logger.info("virus strain with aid from de Bruijn assembly graph and contigs.")
216 | logger.info("")
217 | logger.info("System information:")
218 | try:
219 | logger.info(" VStrains version: " + str(__version__).strip())
220 | logger.info(" Python version: " + ".".join(map(str, sys.version_info[0:3])))
221 | logger.info(" OS: " + platform.platform())
222 | except Exception:
223 | logger.info(" Problem occurred when getting system information")
224 |
225 | logger.info("")
226 | start_time = time.time()
227 |
228 | logger.info("Input arguments:")
229 | logger.info("Assembly type: " + args.assembler)
230 | logger.info("Assembly graph file: " + args.gfa_file)
231 | logger.info("Forward read file: " + args.fwd)
232 | logger.info("Reverse read file: " + args.rve)
233 | if args.assembler == "spades":
234 | logger.info("Contig paths file: " + args.path_file)
235 | logger.info("Output directory: " + os.path.abspath(args.output_dir))
236 | if args.dev:
237 | logger.info("*DEBUG MODE is turned ON")
238 | logger.info("\n\n")
239 | logger.info(
240 | "======= VStrains pipeline started. Log can be found here: "
241 | + os.path.abspath(args.output_dir)
242 | + "/vstrains.log\n"
243 | )
244 |
245 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
246 | consoleHeader.setFormatter(formatter)
247 | fileHandler.setFormatter(formatter)
248 |
249 | # all good
250 | run(args, logger)
251 |
252 | elapsed = time.time() - start_time
253 |
254 | consoleHeader.setFormatter(logging.Formatter("%(message)s"))
255 | fileHandler.setFormatter(logging.Formatter("%(message)s"))
256 |
257 | logger.info("")
258 | logger.info("Thanks for using VStrains")
259 | logger.info(
260 | "Result is stored in {0}/strain.fasta".format(os.path.abspath(args.output_dir))
261 | )
262 | logger.info(
263 | "You can visualise the path stored in {0}/strain.paths via {0}/gfa/graph_L0.gfa".format(
264 | os.path.abspath(args.output_dir)
265 | )
266 | )
267 | logger.info("Finished: {0}".format(date.today().strftime("%B %d, %Y")))
268 | logger.info("Elapsed time: {0}".format(elapsed))
269 | logger.info("Exiting...")
270 | logger.removeHandler(fileHandler)
271 | logger.removeHandler(consoleHeader)
272 |
273 | return 0
274 |
275 |
276 | if __name__ == "__main__":
277 | main()
278 |
--------------------------------------------------------------------------------
/utils/VStrains_SPAdes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from utils.VStrains_Utilities import *
4 | from utils.VStrains_Preprocess import (
5 | graph_simplification,
6 | reindexing,
7 | threshold_estimation,
8 | )
9 | from utils.VStrains_IO import (
10 | graph_to_gfa,
11 | flipped_gfa_to_graph,
12 | gfa_to_graph,
13 | contig_dict_to_path,
14 | contig_dict_to_fasta,
15 | spades_paths_parser,
16 | process_pe_info,
17 | store_reinit_graph,
18 | )
19 | from utils.VStrains_Decomposition import *
20 | from utils.VStrains_Extension import path_extension, best_matching
21 | import os
22 | import sys
23 |
24 |
25 | def run(args, logger):
26 | TEMP_DIR = args.output_dir
27 |
28 | logger.info("VStrains-SPAdes started")
29 |
30 | logger.info(">>>STAGE: parsing graph and contigs")
31 | graph, simp_node_dict, simp_edge_dict = gfa_to_graph(args.gfa_file, logger)
32 | graph_to_gfa(
33 | graph,
34 | simp_node_dict,
35 | simp_edge_dict,
36 | logger,
37 | "{0}/gfa/graph_L0.gfa".format(TEMP_DIR),
38 | )
39 | graph0, simp_node_dict0, simp_edge_dict0 = flipped_gfa_to_graph(
40 | "{0}/gfa/graph_L0.gfa".format(TEMP_DIR), logger
41 | )
42 | graph0, simp_node_dict0, simp_edge_dict0, idx_mapping = reindexing(
43 | graph0, simp_node_dict0, simp_edge_dict0
44 | )
45 | graph_to_gfa(
46 | graph0,
47 | simp_node_dict0,
48 | simp_edge_dict0,
49 | logger,
50 | "{0}/gfa/graph_L0r.gfa".format(TEMP_DIR),
51 | )
52 |
53 | # cut-off coverage, graph preprocess parameter
54 | THRESHOLD = 0
55 | if args.min_cov != None:
56 | THRESHOLD = args.min_cov
57 | logger.info("user-defined node minimum coverage: {0}".format(THRESHOLD))
58 | else:
59 | THRESHOLD = threshold_estimation(graph0, logger, TEMP_DIR)
60 | logger.info("computed node minimum coverage: {0}".format(THRESHOLD))
61 |
62 | contig_dict, contig_info = spades_paths_parser(
63 | graph0,
64 | simp_node_dict0,
65 | simp_edge_dict0,
66 | idx_mapping,
67 | logger,
68 | args.path_file,
69 | args.min_len,
70 | THRESHOLD,
71 | )
72 | copy_contig_dict = {}
73 | for cno, [contig, clen, ccov] in contig_dict.items():
74 | copy_contig_dict[cno] = [list(contig), clen, ccov]
75 | # debug only
76 | contig_dict_to_path(contig_dict, "{0}/tmp/init_contigs.paths".format(TEMP_DIR))
77 | contig_dict_to_fasta(
78 | graph0,
79 | simp_node_dict0,
80 | contig_dict,
81 | "{0}/tmp/init_contigs.fasta".format(TEMP_DIR),
82 | )
83 | if args.ref_file:
84 | minimap_api(
85 | args.ref_file,
86 | "{0}/tmp/init_contigs.fasta".format(TEMP_DIR),
87 | "{0}/paf/init_contigs_to_strain.paf".format(TEMP_DIR),
88 | )
89 | # debug only
90 | logger.info(">>>STAGE: preprocess")
91 | graph_simplification(
92 | graph0, simp_node_dict0, simp_edge_dict0, None, logger, THRESHOLD
93 | )
94 | graph_to_gfa(
95 | graph0,
96 | simp_node_dict0,
97 | simp_edge_dict0,
98 | logger,
99 | "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR),
100 | )
101 | graph1, simp_node_dict1, simp_edge_dict1 = flipped_gfa_to_graph(
102 | "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR), logger
103 | )
104 |
105 | # filter out contig that contains erroroness nodes
106 | for cno, [contig, _, _] in list(contig_dict.items()):
107 | if any([c not in simp_node_dict1 for c in contig]):
108 | contig_dict.pop(cno)
109 | logger.debug("unreliable contig with low coverage: {0}".format(cno))
110 |
111 | # get graph kmer size
112 | ksize = graph1.ep.overlap[list(graph1.edges())[0]] if graph1.num_edges() > 0 else 0
113 | logger.info("graph kmer size: {0}".format(ksize))
114 | if ksize <= 0:
115 | logger.error("invalid kmer-size, the graph does not contain any edges, exit..")
116 | sys.exit(1)
117 |
118 | # obtain paired end information
119 | script_path = "{0}/VStrains_PE_Inference.py".format(
120 | os.path.abspath(os.path.dirname(__file__))
121 | )
122 | subprocess.check_call(
123 | "python {0} -g {1} -o {2} -f {3} -r {4} -k {5}".format(
124 | script_path,
125 | "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR),
126 | "{0}/aln".format(TEMP_DIR),
127 | args.fwd,
128 | args.rve,
129 | ksize,
130 | ),
131 | shell=True,
132 | )
133 | logger.info("paired end information stored")
134 | pe_info_file = "{0}/aln/pe_info".format(TEMP_DIR)
135 | st_info_file = "{0}/aln/st_info".format(TEMP_DIR)
136 | pe_info, dcpy_pe_info = process_pe_info(
137 | simp_node_dict1.keys(), pe_info_file, st_info_file
138 | )
139 |
140 | edge_cleaning(graph1, simp_edge_dict1, contig_dict, pe_info, logger)
141 |
142 | graph2, simp_node_dict2, simp_edge_dict2 = store_reinit_graph(
143 | graph1,
144 | simp_node_dict1,
145 | simp_edge_dict1,
146 | logger,
147 | "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR),
148 | )
149 |
150 | contig_dict_to_path(contig_dict, "{0}/tmp/pre_contigs.paths".format(TEMP_DIR))
151 | contig_dict_to_fasta(
152 | graph2,
153 | simp_node_dict2,
154 | contig_dict,
155 | "{0}/tmp/pre_contigs.fasta".format(TEMP_DIR),
156 | )
157 | # stat evaluation
158 | if args.ref_file:
159 | map_ref_to_graph(
160 | args.ref_file,
161 | simp_node_dict2,
162 | "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR),
163 | logger,
164 | True,
165 | "{0}/paf/node_to_ref.paf".format(TEMP_DIR),
166 | "{0}/tmp/temp_gfa_to_fasta_pre.fasta".format(TEMP_DIR),
167 | )
168 | minimap_api(
169 | args.ref_file,
170 | "{0}/tmp/pre_contigs.fasta".format(TEMP_DIR),
171 | "{0}/paf/pre_contigs_to_strain.paf".format(TEMP_DIR),
172 | )
173 | map_ref_to_contig(
174 | contig_dict, logger, "{0}/paf/pre_contigs_to_strain.paf".format(TEMP_DIR)
175 | )
176 | # end stat
177 |
178 | # split the branches using link information
179 | graphf, simp_node_dictf, simp_edge_dictf = iter_graph_disentanglement(
180 | graph2,
181 | simp_node_dict2,
182 | simp_edge_dict2,
183 | contig_dict,
184 | pe_info,
185 | args.ref_file,
186 | logger,
187 | 0.05 * numpy.median([graph2.vp.dp[node] for node in graph2.vertices()]),
188 | TEMP_DIR,
189 | )
190 |
191 | contig_dict_to_path(contig_dict, "{0}/tmp/post_contigs.paths".format(TEMP_DIR))
192 | contig_dict_to_fasta(
193 | graphf,
194 | simp_node_dictf,
195 | contig_dict,
196 | "{0}/tmp/post_contigs.fasta".format(TEMP_DIR),
197 | )
198 | # stat evaluation
199 | if args.ref_file:
200 | map_ref_to_graph(
201 | args.ref_file,
202 | simp_node_dictf,
203 | "{0}/gfa/split_graph_final.gfa".format(TEMP_DIR),
204 | logger,
205 | True,
206 | "{0}/paf/node_to_ref_final.paf".format(TEMP_DIR),
207 | "{0}/tmp/temp_gfa_to_fasta_post.fasta".format(TEMP_DIR),
208 | )
209 | minimap_api(
210 | args.ref_file,
211 | "{0}/tmp/post_contigs.fasta".format(TEMP_DIR),
212 | "{0}/paf/post_contigs_to_strain.paf".format(TEMP_DIR),
213 | )
214 | map_ref_to_contig(
215 | contig_dict, logger, "{0}/paf/post_contigs_to_strain.paf".format(TEMP_DIR)
216 | )
217 | # end stat
218 | logger.info(">>>STAGE: contig path extension")
219 |
220 | # refine partial links using best match
221 | full_link = best_matching(
222 | graphf, simp_node_dictf, simp_edge_dictf, contig_dict, pe_info, logger
223 | )
224 |
225 | # update graph coverage on non-trivial branch, maximize
226 | increment_nt_branch_coverage(graphf, simp_node_dictf, logger)
227 |
228 | graph_to_gfa(
229 | graphf,
230 | simp_node_dictf,
231 | simp_edge_dictf,
232 | logger,
233 | "{0}/gfa/split_graph_final.gfa".format(TEMP_DIR),
234 | )
235 |
236 | # extend the graph
237 | p_delta = 0.05 * numpy.median([graphf.vp.dp[node] for node in graphf.vertices()])
238 | strain_dict, usages = path_extension(
239 | graphf,
240 | simp_node_dictf,
241 | simp_edge_dictf,
242 | contig_dict,
243 | full_link,
244 | dcpy_pe_info,
245 | logger,
246 | p_delta,
247 | TEMP_DIR,
248 | )
249 |
250 | logger.info(">>>STAGE: final process")
251 | contig_resolve(strain_dict)
252 | graphl, simp_node_dictl, simp_edge_dictl = flipped_gfa_to_graph(
253 | "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR), logger
254 | )
255 | trim_contig_dict(graphl, simp_node_dictl, strain_dict, logger)
256 | contig_dup_removed_s(strain_dict, logger)
257 | contig_dict_to_path(
258 | strain_dict, "{0}/tmp/tmp_strain.paths".format(TEMP_DIR), None, False
259 | )
260 |
261 | # recover repeat nodes back to contig
262 | strain_repeat_resol(
263 | graph0, simp_node_dict0, strain_dict, contig_info, copy_contig_dict, logger
264 | )
265 |
266 | logger.info(">>>STAGE: generate result")
267 | contig_dict_to_fasta(
268 | graph0, simp_node_dict0, strain_dict, "{0}/strain.fasta".format(TEMP_DIR)
269 | )
270 | contig_dict_to_path(
271 | strain_dict, "{0}/strain.paths".format(TEMP_DIR), idx_mapping, True
272 | )
273 | if args.ref_file:
274 | minimap_api(
275 | args.ref_file,
276 | "{0}/strain.fasta".format(TEMP_DIR),
277 | "{0}/paf/strain_to_ref.paf".format(TEMP_DIR),
278 | )
279 | logger.info("VStrains-SPAdes finished")
280 | return 0
281 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs
6 |
7 | 
8 | [](https://github.com/psf/black)
9 |
10 | Manual
11 | ===========
12 |
13 | Table of Contents
14 | -----------------
15 |
16 | 1. [About VStrains](#sec1)
17 | 2. [Updates](#sec2)
18 | 3. [Installation](#sec3)
19 | 3.1. [Option 1. Quick Install](#sec3.1)
20 | 3.2. [Option 2. Manual Install](#sec3.2)
21 | 3.3. [Download & Install VStrains](#sec3.3)
22 | 4. [Running VStrains](#sec4)
23 | 4.1. [Quick Usage](#sec4.1)
24 | 4.2. [Support SPAdes](#sec4.2)
25 | 4.3. [Output](#sec4.3)
26 | 5. [Stand-alone binaries](#sec5)
27 | 6. [Experiment](#sec6)
28 | 7. [Citation](#sec7)
29 | 8. [Feedback and bug reports](#sec8)
30 |
31 |
32 | # About VStrains
33 |
34 | VStrains is a de novo approach for reconstructing strains from viral quasispecies.
35 |
36 |
37 |
38 |
39 | # Updates
40 |
41 | ## VStrains 1.1.0 Release (03 Feb 2023)
42 | * Replace the PE link inference module `VStrains_Alignment.py` with `VStrains_PE_Inference.py`
43 |
44 | `VStrains_PE_Inference.py` implements a hash table approach that produce efficient perfect match lookup, the new module leads to consistent evaluation results and substantially decrease the runtime and memory usage against previous alignment approach.
45 |
46 |
47 |
48 |
49 | # Installation
50 |
51 | VStrains requires a 64-bit Linux system or Mac OS and python (supported versions are python3: 3.2 and higher).
52 |
53 |
54 | ## Option 1. Quick Install (**recommended**)
55 |
56 | Install [(mini)conda](https://conda.io/miniconda.html) as a light-weighted package management tool. Run the following commands to initialize and setup the conda environment for VStrains
57 |
58 | ```bash
59 | # add channels
60 | conda config --add channels defaults
61 | conda config --add channels bioconda
62 | conda config --add channels conda-forge
63 |
64 | # create conda environment
65 | conda create --name VStrains-env
66 |
67 | # activate conda environment
68 | conda activate VStrains-env
69 |
70 | conda install -c bioconda -c conda-forge python=3 graph-tool minimap2 numpy gfapy matplotlib
71 | ```
72 |
73 |
74 | ## Option 2. Manual Install
75 |
76 | Manually install dependencies:
77 | - [minimap2](https://github.com/lh3/minimap2)
78 |
79 | And python modules:
80 | - [graph-tool](https://graph-tool.skewed.de)
81 | - [numpy](https://numpy.org)
82 | - [gfapy](https://github.com/ggonnella/gfapy)
83 | - [matplotlib](https://matplotlib.org)
84 |
85 |
86 | ## Download & Install VStrains
87 |
88 | After successfully setup the environment and dependencies, clone the VStrains into your desirable place.
89 |
90 | ```bash
91 | git clone https://github.com/metagentools/VStrains.git
92 | ```
93 |
94 | Install the VStrains via `Pip`
95 |
96 | ```bash
97 | cd VStrains; pip install .
98 | ```
99 |
100 | Run the following commands to ensure VStrains is correctly setup & installed.
101 |
102 | ```bash
103 | vstrains -h
104 | ```
105 |
106 |
107 | # Running VStrains
108 |
109 | VStrains supports assembly results from [SPAdes](https://github.com/ablab/spades) (includes metaSPAdes and metaviralSPAdes) and may supports other graph-based assemblers in the future.
110 |
111 |
112 | ## Quick Usage
113 |
114 | ```
115 | usage: VStrains [-h] -a {spades} -g GFA_FILE [-p PATH_FILE] [-o OUTPUT_DIR] -fwd FWD -rve RVE
116 |
117 | Construct full-length viral strains under de novo approach from contigs and assembly graph, currently supports
118 | SPAdes
119 |
120 | optional arguments:
121 | -h, --help show this help message and exit
122 | -a {spades}, --assembler {spades}
123 | name of the assembler used. [spades]
124 | -g GFA_FILE, --graph GFA_FILE
125 | path to the assembly graph, (.gfa format)
126 | -p PATH_FILE, --path PATH_FILE
127 | contig file from SPAdes (.paths format), only required for SPAdes. e.g., contigs.paths
128 | -o OUTPUT_DIR, --output_dir OUTPUT_DIR
129 | path to the output directory [default: acc/]
130 | -fwd FWD, --fwd_file FWD
131 | paired-end sequencing reads, forward strand (.fastq format)
132 | -rve RVE, --rve_file RVE
133 | paired-end sequencing reads, reverse strand (.fastq format)
134 | ```
135 |
136 | VStrains takes as input an assembly graph in Graphical Fragment Assembly (GFA) Format and associated contig information, together with the raw reads in paired-end format (e.g., forward.fastq, reverse.fastq).
137 |
138 |
139 | ## Support SPAdes
140 |
141 | When running SPAdes, we recommend to use `--careful` option for more accurate assembly results. Do not modify any contig/node name from the SPAdes assembly results for consistency. Please refer to [SPAdes](https://github.com/ablab/spades) for further guideline. Example usage as below:
142 |
143 | ```bash
144 | # SPAdes assembler example, pair-end reads
145 | python spades.py -1 forward.fastq -2 reverse.fastq --careful -t 16 -o output_dir
146 | ```
147 |
148 | Both assembly graph (`assembly_graph_after_simplification.gfa`) and contig information (`contigs.paths`) can be found in the output directory after running SPAdes assembler. Please use them together with raw reads as inputs for VStrains, and set `-a` flag to `spades`. Example usage as below:
149 |
150 | ```bash
151 | vstrains -a spades -g assembly_graph_after_simplification.gfa -p contigs.paths -o output_dir -fwd forward.fastq -rve reverse.fastq
152 | ```
153 |
154 |
155 | ## Output
156 |
157 |
158 | VStrains stores all output files in ``, which is set by the user.
159 |
160 | * `/aln/` directory contains paired-end (PE) linkage information, which is stored in `pe_info` and `st_info`.
161 | * `/gfa/` directory contains iteratively simplified assembly graphs, where `graph_L0.gfa` contains the assembly graph produced by SPAdes after Strandedness Canonization, `split_graph_final.gfa` contains the assembly graph after Graph Disentanglement, and `graph_S_final.gfa` contains the assembly graph after Contig-based Path Extraction, the rests are intermediate results. All the assembly graphs are in [GFA 1.0 format](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md).
162 | * `/paf/` and `/tmp/` are temporary directories, feel free to ignore them.
163 | * `/strain.fasta` contains resulting strains in `.fasta`, the headers for each strain has the form `NODE___` which is compatiable to SPAdes contigs format.
164 | * `/strain.paths` contains paths in the assembly graph (input `GFA_FILE`) corresponding to `strain.fasta` using [Bandage](https://github.com/rrwick/Bandage) for further downstream analysis.
165 | * `/vstrains.log` contains the VStrains log.
166 |
167 |
168 |
169 |
176 |
177 |
178 | # Stand-alone binaries
179 |
180 | `evals/quast_evaluation.py` is a wrapper script for strain-level experimental result analysis using [MetaQUAST](https://github.com/ablab/quast).
181 |
182 | ```
183 | usage: quast_evaluation.py [-h] -quast QUAST [-cs FILES [FILES ...]] [-d IDIR] -ref REF_FILE -o OUTPUT_DIR
184 |
185 | Use MetaQUAST to evaluate assembly result
186 |
187 | options:
188 | -h, --help show this help message and exit
189 | -quast QUAST, --path_to_quast QUAST
190 | path to MetaQuast python script, version >= 5.2.0
191 | -cs FILES [FILES ...], --contig_files FILES [FILES ...]
192 | contig files from different tools, separated by space
193 | -d IDIR, --contig_dir IDIR
194 | contig files from different tools, stored in the directory, .fasta format
195 | -ref REF_FILE, --ref_file REF_FILE
196 | ref file (single)
197 | -o OUTPUT_DIR, --output_dir OUTPUT_DIR
198 | output directory
199 | ```
200 |
201 |
202 | # Experiment
203 |
204 | VStrains is evaluated on both simulated and real datasets under default settings, and the source of the datasets can be found in the links listed below:
205 | 1. Simulated Dataset, can be found at [savage-benchmark](https://bitbucket.org/jbaaijens/savage-benchmarks/src/master/) (No preprocessing is required)
206 | - 6 Poliovirus (20,000x)
207 | - 10 HCV (20,000x)
208 | - 15 ZIKV (20,000x)
209 | 2. Real Dataset (please refer to [Supplementary Material](https://www.biorxiv.org/content/10.1101/2022.10.21.513181v3.supplementary-material) for preprocessing the real datasets)
210 | - 5 HIV labmix (20,000x) [SRR961514](https://www.ncbi.nlm.nih.gov/sra/?term=SRR961514), reference genome sequences are available at [5 HIV References](https://github.com/cbg-ethz/5-virus-mix/blob/master/data/REF.fasta)
211 | - 2 SARS-COV-2 (4,000x) [SRR18009684](https://www.ncbi.nlm.nih.gov/sra/?term=SRR18009684), [SRR18009686](https://www.ncbi.nlm.nih.gov/sra/?term=SRR18009686), pre-processed reads and individually assemble ground-truth reference sequences can be found at [2 SARS-COV-2 Dataset](https://github.com/RunpengLuo/sarscov2-4000x)
212 |
213 |
214 | # Citation
215 | VStrains has been accepted at [RECOMB 2023](http://recomb2023.bilkent.edu.tr/program.html) and manuscript is publicly available at [here](https://link.springer.com/chapter/10.1007/978-3-031-29119-7_1).
216 |
217 | If you use VStrains in your work, please cite the following publications.
218 |
219 | Runpeng Luo and Yu Lin, VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs
220 |
221 |
222 | # Feedback and bug reports
223 |
224 | Thanks for using VStrains. If any bugs be experienced during execution, please re-run the program with additional `-d` flag and provide the `vstains.log` together with user cases via `Issues`
225 |
--------------------------------------------------------------------------------
/utils/VStrains_Preprocess.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from logging import Logger
4 | import subprocess
5 | from graph_tool.all import Graph
6 |
7 | import numpy
8 | import matplotlib.pyplot as plt
9 |
10 | from utils.VStrains_Utilities import *
11 |
12 |
13 | def reindexing(graph: Graph, simp_node_dict: dict, simp_edge_dict: dict):
14 | """
15 | Reindex the nodes, with idx-node_id mappings
16 | """
17 | idx_mapping = {}
18 | idx_node_dict = {}
19 | idx_edge_dict = {}
20 | idx = 0
21 | for no, node in simp_node_dict.items():
22 | if graph.vp.color[node] == "black":
23 | idx_mapping[no] = str(idx)
24 | graph.vp.id[node] = str(idx)
25 | idx_node_dict[str(idx)] = node
26 | idx += 1
27 | for (u, v), e in simp_edge_dict.items():
28 | if (
29 | graph.ep.color[e] == "black"
30 | and graph.vp.color[e.source()] == "black"
31 | and graph.vp.color[e.target()] == "black"
32 | ):
33 | idx_edge_dict[(idx_mapping[u], idx_mapping[v])] = e
34 | return graph, idx_node_dict, idx_edge_dict, idx_mapping
35 |
36 |
37 | def threshold_estimation(graph: Graph, logger: Logger, temp_dir):
38 | dps = [graph.vp.dp[node] for node in graph.vertices()]
39 | # handle edge case, when the graph contains uniform coverage
40 | if max(dps) == min(dps):
41 | return 0.00
42 | regions, bins = numpy.histogram(
43 | dps, bins=int((max(dps) - min(dps)) // (0.05 * numpy.median(dps)))
44 | )
45 | pidx, _ = max(list(enumerate(regions)), key=lambda p: p[1])
46 | ratio = 0.00
47 | if pidx == 0:
48 | ratio = 0.05
49 | # global peak belongs to first filter region, find maximum peak range, bound by 25% Median
50 | for i in range(0, 4):
51 | if i >= len(regions):
52 | logger.warning(
53 | "histogram is not properly set, reset cutoff to default (0.05*M)"
54 | )
55 | ratio = 0.05
56 | break
57 | if regions[i] > regions[i + 1]:
58 | ratio += 0.05
59 | else:
60 | break
61 | threshold = ratio * numpy.median(dps)
62 | plt.figure(figsize=(128, 64))
63 | for b in bins:
64 | plt.axvline(b, color="blue")
65 | plt.hist(x=dps, bins=len(dps))
66 | plt.axvline(threshold, color="r")
67 | plt.title("node coverage bar plot")
68 | plt.xticks(numpy.arange(min(dps), max(dps) + 1, 50.0))
69 | plt.savefig("{0}{1}".format(temp_dir, "/tmp/bar_plot.png"))
70 | return threshold
71 |
72 |
73 | def graph_simplification(
74 | graph: Graph,
75 | simp_node_dict: dict,
76 | simp_edge_dict: dict,
77 | contig_dict: dict,
78 | logger: Logger,
79 | min_cov,
80 | ):
81 | """
82 | Directly remove all the vertex with coverage less than minimum coverage and related edge
83 |
84 | Node belongs to any contigs should not be removed
85 | return:
86 | removed_node_dict
87 | removed_edge_dict
88 | """
89 | logger.info("graph simplification")
90 | logger.debug(
91 | "Total nodes: "
92 | + str(len(simp_node_dict))
93 | + " Total edges: "
94 | + str(len(simp_edge_dict))
95 | )
96 | node_to_contig_dict = {}
97 | edge_to_contig_dict = {}
98 | if contig_dict != None:
99 | node_to_contig_dict, edge_to_contig_dict = contig_map_node(contig_dict)
100 | # iterate until no more node be removed from the graph
101 | for id, node in list(simp_node_dict.items()):
102 | if graph.vp.dp[node] <= min_cov:
103 | if id in node_to_contig_dict:
104 | continue
105 |
106 | graph_remove_vertex(graph, simp_node_dict, id, printout=False)
107 |
108 | for e in set(node.all_edges()):
109 | uid = graph.vp.id[e.source()]
110 | vid = graph.vp.id[e.target()]
111 | if (uid, vid) in edge_to_contig_dict:
112 | continue
113 | if (uid, vid) in simp_edge_dict:
114 | graph_remove_edge(graph, simp_edge_dict, uid, vid, printout=False)
115 |
116 | logger.debug(
117 | "Remain nodes: "
118 | + str(len(simp_node_dict))
119 | + " Total edges: "
120 | + str(len(simp_edge_dict))
121 | )
122 | logger.info("done")
123 | return
124 |
125 |
126 | # ------------------------------------LEGACY------------------------------------#
127 | def paths_from_src(graph: Graph, simp_node_dict: dict, self_node, src, maxlen):
128 | """
129 | retrieve all the path from src node to any node
130 | within maxlen restriction, in straight direction
131 | """
132 |
133 | def dfs_rev(graph: Graph, u, curr_path: list, maxlen, visited, all_path):
134 | visited[u] = True
135 | curr_path.append(u)
136 | curr_len = path_len(graph, curr_path)
137 | if curr_len >= maxlen:
138 | all_path.append(list(curr_path))
139 | else:
140 | for v in u.out_neighbors():
141 | if not visited[v]:
142 | dfs_rev(graph, v, curr_path, maxlen, visited, all_path)
143 | curr_path.pop(-1)
144 | visited[u] = False
145 | return
146 |
147 | visited = {}
148 | for u in graph.vertices():
149 | if graph.vp.id[u] not in simp_node_dict:
150 | visited[u] = True
151 | else:
152 | visited[u] = False
153 | visited[self_node] = True
154 | all_path = []
155 | dfs_rev(graph, src, [], maxlen, visited, all_path)
156 | return all_path
157 |
158 |
159 | def paths_to_tgt(graph: Graph, simp_node_dict: dict, self_node, tgt, maxlen):
160 | """
161 | retrieve all the path from any node to tgt node
162 | within maxlen restriction, in reverse direction
163 | """
164 |
165 | def dfs_rev(graph: Graph, v, curr_path: list, maxlen, visited, all_path):
166 | visited[v] = True
167 | curr_path.insert(0, v)
168 | curr_len = path_len(graph, curr_path)
169 | if curr_len >= maxlen:
170 | all_path.append(list(curr_path))
171 | else:
172 | for u in v.in_neighbors():
173 | if not visited[u]:
174 | dfs_rev(graph, u, curr_path, maxlen, visited, all_path)
175 | curr_path.pop(0)
176 | visited[v] = False
177 | return
178 |
179 | visited = {}
180 | for u in graph.vertices():
181 | if graph.vp.id[u] not in simp_node_dict:
182 | visited[u] = True
183 | else:
184 | visited[u] = False
185 | visited[self_node] = True
186 | all_path = []
187 | dfs_rev(graph, tgt, [], maxlen, visited, all_path)
188 | return all_path
189 |
190 |
191 | def tip_removal_s(
192 | graph: Graph,
193 | simp_node_dict: dict,
194 | contig_dict: dict,
195 | logger: Logger,
196 | tempdir,
197 | accept_rate=0.99,
198 | ):
199 | if not graph_is_DAG(graph, simp_node_dict):
200 | logger.info("Graph is Cyclic, tip removal start..")
201 | tip_removed = False
202 | while not tip_removed:
203 | tip_removed = tip_removal(
204 | graph, simp_node_dict, logger, tempdir, accept_rate
205 | )
206 | for cno, [contig, _, ccov] in list(contig_dict.items()):
207 | if not all([no in simp_node_dict for no in contig]):
208 | subcontigs = []
209 | curr_contig = []
210 | addLast = False
211 | for no in contig:
212 | if no in simp_node_dict:
213 | addLast = True
214 | curr_contig.append(no)
215 | else:
216 | addLast = False
217 | if curr_contig != []:
218 | subcontigs.append(curr_contig[:])
219 | curr_contig = []
220 | if addLast:
221 | subcontigs.append(curr_contig[:])
222 |
223 | contig_dict.pop(cno)
224 | for i, subc in enumerate(subcontigs):
225 | sublen = path_len(graph, [simp_node_dict[c] for c in subc])
226 | contig_dict[cno + "^" + str(i)] = [subc, sublen, ccov]
227 | else:
228 | logger.info("Graph is DAG, tip removal skipped.")
229 | logger.info("done")
230 | return
231 |
232 |
233 | def tip_removal(
234 | graph: Graph, simp_node_dict: dict, logger: Logger, tempdir, accept_rate
235 | ):
236 | """
237 | retrieve all the source/tail simple path, and merge them into adjacent neighbor path if possible
238 |
239 | the collapse step can be done before node depeth rebalance, since it only regards to
240 | matching score within node seq len
241 |
242 | if is the case, then spades contig may also be modified.
243 | """
244 |
245 | def remove_tip(graph: Graph, simp_node_dict: dict, from_node, to_path):
246 | """
247 | collapse the node with the given path, increment given path depth, remove related information
248 | about the node.
249 | """
250 | graph.vp.color[from_node] = "gray"
251 | pending_dp = graph.vp.dp[from_node]
252 | for node in to_path:
253 | graph.vp.dp[node] += pending_dp
254 | simp_node_dict.pop(graph.vp.id[from_node])
255 | for e in from_node.all_edges():
256 | graph.ep.color[e] = "gray"
257 | logger.debug(
258 | path_to_id_string(
259 | graph,
260 | to_path,
261 | "Tip Node {0} collapsed to path".format(graph.vp.id[from_node]),
262 | )
263 | )
264 | return
265 |
266 | def cand_collapse_path(graph: Graph, from_node, to_paths, temp_dir):
267 | """
268 | use minimap2 -c to evaluation the node-path similarity, sort based on matching score in DESC order
269 |
270 | return: the most similar path if there exist a path with score >= accept rate, else return None
271 | """
272 | ref_loc = "{0}/ref.fa".format(temp_dir)
273 | query_loc = "{0}/query.fa".format(temp_dir)
274 | overlap_loc = "{0}/overlap.paf".format(temp_dir)
275 | subprocess.check_call(
276 | "touch {0}; echo > {0}; touch {1}; echo > {1}".format(ref_loc, query_loc),
277 | shell=True,
278 | )
279 |
280 | id_path_dict = {}
281 | for id, path in list(enumerate(to_paths)):
282 | id_path_dict[id] = path
283 |
284 | # retrieve all the path information and save into ref.fa
285 | with open(ref_loc, "w") as ref_file:
286 | for id, path in id_path_dict.items():
287 | name = ">" + str(id) + "\n"
288 | seq = path_to_seq(graph, path, id) + "\n"
289 | ref_file.write(name)
290 | ref_file.write(seq)
291 | ref_file.close()
292 |
293 | # save from node info to query.fa
294 | with open(query_loc, "w") as query_file:
295 | name = ">" + graph.vp.id[from_node] + "\n"
296 | seq = path_to_seq(graph, [from_node], name) + "\n"
297 | query_file.write(name)
298 | query_file.write(seq)
299 | query_file.close()
300 |
301 | # minimap to obtain matching score for all node-path
302 | id_evalscore = {}
303 | minimap_api(ref_loc, query_loc, overlap_loc)
304 | with open(overlap_loc, "r") as overlap_file:
305 | for Line in overlap_file:
306 | splited = (Line[:-1]).split("\t")
307 | path_no = int(splited[5])
308 | nmatch = int(splited[9])
309 | nblock = int(splited[10])
310 | if path_no not in id_evalscore:
311 | id_evalscore[path_no] = [nmatch / nblock]
312 | else:
313 | id_evalscore[path_no].append(nmatch / nblock)
314 | overlap_file.close()
315 |
316 | # remove temp file
317 | subprocess.check_call(
318 | "rm {0}; rm {1}; rm {2}".format(ref_loc, query_loc, overlap_loc), shell=True
319 | )
320 |
321 | id_evalscore_sum = []
322 | for id, scores in id_evalscore.items():
323 | mean_score = numpy.mean(scores) if len(scores) != 0 else 0
324 | id_evalscore_sum.append((id, mean_score))
325 |
326 | best_match = sorted(id_evalscore_sum, key=lambda t: t[1], reverse=True)
327 | logger.debug("Tip Node: " + str(graph.vp.id[from_node]) + str(best_match))
328 | if len(best_match) == 0:
329 | return None
330 | elif best_match[0][1] >= accept_rate:
331 | return id_path_dict[best_match[0][0]]
332 | else:
333 | return None
334 |
335 | is_removed = True
336 | # get all the source simple path
337 | src_nodes = []
338 | tgt_nodes = []
339 | isolated_node = []
340 | for node in simp_node_dict.values():
341 | if node.in_degree() + node.out_degree() == 0:
342 | isolated_node.append(node)
343 | elif node.in_degree() == 0:
344 | src_nodes.append(node)
345 | elif node.out_degree() == 0:
346 | tgt_nodes.append(node)
347 | else:
348 | None
349 |
350 | # src node collapse
351 | src_nodes = sorted(src_nodes, key=lambda x: graph.vp.dp[x])
352 | for src in src_nodes:
353 | src_len = path_len(graph, [src])
354 | potential_paths = []
355 | # path retrieve
356 | for out_branch in src.out_neighbors():
357 | if graph.vp.id[out_branch] not in simp_node_dict:
358 | continue
359 | # print("current out branch: ", graph.vp.id[out_branch])
360 | for in_tgt in out_branch.in_neighbors():
361 | if graph.vp.id[in_tgt] == graph.vp.id[src]:
362 | # coincidence path
363 | continue
364 | if graph.vp.id[in_tgt] not in simp_node_dict:
365 | # collapsed path in previous iteration
366 | continue
367 | # print("current in tgt: ", graph.vp.id[in_tgt])
368 | potential_paths.extend(
369 | paths_to_tgt(graph, simp_node_dict, src, in_tgt, src_len)
370 | )
371 | cand_path = cand_collapse_path(graph, src, potential_paths, tempdir)
372 | if cand_path != None:
373 | remove_tip(graph, simp_node_dict, src, cand_path)
374 | is_removed = False
375 |
376 | # target node collapse
377 | tgt_nodes = sorted(tgt_nodes, key=lambda x: graph.vp.dp[x])
378 | for tgt in tgt_nodes:
379 | tgt_len = path_len(graph, [tgt])
380 | potential_paths = []
381 | # path retrieve
382 | for in_branch in tgt.in_neighbors():
383 | if graph.vp.id[in_branch] not in simp_node_dict:
384 | continue
385 | # print("current in branch: ", graph.vp.id[in_branch])
386 | for out_src in in_branch.out_neighbors():
387 | if graph.vp.id[out_src] == graph.vp.id[tgt]:
388 | # coincidence path
389 | continue
390 | if graph.vp.id[out_src] not in simp_node_dict:
391 | # collapsed path in previous iteration
392 | continue
393 | # print("current out src: ", graph.vp.id[out_src])
394 | potential_paths.extend(
395 | paths_from_src(graph, simp_node_dict, tgt, out_src, tgt_len)
396 | )
397 | cand_path = cand_collapse_path(graph, tgt, potential_paths, tempdir)
398 | if cand_path != None:
399 | remove_tip(graph, simp_node_dict, tgt, cand_path)
400 | is_removed = False
401 | return is_removed
402 |
--------------------------------------------------------------------------------
/utils/VStrains_Alignment.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | import os
4 | import time
5 | import subprocess
6 | import numpy
7 | import sys
8 |
9 |
10 | def process_paf_file(
11 | index2id,
12 | index2reflen,
13 | len_index2id,
14 | read_ids,
15 | fwd_paf_file,
16 | rve_paf_file,
17 | split_len,
18 | tid,
19 | ):
20 | print("Batch {0} start".format(tid))
21 | print("current pid: {0}".format(os.getpid()))
22 | start = time.time()
23 |
24 | node_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
25 | short_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
26 |
27 | id2index = {}
28 | for i in range(len_index2id):
29 | id2index[index2id[i]] = i
30 |
31 | read2index = {}
32 | index2read = numpy.array(
33 | [(k, fwdlen, revlen) for (k, _, _, fwdlen, revlen) in read_ids], dtype=int
34 | )
35 |
36 | conf_alns_f = [None for _ in index2read]
37 | # numpy.array([None for _ in index2read], dtype=object)
38 | conf_cords_f = [None for _ in index2read]
39 | # numpy.array([None for _ in index2read], dtype=object)
40 |
41 | conf_alns_r = [None for _ in index2read]
42 | # numpy.array([None for _ in index2read], dtype=object)
43 | conf_cords_r = [None for _ in index2read]
44 | # numpy.array([None for _ in index2read], dtype=object)
45 |
46 | for i, (glb_index, f_local_inds, r_local_inds, _, _) in enumerate(read_ids):
47 | read2index[glb_index] = i
48 | conf_alns_f[i] = [[] for _ in range(f_local_inds)]
49 | # numpy.array([[] for _ in range(f_local_inds)], dtype=object)
50 | conf_cords_f[i] = [[] for _ in range(f_local_inds)]
51 | # numpy.array([[] for _ in range(f_local_inds)], dtype=object)
52 | conf_alns_r[i] = [[] for _ in range(r_local_inds)]
53 | # numpy.array([[] for _ in range(r_local_inds)], dtype=object)
54 | conf_cords_r[i] = [[] for _ in range(r_local_inds)]
55 | # numpy.array([[] for _ in range(r_local_inds)], dtype=object)
56 |
57 | for file in [fwd_paf_file, rve_paf_file]:
58 | with open(file, "r") as fwd_paf:
59 | file_count = 0
60 | for line in fwd_paf:
61 | if line == "\n":
62 | break
63 | splited = (line[:-1]).split("\t")
64 | seg_no = splited[0]
65 | [glb_seg_no, sub_no] = seg_no.split("_")
66 | ref_no = str(splited[5])
67 | ref_start_coord = int(splited[7]) # 0-based
68 | nm = int(splited[10]) - int(splited[9])
69 | if nm == 0 and int(splited[10]) == split_len:
70 | if file == fwd_paf_file:
71 | conf_alns_f[read2index[int(glb_seg_no)]][int(sub_no)].append(
72 | id2index[ref_no]
73 | )
74 | conf_cords_f[read2index[int(glb_seg_no)]][int(sub_no)].append(
75 | ref_start_coord
76 | )
77 | else:
78 | conf_alns_r[read2index[int(glb_seg_no)]][int(sub_no)].append(
79 | id2index[ref_no]
80 | )
81 | conf_cords_r[read2index[int(glb_seg_no)]][int(sub_no)].append(
82 | ref_start_coord
83 | )
84 | file_count += 1
85 | fwd_paf.close()
86 | # print("Batch {0} finished alignment file parsing".format(tid))
87 | subprocess.check_call("rm {0}".format(fwd_paf_file), shell=True)
88 | subprocess.check_call("rm {0}".format(rve_paf_file), shell=True)
89 | # nonunique_counter = 0
90 |
91 | def retrieve_single_end_saturation(glb_index, conf_alns, conf_cords, rlen, ks):
92 | nodes = numpy.zeros(len_index2id, dtype=int)
93 | coords = [None for _ in range(len_index2id)]
94 | kindices = [None for _ in range(len_index2id)]
95 | for i, sub_aln_statuses in enumerate(conf_alns[glb_index]):
96 | # if len(sub_aln_statuses) > 1:
97 | # nonunique_counter += 1
98 | for j, sub_aln_status in enumerate(sub_aln_statuses):
99 | nodes[sub_aln_status] += 1
100 | if coords[sub_aln_status] == None:
101 | coords[sub_aln_status] = conf_cords[glb_index][i][j]
102 | else:
103 | coords[sub_aln_status] = min(
104 | coords[sub_aln_status], conf_cords[glb_index][i][j]
105 | )
106 | if kindices[sub_aln_status] == None:
107 | kindices[sub_aln_status] = i
108 | else:
109 | kindices[sub_aln_status] = min(kindices[sub_aln_status], i)
110 | saturates = []
111 | L = 0
112 | R = 0
113 | for i, v in enumerate(nodes):
114 | if coords[i] == None or kindices[i] == None:
115 | continue
116 | L = max(coords[i], coords[i] - kindices[i])
117 | R = min(coords[i] + index2reflen[i] - 1, coords[i] - kindices[i] + rlen - 1)
118 | saturate = R - L - (split_len - 1) + 1
119 | expected = (min(rlen, index2reflen[i]) - ks + 1) * (rlen - ks) / rlen
120 | if v >= max(min(saturate, expected), 1):
121 | # print(i,v,"passed")
122 | saturates.append(i)
123 | return saturates
124 |
125 | for glb_id, fwdlen, revlen in index2read:
126 | glb_index = read2index[glb_id]
127 | lefts = retrieve_single_end_saturation(
128 | glb_index, conf_alns_f, conf_cords_f, fwdlen, split_len
129 | )
130 | rights = retrieve_single_end_saturation(
131 | glb_index, conf_alns_r, conf_cords_r, revlen, split_len
132 | )
133 |
134 | k = 0
135 | for i in lefts:
136 | for i2 in lefts[k:]:
137 | short_mat[i][i2] += 1
138 | k += 1
139 | k = 0
140 | for j in rights:
141 | for j2 in rights[k:]:
142 | short_mat[j][j2] += 1
143 | k += 1
144 |
145 | for i in lefts:
146 | for j in rights:
147 | node_mat[i][j] += 1
148 |
149 | # free up memory
150 | conf_alns_f[glb_index] = None
151 | conf_alns_r[glb_index] = None
152 |
153 | elapsed = time.time() - start
154 | print("Batch {0} finished".format(tid))
155 | # print("Batch: {0} found non unique kmer count: {1}".format(tid, nonunique_counter))
156 | print("Batch: {0} time spent for processing paf file: {1}".format(tid, elapsed))
157 | return node_mat, short_mat
158 |
159 |
160 | def batch_split(
161 | fwd_file: str,
162 | rve_file: str,
163 | temp_dir: str,
164 | batch_size: int,
165 | do_split: bool,
166 | split_len,
167 | ):
168 | """split the read file into several
169 | Args:
170 | fwd_file (str): _description_
171 | rve_file (str): _description_
172 | batch_size (int): _description_
173 | Returns:
174 | list: list of batch files
175 | """
176 | n_reads = 0
177 | short_reads = 0
178 | used_reads = 0
179 | fkmer = 0
180 | rkmer = 0
181 |
182 | temp_file_fwd = None
183 | temp_file_rve = None
184 | local_reads = 0
185 | local_list = []
186 | batch_count = 0
187 | read_summary = []
188 | sub_files = []
189 | # forward reverse read processing
190 | with open(fwd_file, "r") as fwd:
191 | with open(rve_file, "r") as rve:
192 | fwd_reads = fwd.readlines()
193 | rev_reads = rve.readlines()
194 | total_size = min(len(fwd_reads) // 4, len(rev_reads) // 4)
195 | # marker_test = 1
196 | # total_size = min(marker_test, total_size)
197 | for i in range(total_size):
198 | # if i % batch_size == 0:
199 | # print("Processed {0} reads up to now.".format(i))
200 | [_, fseq, _, feval] = [s[:-1] for s in fwd_reads[i * 4 : (i + 1) * 4]]
201 | [_, rseq, _, reval] = [s[:-1] for s in rev_reads[i * 4 : (i + 1) * 4]]
202 | if fseq.count("N") or rseq.count("N"):
203 | n_reads += 1
204 | elif len(fseq) < split_len or len(rseq) < split_len:
205 | short_reads += 1
206 | else:
207 | used_reads += 1
208 | local_reads += 1
209 | local_list.append((fseq, feval, rseq, reval))
210 | if local_reads == batch_size or (
211 | local_reads > 0 and i == total_size - 1
212 | ):
213 | # file creation
214 | sub_fwd_filename = "{0}/temp_forward_{1}.fastq".format(
215 | temp_dir, batch_count
216 | )
217 | sub_rve_filename = "{0}/temp_reverse_{1}.fastq".format(
218 | temp_dir, batch_count
219 | )
220 | subprocess.check_call(
221 | "touch {0}; echo " " > {0}".format(sub_fwd_filename), shell=True
222 | )
223 | subprocess.check_call(
224 | "touch {0}; echo " " > {0}".format(sub_rve_filename), shell=True
225 | )
226 | temp_file_fwd = open(sub_fwd_filename, "w")
227 | temp_file_rve = open(sub_rve_filename, "w")
228 |
229 | read_ids = []
230 | if do_split:
231 | for j, (fseq, feval, rseq, reval) in enumerate(local_list):
232 | fread_id_subs = len(fseq) - split_len + 1
233 | rread_id_subs = len(rseq) - split_len + 1
234 | prefix_name = "@{0}_".format(j)
235 | # forward
236 | for sub_i in range(len(fseq) - split_len + 1):
237 | subfread = fseq[sub_i : sub_i + split_len]
238 | subfeval = feval[sub_i : sub_i + split_len]
239 | temp_file_fwd.write(
240 | prefix_name + "{0} /1\n".format(sub_i)
241 | )
242 | temp_file_fwd.write(subfread + "\n")
243 | temp_file_fwd.write("+\n")
244 | temp_file_fwd.write(subfeval + "\n")
245 | fkmer += len(fseq) - split_len + 1
246 | # reverse
247 | for sub_i in range(len(rseq) - split_len + 1):
248 | subrread = rseq[sub_i : sub_i + split_len]
249 | subreval = reval[sub_i : sub_i + split_len]
250 | temp_file_rve.write(
251 | prefix_name + "{0} /2\n".format(sub_i)
252 | )
253 | temp_file_rve.write(subrread + "\n")
254 | temp_file_rve.write("+\n")
255 | temp_file_rve.write(subreval + "\n")
256 | rkmer += len(rseq) - split_len + 1
257 | read_ids.append(
258 | (j, fread_id_subs, rread_id_subs, len(fseq), len(rseq))
259 | )
260 | else:
261 | for j, (fseq, feval, rseq, reval) in enumerate(local_list):
262 | prefix_name = "@{0}_".format(j)
263 | temp_file_fwd.write(prefix_name + "{0} /1\n".format(0))
264 | temp_file_fwd.write(fseq + "\n")
265 | temp_file_fwd.write("+\n")
266 | temp_file_fwd.write(feval + "\n")
267 |
268 | temp_file_rve.write(prefix_name + "{0} /2\n".format(0))
269 | temp_file_rve.write(rseq + "\n")
270 | temp_file_rve.write("+\n")
271 | temp_file_rve.write(reval + "\n")
272 | read_ids.append((j, 1, 1, len(fseq), len(rseq)))
273 | temp_file_fwd.close()
274 | temp_file_rve.close()
275 | read_summary.append(read_ids)
276 | sub_files.append((sub_fwd_filename, sub_rve_filename))
277 | local_reads = 0
278 | local_list = []
279 | batch_count += 1
280 | fwd.close()
281 | rve.close()
282 |
283 | print("total number of reads (before): ", total_size)
284 | print("total reads containing N: ", n_reads)
285 | print("total reads too short [<{0}]: ".format(split_len), short_reads)
286 | print("total number of reads (used): ", used_reads)
287 | print("total number of forward reads kmer: ", fkmer)
288 | print("total number of reverse reads kmer: ", rkmer)
289 | return read_summary, sub_files
290 |
291 |
292 | def minimap_alignment(fasta_file, sub_files, temp_dir):
293 | paf_files = []
294 | for i, (sub_fwd_filename, sub_rve_filename) in enumerate(sub_files):
295 | print(
296 | "minimap reads {0},{1} to graph..".format(
297 | sub_fwd_filename, sub_rve_filename
298 | )
299 | )
300 | start = time.time()
301 | sub_fwd_paf = "{0}/temp_fwd_aln_{1}.paf".format(temp_dir, i)
302 | subprocess.check_call(
303 | "minimap2 -c -t 16 {0} {1} > {2}".format(
304 | fasta_file, sub_fwd_filename, sub_fwd_paf
305 | ),
306 | shell=True,
307 | )
308 | # -B 40 -O 20,50 -E 30,10 -z 1,1 -k 27 -w 18 -s 256
309 | subprocess.check_call("rm {0}".format(sub_fwd_filename), shell=True)
310 |
311 | sub_rve_paf = "{0}/temp_rve_aln_{1}.paf".format(temp_dir, i)
312 | subprocess.check_call(
313 | "minimap2 -c -t 16 {0} {1} > {2}".format(
314 | fasta_file, sub_rve_filename, sub_rve_paf
315 | ),
316 | shell=True,
317 | )
318 | subprocess.check_call("rm {0}".format(sub_rve_filename), shell=True)
319 |
320 | paf_files.append((sub_fwd_paf, sub_rve_paf))
321 | elapsed = time.time() - start
322 | print("Time spent for minimap2: ", elapsed)
323 | return paf_files
324 |
325 |
326 | def main():
327 | print(
328 | "----------------------Paired-End Information Alignment----------------------"
329 | )
330 | parser = argparse.ArgumentParser(
331 | prog="pe_info",
332 | description="""Align Paired-End reads to nodes in graph to obtain strong links""",
333 | )
334 |
335 | parser.add_argument(
336 | "-g", "--gfa,", dest="gfa", type=str, required=True, help="graph, .gfa format"
337 | )
338 |
339 | parser.add_argument(
340 | "-o",
341 | "--output_dir",
342 | dest="dir",
343 | type=str,
344 | required=True,
345 | help="output directory",
346 | )
347 |
348 | parser.add_argument(
349 | "-f", "--forward", dest="fwd", required=True, help="forward read, .fastq"
350 | )
351 |
352 | parser.add_argument(
353 | "-r", "--reverse", dest="rve", required=True, help="reverse read, .fastq"
354 | )
355 |
356 | parser.add_argument(
357 | "-k",
358 | "--kmer_size",
359 | dest="kmer_size",
360 | type=int,
361 | default=128,
362 | help="unique kmer size",
363 | )
364 |
365 | args = parser.parse_args()
366 |
367 | # initialize output directory
368 | if args.dir[-1] == "/":
369 | args.dir = args.dir[:-1]
370 | subprocess.check_call("rm -rf {0}".format(args.dir), shell=True)
371 | os.makedirs(args.dir, exist_ok=True)
372 |
373 | glb_start = time.time()
374 | tmp_g2s_file = "{0}/temp_graph_seq.fasta".format(args.dir)
375 |
376 | # convert gfa to fasta file
377 | index2id = []
378 | index2reflen = []
379 | with open(args.gfa, "r") as gfa:
380 | with open(tmp_g2s_file, "w") as fasta:
381 | for Line in gfa:
382 | splited = (Line[:-1]).split("\t")
383 | if splited[0] == "S":
384 | fasta.write(">{0}\n{1}\n".format(splited[1], splited[2]))
385 | index2id.append(splited[1])
386 | index2reflen.append(len(splited[2]))
387 | fasta.close()
388 | gfa.close()
389 |
390 | split_len = args.kmer_size + 1
391 | # split reads to several batches
392 | read_summary, sub_files = batch_split(
393 | args.fwd, args.rve, args.dir, 40000, True, split_len
394 | )
395 | # minimap2 reads to fasta file
396 | paf_files = minimap_alignment(tmp_g2s_file, sub_files, args.dir)
397 |
398 | len_index2id = len(index2id)
399 | node_mats = []
400 | strand_mats = []
401 |
402 | for i in range(len(paf_files)):
403 | (node_mat, strand_mat) = process_paf_file(
404 | index2id,
405 | index2reflen,
406 | len_index2id,
407 | read_summary[i],
408 | paf_files[i][0],
409 | paf_files[i][1],
410 | split_len,
411 | i,
412 | )
413 | node_mats.append(node_mat)
414 | strand_mats.append(strand_mat)
415 |
416 | print("All processes have finished their job, combine the result.")
417 | # combine all the outputs
418 | glb_node_mat = numpy.sum(numpy.array(node_mats), axis=0)
419 | glb_strand_mat = numpy.sum(numpy.array(strand_mats), axis=0)
420 | out_file = "{0}/pe_info".format(args.dir)
421 | out_file2 = "{0}/st_info".format(args.dir)
422 | subprocess.check_call("touch {0}; echo " " > {0}".format(out_file), shell=True)
423 | with open(out_file, "w") as outfile:
424 | with open(out_file2, "w") as outfile2:
425 | for i in range(len_index2id):
426 | for j in range(len_index2id):
427 | outfile.write(
428 | "{0}:{1}:{2}\n".format(
429 | index2id[i], index2id[j], glb_node_mat[i][j]
430 | )
431 | )
432 | outfile2.write(
433 | "{0}:{1}:{2}\n".format(
434 | index2id[i], index2id[j], glb_strand_mat[i][j]
435 | )
436 | )
437 | outfile2.close()
438 | outfile.close()
439 |
440 | glb_elapsed = time.time() - glb_start
441 | print("Global time elapsed: ", glb_elapsed)
442 | print("result stored in: ", out_file)
443 |
444 |
445 | if __name__ == "__main__":
446 | main()
447 | sys.exit(0)
448 |
--------------------------------------------------------------------------------
/utils/VStrains_IO.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from logging import Logger
4 | from graph_tool.all import Graph
5 | import gfapy
6 | import subprocess
7 | import sys
8 | import re
9 |
10 | from utils.VStrains_Utilities import *
11 |
12 |
13 | def init_graph():
14 | graph = Graph(directed=True)
15 | graph.vp.seq = graph.new_vertex_property("string", val="")
16 | graph.vp.dp = graph.new_vertex_property("double")
17 | graph.vp.id = graph.new_vertex_property("string", val="UD")
18 | graph.vp.color = graph.new_vertex_property("string")
19 |
20 | graph.ep.overlap = graph.new_edge_property("int", val=0)
21 | graph.ep.flow = graph.new_edge_property("double", val=0.0)
22 | graph.ep.color = graph.new_edge_property("string")
23 |
24 | return graph
25 |
26 |
27 | def gfa_to_graph(gfa_file, logger: Logger, init_ori=1):
28 | """
29 | Convert assembly graph gfa file to graph
30 | Nodes: segment with corresponding
31 | """
32 |
33 | logger.info("Parsing GFA format graph")
34 | gfa = gfapy.Gfa().from_file(filename=gfa_file)
35 | logger.info(
36 | "Parsed gfa file length: {0}, version: {1}".format(len(gfa.lines), gfa.version)
37 | )
38 |
39 | graph = init_graph()
40 | graph.vp.visited = graph.new_vertex_property("int16_t", val=0)
41 | graph.vp.ori = graph.new_vertex_property("int16_t") # 1 = +, -1 = -
42 |
43 | graph.ep.visited = graph.new_edge_property("int", val=0)
44 |
45 | # S
46 | node_dict = {}
47 | dp_dict = {}
48 | edge_dict = {}
49 | for line in gfa.segments:
50 | # segment, convert into Node^- and Node^+
51 | [t, seg_no, seg] = (str(line).split("\t"))[:3]
52 | tags = (str(line).split("\t"))[3:]
53 | dp_float = 0
54 | ln = 0
55 | kc = 0
56 | for tag in tags:
57 | if tag.startswith("dp") or tag.startswith("DP"):
58 | dp_float = float(tag.split(":")[2])
59 | break
60 | if tag.startswith("ln") or tag.startswith("LN"):
61 | ln = int(tag.split(":")[2])
62 | if tag.startswith("kc") or tag.startswith("KC"):
63 | kc = int(tag.split(":")[2])
64 | if ln != 0 and kc != 0:
65 | break
66 |
67 | # gfa format check
68 | if t != "S" or (dp_float == 0 and (ln == 0 or kc == 0)):
69 | logger.error(
70 | "file: {0}, Illegal graph format, please double check if the graph has been contaminated".format(
71 | gfa_file
72 | )
73 | )
74 | sys.exit(1)
75 |
76 | if dp_float == 0:
77 | dp_float = kc / ln
78 |
79 | v_pos = graph.add_vertex()
80 | graph.vp.seq[v_pos] = seg
81 | graph.vp.dp[v_pos] = dp_float
82 | graph.vp.id[v_pos] = seg_no
83 | graph.vp.ori[v_pos] = 1
84 | graph.vp.visited[v_pos] = -1
85 | graph.vp.color[v_pos] = "black"
86 |
87 | v_neg = graph.add_vertex()
88 | graph.vp.seq[v_neg] = reverse_seq(seg)
89 | graph.vp.dp[v_neg] = dp_float
90 | graph.vp.id[v_neg] = seg_no
91 | graph.vp.ori[v_neg] = -1
92 | graph.vp.visited[v_neg] = -1
93 | graph.vp.color[v_neg] = "black"
94 |
95 | node_dict[seg_no] = (v_pos, v_neg)
96 | dp_dict[seg_no] = dp_float
97 | # L
98 | for edge in gfa.edges:
99 | [t, seg_no_l, ori_l, seg_no_r, ori_r] = (str(edge).split("\t"))[:5]
100 | tags = (str(edge).split("\t"))[5:]
101 | overlap_len = [tag for tag in tags if tag.endswith("m") or tag.endswith("M")][0]
102 | # gfa format check
103 | assert t == "L" and overlap_len[-1] == "M"
104 |
105 | u_pos, u_neg = node_dict[seg_no_l]
106 | v_pos, v_neg = node_dict[seg_no_r]
107 | u = u_pos if ori_l == "+" else u_neg
108 | v = v_pos if ori_r == "+" else v_neg
109 |
110 | if (seg_no_l, graph.vp.ori[u], seg_no_r, graph.vp.ori[v]) in edge_dict:
111 | logger.error(
112 | "parallel edge found, invalid case in assembly graph, please double-check the assembly graph format"
113 | )
114 | logger.error("Pipeline aborted")
115 | sys.exit(1)
116 |
117 | if seg_no_l == seg_no_r:
118 | graph.vp.seq[u] = str.lower(graph.vp.seq[u])
119 | graph.vp.seq[v] = str.lower(graph.vp.seq[v])
120 | continue
121 |
122 | e = graph.add_edge(source=u, target=v)
123 | graph.ep.overlap[e] = int(overlap_len[:-1])
124 | graph.ep.color[e] = "black"
125 |
126 | edge_dict[(seg_no_l, graph.vp.ori[u], seg_no_r, graph.vp.ori[v])] = e
127 |
128 | graph, simp_node_dict, simp_edge_dict = flip_graph_bfs(
129 | graph, node_dict, edge_dict, dp_dict, logger, init_ori
130 | )
131 | red_graph, red_node_dict, red_edge_dict = reduce_graph(
132 | graph, simp_node_dict, simp_edge_dict
133 | )
134 | return red_graph, red_node_dict, red_edge_dict
135 |
136 |
137 | def flip_graph_bfs(
138 | graph: Graph,
139 | node_dict: dict,
140 | edge_dict: dict,
141 | dp_dict: dict,
142 | logger: Logger,
143 | init_ori=1,
144 | ):
145 | """
146 | Flip all the node orientation.
147 |
148 | return an node_dict, which only contains one orientation per node for simplicity.
149 | rename all the used node to positive, and forbidden the opponent node.
150 | """
151 |
152 | def source_node_via_dp(dp_dict: dict):
153 | """
154 | return the pos-neg node with maximum depth
155 | """
156 | return max(dp_dict, key=dp_dict.get)
157 |
158 | def reverse_edge(graph: Graph, edge, node_dict: dict, edge_dict: dict):
159 | """
160 | reverse an edge with altered orientation and direction.
161 | """
162 | tmp_s = edge.source()
163 | tmp_t = edge.target()
164 |
165 | edge_dict.pop(
166 | (
167 | graph.vp.id[tmp_s],
168 | graph.vp.ori[tmp_s],
169 | graph.vp.id[tmp_t],
170 | graph.vp.ori[tmp_t],
171 | )
172 | )
173 |
174 | tmp_s_pos, tmp_s_neg = node_dict[graph.vp.id[tmp_s]]
175 | tmp_t_pos, tmp_t_neg = node_dict[graph.vp.id[tmp_t]]
176 | s = tmp_t_pos if graph.vp.ori[tmp_t] == -1 else tmp_t_neg
177 | t = tmp_s_pos if graph.vp.ori[tmp_s] == -1 else tmp_s_neg
178 |
179 | o = graph.ep.overlap[edge]
180 | graph.remove_edge(edge)
181 | e = graph.add_edge(s, t)
182 | graph.ep.overlap[e] = o
183 | edge_dict[
184 | (graph.vp.id[s], graph.vp.ori[s], graph.vp.id[t], graph.vp.ori[t])
185 | ] = e
186 |
187 | return graph, e, edge_dict
188 |
189 | logger.info("flip graph orientation..")
190 | pick_dict = {}
191 | while set(dp_dict):
192 | seg_no = source_node_via_dp(dp_dict)
193 | source_pos, source_neg = node_dict[seg_no]
194 | graph.vp.visited[source_pos] = 0
195 | graph.vp.visited[source_neg] = 0
196 | fifo_queue = [[node_dict[seg_no], init_ori]]
197 |
198 | while fifo_queue:
199 | (v_pos, v_neg), ori = fifo_queue.pop()
200 | dp_dict.pop(graph.vp.id[v_pos])
201 |
202 | u = None
203 | if ori == 1:
204 | u = v_pos
205 | pick_dict[graph.vp.id[u]] = "+"
206 | # print_vertex(graph, v_neg, "node to reverse pos")
207 | for e in set(v_neg.all_edges()):
208 | graph, r_e, edge_dict = reverse_edge(graph, e, node_dict, edge_dict)
209 | # print_edge(graph, r_e, "after reverse: ")
210 | else:
211 | u = v_neg
212 | pick_dict[graph.vp.id[u]] = "-"
213 | # print_vertex(graph, v_pos, "node to reverse neg")
214 | for e in set(v_pos.all_edges()):
215 | graph, r_e, edge_dict = reverse_edge(graph, e, node_dict, edge_dict)
216 | # print_edge(graph, r_e, "after reverse: ")
217 |
218 | graph.vp.visited[v_pos] = 1
219 | graph.vp.visited[v_neg] = 1
220 | # add further nodes into the fifo_queue
221 | for adj_node in u.all_neighbors():
222 | if graph.vp.visited[adj_node] == -1:
223 | vpos, vneg = node_dict[graph.vp.id[adj_node]]
224 | graph.vp.visited[vpos] = 0
225 | graph.vp.visited[vneg] = 0
226 | # print("appending node {0} to queue".format(graph.vp.id[adj_node]))
227 | fifo_queue.append(
228 | [node_dict[graph.vp.id[adj_node]], graph.vp.ori[adj_node]]
229 | )
230 |
231 | # verify sorted graph
232 | logger.info("final verifying graph..")
233 | assert len(pick_dict) == len(node_dict)
234 | for key, item in list(pick_dict.items()):
235 | v_pos, v_neg = node_dict[key]
236 | if item == "+":
237 | # FIXME split v_neg to a new node
238 | if v_neg.in_degree() + v_neg.out_degree() > 0:
239 | print_vertex(
240 | graph, v_neg, logger, "pick ambiguous found, pick both, split node"
241 | )
242 | pick_dict[key] = "t"
243 | else:
244 | # FIXME split v_neg to a new node
245 | if v_pos.in_degree() + v_pos.out_degree() > 0:
246 | print_vertex(
247 | graph, v_pos, logger, "pick ambiguous found, pick both, split node"
248 | )
249 | pick_dict[key] = "t"
250 | logger.info("Graph is verified")
251 |
252 | simp_node_dict = {}
253 | for seg_no, pick in pick_dict.items():
254 | if pick == "+":
255 | simp_node_dict[seg_no] = node_dict[seg_no][0]
256 | elif pick == "-":
257 | simp_node_dict["-" + seg_no] = node_dict[seg_no][1]
258 | graph.vp.id[node_dict[seg_no][1]] = "-" + seg_no
259 | else:
260 | simp_node_dict[seg_no] = node_dict[seg_no][0]
261 | graph.vp.id[node_dict[seg_no][0]] = seg_no
262 | simp_node_dict["-" + seg_no] = node_dict[seg_no][1]
263 | graph.vp.id[node_dict[seg_no][1]] = "-" + seg_no
264 |
265 | simp_edge_dict = {}
266 | for e in edge_dict.values():
267 | simp_edge_dict[(graph.vp.id[e.source()], graph.vp.id[e.target()])] = e
268 | logger.info("done")
269 | return graph, simp_node_dict, simp_edge_dict
270 |
271 |
272 | def reduce_graph(unsimp_graph: Graph, simp_node_dict: dict, simp_edge_dict: dict):
273 | graph = init_graph()
274 | red_node_dict = {}
275 | red_edge_dict = {}
276 |
277 | for no, node in simp_node_dict.items():
278 | v = graph.add_vertex()
279 | graph.vp.seq[v] = unsimp_graph.vp.seq[node]
280 | graph.vp.dp[v] = unsimp_graph.vp.dp[node]
281 | graph.vp.id[v] = unsimp_graph.vp.id[node]
282 | graph.vp.color[v] = "black"
283 | red_node_dict[no] = v
284 |
285 | for (u, v), e in simp_edge_dict.items():
286 | source = red_node_dict[u]
287 | sink = red_node_dict[v]
288 |
289 | re = graph.add_edge(source, sink)
290 | graph.ep.overlap[re] = unsimp_graph.ep.overlap[e]
291 | graph.ep.flow[re] = unsimp_graph.ep.flow[e]
292 | graph.ep.color[re] = "black"
293 | red_edge_dict[(u, v)] = re
294 |
295 | return graph, red_node_dict, red_edge_dict
296 |
297 |
298 | def flipped_gfa_to_graph(gfa_file, logger: Logger):
299 | """
300 | read flipped gfa format graph in.
301 | """
302 | logger.debug("Parsing GFA format graph")
303 | gfa = gfapy.Gfa().from_file(filename=gfa_file)
304 | logger.debug(
305 | "Parsed gfa file length: {0}, version: {1}".format(len(gfa.lines), gfa.version)
306 | )
307 |
308 | graph = init_graph()
309 | red_node_dict = {}
310 | red_edge_dict = {}
311 |
312 | # S
313 | for line in gfa.segments:
314 | [_, seg_no, seg, dp] = str(line).split("\t")
315 | dp_float = float(dp.split(":")[2])
316 | v = graph.add_vertex()
317 | graph.vp.seq[v] = seg
318 | graph.vp.dp[v] = dp_float
319 | graph.vp.id[v] = seg_no
320 | graph.vp.color[v] = "black"
321 | red_node_dict[seg_no] = v
322 | # L
323 | for edge in gfa.edges:
324 | [_, seg_no_l, ori_l, seg_no_r, ori_r, overlap_len] = str(edge).split("\t")
325 | source = red_node_dict[seg_no_l]
326 | sink = red_node_dict[seg_no_r]
327 |
328 | assert overlap_len[-1] == "M" and ori_l == ori_r
329 | re = graph.add_edge(source, sink)
330 | graph.ep.overlap[re] = int(overlap_len[:-1])
331 | graph.ep.color[re] = "black"
332 | red_edge_dict[(seg_no_l, seg_no_r)] = re
333 |
334 | return graph, red_node_dict, red_edge_dict
335 |
336 |
337 | def graph_to_gfa(
338 | graph: Graph, simp_node_dict: dict, simp_edge_dict: dict, logger: Logger, filename
339 | ):
340 | """
341 | store the swapped graph in simplifed_graph.
342 | """
343 | subprocess.check_call("touch {0}; echo > {0}".format(filename), shell=True)
344 |
345 | with open(filename, "w") as gfa:
346 | for v in simp_node_dict.values():
347 | if graph.vp.color[v] == "black":
348 | name = graph.vp.id[v]
349 | gfa.write(
350 | "S\t{0}\t{1}\tDP:f:{2}\n".format(
351 | name, graph.vp.seq[v], graph.vp.dp[v]
352 | )
353 | )
354 |
355 | for (u, v), e in simp_edge_dict.items():
356 | node_u = simp_node_dict[u] if u in simp_node_dict else None
357 | node_v = simp_node_dict[v] if v in simp_node_dict else None
358 |
359 | if node_u == None or node_v == None:
360 | continue
361 | if graph.vp.color[node_u] != "black" or graph.vp.color[node_v] != "black":
362 | continue
363 | if graph.ep.color[e] != "black":
364 | continue
365 | gfa.write(
366 | "L\t{0}\t{1}\t{2}\t{3}\t{4}M\n".format(
367 | u, "+", v, "+", graph.ep.overlap[e]
368 | )
369 | )
370 | gfa.close()
371 | logger.info(filename + " is stored..")
372 | return 0
373 |
374 |
375 | def is_valid(p: list, idx_mapping: dict, simp_node_dict: dict, simp_edge_dict: dict):
376 | if len(p) == 0:
377 | return False
378 | if len(p) == 1:
379 | if p[0] not in idx_mapping:
380 | return False
381 | if idx_mapping[p[0]] not in simp_node_dict:
382 | return False
383 | return True
384 | for i in range(len(p) - 1):
385 | if p[i] not in idx_mapping or p[i + 1] not in idx_mapping:
386 | return False
387 | mu = idx_mapping[p[i]]
388 | mv = idx_mapping[p[i + 1]]
389 | if mu not in simp_node_dict:
390 | return False
391 | if mv not in simp_node_dict:
392 | return False
393 | if (mu, mv) not in simp_edge_dict:
394 | return False
395 | return True
396 |
397 |
398 | def spades_paths_parser(
399 | graph: Graph,
400 | simp_node_dict: dict,
401 | simp_edge_dict: dict,
402 | idx_mapping: dict,
403 | logger: Logger,
404 | path_file,
405 | min_len=250,
406 | min_cov=0,
407 | ):
408 | """
409 | Map SPAdes's contig to the graph, return all the suitable contigs.
410 | """
411 |
412 | def get_paths(fd, path):
413 | subpaths = []
414 | total_nodes = 0
415 | while path.endswith(";\n"):
416 | subpath = str(path[:-2]).split(",")
417 | subpath = list(
418 | map(
419 | lambda v: str(v[:-1]) if v[-1] == "+" else "-" + str(v[:-1]),
420 | subpath,
421 | )
422 | )
423 | subpathred = list(dict.fromkeys(subpath))
424 | # validity check
425 | if is_valid(subpathred, idx_mapping, simp_node_dict, simp_edge_dict):
426 | subpath = list(map(lambda v: idx_mapping[v], subpath))
427 | subpaths.append(subpath)
428 | total_nodes += len(subpath)
429 | path = fd.readline()
430 |
431 | subpath = path.rstrip().split(",")
432 | subpath = list(
433 | map(lambda v: str(v[:-1]) if v[-1] == "+" else "-" + str(v[:-1]), subpath)
434 | )
435 | subpathred = list(dict.fromkeys(subpath))
436 | # validity check
437 | if is_valid(subpathred, idx_mapping, simp_node_dict, simp_edge_dict):
438 | subpath = list(map(lambda v: idx_mapping[v], subpath))
439 | subpaths.append(subpath)
440 | total_nodes += len(subpath)
441 |
442 | return subpaths, total_nodes
443 |
444 | logger.info("parsing SPAdes .paths file..")
445 | contig_dict = {}
446 | contig_info = {}
447 | try:
448 | with open(path_file, "r") as contigs_file:
449 | name = contigs_file.readline()
450 | path = contigs_file.readline()
451 |
452 | while name != "" and path != "":
453 | (cno, clen, ccov) = re.search(
454 | "%s(.*)%s(.*)%s(.*)" % ("NODE_", "_length_", "_cov_"), name.strip()
455 | ).group(1, 2, 3)
456 | subpaths, total_nodes = get_paths(contigs_file, path)
457 |
458 | name_r = contigs_file.readline()
459 | path_r = contigs_file.readline()
460 | (cno_r, clen_r, ccov_r) = re.search(
461 | "%s(.*)%s(.*)%s(.*)%s" % ("NODE_", "_length_", "_cov_", "'"),
462 | name_r.strip(),
463 | ).group(1, 2, 3)
464 | subpaths_r, total_nodes_r = get_paths(contigs_file, path_r)
465 |
466 | if not (cno == cno_r and clen == clen_r and ccov == ccov_r):
467 | raise BaseException
468 |
469 | # next contig group
470 | name = contigs_file.readline()
471 | path = contigs_file.readline()
472 |
473 | # pick one direction only
474 | (segments, total_n) = max(
475 | [(subpaths, total_nodes), (subpaths_r, total_nodes_r)],
476 | key=lambda t: t[1],
477 | )
478 |
479 | # filter contig
480 | if segments == []:
481 | continue
482 | if total_n < 2 and (float(ccov) <= min_cov or int(clen) < min_len):
483 | continue
484 | for i, subpath in enumerate(segments):
485 | repeat_dict = {}
486 | for k in subpath:
487 | if k not in repeat_dict:
488 | repeat_dict[k] = 1
489 | else:
490 | repeat_dict[k] += 1
491 | subpath = list(dict.fromkeys(subpath))
492 |
493 | if len(segments) != 1:
494 | contig_dict[cno + "$" + str(i)] = [
495 | subpath,
496 | path_len(graph, [simp_node_dict[id] for id in subpath]),
497 | float(ccov),
498 | ]
499 | contig_info[cno + "$" + str(i)] = (None, repeat_dict)
500 | else:
501 | contig_dict[cno] = [subpath, int(clen), float(ccov)]
502 | contig_info[cno] = (None, repeat_dict)
503 |
504 | contigs_file.close()
505 | except BaseException as err:
506 | logger.error(
507 | err,
508 | "\nPlease make sure the correct SPAdes contigs .paths file is provided.",
509 | )
510 | logger.error("Pipeline aborted")
511 | sys.exit(1)
512 | logger.debug(str(contig_dict))
513 | logger.debug(str(contig_info))
514 | logger.info("done")
515 | return contig_dict, contig_info
516 |
517 |
518 | def contig_dict_to_fasta(
519 | graph: Graph, simp_node_dict: dict, contig_dict: dict, output_file
520 | ):
521 | """
522 | Store contig dict into fastq file
523 | """
524 | subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True)
525 |
526 | with open(output_file, "w") as fasta:
527 | for cno, (contig, clen, ccov) in sorted(
528 | contig_dict.items(), key=lambda x: x[1][1], reverse=True
529 | ):
530 | contig_name = (
531 | ">" + str(cno) + "_" + str(clen) + "_" + str(round(ccov, 2)) + "\n"
532 | )
533 | seq = path_ids_to_seq(graph, contig, contig_name, simp_node_dict) + "\n"
534 | fasta.write(contig_name)
535 | fasta.write(seq)
536 | fasta.close()
537 |
538 |
539 | def strain_dict_to_fasta(strain_dict: dict, output_file):
540 | """
541 | Store strain dict into fastq file
542 | """
543 | subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True)
544 |
545 | with open(output_file, "w") as fasta:
546 | for cno, (sseq, clen, ccov) in sorted(
547 | strain_dict.items(), key=lambda x: x[1][1], reverse=True
548 | ):
549 | contig_name = (
550 | ">" + str(cno) + "_" + str(clen) + "_" + str(round(ccov, 2)) + "\n"
551 | )
552 | seq = sseq + "\n"
553 | fasta.write(contig_name)
554 | fasta.write(seq)
555 | fasta.close()
556 |
557 |
558 | def contig_dict_to_path(
559 | contig_dict: dict, output_file, id_mapping: dict = None, keep_original=False
560 | ):
561 | """
562 | Store contig dict into paths file
563 | """
564 | subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True)
565 | rev_id_mapping = {}
566 | if id_mapping != None:
567 | for id, map in id_mapping.items():
568 | rev_id_mapping[map] = id
569 | with open(output_file, "w") as paths:
570 | for cno, (contig, clen, ccov) in sorted(
571 | contig_dict.items(), key=lambda x: x[1][1], reverse=True
572 | ):
573 | contig_name = "NODE_" + str(cno) + "_" + str(clen) + "_" + str(ccov) + "\n"
574 | path_ids = ""
575 | for id in contig:
576 | if keep_original:
577 | for iid in str(id).split("&"):
578 | if iid.find("*") != -1:
579 | rid = rev_id_mapping[iid[: iid.find("*")]]
580 | else:
581 | rid = rev_id_mapping[iid]
582 | if rid[0] == "-":
583 | rid = rid[1:] + "-"
584 | path_ids += rid + ","
585 | else:
586 | for iid in str(id).split("&"):
587 | if iid.find("*") != -1:
588 | rid = iid[: iid.find("*")]
589 | else:
590 | rid = iid
591 | path_ids += str(rid) + ","
592 | path_ids = path_ids[:-1] + "\n"
593 | paths.write(contig_name)
594 | paths.write(path_ids)
595 | paths.close()
596 |
597 |
598 | def process_pe_info(node_ids, pe_info_file, st_info_file):
599 | pe_info = {}
600 | for u in node_ids:
601 | for v in node_ids:
602 | pe_info[(min(u, v), max(u, v))] = 0
603 | with open(pe_info_file, "r") as file:
604 | for line in file:
605 | if line == "\n":
606 | break
607 | [u, v, mark] = line[:-1].split(":")[:3]
608 | # bidirection
609 | key = (min(u, v), max(u, v))
610 | if pe_info.get(key) != None:
611 | pe_info[key] += int(mark)
612 | file.close()
613 |
614 | with open(st_info_file, "r") as file:
615 | for line in file:
616 | if line == "\n":
617 | break
618 | [u, v, mark] = line[:-1].split(":")[:3]
619 | # bidirection
620 | key = (min(u, v), max(u, v))
621 | if pe_info.get(key) != None:
622 | pe_info[key] += int(mark)
623 | file.close()
624 | dcpy_pe_info = {}
625 | for (uid, wid), u in pe_info.items():
626 | dcpy_pe_info[(uid, wid)] = u
627 | return pe_info, dcpy_pe_info
628 |
629 |
630 | def store_reinit_graph(
631 | graph: Graph,
632 | simp_node_dict: dict,
633 | simp_edge_dict: dict,
634 | logger: Logger,
635 | opt_filename,
636 | ):
637 | graph_to_gfa(graph, simp_node_dict, simp_edge_dict, logger, opt_filename)
638 | grapho, simp_node_dicto, simp_edge_dicto = flipped_gfa_to_graph(
639 | opt_filename, logger
640 | )
641 | assign_edge_flow(grapho, simp_node_dicto, simp_edge_dicto)
642 | return grapho, simp_node_dicto, simp_edge_dicto
643 |
--------------------------------------------------------------------------------
/utils/VStrains_Extension.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 |
4 | from graph_tool.all import Graph
5 | from utils.VStrains_Utilities import *
6 | from utils.VStrains_Decomposition import get_non_trivial_branches, global_trivial_split
7 | from utils.VStrains_IO import store_reinit_graph
8 |
9 |
10 | def best_matching(
11 | graph: Graph,
12 | simp_node_dict: dict,
13 | simp_edge_dict: dict,
14 | contig_dict: dict,
15 | pe_info: dict,
16 | logger: Logger,
17 | ):
18 | full_link = {}
19 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
20 | node_to_contig_dict, _ = contig_map_node(contig_dict)
21 | for no, node in non_trivial_branches.items():
22 | us = [graph.vp.id[src] for src in node.in_neighbors()]
23 | ws = [graph.vp.id[tgt] for tgt in node.out_neighbors()]
24 | logger.debug("---------------------------------------------")
25 | logger.debug(
26 | "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format(
27 | no, len(us), len(ws)
28 | )
29 | )
30 | # add contig supports
31 | support_contigs = node_to_contig_dict.get(no, [])
32 | con_info = {}
33 | for cno in support_contigs:
34 | [contig, clen, ccov] = contig_dict[cno]
35 | loc = contig.index(no)
36 | if loc > 0 and loc < len(contig) - 1:
37 | con_info[(contig[loc - 1], contig[loc + 1])] = con_info.get(
38 | (contig[loc - 1], contig[loc + 1]), []
39 | )
40 | con_info[(contig[loc - 1], contig[loc + 1])].append((cno, clen, ccov))
41 | print_contig(
42 | cno,
43 | clen,
44 | round(ccov, 2),
45 | contig[max(loc - 1, 0) : loc + 2],
46 | logger,
47 | "support contig",
48 | )
49 | kept_link = {}
50 | sec_comb = []
51 | # init node usage for current branch
52 | in_usage = dict.fromkeys(us, 0)
53 | out_usage = dict.fromkeys(ws, 0)
54 |
55 | # align contig link first, and update status
56 | logger.debug("align contig link first")
57 | for uid in us:
58 | for wid in ws:
59 | logger.debug("---------------------")
60 | u = simp_node_dict[uid]
61 | w = simp_node_dict[wid]
62 | curr_pe = pe_info[(min(uid, wid), max(uid, wid))]
63 |
64 | logger.debug("{0} -> {1} PE: {2}".format(uid, wid, curr_pe))
65 | logger.debug(
66 | "cov info: {0}[{1}] -> {2}[{3}]".format(
67 | graph.ep.flow[graph.edge(u, node)],
68 | pe_info[(min(uid, no), max(uid, no))],
69 | graph.ep.flow[graph.edge(node, w)],
70 | pe_info[(min(no, wid), max(no, wid))],
71 | )
72 | )
73 | accept = False
74 | if (uid, wid) in con_info:
75 | logger.debug(
76 | "current link supported by contig: {0}, added".format(
77 | con_info[(uid, wid)]
78 | )
79 | )
80 | accept = True
81 | if uid == wid:
82 | logger.debug(
83 | "current link is a self link: {0}, potential cyclic strain, added".format(
84 | uid
85 | )
86 | )
87 | accept = True
88 |
89 | if accept:
90 | in_usage[uid] += 1
91 | out_usage[wid] += 1
92 | kept_link[(uid, wid)] = curr_pe
93 | else:
94 | logger.debug("current link is secondary choice, process later")
95 | sec_comb.append((uid, wid, curr_pe))
96 |
97 | logger.debug(
98 | "align paired end/single end information first (if any) to isolated nodes"
99 | )
100 | sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True)
101 | for uid, wid, pe in sorted_sec_comb:
102 | if pe > 0:
103 | logger.debug(
104 | "-----SEC LINK {0} -> {1} PE: {2}-----".format(uid, wid, pe)
105 | )
106 | logger.debug("- link [ > 0] supported case, added")
107 | in_usage[uid] += 1
108 | out_usage[wid] += 1
109 | kept_link[(uid, wid)] = pe
110 | full_link[no] = kept_link
111 | return full_link
112 |
113 |
114 | # extend contigs on both end, until a non distinct extension
115 | def contig_extension(
116 | graph: Graph,
117 | simp_node_dict: dict,
118 | contig: list,
119 | ccov,
120 | full_link: dict,
121 | logger: Logger,
122 | threshold,
123 | ):
124 | visited = dict.fromkeys(simp_node_dict.keys(), False)
125 | for no in contig[1:-1]:
126 | visited[no] = True
127 | final_path = []
128 | final_path.extend([simp_node_dict[no] for no in contig][1:-1])
129 |
130 | curr = simp_node_dict[contig[-1]]
131 | logger.debug("c-t extension")
132 | while curr != None and not visited[graph.vp.id[curr]]:
133 | visited[graph.vp.id[curr]] = True
134 | final_path.append(curr)
135 | out_branches = list([n for n in curr.out_neighbors()])
136 | if len(out_branches) == 0:
137 | curr = None
138 | logger.debug("Reach the end")
139 | elif len(out_branches) == 1:
140 | curr = out_branches[0]
141 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
142 | else:
143 | f_assigned = False
144 | if graph.vp.id[curr] in full_link and len(final_path) > 1:
145 | logger.debug("Curr is Branch")
146 | curr_links = [
147 | simp_node_dict[wid]
148 | for (uid, wid) in full_link[graph.vp.id[curr]].keys()
149 | if uid == graph.vp.id[final_path[-2]]
150 | ]
151 | if len(curr_links) == 1:
152 | # curr = curr_links[0]
153 | # logger.debug("single link next: {0}".format(graph.vp.id[curr]))
154 | if graph.vp.dp[curr_links[0]] - ccov <= -2 * threshold:
155 | curr = None
156 | logger.debug(
157 | "{0} single link < 2delta, use coverage".format(
158 | graph.vp.id[curr_links[0]]
159 | )
160 | )
161 | else:
162 | curr = curr_links[0]
163 | logger.debug("single link next: {0}".format(graph.vp.id[curr]))
164 | elif len(curr_links) > 1:
165 | logger.debug("Ambiguous, stop extension")
166 | curr = None
167 | else:
168 | logger.debug("No link in here, use coverage information")
169 | f_assigned = True
170 | else:
171 | curr = None
172 | logger.debug("Not in full link or len of path <= 1")
173 | if f_assigned:
174 | in_branches = list([n for n in curr.in_neighbors()])
175 | if len(final_path) > 1 and len(in_branches) > 0:
176 | curru = final_path[-2]
177 | opt_ws = sorted(
178 | out_branches,
179 | key=lambda ww: abs(graph.vp.dp[curru] - graph.vp.dp[ww]),
180 | )
181 | bestw = opt_ws[0]
182 | opt_us = sorted(
183 | in_branches,
184 | key=lambda uu: abs(graph.vp.dp[bestw] - graph.vp.dp[uu]),
185 | )
186 | if opt_us[0] == curru:
187 | delta = max(
188 | 2 * abs(graph.vp.dp[curru] - graph.vp.dp[bestw]), threshold
189 | )
190 | if (
191 | len(opt_us) > 1
192 | and abs(graph.vp.dp[opt_us[1]] - graph.vp.dp[bestw])
193 | <= delta
194 | ):
195 | logger.debug("ambiguous best matching, stop extension")
196 | continue
197 | if (
198 | len(opt_ws) > 1
199 | and abs(graph.vp.dp[curru] - graph.vp.dp[opt_ws[1]])
200 | <= delta
201 | ):
202 | logger.debug("ambiguous best matching, stop extension")
203 | continue
204 | logger.debug("best matching")
205 | curr = bestw
206 | else:
207 | logger.debug("Not best match")
208 | curr = None
209 | else:
210 | curr = None
211 | logger.debug("No Link + Not trivial, stop extension")
212 | if curr == None:
213 | single_bests = sorted(
214 | [(onode, graph.vp.dp[onode]) for onode in out_branches],
215 | key=lambda tp: tp[1],
216 | reverse=True,
217 | )
218 | logger.debug(
219 | "Try last bit: 1st: {0}, 2nd: {1}, delta: {2}, cov: {3}".format(
220 | (graph.vp.id[single_bests[0][0]], single_bests[0][1]),
221 | (graph.vp.id[single_bests[1][0]], single_bests[1][1]),
222 | threshold,
223 | ccov,
224 | )
225 | )
226 | if (
227 | single_bests[0][1] - ccov > -threshold
228 | and single_bests[1][1] - ccov <= -threshold
229 | ):
230 | logger.debug("Last bit succ")
231 | curr = single_bests[0][0]
232 | else:
233 | logger.debug("Last bit fail")
234 | unode = simp_node_dict[contig[0]]
235 | if len(contig) == 1 and final_path[-1] not in unode.in_neighbors():
236 | visited[contig[0]] = False
237 | final_path.pop(0)
238 | curr = unode
239 | logger.debug("s-c extension")
240 | while curr != None and not visited[graph.vp.id[curr]]:
241 | visited[graph.vp.id[curr]] = True
242 | final_path.insert(0, curr)
243 | in_branches = list([n for n in curr.in_neighbors()])
244 | if len(in_branches) == 0:
245 | curr = None
246 | logger.debug("Reach the end")
247 | elif len(in_branches) == 1:
248 | curr = in_branches[0]
249 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
250 | else:
251 | f_assigned = False
252 | if graph.vp.id[curr] in full_link and len(final_path) > 1:
253 | logger.debug("Curr is Branch")
254 | curr_links = [
255 | simp_node_dict[uid]
256 | for (uid, wid) in full_link[graph.vp.id[curr]].keys()
257 | if wid == graph.vp.id[final_path[1]]
258 | ]
259 | if len(curr_links) == 1:
260 | # curr = curr_links[0]
261 | # logger.debug("single link next: {0}".format(graph.vp.id[curr]))
262 | if graph.vp.dp[curr_links[0]] - ccov <= -2 * threshold:
263 | curr = None
264 | logger.debug(
265 | "{0} single link < 2delta, use coverage".format(
266 | graph.vp.id[curr_links[0]]
267 | )
268 | )
269 | else:
270 | curr = curr_links[0]
271 | logger.debug("prev: {0}".format(graph.vp.id[curr]))
272 | elif len(curr_links) > 1:
273 | logger.debug("Ambiguous, stop extension")
274 | curr = None
275 | else:
276 | logger.debug("No link in here, use coverage information")
277 | f_assigned = True
278 | else:
279 | curr = None
280 | logger.debug("Not in full link or len of path <= 1")
281 | if f_assigned:
282 | out_branches = list([n for n in curr.out_neighbors()])
283 | if len(final_path) > 1 and len(out_branches) > 0:
284 | currw = final_path[1]
285 | opt_us = sorted(
286 | in_branches,
287 | key=lambda uu: abs(graph.vp.dp[currw] - graph.vp.dp[uu]),
288 | )
289 | bestu = opt_us[0]
290 | opt_ws = sorted(
291 | out_branches,
292 | key=lambda ww: abs(graph.vp.dp[bestu] - graph.vp.dp[ww]),
293 | )
294 | if opt_ws[0] == currw:
295 | delta = max(
296 | 2 * abs(graph.vp.dp[currw] - graph.vp.dp[bestu]), threshold
297 | )
298 | if (
299 | len(opt_us) > 1
300 | and abs(graph.vp.dp[opt_us[1]] - graph.vp.dp[currw])
301 | <= delta
302 | ):
303 | logger.debug("ambiguous best matching, stop extension")
304 | continue
305 | if (
306 | len(opt_ws) > 1
307 | and abs(graph.vp.dp[bestu] - graph.vp.dp[opt_ws[1]])
308 | <= delta
309 | ):
310 | logger.debug("ambiguous best matching, stop extension")
311 | continue
312 | logger.debug("best matching")
313 | curr = bestu
314 | else:
315 | logger.debug("Not best match")
316 | curr = None
317 | else:
318 | logger.debug("No Link + Not trivial, stop extension")
319 | curr = None
320 | if curr == None:
321 | single_bests = sorted(
322 | [(inode, graph.vp.dp[inode]) for inode in in_branches],
323 | key=lambda tp: tp[1],
324 | reverse=True,
325 | )
326 | logger.debug(
327 | "Try last bit: 1st: {0}, 2nd: {1}, delta: {2}, cov: {3}".format(
328 | (graph.vp.id[single_bests[0][0]], single_bests[0][1]),
329 | (graph.vp.id[single_bests[1][0]], single_bests[1][1]),
330 | threshold,
331 | ccov,
332 | )
333 | )
334 | if (
335 | single_bests[0][1] - ccov > -threshold
336 | and single_bests[1][1] - ccov <= -threshold
337 | ):
338 | logger.debug("Last bit succ")
339 | curr = single_bests[0][0]
340 | else:
341 | logger.debug("Last bit fail")
342 | return final_path
343 |
344 |
345 | def final_extension(
346 | graph: Graph, simp_node_dict: dict, contig: list, full_link: dict, logger: Logger
347 | ):
348 | visited = dict.fromkeys(simp_node_dict.keys(), False)
349 | for no in contig[1:-1]:
350 | visited[no] = True
351 | curr = simp_node_dict[contig[-1]]
352 | final_path = []
353 | final_path.extend([simp_node_dict[no] for no in contig][1:-1])
354 | # from curr to the tail, or to the non-extendable end
355 | logger.debug("c-t extension")
356 | while curr != None and not visited[graph.vp.id[curr]]:
357 | visited[graph.vp.id[curr]] = True
358 | final_path.append(curr)
359 | out_branches = list([n for n in curr.out_neighbors()])
360 | if len(out_branches) == 0:
361 | curr = None
362 | logger.debug("Reach the end")
363 | elif len(out_branches) == 1:
364 | curr = out_branches[0]
365 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
366 | else:
367 | if graph.vp.id[curr] in full_link and len(final_path) > 1:
368 | logger.debug("Curr is Branch")
369 | curr_links = [
370 | simp_node_dict[wid]
371 | for (uid, wid) in full_link[graph.vp.id[curr]].keys()
372 | if uid == graph.vp.id[final_path[-2]]
373 | ]
374 | if len(curr_links) == 1:
375 | curr = curr_links[0]
376 | logger.debug("single link next: {0}".format(graph.vp.id[curr]))
377 | else:
378 | logger.debug("No/more link in here, end entension")
379 | curr = None
380 | else:
381 | curr = None
382 | logger.debug("Not in full link or len of path <= 1")
383 |
384 | unode = simp_node_dict[contig[0]]
385 | if len(contig) == 1 and final_path[-1] not in unode.in_neighbors():
386 | visited[contig[0]] = False
387 | final_path.pop(0)
388 | curr = unode
389 | # from head to the curr, or to the non-extendable end
390 | logger.debug("s-c extension")
391 | while curr != None and not visited[graph.vp.id[curr]]:
392 | visited[graph.vp.id[curr]] = True
393 | final_path.insert(0, curr)
394 | in_branches = list([n for n in curr.in_neighbors()])
395 | if len(in_branches) == 0:
396 | curr = None
397 | logger.debug("Reach the end")
398 | elif len(in_branches) == 1:
399 | curr = in_branches[0]
400 | logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
401 | else:
402 | if graph.vp.id[curr] in full_link and len(final_path) > 1:
403 | logger.debug("Curr is Branch")
404 | curr_links = [
405 | simp_node_dict[uid]
406 | for (uid, wid) in full_link[graph.vp.id[curr]].keys()
407 | if wid == graph.vp.id[final_path[1]]
408 | ]
409 | if len(curr_links) == 1:
410 | curr = curr_links[0]
411 | logger.debug("single link next: {0}".format(graph.vp.id[curr]))
412 | else:
413 | logger.debug("No/more link in here, end extension")
414 | curr = None
415 | else:
416 | curr = None
417 | logger.debug("Not in full link or len of path <= 1")
418 | return final_path
419 |
420 |
421 | def get_bubble_nodes(simp_node_dict: dict, contig: list):
422 | bubbles = []
423 | for no in contig:
424 | if simp_node_dict[no].in_degree() == 1 and simp_node_dict[no].out_degree() == 1:
425 | bubbles.append(simp_node_dict[no])
426 | return bubbles
427 |
428 |
429 | def reduce_graph(
430 | graph: Graph,
431 | simp_node_dict: dict,
432 | usages: dict,
433 | full_link: dict,
434 | logger: Logger,
435 | path,
436 | pcov,
437 | threshold,
438 | ):
439 | del_nodes_ids = []
440 | for node in path:
441 | usages[graph.vp.id[node]] += 1
442 | graph.vp.dp[node] -= pcov
443 | if graph.vp.dp[node] <= threshold:
444 | del_nodes_ids.append(graph.vp.id[node])
445 | graph.vp.color[node] = "gray"
446 | usages.pop(graph.vp.id[node])
447 | logger.debug(list_to_string(del_nodes_ids, "invalid nodes"))
448 | for links in full_link.values():
449 | for uid, wid in list(links.keys()):
450 | if (
451 | graph.vp.color[simp_node_dict[uid]] != "black"
452 | or graph.vp.color[simp_node_dict[wid]] != "black"
453 | ):
454 | links.pop((uid, wid))
455 | logger.debug("[D]{0}, {1}".format(uid, wid))
456 |
457 |
458 | def reduce_id_simple(id_l: list):
459 | ids = []
460 | for id in id_l:
461 | for iid in id.split("&"):
462 | if iid.find("*") != -1:
463 | ids.append(iid[: iid.find("*")])
464 | else:
465 | ids.append(iid)
466 | return ids
467 |
468 |
469 | def reduce_Anode(id: str, sno2ids: dict):
470 | ids = [id]
471 | while any([iid.startswith("A") for iid in ids]):
472 | len_ids = len(ids)
473 | for i in range(len_ids):
474 | if ids[i].startswith("A"):
475 | id_v = ids.pop(i).split("*")[0]
476 | j = i
477 | for subid in sno2ids[id_v]:
478 | ids.insert(j, subid)
479 | j += 1
480 | break
481 | return ids
482 |
483 |
484 | def path_extension(
485 | graph: Graph,
486 | simp_node_dict: dict,
487 | simp_edge_dict: dict,
488 | contig_dict: dict,
489 | full_link: dict,
490 | pe_info: dict,
491 | logger: Logger,
492 | threshold,
493 | temp_dir,
494 | ):
495 | logger.debug(
496 | "-------------------------PATH Extension, delta: {0}".format(threshold)
497 | )
498 | usages = dict.fromkeys(simp_node_dict.keys(), 0) # record the usage of each nodes
499 | strain_dict = {}
500 | rid = 1
501 | sno2ids = dict()
502 | while len(contig_dict) > 0:
503 | # perform trivial split
504 | prev_ids = list(simp_node_dict.keys())
505 | trivial_split_count, id_mapping = global_trivial_split(
506 | graph, simp_node_dict, simp_edge_dict, logger
507 | )
508 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
509 | graph,
510 | simp_node_dict,
511 | simp_edge_dict,
512 | logger,
513 | "{0}/gfa/graph_S{1}.gfa".format(temp_dir, rid),
514 | )
515 | red_id_mapping = contig_dict_remapping(
516 | graph,
517 | simp_node_dict,
518 | simp_edge_dict,
519 | contig_dict,
520 | id_mapping,
521 | prev_ids,
522 | logger,
523 | )
524 | # update links
525 | for no in list(full_link.keys()):
526 | if no not in simp_node_dict:
527 | full_link.pop(no)
528 | else:
529 | kept_link = full_link.pop(no)
530 | node = simp_node_dict[no]
531 | for (uid, wid), pe in list(kept_link.items()):
532 | # if len(red_id_mapping[uid]) != 1 or len(red_id_mapping[wid]) != 1:
533 | # kept_link.pop((uid, wid))
534 | # else:
535 | # kept_link[(list(red_id_mapping[uid])[0], list(red_id_mapping[wid])[0])] = pe
536 | kept_link.pop((uid, wid))
537 | if len(red_id_mapping[uid]) == 1 or len(red_id_mapping[wid]) == 1:
538 | for uuid in red_id_mapping[uid]:
539 | for wwid in red_id_mapping[wid]:
540 | if (
541 | (uuid, wwid) not in kept_link
542 | and (simp_node_dict[uuid] in node.in_neighbors())
543 | and (simp_node_dict[wwid] in node.out_neighbors())
544 | ):
545 | kept_link[(uuid, wwid)] = pe
546 | full_link[no] = kept_link
547 | # update usages
548 | for no, u in list(usages.items()):
549 | usages.pop(no)
550 | for new_no in red_id_mapping[no]:
551 | usages[new_no] = u
552 | ############################
553 | # get longest contig
554 | (longest_cno, [contig, clen, ccov]) = max(
555 | contig_dict.items(), key=lambda tp: tp[1][1]
556 | )
557 | contig_dict.pop(longest_cno)
558 | if all(usages[cn] > 0 for cn in contig):
559 | print_contig(
560 | longest_cno, clen, ccov, contig, logger, "-----> Used previously"
561 | )
562 | continue
563 | if any(graph.vp.color[simp_node_dict[no]] == "gray" for no in contig):
564 | print_contig(
565 | longest_cno,
566 | clen,
567 | ccov,
568 | contig,
569 | logger,
570 | "-----> Some node low cov, skip",
571 | )
572 | continue
573 |
574 | cbubbles = get_bubble_nodes(simp_node_dict, contig)
575 | bbl_cov = (
576 | numpy.median([graph.vp.dp[node] for node in cbubbles])
577 | if len(cbubbles) != 0
578 | else ccov
579 | )
580 | print_contig(
581 | longest_cno,
582 | clen,
583 | bbl_cov,
584 | contig,
585 | logger,
586 | "-----> Current extending contig: org ccov: {0}, use min {1}".format(
587 | ccov, min(ccov, bbl_cov)
588 | ),
589 | )
590 |
591 | path = contig_extension(
592 | graph,
593 | simp_node_dict,
594 | contig,
595 | min(ccov, bbl_cov),
596 | full_link,
597 | logger,
598 | threshold,
599 | )
600 | pno = "A" + str(rid)
601 | plen = path_len(graph, path)
602 | path_ids = [graph.vp.id[n] for n in path]
603 | sno2ids[pno] = []
604 | for pid in path_ids:
605 | if pid in sno2ids:
606 | sno2ids[pno].extend(sno2ids[pid])
607 | else:
608 | sno2ids[pno].append(pid)
609 | pbubbles = get_bubble_nodes(simp_node_dict, path_ids)
610 | bbl_pcov = (
611 | numpy.median([graph.vp.dp[node] for node in pbubbles])
612 | if len(pbubbles) != 0
613 | else ccov
614 | )
615 | pcov = min([ccov, bbl_pcov, bbl_cov])
616 | logger.debug(
617 | path_to_id_string(
618 | graph, path, "---*extended from contig {0}".format(longest_cno)
619 | )
620 | )
621 | logger.debug(
622 | "name: {0}, plen: {1}, pcov: {2}, bubble cov: {3}".format(
623 | pno, plen, pcov, bbl_pcov
624 | )
625 | )
626 | strain_dict[pno] = [sno2ids[pno], plen, pcov]
627 | for pid in path_ids:
628 | if pid in strain_dict:
629 | strain_dict.pop(pid)
630 | path_ins = [n for n in path[0].in_neighbors()]
631 | path_outs = [n for n in path[-1].out_neighbors()]
632 | if len(path_ins) == 0 and len(path_outs) == 0:
633 | # both end st
634 | logger.debug("st isolated, add to strain")
635 | reduce_graph(
636 | graph, simp_node_dict, usages, full_link, logger, path, pcov, threshold
637 | )
638 | elif len(path_ins) != 0 and len(path_outs) == 0:
639 | if len(path) > 1:
640 | logger.debug("left connected, wait")
641 | reduce_graph(
642 | graph,
643 | simp_node_dict,
644 | usages,
645 | full_link,
646 | logger,
647 | path[1:],
648 | pcov,
649 | threshold,
650 | )
651 | pnode = graph_add_vertex(
652 | graph, simp_node_dict, pno, pcov, path_to_seq(graph, path[1:], pno)
653 | )
654 | graph_add_edge(
655 | graph,
656 | simp_edge_dict,
657 | path[0],
658 | pnode,
659 | graph.ep.overlap[graph.edge(path[0], path[1])],
660 | pcov,
661 | )
662 | usages[pno] = 0
663 | elif len(path_ins) == 0 and len(path_outs) != 0:
664 | if len(path) > 1:
665 | logger.debug("right connected, wait")
666 | reduce_graph(
667 | graph,
668 | simp_node_dict,
669 | usages,
670 | full_link,
671 | logger,
672 | path[:-1],
673 | pcov,
674 | threshold,
675 | )
676 | pnode = graph_add_vertex(
677 | graph, simp_node_dict, pno, pcov, path_to_seq(graph, path[:-1], pno)
678 | )
679 | graph_add_edge(
680 | graph,
681 | simp_edge_dict,
682 | pnode,
683 | path[-1],
684 | graph.ep.overlap[graph.edge(path[-2], path[-1])],
685 | pcov,
686 | )
687 | usages[pno] = 0
688 | else:
689 | if len(path) > 1:
690 | logger.debug("both connected, wait")
691 | reduce_graph(
692 | graph,
693 | simp_node_dict,
694 | usages,
695 | full_link,
696 | logger,
697 | path[1:-1],
698 | pcov,
699 | threshold,
700 | )
701 | if len(path[1:-1]) > 0:
702 | pnode = graph_add_vertex(
703 | graph,
704 | simp_node_dict,
705 | pno,
706 | pcov,
707 | path_to_seq(graph, path[1:-1], pno),
708 | )
709 | graph_add_edge(
710 | graph,
711 | simp_edge_dict,
712 | path[0],
713 | pnode,
714 | graph.ep.overlap[graph.edge(path[0], path[1])],
715 | pcov,
716 | )
717 | graph_add_edge(
718 | graph,
719 | simp_edge_dict,
720 | pnode,
721 | path[-1],
722 | graph.ep.overlap[graph.edge(path[-2], path[-1])],
723 | pcov,
724 | )
725 | usages[pno] = 0
726 |
727 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
728 | graph,
729 | simp_node_dict,
730 | simp_edge_dict,
731 | logger,
732 | "{0}/gfa/graph_S{1}post.gfa".format(temp_dir, rid),
733 | )
734 | for cno in list(contig_dict.keys()):
735 | delete = False
736 | for no in contig_dict[cno][0]:
737 | if no not in simp_node_dict:
738 | delete = True
739 | if delete:
740 | contig_dict.pop(cno)
741 | rid += 1
742 |
743 | # remove trivial split multiple nodes
744 | seq_dict = {}
745 | for node in graph.vertices():
746 | if graph.vp.seq[node] not in seq_dict:
747 | seq_dict[graph.vp.seq[node]] = []
748 | seq_dict[graph.vp.seq[node]].append(node)
749 |
750 | for _, sp_nodes in seq_dict.items():
751 | if len(sp_nodes) > 1:
752 | sorted_sp_nodes = sorted(
753 | sp_nodes, key=lambda vnode: graph.vp.dp[vnode], reverse=True
754 | )
755 | for vnode in sorted_sp_nodes[1:]:
756 | graph_remove_vertex(graph, simp_node_dict, graph.vp.id[vnode])
757 | usages.pop(graph.vp.id[vnode])
758 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
759 | graph,
760 | simp_node_dict,
761 | simp_edge_dict,
762 | logger,
763 | "{0}/gfa/graph_S_final.gfa".format(temp_dir),
764 | )
765 | # assign link information
766 | final_link_info = {}
767 | for node in graph.vertices():
768 | for node2 in graph.vertices():
769 | if node > node2:
770 | continue
771 |
772 | nid1s = reduce_id_simple(reduce_Anode(graph.vp.id[node], sno2ids))
773 | # nid1s = (
774 | # reduce_id_simple([graph.vp.id[node]])
775 | # if graph.vp.id[node][0] != "A"
776 | # else reduce_id_simple(sno2ids[graph.vp.id[node].split("*")[0]])
777 | # )
778 | nid2s = reduce_id_simple(reduce_Anode(graph.vp.id[node2], sno2ids))
779 | # nid2s = (
780 | # reduce_id_simple([graph.vp.id[node2]])
781 | # if graph.vp.id[node2][0] != "A"
782 | # else reduce_id_simple(sno2ids[graph.vp.id[node2].split("*")[0]])
783 | # )
784 | kpair = (
785 | min(graph.vp.id[node], graph.vp.id[node2]),
786 | max(graph.vp.id[node], graph.vp.id[node2]),
787 | )
788 |
789 | logger.debug("nid1s: {0}, nid2s: {1}".format(nid1s, nid2s))
790 | logger.debug(
791 | "node1id: {0}, node2id: {1}".format(
792 | graph.vp.id[node], graph.vp.id[node2]
793 | )
794 | )
795 | final_link_info[kpair] = 0
796 | for id1 in nid1s:
797 | for id2 in nid2s:
798 | inner_kpair = (min(id1, id2), max(id1, id2))
799 | final_link_info[kpair] += pe_info[inner_kpair]
800 |
801 | nt_branches = get_non_trivial_branches(graph, simp_node_dict)
802 | final_links = {}
803 | for no, node in nt_branches.items():
804 | final_links[no] = {}
805 | us = [graph.vp.id[src] for src in node.in_neighbors()]
806 | ws = [graph.vp.id[tgt] for tgt in node.out_neighbors()]
807 | logger.debug("---------------------------------------------")
808 | logger.debug(
809 | "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format(
810 | no, len(us), len(ws)
811 | )
812 | )
813 | combs = []
814 | in_usage = dict.fromkeys(us, 0)
815 |
816 | out_usage = dict.fromkeys(ws, 0)
817 | for uid in us:
818 | for wid in ws:
819 | combs.append(
820 | (uid, wid, final_link_info[(min(uid, wid), max(uid, wid))])
821 | )
822 | sorted_comb = sorted(combs, key=lambda x: x[2], reverse=True)
823 | for uid, wid, lf in sorted_comb:
824 | logger.debug("---------------------")
825 | if lf > 0 and in_usage[uid] == 0 and out_usage[wid] == 0:
826 | logger.debug(
827 | "-----SEC LINK {0} -> {1} LINK: {2}-----".format(uid, wid, lf)
828 | )
829 | logger.debug("- unique link [ > 0] supported case, added")
830 | final_links[no][(uid, wid)] = lf
831 | in_usage[uid] += 1
832 | out_usage[wid] += 1
833 |
834 | # add all the nodes that not be used in contig extension to final resulting sets
835 | for node in sorted(
836 | graph.vertices(), key=lambda nd: len(graph.vp.seq[nd]), reverse=True
837 | ):
838 | if len(graph.vp.seq[node]) <= 600:
839 | break
840 | if usages[graph.vp.id[node]] == 0:
841 | logger.debug("Extend from free node: {0}".format(graph.vp.id[node]))
842 | ccov = graph.vp.dp[node]
843 | path = final_extension(
844 | graph, simp_node_dict, [graph.vp.id[node]], final_links, logger
845 | )
846 | pno = "N" + str(rid)
847 | plen = path_len(graph, path)
848 | path_ids = [graph.vp.id[n] for n in path]
849 | pids = []
850 | for pid in path_ids:
851 | if pid in sno2ids:
852 | pids.extend(sno2ids[pid])
853 | else:
854 | pids.append(pid)
855 | for pid in path_ids:
856 | if pid in strain_dict:
857 | strain_dict.pop(pid)
858 | pbubbles = get_bubble_nodes(simp_node_dict, path_ids)
859 | pcov = (
860 | numpy.median([graph.vp.dp[node] for node in pbubbles])
861 | if len(pbubbles) != 0
862 | else graph.vp.dp[node]
863 | )
864 | logger.debug(
865 | path_to_id_string(
866 | graph,
867 | path,
868 | "---*extended from free node {0}".format(graph.vp.id[node]),
869 | )
870 | )
871 | logger.debug("name: {0}, plen: {1}, pcov: {2}".format(pno, plen, pcov))
872 | strain_dict[pno] = [pids, plen, pcov]
873 | for node in path:
874 | usages[graph.vp.id[node]] += 1
875 | rid += 1
876 | for sno, [_, _, scov] in list(strain_dict.items()):
877 | if scov <= 2 * threshold:
878 | strain_dict.pop(sno)
879 |
880 | # split zipped vertices
881 | rid = ""
882 | for cno in strain_dict.keys():
883 | [contig, clen, ccov] = strain_dict[cno]
884 | rcontig = []
885 | for id in contig:
886 | rcontig.extend(reduce_id_simple(reduce_Anode(id, sno2ids)))
887 | # for iid in str(id).split("&"):
888 | # if iid.find("*") != -1:
889 | # rid = iid[: iid.find("*")]
890 | # else:
891 | # rid = iid
892 |
893 | # if rid in sno2ids:
894 | # rcontig.extend(sno2ids[rid])
895 | # else:
896 | # rcontig.append(rid)
897 | strain_dict[cno] = [rcontig, clen, ccov]
898 |
899 | return strain_dict, usages
900 |
--------------------------------------------------------------------------------
/utils/VStrains_Decomposition.py:
--------------------------------------------------------------------------------
1 | from utils.VStrains_Utilities import *
2 | from utils.VStrains_IO import store_reinit_graph
3 | import matplotlib.pyplot as plt
4 | import numpy
5 |
6 |
7 | def link_split(
8 | sec_comb: list,
9 | kept_link: dict,
10 | in_usage: dict,
11 | in_capacity: dict,
12 | out_usage: dict,
13 | out_capacity: dict,
14 | logger,
15 | ):
16 | """update split plan using paired end & single end information"""
17 | logger.debug("attempt to split via paired end information")
18 | sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True)
19 | for uid, wid, pe in sorted_sec_comb:
20 | if pe <= 0:
21 | break
22 | logger.debug("-----SEC LINK {0} -> {1} PE: {2}".format(uid, wid, pe))
23 | logger.debug("Capacity: {0} -> {1}".format(in_capacity[uid], out_capacity[wid]))
24 | logger.debug("- distinct compatiable case, added")
25 | in_usage[uid] += 1
26 | out_usage[wid] += 1
27 | kept_link[(uid, wid)] = ((in_capacity[uid] + out_capacity[wid]) / 2, pe)
28 | return
29 |
30 |
31 | def cov_split(
32 | us: list,
33 | ws: list,
34 | pe_info: dict,
35 | sec_comb: list,
36 | kept_link: dict,
37 | in_usage: dict,
38 | in_capacity: dict,
39 | out_usage: dict,
40 | out_capacity: dict,
41 | logger,
42 | ):
43 | """update split plan using coverage information"""
44 | logger.debug("attempt to split via coverage information")
45 | logger.debug(
46 | "align paired end/single end information first (if any) to isolated nodes"
47 | )
48 | sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True)
49 | for uid, wid, pe in sorted_sec_comb:
50 | if pe <= 0:
51 | break
52 | if in_usage[uid] > 0 or out_usage[wid] > 0:
53 | continue
54 | logger.debug("-----SEC LINK {0} -> {1} PE: {2}-----".format(uid, wid, pe))
55 | logger.debug("Capacity: {0} -> {1}".format(in_capacity[uid], out_capacity[wid]))
56 | logger.debug("- link [ > 0] supported case, added")
57 | in_usage[uid] += 1
58 | out_usage[wid] += 1
59 | kept_link[(uid, wid)] = ((in_capacity[uid] + out_capacity[wid]) / 2, pe)
60 |
61 | logger.debug("obtain best match via coverage similarity")
62 | for uid in us:
63 | if in_usage[uid] > 0:
64 | continue
65 | opt_ws = sorted(ws, key=lambda wwid: abs(in_capacity[uid] - out_capacity[wwid]))
66 | wid = opt_ws[0]
67 | opt_us = sorted(us, key=lambda uuid: abs(in_capacity[uuid] - out_capacity[wid]))
68 | if opt_us[0] == uid and out_usage[wid] == 0 and (uid, wid) not in kept_link:
69 | delta = 2 * abs(in_capacity[uid] - out_capacity[wid])
70 | logger.debug(
71 | "Found coverage best match: {0} -> {1} with cov: {2}, {3}, checking delta bound: {4}".format(
72 | uid, wid, in_capacity[uid], out_capacity[wid], delta
73 | )
74 | )
75 | if (
76 | abs(in_capacity[opt_us[1]] - out_capacity[wid]) <= delta
77 | or abs(in_capacity[uid] - out_capacity[opt_ws[1]]) <= delta
78 | ):
79 | logger.debug("ambiguous matching, skip")
80 | else:
81 | logger.debug("added")
82 | in_usage[uid] += 1
83 | out_usage[wid] += 1
84 | kept_link[(uid, wid)] = (
85 | (in_capacity[uid] + out_capacity[wid]) / 2,
86 | pe_info[(min(uid, wid), max(uid, wid))],
87 | )
88 | return
89 |
90 |
91 | def balance_split(
92 | graph: Graph,
93 | simp_node_dict: dict,
94 | simp_edge_dict: dict,
95 | contig_dict: dict,
96 | pe_info: dict,
97 | logger: Logger,
98 | ref_file: str,
99 | temp_dir: str,
100 | count_id: int,
101 | threshold,
102 | is_prim: bool,
103 | ):
104 | logger.info(
105 | "balance split using contigs&paired end links&coverage information.. isPrim: {0}".format(
106 | is_prim
107 | )
108 | )
109 | correct_X = []
110 | correct_Y = []
111 | false_error_X = []
112 | false_error_Y = []
113 | error_X = []
114 | error_Y = []
115 | error_text = []
116 | cut = 100
117 |
118 | # detect all non-trivial branches right now
119 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
120 | split_branches = []
121 | node_to_contig_dict, _ = contig_map_node(contig_dict)
122 | for no, node in non_trivial_branches.items():
123 | us = [
124 | graph.vp.id[e.source()]
125 | for e in node.in_edges()
126 | if graph.ep.color[e] == "black"
127 | ]
128 | ws = [
129 | graph.vp.id[e.target()]
130 | for e in node.out_edges()
131 | if graph.ep.color[e] == "black"
132 | ]
133 | logger.debug("---------------------------------------------")
134 | logger.debug(
135 | "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format(
136 | no, len(us), len(ws)
137 | )
138 | )
139 |
140 | # authenticate if split-able
141 | if any([pe_info[(uid, uid)] == None for uid in us]) or any(
142 | [pe_info[(wid, wid)] == None for wid in ws]
143 | ):
144 | logger.debug(
145 | "current non-trivial branch: {0} is related to current iteration, split later".format(
146 | no
147 | )
148 | )
149 | continue
150 | if not is_non_trivial(graph, node):
151 | logger.debug(
152 | "current non-trivial branch: {0} is not non-trivial, potential bug".format(
153 | no
154 | )
155 | )
156 | continue
157 | if len(us) != len(ws):
158 | logger.debug("Not N-N split, skip")
159 | continue
160 |
161 | # check if link-split
162 | split_via_link = True
163 |
164 | # not perform link-split if any leaf is from a splitted node
165 | for id in us + ws:
166 | singles = id.split("&")
167 | if all([single.count("*") > 0 for single in singles]):
168 | logger.debug(
169 | "leaf:{0} is total branch nodes, no link information, skip link split".format(
170 | id
171 | )
172 | )
173 | split_via_link = False
174 | break
175 |
176 | # not perform link-split if no combination has link information
177 | if all(
178 | [pe_info[(min(uid, wid), max(uid, wid))] == 0 for uid in us for wid in ws]
179 | ):
180 | logger.debug(
181 | "current branch node too long, no link information, skip link split"
182 | )
183 | split_via_link = False
184 |
185 | # add contig supports
186 | support_contigs = node_to_contig_dict.get(no, [])
187 | con_info = {}
188 | for cno in support_contigs:
189 | [contig, clen, ccov] = contig_dict[cno]
190 | loc = contig.index(no)
191 | if loc > 0 and loc < len(contig) - 1:
192 | con_info[(contig[loc - 1], contig[loc + 1])] = con_info.get(
193 | (contig[loc - 1], contig[loc + 1]), []
194 | )
195 | con_info[(contig[loc - 1], contig[loc + 1])].append((cno, clen, ccov))
196 | print_contig(
197 | cno,
198 | clen,
199 | round(ccov, 2),
200 | contig[max(loc - 1, 0) : loc + 2],
201 | logger,
202 | "support contig",
203 | )
204 |
205 | # debug only
206 | # obtain perfect split via reference
207 | expect_link = []
208 | ref_pair_dict = {}
209 | ref_all_dict = {}
210 | if ref_file:
211 | lrefs = set()
212 | rrefs = set()
213 | error_nos = set()
214 | for uid in us:
215 | for wid in ws:
216 | u = simp_node_dict[uid]
217 | w = simp_node_dict[wid]
218 | ref_l = best_aln_score(graph, "L", [u], ref_file, temp_dir)
219 | best_ref_l = [
220 | ref
221 | for [_, l, ref, nm] in ref_l
222 | if nm == 0 and l == len(graph.vp.seq[u])
223 | ]
224 | ref_r = best_aln_score(graph, "R", [w], ref_file, temp_dir)
225 | best_ref_r = [
226 | ref
227 | for [_, l, ref, nm] in ref_r
228 | if nm == 0 and l == len(graph.vp.seq[w])
229 | ]
230 | lrefs = lrefs.union(best_ref_l)
231 | rrefs = rrefs.union(best_ref_r)
232 | ref_pair_dict[(uid, wid)] = set(best_ref_l).intersection(
233 | set(best_ref_r)
234 | )
235 | ref_all_dict[(uid, wid)] = set(
236 | [ref for [_, _, ref, nm] in ref_l if nm < 5]
237 | ).union(set([ref for [_, _, ref, nm] in ref_r if nm < 5]))
238 | if len(ref_pair_dict[(uid, wid)]) > 0:
239 | expect_link.append((uid, wid))
240 | if len(best_ref_l) == 0:
241 | error_nos.add(uid)
242 | if len(best_ref_r) == 0:
243 | error_nos.add(wid)
244 | sym_diff = lrefs.symmetric_difference(rrefs)
245 | if len(sym_diff) > 0:
246 | logger.debug(
247 | "Current branch have force mismatch connection for following strains: {0}".format(
248 | sym_diff
249 | )
250 | )
251 | # debug only
252 |
253 | kept_link = {}
254 | sec_comb = []
255 | # init node usage for current branch
256 | in_usage = dict.fromkeys(us, 0)
257 | in_capacity = {}
258 | for uid in us:
259 | in_capacity[uid] = graph.ep.flow[simp_edge_dict[(uid, no)]]
260 |
261 | out_usage = dict.fromkeys(ws, 0)
262 | out_capacity = {}
263 | for wid in ws:
264 | out_capacity[wid] = graph.ep.flow[simp_edge_dict[(no, wid)]]
265 |
266 | # align contig link first, and update status
267 | logger.debug("align contig link first")
268 | for uid in us:
269 | for wid in ws:
270 | logger.debug("---------------------")
271 | u = simp_node_dict[uid]
272 | w = simp_node_dict[wid]
273 | curr_pe = pe_info[(min(uid, wid), max(uid, wid))]
274 |
275 | logger.debug("{0} -> {1} PE: {2}".format(uid, wid, curr_pe))
276 | logger.debug(
277 | "cov info: {0}[{1}] -> {2}[{3}]".format(
278 | graph.ep.flow[graph.edge(u, node)],
279 | pe_info[(min(uid, no), max(uid, no))],
280 | graph.ep.flow[graph.edge(node, w)],
281 | pe_info[(min(no, wid), max(no, wid))],
282 | )
283 | )
284 | if ref_file:
285 | logger.debug(
286 | "intersect reference: {0}".format(ref_pair_dict[(uid, wid)])
287 | )
288 | # potential incorrect matching, but supported by links
289 | if len(ref_pair_dict[(uid, wid)]) == 0 and curr_pe > 0:
290 | logger.debug("False Positive case, WARN")
291 | accept = False
292 | if (uid, wid) in con_info:
293 | logger.debug(
294 | "current link supported by contig: {0}, added".format(
295 | con_info[(uid, wid)]
296 | )
297 | )
298 | accept = True
299 | if uid == wid:
300 | logger.debug(
301 | "current link is a self link: {0}, potential cyclic strain, added".format(
302 | uid
303 | )
304 | )
305 | accept = True
306 |
307 | if accept:
308 | in_usage[uid] += 1
309 | out_usage[wid] += 1
310 | kept_link[(uid, wid)] = (
311 | (in_capacity[uid] + out_capacity[wid]) / 2,
312 | curr_pe,
313 | )
314 | else:
315 | logger.debug("current link is secondary choice, process later")
316 | sec_comb.append((uid, wid, curr_pe))
317 | if is_prim:
318 | if split_via_link:
319 | link_split(
320 | sec_comb,
321 | kept_link,
322 | in_usage,
323 | in_capacity,
324 | out_usage,
325 | out_capacity,
326 | logger,
327 | )
328 | else:
329 | # secondary split, via link first, then coverage
330 | cov_split(
331 | us,
332 | ws,
333 | pe_info,
334 | sec_comb,
335 | kept_link,
336 | in_usage,
337 | in_capacity,
338 | out_usage,
339 | out_capacity,
340 | logger,
341 | )
342 | if not (
343 | all([u == 1 for u in in_usage.values()])
344 | and all([v == 1 for v in out_usage.values()])
345 | ):
346 | logger.debug("->Not satisfy N-N split, skip: {0}".format(kept_link))
347 | continue
348 | worst_pair_diff = max(
349 | [
350 | abs(in_capacity[uid] - out_capacity[wid])
351 | for (uid, wid) in kept_link.keys()
352 | ]
353 | )
354 | if worst_pair_diff > 4 * threshold:
355 | logger.debug(
356 | "worst pair coverage diff greater than 4 delta: {0} > {1}, too uneven, skip: {2}".format(
357 | worst_pair_diff, 4 * threshold, kept_link
358 | )
359 | )
360 | continue
361 | logger.debug("->perform split, all kept links: {0}".format(kept_link))
362 | if ref_file:
363 | logger.debug("->expected links: {0}".format(expect_link))
364 | if set(kept_link) != set(expect_link):
365 | logger.debug("Incorrect split")
366 | else:
367 | logger.debug("Correct split")
368 |
369 | split_branches.append(no)
370 | link2subs = {}
371 | counter = 0
372 | for (uid, wid), (sub_flow, pe) in kept_link.items():
373 | logger.debug("--------> {0} - {1}".format(uid, wid))
374 | # debug only
375 | if ref_file:
376 | if len(ref_pair_dict[(uid, wid)]) != 0:
377 | logger.debug("best pair")
378 | if pe <= cut:
379 | correct_X.append(pe)
380 | correct_Y.append(sub_flow)
381 | if pe < 5:
382 | logger.debug(
383 | "correct node with 0 pest {0}->{1}->{2}, with branch size: {3}".format(
384 | uid, no, wid, len(graph.vp.seq[node])
385 | )
386 | )
387 | else:
388 | is_graph_error = False
389 | if uid in error_nos:
390 | logger.debug(
391 | "src: {0} is incorrect graph erroroness node, no optimal ref".format(
392 | uid
393 | )
394 | )
395 | is_graph_error = True
396 | if wid in error_nos:
397 | logger.debug(
398 | "tgt: {0} is incorrect graph erroroness node, no optimal ref".format(
399 | wid
400 | )
401 | )
402 | is_graph_error = True
403 | if len(ref_all_dict[(uid, wid)].intersection(sym_diff)) > 0:
404 | is_graph_error = True
405 | if is_graph_error:
406 | if pe <= cut:
407 | false_error_X.append(pe)
408 | false_error_Y.append(sub_flow)
409 | logger.debug("false positive error pair")
410 | else:
411 | if pe <= cut:
412 | error_X.append(pe)
413 | error_Y.append(sub_flow)
414 | error_text.append("{0}:{1}:{2}".format(uid, wid, pe))
415 | logger.debug("error pair")
416 | # debug only
417 | # perform split
418 | sub_id = no + "*" + str(counter)
419 | counter += 1
420 | sub_node = graph_add_vertex(
421 | graph, simp_node_dict, sub_id, sub_flow, graph.vp.seq[node]
422 | )
423 |
424 | graph_add_edge(
425 | graph,
426 | simp_edge_dict,
427 | simp_node_dict[uid],
428 | sub_node,
429 | graph.ep.overlap[simp_edge_dict[(uid, no)]],
430 | sub_flow,
431 | )
432 |
433 | graph_add_edge(
434 | graph,
435 | simp_edge_dict,
436 | sub_node,
437 | simp_node_dict[wid],
438 | graph.ep.overlap[simp_edge_dict[(no, wid)]],
439 | sub_flow,
440 | )
441 | link2subs[(uid, wid)] = sub_id
442 |
443 | # keep track of related contig record
444 | for cno in support_contigs:
445 | curr_contig, clen, ccov = contig_dict.pop(cno)
446 | branch_ind = curr_contig.index(no)
447 | uid = curr_contig[branch_ind - 1] if branch_ind > 0 else None
448 | wid = (
449 | curr_contig[branch_ind + 1]
450 | if branch_ind < len(curr_contig) - 1
451 | else None
452 | )
453 | if uid != None and wid != None:
454 | # unique mapping
455 | curr_contig[branch_ind] = link2subs[(uid, wid)]
456 | contig_dict[cno] = [curr_contig, clen, ccov]
457 | elif uid == None and wid == None:
458 | for sub_id in link2subs.values():
459 | # all possible contigs
460 | contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [
461 | [sub_id],
462 | len(graph.vp.seq[simp_node_dict[sub_id]]),
463 | graph.vp.dp[simp_node_dict[sub_id]],
464 | ]
465 | elif uid != None and wid == None:
466 | for (uid2, _), sub_id in link2subs.items():
467 | if uid == uid2:
468 | curr_contig[branch_ind] = sub_id
469 | contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [
470 | list(curr_contig),
471 | clen,
472 | ccov,
473 | ]
474 | else:
475 | for (_, wid2), sub_id in link2subs.items():
476 | if wid == wid2:
477 | curr_contig[branch_ind] = sub_id
478 | contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [
479 | list(curr_contig),
480 | clen,
481 | ccov,
482 | ]
483 |
484 | # remove related edges and vertex, update contig tracker
485 | for uid in us:
486 | graph_remove_edge(graph, simp_edge_dict, uid, no)
487 | for wid in ws:
488 | graph_remove_edge(graph, simp_edge_dict, no, wid)
489 | graph_remove_vertex(graph, simp_node_dict, no)
490 | node_to_contig_dict, _ = contig_map_node(contig_dict)
491 |
492 | # update link info
493 | for (uid, wid), sub_id in link2subs.items():
494 | for nno in simp_node_dict.keys():
495 | pe_info[(min(sub_id, nno), max(sub_id, nno))] = None
496 | for pu, pv in list(pe_info.keys()):
497 | if pu == no or pv == no:
498 | # out of date
499 | pe_info.pop((min(pu, pv), max(pu, pv)))
500 | # final step, assign all the none val pe link to 0
501 | for k in pe_info.keys():
502 | if pe_info[k] == None:
503 | pe_info[k] = 0
504 | logger.debug("No of branch be removed: " + str(len(set(split_branches))))
505 | logger.debug("Split branches: " + list_to_string(set(split_branches)))
506 | logger.info("done")
507 |
508 | # plot the data
509 | if ref_file:
510 | _, (ax1) = plt.subplots(1, 1, figsize=(32, 32))
511 | ax1.scatter(correct_X, correct_Y, color="red", s=100, label="Correct")
512 | ax1.scatter(
513 | false_error_X, false_error_Y, color="blue", s=100, label="False-Positive"
514 | )
515 | ax1.scatter(error_X, error_Y, color="green", marker="^", s=100, label="Error")
516 |
517 | for index in range(len(error_X)):
518 | ax1.text(error_X[index], error_Y[index], error_text[index], size=10)
519 |
520 | ax1.set_xlabel("PE")
521 | ax1.set_ylabel("FLOW")
522 | ax1.set_title("Scatter Plot - flow vs pe")
523 | ax1.legend()
524 | plt.yticks(numpy.arange(0, 500, 10))
525 | plt.xticks(numpy.arange(0, cut + 1, 1))
526 | plt.savefig(
527 | "{0}{1}".format(temp_dir, "/tmp/scatter_plot_pest_{0}.png".format(count_id))
528 | )
529 |
530 | return len(set(split_branches))
531 |
532 |
533 | def trivial_split(
534 | graph: Graph,
535 | simp_node_dict: dict,
536 | simp_edge_dict: dict,
537 | pe_info: dict,
538 | logger: Logger,
539 | ):
540 | """
541 | Split the graph, for any (0|1)->N, N->(0|1) branch, split by forking the 1 edge to N edge.
542 | """
543 | logger.info("graph trivial split on NT related vertices..")
544 | # detect all non-trivial branches right now
545 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
546 | trivial_split_count = 0
547 | id_mapping = {}
548 | for id in simp_node_dict.keys():
549 | id_mapping[id] = set()
550 |
551 | for ntno, ntnode in non_trivial_branches.items():
552 | if graph.vp.color[ntnode] != "black":
553 | continue
554 | logger.debug("Current involving NT branch: {0}".format(ntno))
555 | for inode in set(ntnode.in_neighbors()):
556 | if graph.vp.color[inode] != "black":
557 | continue
558 | ino = graph.vp.id[inode]
559 | if ino not in id_mapping:
560 | id_mapping[ino] = set()
561 | ines = [ue for ue in inode.in_edges() if graph.ep.color[ue] == "black"]
562 | outes = [ve for ve in inode.out_edges() if graph.ep.color[ve] == "black"]
563 | if len(ines) > 1 and len(outes) == 1:
564 | # n to 1
565 | logger.debug("{0}, n->1 split right".format(ino))
566 | graph.vp.color[inode] = "gray"
567 | graph.ep.color[graph.edge(inode, ntnode)] = "gray"
568 | s = "A"
569 | for i in range(len(ines)):
570 | ine = ines[i]
571 | src = ine.source()
572 | snode = graph_add_vertex(
573 | graph,
574 | simp_node_dict,
575 | ino + "*" + chr(ord(s) + i),
576 | graph.ep.flow[ine],
577 | graph.vp.seq[inode],
578 | )
579 | graph.ep.color[ine] = "gray"
580 | sedge_in = graph_add_edge(
581 | graph,
582 | simp_edge_dict,
583 | src,
584 | snode,
585 | graph.ep.overlap[ine],
586 | graph.ep.flow[ine],
587 | )
588 | simp_node_dict[graph.vp.id[snode]] = snode
589 | simp_edge_dict[
590 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
591 | ] = sedge_in
592 |
593 | sedge_out = graph_add_edge(
594 | graph,
595 | simp_edge_dict,
596 | snode,
597 | ntnode,
598 | graph.ep.overlap[graph.edge(inode, ntnode)],
599 | graph.ep.flow[ine],
600 | )
601 | simp_edge_dict[
602 | (
603 | graph.vp.id[sedge_out.source()],
604 | graph.vp.id[sedge_out.target()],
605 | )
606 | ] = sedge_out
607 | id_mapping[ino].add(graph.vp.id[snode])
608 | for nno in simp_node_dict.keys():
609 | pe_info[
610 | (min(graph.vp.id[snode], nno), max(graph.vp.id[snode], nno))
611 | ] = None
612 | trivial_split_count += 1
613 | # update link information
614 | for pu, pv in list(pe_info.keys()):
615 | if pu == ino or pv == ino:
616 | # out of date
617 | pe_info.pop((min(pu, pv), max(pu, pv)))
618 |
619 | for onode in set(ntnode.out_neighbors()):
620 | if graph.vp.color[onode] != "black":
621 | continue
622 | ono = graph.vp.id[onode]
623 | if ono not in id_mapping:
624 | id_mapping[ono] = set()
625 | ines = [ue for ue in onode.in_edges() if graph.ep.color[ue] == "black"]
626 | outes = [ve for ve in onode.out_edges() if graph.ep.color[ve] == "black"]
627 | if len(ines) == 1 and len(outes) > 1:
628 | # 1 to n
629 | logger.debug("{0}, 1->n split left".format(ono))
630 | graph.vp.color[onode] = "gray"
631 | graph.ep.color[graph.edge(ntnode, onode)] = "gray"
632 | s = "A"
633 | for i in range(len(outes)):
634 | oute = outes[i]
635 | tgt = oute.target()
636 | snode = graph_add_vertex(
637 | graph,
638 | simp_node_dict,
639 | ono + "*" + chr(ord(s) + i),
640 | graph.ep.flow[oute],
641 | graph.vp.seq[onode],
642 | )
643 | graph.ep.color[oute] = "gray"
644 | sedge_out = graph_add_edge(
645 | graph,
646 | simp_edge_dict,
647 | snode,
648 | tgt,
649 | graph.ep.overlap[oute],
650 | graph.ep.flow[oute],
651 | )
652 | simp_node_dict[graph.vp.id[snode]] = snode
653 | simp_edge_dict[
654 | (
655 | graph.vp.id[sedge_out.source()],
656 | graph.vp.id[sedge_out.target()],
657 | )
658 | ] = sedge_out
659 |
660 | sedge_in = graph_add_edge(
661 | graph,
662 | simp_edge_dict,
663 | ntnode,
664 | snode,
665 | graph.ep.overlap[graph.edge(ntnode, onode)],
666 | graph.ep.flow[oute],
667 | )
668 | simp_edge_dict[
669 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
670 | ] = sedge_in
671 | id_mapping[ono].add(graph.vp.id[snode])
672 | for nno in simp_node_dict.keys():
673 | pe_info[
674 | (min(graph.vp.id[snode], nno), max(graph.vp.id[snode], nno))
675 | ] = None
676 | trivial_split_count += 1
677 | # update link information
678 | for pu, pv in list(pe_info.keys()):
679 | if pu == ono or pv == ono:
680 | # out of date
681 | pe_info.pop((min(pu, pv), max(pu, pv)))
682 | for k in pe_info.keys():
683 | if pe_info[k] == None:
684 | pe_info[k] = 0
685 | logger.debug(
686 | "Total split-ted trivial branch count: {0}".format(trivial_split_count)
687 | )
688 | return trivial_split_count, id_mapping
689 |
690 |
691 | def global_trivial_split(
692 | graph: Graph, simp_node_dict: dict, simp_edge_dict: dict, logger: Logger
693 | ):
694 | """
695 | Split the graph, for any (0|1)->N, N->(0|1) branch, split by forking the 1 edge to N edge.
696 | """
697 | logger.info("graph trivial split..")
698 |
699 | BOUND_ITER = len(simp_node_dict) ** 2
700 | has_split = True
701 | trivial_split_count = 0
702 | id_mapping = {}
703 | for id in simp_node_dict.keys():
704 | id_mapping[id] = set()
705 | while has_split and trivial_split_count < BOUND_ITER:
706 | has_split = False
707 | for id in list(simp_node_dict.keys()):
708 | node = simp_node_dict[id]
709 | if graph.vp.color[node] != "black":
710 | continue
711 | if id not in id_mapping:
712 | id_mapping[id] = set()
713 | ines = [ue for ue in node.in_edges() if graph.ep.color[ue] == "black"]
714 | outes = [ve for ve in node.out_edges() if graph.ep.color[ve] == "black"]
715 | if len(ines) == 1 and len(outes) > 1:
716 | logger.debug(id + " split left")
717 | graph.vp.color[node] = "gray"
718 | ine = ines[0]
719 | src = ine.source()
720 | graph.ep.color[ine] = "gray"
721 | s = "A"
722 | for i in range(len(outes)):
723 | oute = outes[i]
724 | tgt = oute.target()
725 | snode = graph_add_vertex(
726 | graph,
727 | simp_node_dict,
728 | id + "*" + chr(ord(s) + i),
729 | graph.ep.flow[oute],
730 | graph.vp.seq[node],
731 | )
732 | graph.ep.color[oute] = "gray"
733 | sedge_out = graph_add_edge(
734 | graph,
735 | simp_edge_dict,
736 | snode,
737 | tgt,
738 | graph.ep.overlap[oute],
739 | graph.ep.flow[oute],
740 | )
741 | simp_node_dict[graph.vp.id[snode]] = snode
742 | simp_edge_dict[
743 | (
744 | graph.vp.id[sedge_out.source()],
745 | graph.vp.id[sedge_out.target()],
746 | )
747 | ] = sedge_out
748 |
749 | sedge_in = graph_add_edge(
750 | graph,
751 | simp_edge_dict,
752 | src,
753 | snode,
754 | graph.ep.overlap[ine],
755 | graph.ep.flow[oute],
756 | )
757 | simp_edge_dict[
758 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
759 | ] = sedge_in
760 | id_mapping[id].add(graph.vp.id[snode])
761 | has_split = True
762 | trivial_split_count += 1
763 | elif len(ines) > 1 and len(outes) == 1:
764 | logger.debug(id + " split right")
765 | graph.vp.color[node] = "gray"
766 | oute = outes[0]
767 | tgt = oute.target()
768 | graph.ep.color[oute] = "gray"
769 | s = "A"
770 | for i in range(len(ines)):
771 | ine = ines[i]
772 | src = ine.source()
773 | snode = graph_add_vertex(
774 | graph,
775 | simp_node_dict,
776 | id + "*" + chr(ord(s) + i),
777 | graph.ep.flow[ine],
778 | graph.vp.seq[node],
779 | )
780 | graph.ep.color[ine] = "gray"
781 | sedge_in = graph_add_edge(
782 | graph,
783 | simp_edge_dict,
784 | src,
785 | snode,
786 | graph.ep.overlap[ine],
787 | graph.ep.flow[ine],
788 | )
789 | simp_node_dict[graph.vp.id[snode]] = snode
790 | simp_edge_dict[
791 | (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
792 | ] = sedge_in
793 |
794 | sedge_out = graph_add_edge(
795 | graph,
796 | simp_edge_dict,
797 | snode,
798 | tgt,
799 | graph.ep.overlap[oute],
800 | graph.ep.flow[ine],
801 | )
802 | simp_edge_dict[
803 | (
804 | graph.vp.id[sedge_out.source()],
805 | graph.vp.id[sedge_out.target()],
806 | )
807 | ] = sedge_out
808 | id_mapping[id].add(graph.vp.id[snode])
809 | has_split = True
810 | trivial_split_count += 1
811 | else:
812 | None
813 | if trivial_split_count >= BOUND_ITER:
814 | logger.warning("Strange topology detected, exit trivial split immediately")
815 | return None, id_mapping
816 | else:
817 | logger.debug("No of trivial branch be removed: " + str(trivial_split_count))
818 | logger.info("done")
819 | return trivial_split_count, id_mapping
820 |
821 |
822 | def edge_cleaning(
823 | graph: Graph, simp_edge_dict: dict, contig_dict: dict, pe_info: dict, logger: Logger
824 | ):
825 | """
826 | Detect the crossing edges and select the confident edges only.
827 | """
828 | un_assigned_edge = graph.num_edges()
829 | assigned = dict.fromkeys(
830 | [(graph.vp.id[e.source()], graph.vp.id[e.target()]) for e in graph.edges()],
831 | False,
832 | )
833 | _, edge_to_contig_dict = contig_map_node(contig_dict)
834 | logger.debug("Total edges: " + str(un_assigned_edge))
835 | # converage iteration
836 | converage_flag = 0
837 | while True:
838 | for node in graph.vertices():
839 | in_d = node.in_degree()
840 | in_e = []
841 | for e in node.in_edges():
842 | if assigned[(graph.vp.id[e.source()], graph.vp.id[e.target()])]:
843 | in_d = in_d - 1
844 | else:
845 | in_e.append(e)
846 |
847 | out_d = node.out_degree()
848 | out_e = []
849 | for e in node.out_edges():
850 | if assigned[(graph.vp.id[e.source()], graph.vp.id[e.target()])]:
851 | out_d = out_d - 1
852 | else:
853 | out_e.append(e)
854 |
855 | if in_d == 1:
856 | assigned[
857 | (graph.vp.id[in_e[0].source()], graph.vp.id[in_e[0].target()])
858 | ] = True
859 | un_assigned_edge = un_assigned_edge - 1
860 | if out_d == 1:
861 | assigned[
862 | (graph.vp.id[out_e[0].source()], graph.vp.id[out_e[0].target()])
863 | ] = True
864 | un_assigned_edge = un_assigned_edge - 1
865 | if converage_flag == un_assigned_edge:
866 | break
867 | else:
868 | converage_flag = un_assigned_edge
869 |
870 | logger.debug(
871 | "un-assigned edges after node-weight coverage iteration : {0}".format(
872 | un_assigned_edge
873 | )
874 | )
875 | for u, v in assigned.keys():
876 | if not assigned[(u, v)]:
877 | logger.debug(
878 | "***cross un-assigned edge: {0} -> {1}, with paired end link {2}".format(
879 | u, v, pe_info[(min(u, v), max(u, v))]
880 | )
881 | )
882 | if (u, v) in edge_to_contig_dict:
883 | logger.debug(
884 | "support contig: {0}, force assign".format(
885 | edge_to_contig_dict[(u, v)]
886 | )
887 | )
888 | assigned[(u, v)] = True
889 | else:
890 | logger.debug("support contig: None")
891 | for u, v in assigned.keys():
892 | if not assigned[(u, v)]:
893 | force_assign = True
894 | for w, z in assigned.keys():
895 | if (u == w or v == z) and assigned[(w, z)]:
896 | force_assign = False
897 | break
898 | if not force_assign:
899 | graph.remove_edge(simp_edge_dict.pop((u, v)))
900 | logger.debug(
901 | "intersect unsupported edge: {0} -> {1}, removed".format(u, v)
902 | )
903 | else:
904 | logger.debug("disjoint unsupported edge: {0} -> {1}, kept".format(u, v))
905 | return assigned
906 |
907 |
908 | def iter_graph_disentanglement(
909 | graph: Graph,
910 | simp_node_dict: dict,
911 | simp_edge_dict: dict,
912 | contig_dict: dict,
913 | pe_info: dict,
914 | ref_file: str,
915 | logger: Logger,
916 | threshold,
917 | temp_dir,
918 | ):
919 | BOUND_ITER = len(simp_node_dict) ** 2
920 | it = 0
921 | total_removed_branch = 0
922 | num_split = 0
923 | iterCount = "A"
924 | for is_prim in [True, False]: # False
925 | do_trivial_split = True
926 | while it < BOUND_ITER:
927 | num_split = balance_split(
928 | graph,
929 | simp_node_dict,
930 | simp_edge_dict,
931 | contig_dict,
932 | pe_info,
933 | logger,
934 | ref_file,
935 | temp_dir,
936 | it,
937 | threshold,
938 | is_prim,
939 | )
940 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
941 | graph,
942 | simp_node_dict,
943 | simp_edge_dict,
944 | logger,
945 | "{0}/gfa/split_graph_L{1}d.gfa".format(temp_dir, iterCount),
946 | )
947 | simp_path_compactification(
948 | graph, simp_node_dict, simp_edge_dict, contig_dict, pe_info, logger
949 | )
950 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
951 | graph,
952 | simp_node_dict,
953 | simp_edge_dict,
954 | logger,
955 | "{0}/gfa/split_graph_L{1}dc.gfa".format(temp_dir, iterCount),
956 | )
957 |
958 | if num_split > 0:
959 | do_trivial_split = True
960 | else:
961 | if do_trivial_split:
962 | # trivial split nt branch related cases FIXME
963 | prev_ids = list(simp_node_dict.keys())
964 | trivial_split_count, id_mapping = trivial_split(
965 | graph, simp_node_dict, simp_edge_dict, pe_info, logger
966 | )
967 | logger.debug("my id mapping: {0}".format(id_mapping))
968 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
969 | graph,
970 | simp_node_dict,
971 | simp_edge_dict,
972 | logger,
973 | "{0}/gfa/split_graph_L{1}dct.gfa".format(temp_dir, iterCount),
974 | )
975 |
976 | contig_dict_remapping(
977 | graph,
978 | simp_node_dict,
979 | simp_edge_dict,
980 | contig_dict,
981 | id_mapping,
982 | prev_ids,
983 | logger,
984 | )
985 | simp_path_compactification(
986 | graph,
987 | simp_node_dict,
988 | simp_edge_dict,
989 | contig_dict,
990 | pe_info,
991 | logger,
992 | )
993 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
994 | graph,
995 | simp_node_dict,
996 | simp_edge_dict,
997 | logger,
998 | "{0}/gfa/split_graph_L{1}dctd.gfa".format(temp_dir, iterCount),
999 | )
1000 |
1001 | contig_dup_removed_s(contig_dict, logger)
1002 | trim_contig_dict(graph, simp_node_dict, contig_dict, logger)
1003 | # analysis
1004 | if ref_file:
1005 | map_ref_to_graph(
1006 | ref_file,
1007 | simp_node_dict,
1008 | "{0}/gfa/split_graph_L{1}dc.gfa".format(temp_dir, iterCount),
1009 | logger,
1010 | True,
1011 | "{0}/paf/node_to_ref_{1}.paf".format(temp_dir, iterCount),
1012 | "{0}/tmp/temp_gfa_to_fasta_{1}.fasta".format(temp_dir, iterCount),
1013 | )
1014 | # analysis
1015 | total_removed_branch += num_split
1016 | it += 1
1017 | iterCount = chr(ord(iterCount) + 1)
1018 | if num_split == 0:
1019 | if do_trivial_split:
1020 | do_trivial_split = False
1021 | else:
1022 | break
1023 |
1024 | logger.debug("Total non-trivial branches removed: " + str(total_removed_branch))
1025 | non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
1026 | logger.debug(
1027 | list_to_string(
1028 | non_trivial_branches.keys(),
1029 | "non-trivial branches ({0}) left after paired-end&single-strand links".format(
1030 | len(non_trivial_branches)
1031 | ),
1032 | )
1033 | )
1034 |
1035 | graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
1036 | graph,
1037 | simp_node_dict,
1038 | simp_edge_dict,
1039 | logger,
1040 | "{0}/gfa/split_graph_final.gfa".format(temp_dir),
1041 | )
1042 | return graph, simp_node_dict, simp_edge_dict
1043 |
1044 |
1045 | def best_aln_score(graph: Graph, ori, strain, ref_file, temp_dir):
1046 | fname = "{0}/temp_{1}.fa".format(temp_dir, ori)
1047 | pafname = "{0}/temp_{1}_aln.paf".format(temp_dir, ori)
1048 | subprocess.check_call('echo "" > {0}'.format(fname), shell=True)
1049 | with open(fname, "w") as f:
1050 | f.write(">{0}\n".format(ori))
1051 | f.write("{0}\n".format(path_to_seq(graph, strain, "")))
1052 | f.close()
1053 | minimap_api(ref_file, fname, pafname)
1054 | subprocess.check_call("rm {0}".format(fname), shell=True)
1055 | best_aln = []
1056 | with open(pafname, "r") as paf:
1057 | for line in paf.readlines():
1058 | splited = line[:-1].split("\t")
1059 | if len(splited) < 12:
1060 | continue
1061 | best_aln.append(
1062 | [
1063 | splited[0],
1064 | int(splited[10]),
1065 | splited[5],
1066 | int(splited[10]) - int(splited[9]),
1067 | ]
1068 | )
1069 | paf.close()
1070 | subprocess.check_call("rm {0}".format(pafname), shell=True)
1071 | return best_aln
1072 |
--------------------------------------------------------------------------------