├── utils
    ├── __init__.py
    ├── spades_wrapper.py
    ├── VStrains_PE_Inference.py
    ├── VStrains_SPAdes.py
    ├── VStrains_Preprocess.py
    ├── VStrains_Alignment.py
    ├── VStrains_IO.py
    ├── VStrains_Extension.py
    └── VStrains_Decomposition.py
├── requirements.txt
├── VStrains_logo.png
├── environment.yml
├── MANIFEST.in
├── .gitignore
├── LICENSE
├── recipe
    └── meta.yaml
├── setup.py
├── evals
    ├── sampling.py
    └── quast_evaluation.py
├── vstrains
└── README.md


/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | graph-tool
2 | minimap2
3 | numpy
4 | gfapy
5 | matplotlib
6 | 


--------------------------------------------------------------------------------
/VStrains_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metagentools/VStrains/HEAD/VStrains_logo.png


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: vstrains
 2 | channels:
 3 |   - defaults
 4 |   - bioconda
 5 |   - conda-forge
 6 | dependencies:
 7 |   - python=3
 8 |   - graph-tool>=2.45
 9 |   - minimap2>=2.24
10 |   - numpy>=1.23.5
11 |   - gfapy>=1.2.3
12 |   - matplotlib>=3.6.2


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include README.md
 2 | include requirements.txt
 3 | include LICENSE
 4 | include VStrains_logo.png
 5 | include environment.yml
 6 | include setup.py
 7 | 
 8 | include vstrains
 9 | 
10 | 
11 | recursive-include recipe/*
12 | recursive-include utils/*
13 | recursive-include evals/*
14 | 
15 | global-exclude utils/__pycache__*.pyc
16 | global-exclude evals/__pycache__*.pyc


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | *.sh
10 | 
11 | # Packages #
12 | ############
13 | # it's better to unpack these files and commit the raw source
14 | # git has its own built in compression methods
15 | *.7z
16 | *.dmg
17 | *.gz
18 | *.iso
19 | *.jar
20 | *.rar
21 | *.tar
22 | *.zip
23 | 
24 | # Logs and databases #
25 | ######################
26 | *.log
27 | *.sql
28 | *.sqlite
29 | 
30 | # OS generated files #
31 | ######################
32 | .DS_Store
33 | .DS_Store?
34 | ._*
35 | .Spotlight-V100
36 | .Trashes
37 | ehthumbs.db
38 | Thumbs.db
39 | 
40 | # Evaluation result #
41 | #####################
42 | eval_result/
43 | example/
44 | benchmark/*
45 | quast*/
46 | acc*/
47 | testcase/
48 | src/tmp/*
49 | *.fa
50 | *.fq
51 | *.fasta
52 | *.fastq
53 | *.gfa
54 | *.csv
55 | *.paf
56 | *.pyc
57 | *.sh
58 | # pycache #
59 | ###########
60 | */__pycache__/*
61 | *.cpython*
62 | src/__pycache__/*
63 | legacy/
64 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2022] [Runpeng Luo]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "VStrains" %}
 2 | {% set version = "1.1.0" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ version }}"
 7 | 
 8 | source:
 9 |   url: https://github.com/metagentools/{{ name }}/releases/download/v{{ version }}/{{ name }}-{{ version }}.tar.gz
10 |   sha256: 79a77435dd0f648fe55bb5930ef8fdd874d4aec990850ab20dd8b067d8df5ec0
11 | 
12 | build:
13 |   number: 0
14 |   noarch: python
15 |   script:
16 |     - "{{ PYTHON }} -m pip install . -vv"
17 | 
18 | requirements:
19 |   host:
20 |     - pip>=22.3.1
21 |     - python=3
22 |     - graph-tool>=2.45
23 |     - minimap2>=2.24
24 |     - numpy>=1.23.5
25 |     - gfapy>=1.2.3
26 |     - matplotlib>=3.6.2
27 |   run:
28 |     - python=3
29 |     - graph-tool>=2.45
30 |     - minimap2>=2.24
31 |     - numpy>=1.23.5
32 |     - gfapy>=1.2.3
33 |     - matplotlib>=3.6.2
34 | 
35 | test:
36 |   commands:
37 |     - vstrains -h
38 | 
39 | about:
40 |   home: "https://github.com/metagentools/MetaCoAG"
41 |   license: MIT
42 |   license_file: LICENSE
43 |   summary: "VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs"
44 |   doc_url: "https://github.com/metagentools/VStrains/blob/master/README.md"
45 |   dev_url: "https://github.com/metagentools/VStrains"
46 | 
47 | extra:
48 |   recipe-maintainers:
49 |     - JohnLuo
50 |   # identifiers:
51 |   #   - doi:10.1101/2022.10.21.513181v3


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | # read the contents of your README file
 6 | from pathlib import Path
 7 | 
 8 | this_directory = Path(__file__).parent
 9 | long_description = (this_directory / "README.md").read_text()
10 | 
11 | packages = find_packages()
12 | package_data = {"utils": ["utils/*"]}
13 | 
14 | data_files = [(".", ["LICENSE", "README.md"])]
15 | 
16 | setup(
17 |     name="vstrains",
18 |     version="1.1.0",
19 |     zip_safe=True,
20 |     author="Runpeng Luo and Yu Lin",
21 |     author_email="runpengluo@gmail.com",
22 |     description="VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs",
23 |     long_description=long_description,
24 |     long_description_content_type="text/markdown",
25 |     url="https://github.com/metagentools/VStrains",
26 |     license="MIT",
27 |     packages=packages,
28 |     package_data=package_data,
29 |     data_files=data_files,
30 |     include_package_data=True,
31 |     scripts=["vstrains"],
32 |     classifiers=[
33 |         "Development Status :: 5 - Production/Stable",
34 |         "Programming Language :: Python :: 3",
35 |         "License :: OSI Approved :: MIT License",
36 |         "Natural Language :: English",
37 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
38 |         "Operating System :: OS Independent",
39 |     ],
40 |     install_requires=[
41 |         # "graph-tool", # not distributed via Pip
42 |         # "minimap2", # not distributed via Pip
43 |         "numpy",
44 |         "gfapy",
45 |         "matplotlib",
46 |     ],
47 |     python_requires=">=3",
48 | )
49 | 


--------------------------------------------------------------------------------
/utils/spades_wrapper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import argparse
 4 | import time
 5 | 
 6 | if __name__ == "__main__":
 7 |     parser = argparse.ArgumentParser(
 8 |         prog="spades_wrapper.py",
 9 |         description="""Build assembly graph&contig using SPAdes --careful mode, 
10 |         with input pair-end reads and store the graph.""",
11 |     )
12 |     parser.add_argument(
13 |         "-f",
14 |         "--forward",
15 |         dest="forward",
16 |         type=str,
17 |         required=True,
18 |         help="Forward reads, fastq format",
19 |     )
20 |     parser.add_argument(
21 |         "-r",
22 |         "--reverse",
23 |         dest="reverse",
24 |         type=str,
25 |         required=True,
26 |         help="Reverse reads, fastq format",
27 |     )
28 |     parser.add_argument(
29 |         "-spades",
30 |         "--spades_path",
31 |         dest="spades",
32 |         type=str,
33 |         required=True,
34 |         help="absolute path to spades executable",
35 |     )
36 |     parser.add_argument(
37 |         "-t",
38 |         "--threads",
39 |         dest="thread_count",
40 |         default=8,
41 |         help="Set number of threads used for SPAdes.",
42 |     )
43 |     parser.add_argument(
44 |         "-o", "--output_dir", dest="output_dir", type=str, required=True
45 |     )
46 |     args = parser.parse_args()
47 | 
48 |     global_t1_start = time.perf_counter()
49 |     global_t2_start = time.process_time()
50 | 
51 |     filepath = os.path.dirname(os.path.abspath(__file__))
52 |     spades = args.spades
53 | 
54 |     if spades:
55 |         print(filepath)
56 |         subprocess.check_call(
57 |             "rm -rf {0} && mkdir {0}".format(args.output_dir), shell=True
58 |         )
59 | 
60 |         subprocess.check_call(
61 |             spades
62 |             + " -1 {0} -2 {1} --careful -t {3} -o {4}".format(
63 |                 args.forward, args.reverse, args.thread_count, args.output_dir
64 |             ),
65 |             shell=True,
66 |         )
67 |     else:
68 |         print("SPAdes executable path haven't specified.")
69 | 
70 |     t1_stop = time.perf_counter()
71 |     t2_stop = time.process_time()
72 | 
73 |     print("\SPAdes assembly completed")
74 |     print("Elapsed time: {:.1f} seconds".format(t1_stop - global_t1_start))
75 |     print("CPU process time: {:.1f} seconds".format(t2_stop - global_t2_start))
76 | 


--------------------------------------------------------------------------------
/evals/sampling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import subprocess
 4 | import sys
 5 | import random
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser(
10 |         prog="sampling",
11 |         description="""Sampling the pairend fastq file""",
12 |     )
13 | 
14 |     parser.add_argument(
15 |         "-s",
16 |         "--sampling_ratio",
17 |         dest="sratio",
18 |         type=int,
19 |         required=True,
20 |         help="sampling ratio, 2 for sampling half the dataset, etc.,",
21 |     )
22 |     parser.add_argument(
23 |         "-f",
24 |         "--forward",
25 |         dest="fwd",
26 |         type=str,
27 |         required=True,
28 |         help="forward .fastq file",
29 |     )
30 | 
31 |     parser.add_argument(
32 |         "-r",
33 |         "--reverse",
34 |         dest="rve",
35 |         type=str,
36 |         required=True,
37 |         help="reverse .fastq file",
38 |     )
39 | 
40 |     parser.add_argument(
41 |         "-of",
42 |         "--out_forward",
43 |         dest="ofwd",
44 |         type=str,
45 |         required=True,
46 |         help="output forward .fastq file",
47 |     )
48 | 
49 |     parser.add_argument(
50 |         "-or",
51 |         "--out_reverse",
52 |         dest="orve",
53 |         type=str,
54 |         required=True,
55 |         help="output reverse .fastq file",
56 |     )
57 | 
58 |     args = parser.parse_args()
59 | 
60 |     if 1 / args.sratio <= 0 or 1 / args.sratio >= 1:
61 |         print("error ratio, please input a valid ratio")
62 |         sys.exit(1)
63 | 
64 |     subprocess.check_call("echo " " > {0}".format(args.ofwd), shell=True)
65 |     subprocess.check_call("echo " " > {0}".format(args.orve), shell=True)
66 | 
67 |     with open(args.fwd, "r") as fwd:
68 |         with open(args.rve, "r") as rve:
69 |             with open(args.ofwd, "w") as ofwd:
70 |                 with open(args.orve, "w") as orve:
71 |                     flines = fwd.readlines()
72 |                     rlines = rve.readlines()
73 |                     n = len(flines) // 4
74 |                     k = 0
75 |                     print("total number of reads: ", n)
76 |                     for i in range(n):
77 |                         if random.random() > 1 / args.sratio:
78 |                             continue
79 |                         k += 1
80 |                         for fcurr in flines[i * 4 : i * 4 + 4]:
81 |                             ofwd.write(fcurr)
82 |                         for rcurr in rlines[i * 4 : i * 4 + 4]:
83 |                             orve.write(rcurr)
84 |                     print("sample {0} reads given ratio {1}".format(k, args.sratio))
85 |                     orve.close()
86 |                 ofwd.close()
87 |             rve.close()
88 |         fwd.close()
89 | 
90 |     return
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     sys.exit(main())
95 | 


--------------------------------------------------------------------------------
/evals/quast_evaluation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | import subprocess
  3 | import argparse
  4 | import sys
  5 | import os
  6 | 
  7 | usage = "Use MetaQUAST to evaluate assembly result"
  8 | Author = "Runpeng Luo"
  9 | 
 10 | 
 11 | def sep_ref(ref_file, id=0):
 12 |     ref_file_list = []
 13 |     i = 0
 14 |     with open(ref_file, "r") as ref:
 15 |         j = 0
 16 |         lines = ref.readlines()
 17 |         l = len(lines)
 18 |         while j < l - 1:
 19 |             name_in_file = lines[j]
 20 |             name = str(lines[j][1:-1])
 21 |             name = name.split(" ")[0]
 22 |             name = name.split(".")[0]
 23 |             strain = lines[j + 1]
 24 |             j = j + 2
 25 |             file_name = "sub_" + str(id) + "_" + str(name) + "_ref.fasta"
 26 |             subprocess.check_call("touch {0}".format(file_name), shell=True)
 27 |             with open(file_name, "w") as sub_file:
 28 |                 sub_file.write(name_in_file)
 29 |                 sub_file.write(strain)
 30 |                 sub_file.close()
 31 |             ref_file_list.append(file_name)
 32 |             i = i + 1
 33 |         ref.close()
 34 |     print("ref list: ", ref_file_list)
 35 |     return ref_file_list
 36 | 
 37 | 
 38 | def quast_eval(files, ref, o, quast, id=0):
 39 |     subprocess.check_call("rm -rf sub_{0}_*_ref.fasta".format(id), shell=True)
 40 | 
 41 |     ref_file_list = sep_ref(ref, id)
 42 | 
 43 |     command = "python2 {0} --unique-mapping --report-all-metrics -m 500 -t 8 ".format(
 44 |         quast
 45 |     )
 46 |     for fname in files:
 47 |         command += fname + " "
 48 | 
 49 |     command += "-o " + o + " -R "
 50 | 
 51 |     for file in ref_file_list:
 52 |         command += file + ","
 53 |     command = command[:-1]
 54 | 
 55 |     print(command)
 56 |     subprocess.check_call(command, shell=True)
 57 | 
 58 |     # clean up
 59 |     subprocess.check_call("rm -rf sub_{0}_*_ref.fasta".format(id), shell=True)
 60 |     return
 61 | 
 62 | 
 63 | if __name__ == "__main__":
 64 |     parser = argparse.ArgumentParser(prog="quast_evaluation.py", description=usage)
 65 |     parser.add_argument(
 66 |         "-quast",
 67 |         "--path_to_quast",
 68 |         dest="quast",
 69 |         required=True,
 70 |         help="path to MetaQuast python script, version >= 5.2.0",
 71 |     )
 72 |     parser.add_argument(
 73 |         "-cs",
 74 |         "--contig_files",
 75 |         dest="files",
 76 |         default=None,
 77 |         nargs="+",
 78 |         help="contig files from different tools, separated by space",
 79 |     )
 80 |     parser.add_argument(
 81 |         "-d",
 82 |         "--contig_dir",
 83 |         dest="idir",
 84 |         default=None,
 85 |         help="contig files from different tools, stored in the directory, .fasta format",
 86 |     )
 87 |     parser.add_argument(
 88 |         "-ref",
 89 |         "--ref_file",
 90 |         dest="ref_file",
 91 |         type=str,
 92 |         required=True,
 93 |         help="ref file (single)",
 94 |     )
 95 |     parser.add_argument(
 96 |         "-o",
 97 |         "--output_dir",
 98 |         dest="output_dir",
 99 |         type=str,
100 |         required=True,
101 |         help="output directory",
102 |     )
103 |     args = parser.parse_args()
104 | 
105 |     if args.idir == None and args.files == None:
106 |         print("Please provide correct query input")
107 |         sys.exit(1)
108 | 
109 |     if args.idir != None and (
110 |         not os.path.exists(args.idir) or not os.path.isdir(args.idir)
111 |     ):
112 |         print("Please provide correct directory")
113 |         sys.exit(1)
114 | 
115 |     files = []
116 |     if args.files != None:
117 |         files.extend(args.files)
118 |     if args.idir != None:
119 |         files.extend(
120 |             [
121 |                 str(args.idir) + s
122 |                 for s in sorted(os.listdir(args.idir))
123 |                 if s.endswith(".fasta") or s.endswith(".fa")
124 |             ]
125 |         )
126 | 
127 |     quast_eval(files, args.ref_file, args.output_dir, args.quast)
128 | 


--------------------------------------------------------------------------------
/utils/VStrains_PE_Inference.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import os
  4 | import time
  5 | import subprocess
  6 | import numpy
  7 | import sys
  8 | 
  9 | rev_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}
 10 | 
 11 | 
 12 | def reverse_seq(seq: str):
 13 |     return "".join(rev_dict[x] for x in reversed(seq))
 14 | 
 15 | 
 16 | def single_end_read_mapping(
 17 |     seq: str, kmer_htable: dict, index2seqlen: list, split_len: int, len_index2id: int
 18 | ):
 19 |     nodes = numpy.zeros(len_index2id, dtype=int)
 20 |     coords = [sys.maxsize for _ in range(len_index2id)]
 21 |     kindices = [sys.maxsize for _ in range(len_index2id)]
 22 | 
 23 |     rlen = len(seq)
 24 |     for i in range(rlen - split_len + 1):
 25 |         kmer = seq[i : i + split_len]
 26 |         if kmer in kmer_htable:
 27 |             # found a collide node
 28 |             for rid, rcord in kmer_htable[kmer]:
 29 |                 nodes[rid] += 1
 30 |                 coords[rid] = min(coords[rid], rcord)
 31 |                 kindices[rid] = min(kindices[rid], i)
 32 | 
 33 |     saturates = []
 34 |     L = 0
 35 |     R = 0
 36 |     for i, v in enumerate(nodes):
 37 |         if coords[i] == sys.maxsize or kindices[i] == sys.maxsize:
 38 |             continue
 39 |         L = max(coords[i], coords[i] - kindices[i])
 40 |         R = min(coords[i] + index2seqlen[i] - 1, coords[i] - kindices[i] + rlen - 1)
 41 |         saturate = R - L - (split_len - 1) + 1
 42 |         expected = (
 43 |             (min(rlen, index2seqlen[i]) - split_len + 1) * (rlen - split_len) / rlen
 44 |         )
 45 |         if v >= max(min(saturate, expected), 1):
 46 |             # print(i,v,"passed")
 47 |             saturates.append(i)
 48 |     return saturates
 49 | 
 50 | 
 51 | def main():
 52 |     print(
 53 |         "----------------------Paired-End Information Alignment----------------------"
 54 |     )
 55 |     parser = argparse.ArgumentParser(
 56 |         prog="pe_info",
 57 |         description="""Align Paired-End reads to nodes in graph to obtain strong links""",
 58 |     )
 59 | 
 60 |     parser.add_argument(
 61 |         "-g", "--gfa,", dest="gfa", type=str, required=True, help="graph, .gfa format"
 62 |     )
 63 | 
 64 |     parser.add_argument(
 65 |         "-o",
 66 |         "--output_dir",
 67 |         dest="dir",
 68 |         type=str,
 69 |         required=True,
 70 |         help="output directory",
 71 |     )
 72 | 
 73 |     parser.add_argument(
 74 |         "-f", "--forward", dest="fwd", required=True, help="forward read, .fastq"
 75 |     )
 76 | 
 77 |     parser.add_argument(
 78 |         "-r", "--reverse", dest="rve", required=True, help="reverse read, .fastq"
 79 |     )
 80 | 
 81 |     parser.add_argument(
 82 |         "-k",
 83 |         "--kmer_size",
 84 |         dest="kmer_size",
 85 |         type=int,
 86 |         default=128,
 87 |         help="unique kmer size",
 88 |     )
 89 | 
 90 |     args = parser.parse_args()
 91 | 
 92 |     # initialize output directory
 93 |     if args.dir[-1] == "/":
 94 |         args.dir = args.dir[:-1]
 95 |     subprocess.check_call("rm -rf {0}".format(args.dir), shell=True)
 96 |     os.makedirs(args.dir, exist_ok=True)
 97 | 
 98 |     glb_start = time.time()
 99 | 
100 |     # get gfa node informations
101 |     index2id = []
102 |     index2seq = []
103 |     index2seqlen = []
104 | 
105 |     with open(args.gfa, "r") as gfa:
106 |         for Line in gfa:
107 |             splited = (Line[:-1]).split("\t")
108 |             if splited[0] == "S":
109 |                 index2id.append(splited[1])
110 |                 index2seq.append(splited[2])
111 |                 index2seqlen.append(len(splited[2]))
112 |         gfa.close()
113 | 
114 |     split_len = args.kmer_size + 1
115 | 
116 |     # construct hash table for gfa nodes with chunck kmer
117 |     kmer_htable = {}
118 |     for i, seq in enumerate(index2seq):
119 |         seqlen = index2seqlen[i]
120 |         for sub_i in range(seqlen - split_len + 1):
121 |             kmer = seq[sub_i : sub_i + split_len]
122 |             rev_kmer = reverse_seq(kmer)
123 |             if kmer in kmer_htable:
124 |                 # not unique
125 |                 kmer_htable[kmer].append((i, sub_i))
126 |             else:
127 |                 # unique
128 |                 kmer_htable[kmer] = [(i, sub_i)]
129 | 
130 |             if rev_kmer in kmer_htable:
131 |                 # not unique
132 |                 kmer_htable[rev_kmer].append((i, sub_i))
133 |             else:
134 |                 # unique
135 |                 kmer_htable[rev_kmer] = [(i, sub_i)]
136 | 
137 |     # init nodes pairwise relationship
138 |     len_index2id = len(index2id)
139 |     node_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
140 |     short_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
141 | 
142 |     n_reads = 0
143 |     short_reads = 0
144 |     used_reads = 0
145 | 
146 |     print("Start aligning reads to gfa nodes")
147 |     fwd_fd = open(args.fwd, "r")
148 |     rve_fd = open(args.rve, "r")
149 |     fwd_reads = fwd_fd.readlines()
150 |     rve_reads = rve_fd.readlines()
151 |     fwd_fd.close()
152 |     rve_fd.close()
153 | 
154 |     total_size = min(len(fwd_reads) // 4, len(rve_reads) // 4)
155 |     for read_idx in range(total_size):
156 |         if read_idx % 100000 == 0:
157 |             print("Number of processed reads: ", read_idx)
158 |         [_, fseq, _, _] = [s[:-1] for s in fwd_reads[read_idx * 4 : (read_idx + 1) * 4]]
159 |         [_, rseq, _, _] = [s[:-1] for s in rve_reads[read_idx * 4 : (read_idx + 1) * 4]]
160 |         if fseq.count("N") or rseq.count("N"):
161 |             n_reads += 1
162 |         elif len(fseq) < split_len or len(rseq) < split_len:
163 |             short_reads += 1
164 |         else:
165 |             used_reads += 1
166 |             # valid read pair
167 |             lefts = single_end_read_mapping(
168 |                 fseq, kmer_htable, index2seqlen, split_len, len_index2id
169 |             )
170 |             rights = single_end_read_mapping(
171 |                 rseq, kmer_htable, index2seqlen, split_len, len_index2id
172 |             )
173 | 
174 |             k = 0
175 |             for i in lefts:
176 |                 for i2 in lefts[k:]:
177 |                     short_mat[i][i2] += 1
178 |                 k += 1
179 | 
180 |             k = 0
181 |             for j in rights:
182 |                 for j2 in rights[k:]:
183 |                     short_mat[j][j2] += 1
184 |                 k += 1
185 | 
186 |             for i in lefts:
187 |                 for j in rights:
188 |                     node_mat[i][j] += 1
189 | 
190 |     out_file = "{0}/pe_info".format(args.dir)
191 |     out_file2 = "{0}/st_info".format(args.dir)
192 |     subprocess.check_call("touch {0}; echo " " > {0}".format(out_file), shell=True)
193 |     subprocess.check_call("touch {0}; echo " " > {0}".format(out_file2), shell=True)
194 |     with open(out_file, "w") as outfile:
195 |         with open(out_file2, "w") as outfile2:
196 |             for i in range(len_index2id):
197 |                 for j in range(len_index2id):
198 |                     outfile.write(
199 |                         "{0}:{1}:{2}\n".format(index2id[i], index2id[j], node_mat[i][j])
200 |                     )
201 |                     outfile2.write(
202 |                         "{0}:{1}:{2}\n".format(
203 |                             index2id[i], index2id[j], short_mat[i][j]
204 |                         )
205 |                     )
206 |             outfile2.close()
207 |         outfile.close()
208 | 
209 |     glb_elapsed = time.time() - glb_start
210 |     print("Global time elapsed: ", glb_elapsed)
211 |     print("result stored in: ", out_file)
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     main()
216 |     sys.exit(0)
217 | 


--------------------------------------------------------------------------------
/vstrains:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import sys
  5 | import os
  6 | import platform
  7 | import numpy
  8 | import logging
  9 | import time
 10 | from datetime import date
 11 | 
 12 | from utils import VStrains_SPAdes
 13 | 
 14 | __author__ = "Runpeng Luo"
 15 | __copyright__ = "Copyright 2022-2025, VStrains Project"
 16 | __credits__ = ["Runpeng Luo", "Yu Lin"]
 17 | __license__ = "MIT"
 18 | __version__ = "1.1.0"
 19 | __maintainer__ = "Runpeng Luo"
 20 | __email__ = "John.Luo@anu.edu.au"
 21 | __status__ = "Production"
 22 | 
 23 | 
 24 | def run(args, logger):
 25 |     numpy.seterr(all="raise")
 26 |     RUNNER = {
 27 |         "spades": VStrains_SPAdes.run,
 28 |     }
 29 |     RUNNER[args.assembler](args, logger)
 30 | 
 31 | 
 32 | def main():
 33 |     parser = argparse.ArgumentParser(
 34 |         prog="VStrains",
 35 |         description="""Construct full-length viral strains under de novo approach 
 36 |         from contigs and assembly graph, currently supports SPAdes""",
 37 |     )
 38 | 
 39 |     parser.add_argument(
 40 |         "-a",
 41 |         "--assembler",
 42 |         dest="assembler",
 43 |         type=str,
 44 |         required=True,
 45 |         choices=["spades"],
 46 |         help="name of the assembler used. [spades]",
 47 |     )
 48 | 
 49 |     parser.add_argument(
 50 |         "-g",
 51 |         "--graph",
 52 |         dest="gfa_file",
 53 |         type=str,
 54 |         required=True,
 55 |         help="path to the assembly graph, (.gfa format)",
 56 |     )
 57 | 
 58 |     parser.add_argument(
 59 |         "-p",
 60 |         "--path",
 61 |         dest="path_file",
 62 |         type=str,
 63 |         required=False,
 64 |         help="contig file from SPAdes (.paths format), only required for SPAdes. e.g., contigs.paths",
 65 |     )
 66 | 
 67 |     parser.add_argument(
 68 |         "-mc",
 69 |         "--minimum_coverage",
 70 |         dest="min_cov",
 71 |         default=None,
 72 |         type=int,
 73 |         help=argparse.SUPPRESS,
 74 |         # (
 75 |         #     "minimum node coverage cutoff [default: auto]"
 76 |         # ),
 77 |     )
 78 | 
 79 |     parser.add_argument(
 80 |         "-ml",
 81 |         "--minimum_contig_length",
 82 |         dest="min_len",
 83 |         default=None,
 84 |         type=int,
 85 |         help=argparse.SUPPRESS,
 86 |         # ("minimum initial contig length [default: 250]"),
 87 |     )
 88 | 
 89 |     parser.add_argument(
 90 |         "-r",
 91 |         "--reference_fa",
 92 |         dest="ref_file",
 93 |         default=None,
 94 |         type=str,
 95 |         help=argparse.SUPPRESS,
 96 |     )
 97 | 
 98 |     parser.add_argument(
 99 |         "-o",
100 |         "--output_dir",
101 |         dest="output_dir",
102 |         default="acc/",
103 |         type=str,
104 |         help="path to the output directory [default: acc/]",
105 |     )
106 | 
107 |     parser.add_argument(
108 |         "-d",
109 |         "--dev_mode",
110 |         dest="dev",
111 |         action="store_true",
112 |         default=False,
113 |         help=argparse.SUPPRESS,
114 |     )
115 | 
116 |     parser.add_argument(
117 |         "-fwd",
118 |         "--fwd_file",
119 |         dest="fwd",
120 |         required=True,
121 |         default=None,
122 |         type=str,
123 |         help="paired-end sequencing reads, forward strand (.fastq format)",
124 |     )
125 | 
126 |     parser.add_argument(
127 |         "-rve",
128 |         "--rve_file",
129 |         dest="rve",
130 |         required=True,
131 |         default=None,
132 |         type=str,
133 |         help="paired-end sequencing reads, reverse strand (.fastq format)",
134 |     )
135 | 
136 |     args = parser.parse_args()
137 | 
138 |     # parsing arguments, sanity check
139 |     if (not args.gfa_file) or (not os.path.exists(args.gfa_file)):
140 |         print("\nPath to the assembly graph is required, (.gfa format)")
141 |         print("Please ensure the path is correct")
142 |         print("\nExiting...\n")
143 |         sys.exit(1)
144 | 
145 |     args.assembler = args.assembler.lower()
146 | 
147 |     if args.assembler.lower() == "spades":
148 |         if (not args.path_file) or (not os.path.exists(args.path_file)):
149 |             print(
150 |                 "\nPath to Contig file from SPAdes (.paths format) is required for SPAdes assmbler option. e.g., contigs.paths"
151 |             )
152 |             print("\nExiting...\n")
153 |             sys.exit(1)
154 |     else:
155 |         print("\nPlease make sure to provide the correct assembler type (SPAdes).")
156 |         print("\nExiting...\n")
157 |         sys.exit(1)
158 | 
159 |     if args.min_len != None:
160 |         if args.min_len < 0:
161 |             print(
162 |                 "\nPlease make sure to provide the correct option (invalid value for min_len or min_cov)."
163 |             )
164 |             print("\nExiting...\n")
165 |             sys.exit(1)
166 |     else:
167 |         args.min_len = 250
168 | 
169 |     if args.min_cov != None:
170 |         if args.min_cov < 0:
171 |             print(
172 |                 "\nPlease make sure to provide the correct option (invalid value for min_len or min_cov)."
173 |             )
174 |             print("\nExiting...\n")
175 |             sys.exit(1)
176 | 
177 |     if args.output_dir[-1] == "/":
178 |         args.output_dir = args.output_dir[:-1]
179 | 
180 |     # initialize output directory
181 |     os.makedirs(args.output_dir, exist_ok=True)
182 |     try:
183 |         os.makedirs(args.output_dir + "/gfa/")
184 |         os.makedirs(args.output_dir + "/tmp/")
185 |         os.makedirs(args.output_dir + "/paf/")
186 |         os.makedirs(args.output_dir + "/aln/")
187 |     except OSError as _:
188 |         print("\nCurrent output directory is not empty")
189 |         print("Please empty/re-create the output directory: " + str(args.output_dir))
190 |         print("\nExiting...\n")
191 |         sys.exit(1)
192 | 
193 |     if os.path.exists(args.output_dir + "/vstrains.log"):
194 |         os.remove(args.output + "/vstrains.log")
195 | 
196 |     # Setup logger
197 |     # -----------------------
198 |     logger = logging.getLogger("VStrains %s" % __version__)
199 |     logger.setLevel(logging.DEBUG if args.dev else logging.INFO)
200 | 
201 |     consoleHeader = logging.StreamHandler()
202 |     consoleHeader.setLevel(logging.INFO)
203 |     consoleHeader.setFormatter(logging.Formatter("%(message)s"))
204 |     logger.addHandler(consoleHeader)
205 | 
206 |     fileHandler = logging.FileHandler(args.output_dir + "/vstrains.log")
207 |     fileHandler.setLevel(logging.DEBUG if args.dev else logging.INFO)
208 |     fileHandler.setFormatter(logging.Formatter("%(message)s"))
209 |     logger.addHandler(fileHandler)
210 | 
211 |     logger.info("Welcome to VStrains!")
212 |     logger.info(
213 |         "VStrains is a strain-aware assembly tools, which constructs full-length "
214 |     )
215 |     logger.info("virus strain with aid from de Bruijn assembly graph and contigs.")
216 |     logger.info("")
217 |     logger.info("System information:")
218 |     try:
219 |         logger.info("  VStrains version: " + str(__version__).strip())
220 |         logger.info("  Python version: " + ".".join(map(str, sys.version_info[0:3])))
221 |         logger.info("  OS: " + platform.platform())
222 |     except Exception:
223 |         logger.info("  Problem occurred when getting system information")
224 | 
225 |     logger.info("")
226 |     start_time = time.time()
227 | 
228 |     logger.info("Input arguments:")
229 |     logger.info("Assembly type: " + args.assembler)
230 |     logger.info("Assembly graph file: " + args.gfa_file)
231 |     logger.info("Forward read file: " + args.fwd)
232 |     logger.info("Reverse read file: " + args.rve)
233 |     if args.assembler == "spades":
234 |         logger.info("Contig paths file: " + args.path_file)
235 |     logger.info("Output directory: " + os.path.abspath(args.output_dir))
236 |     if args.dev:
237 |         logger.info("*DEBUG MODE is turned ON")
238 |     logger.info("\n\n")
239 |     logger.info(
240 |         "======= VStrains pipeline started. Log can be found here: "
241 |         + os.path.abspath(args.output_dir)
242 |         + "/vstrains.log\n"
243 |     )
244 | 
245 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
246 |     consoleHeader.setFormatter(formatter)
247 |     fileHandler.setFormatter(formatter)
248 | 
249 |     # all good
250 |     run(args, logger)
251 | 
252 |     elapsed = time.time() - start_time
253 | 
254 |     consoleHeader.setFormatter(logging.Formatter("%(message)s"))
255 |     fileHandler.setFormatter(logging.Formatter("%(message)s"))
256 | 
257 |     logger.info("")
258 |     logger.info("Thanks for using VStrains")
259 |     logger.info(
260 |         "Result is stored in {0}/strain.fasta".format(os.path.abspath(args.output_dir))
261 |     )
262 |     logger.info(
263 |         "You can visualise the path stored in {0}/strain.paths via {0}/gfa/graph_L0.gfa".format(
264 |             os.path.abspath(args.output_dir)
265 |         )
266 |     )
267 |     logger.info("Finished: {0}".format(date.today().strftime("%B %d, %Y")))
268 |     logger.info("Elapsed time: {0}".format(elapsed))
269 |     logger.info("Exiting...")
270 |     logger.removeHandler(fileHandler)
271 |     logger.removeHandler(consoleHeader)
272 | 
273 |     return 0
274 | 
275 | 
276 | if __name__ == "__main__":
277 |     main()
278 | 


--------------------------------------------------------------------------------
/utils/VStrains_SPAdes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from utils.VStrains_Utilities import *
  4 | from utils.VStrains_Preprocess import (
  5 |     graph_simplification,
  6 |     reindexing,
  7 |     threshold_estimation,
  8 | )
  9 | from utils.VStrains_IO import (
 10 |     graph_to_gfa,
 11 |     flipped_gfa_to_graph,
 12 |     gfa_to_graph,
 13 |     contig_dict_to_path,
 14 |     contig_dict_to_fasta,
 15 |     spades_paths_parser,
 16 |     process_pe_info,
 17 |     store_reinit_graph,
 18 | )
 19 | from utils.VStrains_Decomposition import *
 20 | from utils.VStrains_Extension import path_extension, best_matching
 21 | import os
 22 | import sys
 23 | 
 24 | 
 25 | def run(args, logger):
 26 |     TEMP_DIR = args.output_dir
 27 | 
 28 |     logger.info("VStrains-SPAdes started")
 29 | 
 30 |     logger.info(">>>STAGE: parsing graph and contigs")
 31 |     graph, simp_node_dict, simp_edge_dict = gfa_to_graph(args.gfa_file, logger)
 32 |     graph_to_gfa(
 33 |         graph,
 34 |         simp_node_dict,
 35 |         simp_edge_dict,
 36 |         logger,
 37 |         "{0}/gfa/graph_L0.gfa".format(TEMP_DIR),
 38 |     )
 39 |     graph0, simp_node_dict0, simp_edge_dict0 = flipped_gfa_to_graph(
 40 |         "{0}/gfa/graph_L0.gfa".format(TEMP_DIR), logger
 41 |     )
 42 |     graph0, simp_node_dict0, simp_edge_dict0, idx_mapping = reindexing(
 43 |         graph0, simp_node_dict0, simp_edge_dict0
 44 |     )
 45 |     graph_to_gfa(
 46 |         graph0,
 47 |         simp_node_dict0,
 48 |         simp_edge_dict0,
 49 |         logger,
 50 |         "{0}/gfa/graph_L0r.gfa".format(TEMP_DIR),
 51 |     )
 52 | 
 53 |     # cut-off coverage, graph preprocess parameter
 54 |     THRESHOLD = 0
 55 |     if args.min_cov != None:
 56 |         THRESHOLD = args.min_cov
 57 |         logger.info("user-defined node minimum coverage: {0}".format(THRESHOLD))
 58 |     else:
 59 |         THRESHOLD = threshold_estimation(graph0, logger, TEMP_DIR)
 60 |         logger.info("computed node minimum coverage: {0}".format(THRESHOLD))
 61 | 
 62 |     contig_dict, contig_info = spades_paths_parser(
 63 |         graph0,
 64 |         simp_node_dict0,
 65 |         simp_edge_dict0,
 66 |         idx_mapping,
 67 |         logger,
 68 |         args.path_file,
 69 |         args.min_len,
 70 |         THRESHOLD,
 71 |     )
 72 |     copy_contig_dict = {}
 73 |     for cno, [contig, clen, ccov] in contig_dict.items():
 74 |         copy_contig_dict[cno] = [list(contig), clen, ccov]
 75 |     # debug only
 76 |     contig_dict_to_path(contig_dict, "{0}/tmp/init_contigs.paths".format(TEMP_DIR))
 77 |     contig_dict_to_fasta(
 78 |         graph0,
 79 |         simp_node_dict0,
 80 |         contig_dict,
 81 |         "{0}/tmp/init_contigs.fasta".format(TEMP_DIR),
 82 |     )
 83 |     if args.ref_file:
 84 |         minimap_api(
 85 |             args.ref_file,
 86 |             "{0}/tmp/init_contigs.fasta".format(TEMP_DIR),
 87 |             "{0}/paf/init_contigs_to_strain.paf".format(TEMP_DIR),
 88 |         )
 89 |     # debug only
 90 |     logger.info(">>>STAGE: preprocess")
 91 |     graph_simplification(
 92 |         graph0, simp_node_dict0, simp_edge_dict0, None, logger, THRESHOLD
 93 |     )
 94 |     graph_to_gfa(
 95 |         graph0,
 96 |         simp_node_dict0,
 97 |         simp_edge_dict0,
 98 |         logger,
 99 |         "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR),
100 |     )
101 |     graph1, simp_node_dict1, simp_edge_dict1 = flipped_gfa_to_graph(
102 |         "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR), logger
103 |     )
104 | 
105 |     # filter out contig that contains erroroness nodes
106 |     for cno, [contig, _, _] in list(contig_dict.items()):
107 |         if any([c not in simp_node_dict1 for c in contig]):
108 |             contig_dict.pop(cno)
109 |             logger.debug("unreliable contig with low coverage: {0}".format(cno))
110 | 
111 |     # get graph kmer size
112 |     ksize = graph1.ep.overlap[list(graph1.edges())[0]] if graph1.num_edges() > 0 else 0
113 |     logger.info("graph kmer size: {0}".format(ksize))
114 |     if ksize <= 0:
115 |         logger.error("invalid kmer-size, the graph does not contain any edges, exit..")
116 |         sys.exit(1)
117 | 
118 |     # obtain paired end information
119 |     script_path = "{0}/VStrains_PE_Inference.py".format(
120 |         os.path.abspath(os.path.dirname(__file__))
121 |     )
122 |     subprocess.check_call(
123 |         "python {0} -g {1} -o {2} -f {3} -r {4} -k {5}".format(
124 |             script_path,
125 |             "{0}/gfa/s_graph_L1.gfa".format(TEMP_DIR),
126 |             "{0}/aln".format(TEMP_DIR),
127 |             args.fwd,
128 |             args.rve,
129 |             ksize,
130 |         ),
131 |         shell=True,
132 |     )
133 |     logger.info("paired end information stored")
134 |     pe_info_file = "{0}/aln/pe_info".format(TEMP_DIR)
135 |     st_info_file = "{0}/aln/st_info".format(TEMP_DIR)
136 |     pe_info, dcpy_pe_info = process_pe_info(
137 |         simp_node_dict1.keys(), pe_info_file, st_info_file
138 |     )
139 | 
140 |     edge_cleaning(graph1, simp_edge_dict1, contig_dict, pe_info, logger)
141 | 
142 |     graph2, simp_node_dict2, simp_edge_dict2 = store_reinit_graph(
143 |         graph1,
144 |         simp_node_dict1,
145 |         simp_edge_dict1,
146 |         logger,
147 |         "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR),
148 |     )
149 | 
150 |     contig_dict_to_path(contig_dict, "{0}/tmp/pre_contigs.paths".format(TEMP_DIR))
151 |     contig_dict_to_fasta(
152 |         graph2,
153 |         simp_node_dict2,
154 |         contig_dict,
155 |         "{0}/tmp/pre_contigs.fasta".format(TEMP_DIR),
156 |     )
157 |     # stat evaluation
158 |     if args.ref_file:
159 |         map_ref_to_graph(
160 |             args.ref_file,
161 |             simp_node_dict2,
162 |             "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR),
163 |             logger,
164 |             True,
165 |             "{0}/paf/node_to_ref.paf".format(TEMP_DIR),
166 |             "{0}/tmp/temp_gfa_to_fasta_pre.fasta".format(TEMP_DIR),
167 |         )
168 |         minimap_api(
169 |             args.ref_file,
170 |             "{0}/tmp/pre_contigs.fasta".format(TEMP_DIR),
171 |             "{0}/paf/pre_contigs_to_strain.paf".format(TEMP_DIR),
172 |         )
173 |         map_ref_to_contig(
174 |             contig_dict, logger, "{0}/paf/pre_contigs_to_strain.paf".format(TEMP_DIR)
175 |         )
176 |     # end stat
177 | 
178 |     # split the branches using link information
179 |     graphf, simp_node_dictf, simp_edge_dictf = iter_graph_disentanglement(
180 |         graph2,
181 |         simp_node_dict2,
182 |         simp_edge_dict2,
183 |         contig_dict,
184 |         pe_info,
185 |         args.ref_file,
186 |         logger,
187 |         0.05 * numpy.median([graph2.vp.dp[node] for node in graph2.vertices()]),
188 |         TEMP_DIR,
189 |     )
190 | 
191 |     contig_dict_to_path(contig_dict, "{0}/tmp/post_contigs.paths".format(TEMP_DIR))
192 |     contig_dict_to_fasta(
193 |         graphf,
194 |         simp_node_dictf,
195 |         contig_dict,
196 |         "{0}/tmp/post_contigs.fasta".format(TEMP_DIR),
197 |     )
198 |     # stat evaluation
199 |     if args.ref_file:
200 |         map_ref_to_graph(
201 |             args.ref_file,
202 |             simp_node_dictf,
203 |             "{0}/gfa/split_graph_final.gfa".format(TEMP_DIR),
204 |             logger,
205 |             True,
206 |             "{0}/paf/node_to_ref_final.paf".format(TEMP_DIR),
207 |             "{0}/tmp/temp_gfa_to_fasta_post.fasta".format(TEMP_DIR),
208 |         )
209 |         minimap_api(
210 |             args.ref_file,
211 |             "{0}/tmp/post_contigs.fasta".format(TEMP_DIR),
212 |             "{0}/paf/post_contigs_to_strain.paf".format(TEMP_DIR),
213 |         )
214 |         map_ref_to_contig(
215 |             contig_dict, logger, "{0}/paf/post_contigs_to_strain.paf".format(TEMP_DIR)
216 |         )
217 |     # end stat
218 |     logger.info(">>>STAGE: contig path extension")
219 | 
220 |     # refine partial links using best match
221 |     full_link = best_matching(
222 |         graphf, simp_node_dictf, simp_edge_dictf, contig_dict, pe_info, logger
223 |     )
224 | 
225 |     # update graph coverage on non-trivial branch, maximize
226 |     increment_nt_branch_coverage(graphf, simp_node_dictf, logger)
227 | 
228 |     graph_to_gfa(
229 |         graphf,
230 |         simp_node_dictf,
231 |         simp_edge_dictf,
232 |         logger,
233 |         "{0}/gfa/split_graph_final.gfa".format(TEMP_DIR),
234 |     )
235 | 
236 |     # extend the graph
237 |     p_delta = 0.05 * numpy.median([graphf.vp.dp[node] for node in graphf.vertices()])
238 |     strain_dict, usages = path_extension(
239 |         graphf,
240 |         simp_node_dictf,
241 |         simp_edge_dictf,
242 |         contig_dict,
243 |         full_link,
244 |         dcpy_pe_info,
245 |         logger,
246 |         p_delta,
247 |         TEMP_DIR,
248 |     )
249 | 
250 |     logger.info(">>>STAGE: final process")
251 |     contig_resolve(strain_dict)
252 |     graphl, simp_node_dictl, simp_edge_dictl = flipped_gfa_to_graph(
253 |         "{0}/gfa/es_graph_L2.gfa".format(TEMP_DIR), logger
254 |     )
255 |     trim_contig_dict(graphl, simp_node_dictl, strain_dict, logger)
256 |     contig_dup_removed_s(strain_dict, logger)
257 |     contig_dict_to_path(
258 |         strain_dict, "{0}/tmp/tmp_strain.paths".format(TEMP_DIR), None, False
259 |     )
260 | 
261 |     # recover repeat nodes back to contig
262 |     strain_repeat_resol(
263 |         graph0, simp_node_dict0, strain_dict, contig_info, copy_contig_dict, logger
264 |     )
265 | 
266 |     logger.info(">>>STAGE: generate result")
267 |     contig_dict_to_fasta(
268 |         graph0, simp_node_dict0, strain_dict, "{0}/strain.fasta".format(TEMP_DIR)
269 |     )
270 |     contig_dict_to_path(
271 |         strain_dict, "{0}/strain.paths".format(TEMP_DIR), idx_mapping, True
272 |     )
273 |     if args.ref_file:
274 |         minimap_api(
275 |             args.ref_file,
276 |             "{0}/strain.fasta".format(TEMP_DIR),
277 |             "{0}/paf/strain_to_ref.paf".format(TEMP_DIR),
278 |         )
279 |     logger.info("VStrains-SPAdes finished")
280 |     return 0
281 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="VStrains_logo.png" width="500" title="VStrains logo" alt="VStrains logo">
  3 | </p>
  4 | 
  5 | # VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs
  6 | 
  7 | ![GitHub](https://img.shields.io/github/license/metagentools/VStrains)
  8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
  9 | 
 10 | Manual
 11 | ===========
 12 | 
 13 | Table of Contents
 14 | -----------------
 15 | 
 16 | 1. [About VStrains](#sec1) </br>
 17 | 2. [Updates](#sec2) </br>
 18 | 3. [Installation](#sec3) </br>
 19 |    3.1. [Option 1. Quick Install](#sec3.1) </br>
 20 |    3.2. [Option 2. Manual Install](#sec3.2) </br>
 21 |    3.3. [Download & Install VStrains](#sec3.3) </br>
 22 | 4. [Running VStrains](#sec4) </br>
 23 |    4.1. [Quick Usage](#sec4.1) </br>
 24 |    4.2. [Support SPAdes](#sec4.2) </br>
 25 |    4.3. [Output](#sec4.3) </br>
 26 | 5. [Stand-alone binaries](#sec5) </br>
 27 | 6. [Experiment](#sec6) </br>
 28 | 7. [Citation](#sec7) </br>
 29 | 8. [Feedback and bug reports](#sec8)</br>
 30 | 
 31 | <a name="sec1"></a>
 32 | # About VStrains
 33 | 
 34 | VStrains is a de novo approach for reconstructing strains from viral quasispecies.
 35 | 
 36 | <!-- Please refer to our [paper](NULL) and [supplementary Material](NULL) for details methodology. -->
 37 | 
 38 | <a name="sec2"></a>
 39 | # Updates
 40 | 
 41 | ## VStrains 1.1.0 Release (03 Feb 2023)
 42 | * Replace the PE link inference module `VStrains_Alignment.py` with `VStrains_PE_Inference.py`
 43 |    
 44 |    `VStrains_PE_Inference.py` implements a hash table approach that produce efficient perfect match lookup, the new module leads to consistent evaluation results and substantially decrease the runtime and memory usage against previous alignment approach.
 45 | 
 46 | <!-- * support direct install for Conda -->
 47 | 
 48 | <a name="sec3"></a>
 49 | # Installation
 50 | 
 51 | VStrains requires a 64-bit Linux system or Mac OS and python (supported versions are python3: 3.2 and higher).
 52 | 
 53 | <a name="sec3.1"></a>
 54 | ## Option 1. Quick Install (**recommended**)
 55 | 
 56 | Install [(mini)conda](https://conda.io/miniconda.html) as a light-weighted package management tool. Run the following commands to initialize and setup the conda environment for VStrains
 57 | 
 58 | ```bash
 59 | # add channels
 60 | conda config --add channels defaults
 61 | conda config --add channels bioconda
 62 | conda config --add channels conda-forge
 63 | 
 64 | # create conda environment
 65 | conda create --name VStrains-env
 66 | 
 67 | # activate conda environment
 68 | conda activate VStrains-env
 69 | 
 70 | conda install -c bioconda -c conda-forge python=3 graph-tool minimap2 numpy gfapy matplotlib
 71 | ```
 72 | 
 73 | <a name="sec3.2"></a>
 74 | ## Option 2. Manual Install
 75 | 
 76 | Manually install dependencies: 
 77 | - [minimap2](https://github.com/lh3/minimap2)  
 78 | 
 79 | And python modules:
 80 | - [graph-tool](https://graph-tool.skewed.de)
 81 | - [numpy](https://numpy.org)
 82 | - [gfapy](https://github.com/ggonnella/gfapy)
 83 | - [matplotlib](https://matplotlib.org)
 84 | 
 85 | <a name="sec3.3"></a>
 86 | ## Download & Install VStrains
 87 | 
 88 | After successfully setup the environment and dependencies, clone the VStrains into your desirable place.
 89 | 
 90 | ```bash
 91 | git clone https://github.com/metagentools/VStrains.git
 92 | ```
 93 | 
 94 | Install the VStrains via `Pip`
 95 | 
 96 | ```bash
 97 | cd VStrains; pip install .
 98 | ```
 99 | 
100 | Run the following commands to ensure VStrains is correctly setup & installed.
101 | 
102 | ```bash
103 | vstrains -h
104 | ```
105 | 
106 | <a name="sec4"></a>
107 | # Running VStrains
108 | 
109 | VStrains supports assembly results from [SPAdes](https://github.com/ablab/spades) (includes metaSPAdes and metaviralSPAdes) and may supports other graph-based assemblers in the future.
110 | 
111 | <a name="sec4.1"></a>
112 | ## Quick Usage
113 | 
114 | ```
115 | usage: VStrains [-h] -a {spades} -g GFA_FILE [-p PATH_FILE] [-o OUTPUT_DIR] -fwd FWD -rve RVE
116 | 
117 | Construct full-length viral strains under de novo approach from contigs and assembly graph, currently supports
118 | SPAdes
119 | 
120 | optional arguments:
121 |   -h, --help            show this help message and exit
122 |   -a {spades}, --assembler {spades}
123 |                         name of the assembler used. [spades]
124 |   -g GFA_FILE, --graph GFA_FILE
125 |                         path to the assembly graph, (.gfa format)
126 |   -p PATH_FILE, --path PATH_FILE
127 |                         contig file from SPAdes (.paths format), only required for SPAdes. e.g., contigs.paths
128 |   -o OUTPUT_DIR, --output_dir OUTPUT_DIR
129 |                         path to the output directory [default: acc/]
130 |   -fwd FWD, --fwd_file FWD
131 |                         paired-end sequencing reads, forward strand (.fastq format)
132 |   -rve RVE, --rve_file RVE
133 |                         paired-end sequencing reads, reverse strand (.fastq format)
134 | ```
135 | 
136 | VStrains takes as input an assembly graph in Graphical Fragment Assembly (GFA) Format and associated contig information, together with the raw reads in paired-end format (e.g., forward.fastq, reverse.fastq).
137 | 
138 | <a name="sec4.2"></a>
139 | ## Support SPAdes
140 | 
141 | When running SPAdes, we recommend to use `--careful` option for more accurate assembly results. Do not modify any contig/node name from the SPAdes assembly results for consistency. Please refer to [SPAdes](https://github.com/ablab/spades) for further guideline. Example usage as below:
142 | 
143 | ```bash
144 | # SPAdes assembler example, pair-end reads
145 | python spades.py -1 forward.fastq -2 reverse.fastq --careful -t 16 -o output_dir
146 | ```
147 | 
148 | Both assembly graph (`assembly_graph_after_simplification.gfa`) and contig information (`contigs.paths`) can be found in the output directory after running SPAdes assembler. Please use them together with raw reads as inputs for VStrains, and set `-a` flag to `spades`. Example usage as below:
149 | 
150 | ```bash
151 | vstrains -a spades -g assembly_graph_after_simplification.gfa -p contigs.paths -o output_dir -fwd forward.fastq -rve reverse.fastq
152 | ```
153 | 
154 | <a name="sec4.3"></a>
155 | ## Output
156 | 
157 | 
158 | VStrains stores all output files in `<output_dir>`, which is set by the user.
159 | 
160 | * `<output_dir>/aln/` directory contains paired-end (PE) linkage information, which is stored in `pe_info` and `st_info`.
161 | * `<output_dir>/gfa/` directory contains iteratively simplified assembly graphs, where `graph_L0.gfa` contains the assembly graph produced by SPAdes after Strandedness Canonization, `split_graph_final.gfa` contains the assembly graph after Graph Disentanglement, and `graph_S_final.gfa` contains the assembly graph after Contig-based Path Extraction, the rests are intermediate results. All the assembly graphs are in [GFA 1.0 format](https://github.com/GFA-spec/GFA-spec/blob/master/GFA1.md).
162 | * `<output_dir>/paf/` and `<output_dir>/tmp/` are temporary directories, feel free to ignore them.
163 | * `<output_dir>/strain.fasta` contains resulting strains in `.fasta`, the headers for each strain has the form `NODE_<strain name>_<sequence length>_<coverage>` which is compatiable to SPAdes contigs format.
164 | * `<output_dir>/strain.paths` contains paths in the assembly graph (input `GFA_FILE`) corresponding to `strain.fasta` using [Bandage](https://github.com/rrwick/Bandage) for further downstream analysis.
165 | * `<output_dir>/vstrains.log` contains the VStrains log.
166 | <!-- <a name="sec3.3"></a> -->
167 | <!-- ## Parameters -->
168 | 
169 | <!-- ### Minimum Node Coverage
170 | 
171 | This sets the minimum node coverage for filtering the inaccurate nodes from initial assembly graph. By default, the node coverage is automatically set based on coverage distribution, which demonstrates good result among all tested datasets. Please use `-mc` flag to input the customized minimum node coverage if needed.
172 | 
173 | ### Minimum Contig Length
174 | 
175 | Since SPAdes normally output all the nodes from assembly graph as contigs, short or low coverage contig may lead to less accuracy and confidence. By default, single node contig with length less than 250bp or coverage less then `--mc` (defined above) is filtered out. Please use `-ml` flag to input the customized minimum contig length if needed. -->
176 | 
177 | <a name="sec5"></a>
178 | # Stand-alone binaries
179 | 
180 | `evals/quast_evaluation.py` is a wrapper script for strain-level experimental result analysis using [MetaQUAST](https://github.com/ablab/quast).
181 | 
182 | ```
183 | usage: quast_evaluation.py [-h] -quast QUAST [-cs FILES [FILES ...]] [-d IDIR] -ref REF_FILE -o OUTPUT_DIR
184 | 
185 | Use MetaQUAST to evaluate assembly result
186 | 
187 | options:
188 |   -h, --help            show this help message and exit
189 |   -quast QUAST, --path_to_quast QUAST
190 |                         path to MetaQuast python script, version >= 5.2.0
191 |   -cs FILES [FILES ...], --contig_files FILES [FILES ...]
192 |                         contig files from different tools, separated by space
193 |   -d IDIR, --contig_dir IDIR
194 |                         contig files from different tools, stored in the directory, .fasta format
195 |   -ref REF_FILE, --ref_file REF_FILE
196 |                         ref file (single)
197 |   -o OUTPUT_DIR, --output_dir OUTPUT_DIR
198 |                         output directory
199 | ```
200 | 
201 | <a name="sec6"></a>
202 | # Experiment
203 | 
204 | VStrains is evaluated on both simulated and real datasets under default settings, and the source of the datasets can be found in the links listed below:
205 | 1. Simulated Dataset, can be found at [savage-benchmark](https://bitbucket.org/jbaaijens/savage-benchmarks/src/master/) (No preprocessing is required)
206 |    - 6 Poliovirus (20,000x)
207 |    - 10 HCV (20,000x)
208 |    - 15 ZIKV (20,000x)
209 | 2. Real Dataset (please refer to [Supplementary Material](https://www.biorxiv.org/content/10.1101/2022.10.21.513181v3.supplementary-material) for preprocessing the real datasets)
210 |    - 5 HIV labmix (20,000x) [SRR961514](https://www.ncbi.nlm.nih.gov/sra/?term=SRR961514), reference genome sequences are available at [5 HIV References](https://github.com/cbg-ethz/5-virus-mix/blob/master/data/REF.fasta)
211 |    - 2 SARS-COV-2 (4,000x) [SRR18009684](https://www.ncbi.nlm.nih.gov/sra/?term=SRR18009684), [SRR18009686](https://www.ncbi.nlm.nih.gov/sra/?term=SRR18009686), pre-processed reads and individually assemble ground-truth reference sequences can be found at [2 SARS-COV-2 Dataset](https://github.com/RunpengLuo/sarscov2-4000x)
212 | 
213 | <a name="sec7"></a>
214 | # Citation
215 | VStrains has been accepted at [RECOMB 2023](http://recomb2023.bilkent.edu.tr/program.html) and manuscript is publicly available at [here](https://link.springer.com/chapter/10.1007/978-3-031-29119-7_1).
216 | 
217 | If you use VStrains in your work, please cite the following publications.
218 | 
219 | Runpeng Luo and Yu Lin, VStrains: De Novo Reconstruction of Viral Strains via Iterative Path Extraction From Assembly Graphs
220 | 
221 | <a name="sec8"></a>
222 | # Feedback and bug reports
223 | 
224 | Thanks for using VStrains. If any bugs be experienced during execution, please re-run the program with additional `-d` flag and provide the `vstains.log` together with user cases via `Issues`
225 | 


--------------------------------------------------------------------------------
/utils/VStrains_Preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from logging import Logger
  4 | import subprocess
  5 | from graph_tool.all import Graph
  6 | 
  7 | import numpy
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | from utils.VStrains_Utilities import *
 11 | 
 12 | 
 13 | def reindexing(graph: Graph, simp_node_dict: dict, simp_edge_dict: dict):
 14 |     """
 15 |     Reindex the nodes, with idx-node_id mappings
 16 |     """
 17 |     idx_mapping = {}
 18 |     idx_node_dict = {}
 19 |     idx_edge_dict = {}
 20 |     idx = 0
 21 |     for no, node in simp_node_dict.items():
 22 |         if graph.vp.color[node] == "black":
 23 |             idx_mapping[no] = str(idx)
 24 |             graph.vp.id[node] = str(idx)
 25 |             idx_node_dict[str(idx)] = node
 26 |             idx += 1
 27 |     for (u, v), e in simp_edge_dict.items():
 28 |         if (
 29 |             graph.ep.color[e] == "black"
 30 |             and graph.vp.color[e.source()] == "black"
 31 |             and graph.vp.color[e.target()] == "black"
 32 |         ):
 33 |             idx_edge_dict[(idx_mapping[u], idx_mapping[v])] = e
 34 |     return graph, idx_node_dict, idx_edge_dict, idx_mapping
 35 | 
 36 | 
 37 | def threshold_estimation(graph: Graph, logger: Logger, temp_dir):
 38 |     dps = [graph.vp.dp[node] for node in graph.vertices()]
 39 |     # handle edge case, when the graph contains uniform coverage
 40 |     if max(dps) == min(dps):
 41 |         return 0.00
 42 |     regions, bins = numpy.histogram(
 43 |         dps, bins=int((max(dps) - min(dps)) // (0.05 * numpy.median(dps)))
 44 |     )
 45 |     pidx, _ = max(list(enumerate(regions)), key=lambda p: p[1])
 46 |     ratio = 0.00
 47 |     if pidx == 0:
 48 |         ratio = 0.05
 49 |         # global peak belongs to first filter region, find maximum peak range, bound by 25% Median
 50 |         for i in range(0, 4):
 51 |             if i >= len(regions):
 52 |                 logger.warning(
 53 |                     "histogram is not properly set, reset cutoff to default (0.05*M)"
 54 |                 )
 55 |                 ratio = 0.05
 56 |                 break
 57 |             if regions[i] > regions[i + 1]:
 58 |                 ratio += 0.05
 59 |             else:
 60 |                 break
 61 |     threshold = ratio * numpy.median(dps)
 62 |     plt.figure(figsize=(128, 64))
 63 |     for b in bins:
 64 |         plt.axvline(b, color="blue")
 65 |     plt.hist(x=dps, bins=len(dps))
 66 |     plt.axvline(threshold, color="r")
 67 |     plt.title("node coverage bar plot")
 68 |     plt.xticks(numpy.arange(min(dps), max(dps) + 1, 50.0))
 69 |     plt.savefig("{0}{1}".format(temp_dir, "/tmp/bar_plot.png"))
 70 |     return threshold
 71 | 
 72 | 
 73 | def graph_simplification(
 74 |     graph: Graph,
 75 |     simp_node_dict: dict,
 76 |     simp_edge_dict: dict,
 77 |     contig_dict: dict,
 78 |     logger: Logger,
 79 |     min_cov,
 80 | ):
 81 |     """
 82 |     Directly remove all the vertex with coverage less than minimum coverage and related edge
 83 | 
 84 |     Node belongs to any contigs should not be removed
 85 |     return:
 86 |         removed_node_dict
 87 |         removed_edge_dict
 88 |     """
 89 |     logger.info("graph simplification")
 90 |     logger.debug(
 91 |         "Total nodes: "
 92 |         + str(len(simp_node_dict))
 93 |         + " Total edges: "
 94 |         + str(len(simp_edge_dict))
 95 |     )
 96 |     node_to_contig_dict = {}
 97 |     edge_to_contig_dict = {}
 98 |     if contig_dict != None:
 99 |         node_to_contig_dict, edge_to_contig_dict = contig_map_node(contig_dict)
100 |     # iterate until no more node be removed from the graph
101 |     for id, node in list(simp_node_dict.items()):
102 |         if graph.vp.dp[node] <= min_cov:
103 |             if id in node_to_contig_dict:
104 |                 continue
105 | 
106 |             graph_remove_vertex(graph, simp_node_dict, id, printout=False)
107 | 
108 |             for e in set(node.all_edges()):
109 |                 uid = graph.vp.id[e.source()]
110 |                 vid = graph.vp.id[e.target()]
111 |                 if (uid, vid) in edge_to_contig_dict:
112 |                     continue
113 |                 if (uid, vid) in simp_edge_dict:
114 |                     graph_remove_edge(graph, simp_edge_dict, uid, vid, printout=False)
115 | 
116 |     logger.debug(
117 |         "Remain nodes: "
118 |         + str(len(simp_node_dict))
119 |         + " Total edges: "
120 |         + str(len(simp_edge_dict))
121 |     )
122 |     logger.info("done")
123 |     return
124 | 
125 | 
126 | # ------------------------------------LEGACY------------------------------------#
127 | def paths_from_src(graph: Graph, simp_node_dict: dict, self_node, src, maxlen):
128 |     """
129 |     retrieve all the path from src node to any node
130 |     within maxlen restriction, in straight direction
131 |     """
132 | 
133 |     def dfs_rev(graph: Graph, u, curr_path: list, maxlen, visited, all_path):
134 |         visited[u] = True
135 |         curr_path.append(u)
136 |         curr_len = path_len(graph, curr_path)
137 |         if curr_len >= maxlen:
138 |             all_path.append(list(curr_path))
139 |         else:
140 |             for v in u.out_neighbors():
141 |                 if not visited[v]:
142 |                     dfs_rev(graph, v, curr_path, maxlen, visited, all_path)
143 |         curr_path.pop(-1)
144 |         visited[u] = False
145 |         return
146 | 
147 |     visited = {}
148 |     for u in graph.vertices():
149 |         if graph.vp.id[u] not in simp_node_dict:
150 |             visited[u] = True
151 |         else:
152 |             visited[u] = False
153 |     visited[self_node] = True
154 |     all_path = []
155 |     dfs_rev(graph, src, [], maxlen, visited, all_path)
156 |     return all_path
157 | 
158 | 
159 | def paths_to_tgt(graph: Graph, simp_node_dict: dict, self_node, tgt, maxlen):
160 |     """
161 |     retrieve all the path from any node to tgt node
162 |     within maxlen restriction, in reverse direction
163 |     """
164 | 
165 |     def dfs_rev(graph: Graph, v, curr_path: list, maxlen, visited, all_path):
166 |         visited[v] = True
167 |         curr_path.insert(0, v)
168 |         curr_len = path_len(graph, curr_path)
169 |         if curr_len >= maxlen:
170 |             all_path.append(list(curr_path))
171 |         else:
172 |             for u in v.in_neighbors():
173 |                 if not visited[u]:
174 |                     dfs_rev(graph, u, curr_path, maxlen, visited, all_path)
175 |         curr_path.pop(0)
176 |         visited[v] = False
177 |         return
178 | 
179 |     visited = {}
180 |     for u in graph.vertices():
181 |         if graph.vp.id[u] not in simp_node_dict:
182 |             visited[u] = True
183 |         else:
184 |             visited[u] = False
185 |     visited[self_node] = True
186 |     all_path = []
187 |     dfs_rev(graph, tgt, [], maxlen, visited, all_path)
188 |     return all_path
189 | 
190 | 
191 | def tip_removal_s(
192 |     graph: Graph,
193 |     simp_node_dict: dict,
194 |     contig_dict: dict,
195 |     logger: Logger,
196 |     tempdir,
197 |     accept_rate=0.99,
198 | ):
199 |     if not graph_is_DAG(graph, simp_node_dict):
200 |         logger.info("Graph is Cyclic, tip removal start..")
201 |         tip_removed = False
202 |         while not tip_removed:
203 |             tip_removed = tip_removal(
204 |                 graph, simp_node_dict, logger, tempdir, accept_rate
205 |             )
206 |         for cno, [contig, _, ccov] in list(contig_dict.items()):
207 |             if not all([no in simp_node_dict for no in contig]):
208 |                 subcontigs = []
209 |                 curr_contig = []
210 |                 addLast = False
211 |                 for no in contig:
212 |                     if no in simp_node_dict:
213 |                         addLast = True
214 |                         curr_contig.append(no)
215 |                     else:
216 |                         addLast = False
217 |                         if curr_contig != []:
218 |                             subcontigs.append(curr_contig[:])
219 |                         curr_contig = []
220 |                 if addLast:
221 |                     subcontigs.append(curr_contig[:])
222 | 
223 |                 contig_dict.pop(cno)
224 |                 for i, subc in enumerate(subcontigs):
225 |                     sublen = path_len(graph, [simp_node_dict[c] for c in subc])
226 |                     contig_dict[cno + "^" + str(i)] = [subc, sublen, ccov]
227 |     else:
228 |         logger.info("Graph is DAG, tip removal skipped.")
229 |     logger.info("done")
230 |     return
231 | 
232 | 
233 | def tip_removal(
234 |     graph: Graph, simp_node_dict: dict, logger: Logger, tempdir, accept_rate
235 | ):
236 |     """
237 |     retrieve all the source/tail simple path, and merge them into adjacent neighbor path if possible
238 | 
239 |     the collapse step can be done before node depeth rebalance, since it only regards to
240 |     matching score within node seq len
241 | 
242 |     if is the case, then spades contig may also be modified.
243 |     """
244 | 
245 |     def remove_tip(graph: Graph, simp_node_dict: dict, from_node, to_path):
246 |         """
247 |         collapse the node with the given path, increment given path depth, remove related information
248 |         about the node.
249 |         """
250 |         graph.vp.color[from_node] = "gray"
251 |         pending_dp = graph.vp.dp[from_node]
252 |         for node in to_path:
253 |             graph.vp.dp[node] += pending_dp
254 |         simp_node_dict.pop(graph.vp.id[from_node])
255 |         for e in from_node.all_edges():
256 |             graph.ep.color[e] = "gray"
257 |         logger.debug(
258 |             path_to_id_string(
259 |                 graph,
260 |                 to_path,
261 |                 "Tip Node {0} collapsed to path".format(graph.vp.id[from_node]),
262 |             )
263 |         )
264 |         return
265 | 
266 |     def cand_collapse_path(graph: Graph, from_node, to_paths, temp_dir):
267 |         """
268 |         use minimap2 -c to evaluation the node-path similarity, sort based on matching score in DESC order
269 | 
270 |         return: the most similar path if there exist a path with score >= accept rate, else return None
271 |         """
272 |         ref_loc = "{0}/ref.fa".format(temp_dir)
273 |         query_loc = "{0}/query.fa".format(temp_dir)
274 |         overlap_loc = "{0}/overlap.paf".format(temp_dir)
275 |         subprocess.check_call(
276 |             "touch {0}; echo > {0}; touch {1}; echo > {1}".format(ref_loc, query_loc),
277 |             shell=True,
278 |         )
279 | 
280 |         id_path_dict = {}
281 |         for id, path in list(enumerate(to_paths)):
282 |             id_path_dict[id] = path
283 | 
284 |         # retrieve all the path information and save into ref.fa
285 |         with open(ref_loc, "w") as ref_file:
286 |             for id, path in id_path_dict.items():
287 |                 name = ">" + str(id) + "\n"
288 |                 seq = path_to_seq(graph, path, id) + "\n"
289 |                 ref_file.write(name)
290 |                 ref_file.write(seq)
291 |             ref_file.close()
292 | 
293 |         # save from node info to query.fa
294 |         with open(query_loc, "w") as query_file:
295 |             name = ">" + graph.vp.id[from_node] + "\n"
296 |             seq = path_to_seq(graph, [from_node], name) + "\n"
297 |             query_file.write(name)
298 |             query_file.write(seq)
299 |             query_file.close()
300 | 
301 |         # minimap to obtain matching score for all node-path
302 |         id_evalscore = {}
303 |         minimap_api(ref_loc, query_loc, overlap_loc)
304 |         with open(overlap_loc, "r") as overlap_file:
305 |             for Line in overlap_file:
306 |                 splited = (Line[:-1]).split("\t")
307 |                 path_no = int(splited[5])
308 |                 nmatch = int(splited[9])
309 |                 nblock = int(splited[10])
310 |                 if path_no not in id_evalscore:
311 |                     id_evalscore[path_no] = [nmatch / nblock]
312 |                 else:
313 |                     id_evalscore[path_no].append(nmatch / nblock)
314 |             overlap_file.close()
315 | 
316 |         # remove temp file
317 |         subprocess.check_call(
318 |             "rm {0}; rm {1}; rm {2}".format(ref_loc, query_loc, overlap_loc), shell=True
319 |         )
320 | 
321 |         id_evalscore_sum = []
322 |         for id, scores in id_evalscore.items():
323 |             mean_score = numpy.mean(scores) if len(scores) != 0 else 0
324 |             id_evalscore_sum.append((id, mean_score))
325 | 
326 |         best_match = sorted(id_evalscore_sum, key=lambda t: t[1], reverse=True)
327 |         logger.debug("Tip Node: " + str(graph.vp.id[from_node]) + str(best_match))
328 |         if len(best_match) == 0:
329 |             return None
330 |         elif best_match[0][1] >= accept_rate:
331 |             return id_path_dict[best_match[0][0]]
332 |         else:
333 |             return None
334 | 
335 |     is_removed = True
336 |     # get all the source simple path
337 |     src_nodes = []
338 |     tgt_nodes = []
339 |     isolated_node = []
340 |     for node in simp_node_dict.values():
341 |         if node.in_degree() + node.out_degree() == 0:
342 |             isolated_node.append(node)
343 |         elif node.in_degree() == 0:
344 |             src_nodes.append(node)
345 |         elif node.out_degree() == 0:
346 |             tgt_nodes.append(node)
347 |         else:
348 |             None
349 | 
350 |     # src node collapse
351 |     src_nodes = sorted(src_nodes, key=lambda x: graph.vp.dp[x])
352 |     for src in src_nodes:
353 |         src_len = path_len(graph, [src])
354 |         potential_paths = []
355 |         # path retrieve
356 |         for out_branch in src.out_neighbors():
357 |             if graph.vp.id[out_branch] not in simp_node_dict:
358 |                 continue
359 |             # print("current out branch: ", graph.vp.id[out_branch])
360 |             for in_tgt in out_branch.in_neighbors():
361 |                 if graph.vp.id[in_tgt] == graph.vp.id[src]:
362 |                     # coincidence path
363 |                     continue
364 |                 if graph.vp.id[in_tgt] not in simp_node_dict:
365 |                     # collapsed path in previous iteration
366 |                     continue
367 |                 # print("current in tgt: ", graph.vp.id[in_tgt])
368 |                 potential_paths.extend(
369 |                     paths_to_tgt(graph, simp_node_dict, src, in_tgt, src_len)
370 |                 )
371 |         cand_path = cand_collapse_path(graph, src, potential_paths, tempdir)
372 |         if cand_path != None:
373 |             remove_tip(graph, simp_node_dict, src, cand_path)
374 |             is_removed = False
375 | 
376 |     # target node collapse
377 |     tgt_nodes = sorted(tgt_nodes, key=lambda x: graph.vp.dp[x])
378 |     for tgt in tgt_nodes:
379 |         tgt_len = path_len(graph, [tgt])
380 |         potential_paths = []
381 |         # path retrieve
382 |         for in_branch in tgt.in_neighbors():
383 |             if graph.vp.id[in_branch] not in simp_node_dict:
384 |                 continue
385 |             # print("current in branch: ", graph.vp.id[in_branch])
386 |             for out_src in in_branch.out_neighbors():
387 |                 if graph.vp.id[out_src] == graph.vp.id[tgt]:
388 |                     # coincidence path
389 |                     continue
390 |                 if graph.vp.id[out_src] not in simp_node_dict:
391 |                     # collapsed path in previous iteration
392 |                     continue
393 |                 # print("current out src: ", graph.vp.id[out_src])
394 |                 potential_paths.extend(
395 |                     paths_from_src(graph, simp_node_dict, tgt, out_src, tgt_len)
396 |                 )
397 |         cand_path = cand_collapse_path(graph, tgt, potential_paths, tempdir)
398 |         if cand_path != None:
399 |             remove_tip(graph, simp_node_dict, tgt, cand_path)
400 |             is_removed = False
401 |     return is_removed
402 | 


--------------------------------------------------------------------------------
/utils/VStrains_Alignment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import os
  4 | import time
  5 | import subprocess
  6 | import numpy
  7 | import sys
  8 | 
  9 | 
 10 | def process_paf_file(
 11 |     index2id,
 12 |     index2reflen,
 13 |     len_index2id,
 14 |     read_ids,
 15 |     fwd_paf_file,
 16 |     rve_paf_file,
 17 |     split_len,
 18 |     tid,
 19 | ):
 20 |     print("Batch {0} start".format(tid))
 21 |     print("current pid: {0}".format(os.getpid()))
 22 |     start = time.time()
 23 | 
 24 |     node_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
 25 |     short_mat = numpy.zeros((len_index2id, len_index2id), dtype=int)
 26 | 
 27 |     id2index = {}
 28 |     for i in range(len_index2id):
 29 |         id2index[index2id[i]] = i
 30 | 
 31 |     read2index = {}
 32 |     index2read = numpy.array(
 33 |         [(k, fwdlen, revlen) for (k, _, _, fwdlen, revlen) in read_ids], dtype=int
 34 |     )
 35 | 
 36 |     conf_alns_f = [None for _ in index2read]
 37 |     # numpy.array([None for _ in index2read], dtype=object)
 38 |     conf_cords_f = [None for _ in index2read]
 39 |     # numpy.array([None for _ in index2read], dtype=object)
 40 | 
 41 |     conf_alns_r = [None for _ in index2read]
 42 |     # numpy.array([None for _ in index2read], dtype=object)
 43 |     conf_cords_r = [None for _ in index2read]
 44 |     # numpy.array([None for _ in index2read], dtype=object)
 45 | 
 46 |     for i, (glb_index, f_local_inds, r_local_inds, _, _) in enumerate(read_ids):
 47 |         read2index[glb_index] = i
 48 |         conf_alns_f[i] = [[] for _ in range(f_local_inds)]
 49 |         # numpy.array([[] for _ in range(f_local_inds)], dtype=object)
 50 |         conf_cords_f[i] = [[] for _ in range(f_local_inds)]
 51 |         # numpy.array([[] for _ in range(f_local_inds)], dtype=object)
 52 |         conf_alns_r[i] = [[] for _ in range(r_local_inds)]
 53 |         # numpy.array([[] for _ in range(r_local_inds)], dtype=object)
 54 |         conf_cords_r[i] = [[] for _ in range(r_local_inds)]
 55 |         # numpy.array([[] for _ in range(r_local_inds)], dtype=object)
 56 | 
 57 |     for file in [fwd_paf_file, rve_paf_file]:
 58 |         with open(file, "r") as fwd_paf:
 59 |             file_count = 0
 60 |             for line in fwd_paf:
 61 |                 if line == "\n":
 62 |                     break
 63 |                 splited = (line[:-1]).split("\t")
 64 |                 seg_no = splited[0]
 65 |                 [glb_seg_no, sub_no] = seg_no.split("_")
 66 |                 ref_no = str(splited[5])
 67 |                 ref_start_coord = int(splited[7])  # 0-based
 68 |                 nm = int(splited[10]) - int(splited[9])
 69 |                 if nm == 0 and int(splited[10]) == split_len:
 70 |                     if file == fwd_paf_file:
 71 |                         conf_alns_f[read2index[int(glb_seg_no)]][int(sub_no)].append(
 72 |                             id2index[ref_no]
 73 |                         )
 74 |                         conf_cords_f[read2index[int(glb_seg_no)]][int(sub_no)].append(
 75 |                             ref_start_coord
 76 |                         )
 77 |                     else:
 78 |                         conf_alns_r[read2index[int(glb_seg_no)]][int(sub_no)].append(
 79 |                             id2index[ref_no]
 80 |                         )
 81 |                         conf_cords_r[read2index[int(glb_seg_no)]][int(sub_no)].append(
 82 |                             ref_start_coord
 83 |                         )
 84 |                 file_count += 1
 85 |             fwd_paf.close()
 86 |     # print("Batch {0} finished alignment file parsing".format(tid))
 87 |     subprocess.check_call("rm {0}".format(fwd_paf_file), shell=True)
 88 |     subprocess.check_call("rm {0}".format(rve_paf_file), shell=True)
 89 |     # nonunique_counter = 0
 90 | 
 91 |     def retrieve_single_end_saturation(glb_index, conf_alns, conf_cords, rlen, ks):
 92 |         nodes = numpy.zeros(len_index2id, dtype=int)
 93 |         coords = [None for _ in range(len_index2id)]
 94 |         kindices = [None for _ in range(len_index2id)]
 95 |         for i, sub_aln_statuses in enumerate(conf_alns[glb_index]):
 96 |             # if len(sub_aln_statuses) > 1:
 97 |             #     nonunique_counter += 1
 98 |             for j, sub_aln_status in enumerate(sub_aln_statuses):
 99 |                 nodes[sub_aln_status] += 1
100 |                 if coords[sub_aln_status] == None:
101 |                     coords[sub_aln_status] = conf_cords[glb_index][i][j]
102 |                 else:
103 |                     coords[sub_aln_status] = min(
104 |                         coords[sub_aln_status], conf_cords[glb_index][i][j]
105 |                     )
106 |                 if kindices[sub_aln_status] == None:
107 |                     kindices[sub_aln_status] = i
108 |                 else:
109 |                     kindices[sub_aln_status] = min(kindices[sub_aln_status], i)
110 |         saturates = []
111 |         L = 0
112 |         R = 0
113 |         for i, v in enumerate(nodes):
114 |             if coords[i] == None or kindices[i] == None:
115 |                 continue
116 |             L = max(coords[i], coords[i] - kindices[i])
117 |             R = min(coords[i] + index2reflen[i] - 1, coords[i] - kindices[i] + rlen - 1)
118 |             saturate = R - L - (split_len - 1) + 1
119 |             expected = (min(rlen, index2reflen[i]) - ks + 1) * (rlen - ks) / rlen
120 |             if v >= max(min(saturate, expected), 1):
121 |                 # print(i,v,"passed")
122 |                 saturates.append(i)
123 |         return saturates
124 | 
125 |     for glb_id, fwdlen, revlen in index2read:
126 |         glb_index = read2index[glb_id]
127 |         lefts = retrieve_single_end_saturation(
128 |             glb_index, conf_alns_f, conf_cords_f, fwdlen, split_len
129 |         )
130 |         rights = retrieve_single_end_saturation(
131 |             glb_index, conf_alns_r, conf_cords_r, revlen, split_len
132 |         )
133 | 
134 |         k = 0
135 |         for i in lefts:
136 |             for i2 in lefts[k:]:
137 |                 short_mat[i][i2] += 1
138 |             k += 1
139 |         k = 0
140 |         for j in rights:
141 |             for j2 in rights[k:]:
142 |                 short_mat[j][j2] += 1
143 |             k += 1
144 | 
145 |         for i in lefts:
146 |             for j in rights:
147 |                 node_mat[i][j] += 1
148 | 
149 |         # free up memory
150 |         conf_alns_f[glb_index] = None
151 |         conf_alns_r[glb_index] = None
152 | 
153 |     elapsed = time.time() - start
154 |     print("Batch {0} finished".format(tid))
155 |     # print("Batch: {0} found non unique kmer count: {1}".format(tid, nonunique_counter))
156 |     print("Batch: {0} time spent for processing paf file: {1}".format(tid, elapsed))
157 |     return node_mat, short_mat
158 | 
159 | 
160 | def batch_split(
161 |     fwd_file: str,
162 |     rve_file: str,
163 |     temp_dir: str,
164 |     batch_size: int,
165 |     do_split: bool,
166 |     split_len,
167 | ):
168 |     """split the read file into several
169 |     Args:
170 |         fwd_file (str): _description_
171 |         rve_file (str): _description_
172 |         batch_size (int): _description_
173 |     Returns:
174 |         list: list of batch files
175 |     """
176 |     n_reads = 0
177 |     short_reads = 0
178 |     used_reads = 0
179 |     fkmer = 0
180 |     rkmer = 0
181 | 
182 |     temp_file_fwd = None
183 |     temp_file_rve = None
184 |     local_reads = 0
185 |     local_list = []
186 |     batch_count = 0
187 |     read_summary = []
188 |     sub_files = []
189 |     # forward reverse read processing
190 |     with open(fwd_file, "r") as fwd:
191 |         with open(rve_file, "r") as rve:
192 |             fwd_reads = fwd.readlines()
193 |             rev_reads = rve.readlines()
194 |             total_size = min(len(fwd_reads) // 4, len(rev_reads) // 4)
195 |             # marker_test = 1
196 |             # total_size = min(marker_test, total_size)
197 |             for i in range(total_size):
198 |                 # if i % batch_size == 0:
199 |                 #     print("Processed {0} reads up to now.".format(i))
200 |                 [_, fseq, _, feval] = [s[:-1] for s in fwd_reads[i * 4 : (i + 1) * 4]]
201 |                 [_, rseq, _, reval] = [s[:-1] for s in rev_reads[i * 4 : (i + 1) * 4]]
202 |                 if fseq.count("N") or rseq.count("N"):
203 |                     n_reads += 1
204 |                 elif len(fseq) < split_len or len(rseq) < split_len:
205 |                     short_reads += 1
206 |                 else:
207 |                     used_reads += 1
208 |                     local_reads += 1
209 |                     local_list.append((fseq, feval, rseq, reval))
210 |                 if local_reads == batch_size or (
211 |                     local_reads > 0 and i == total_size - 1
212 |                 ):
213 |                     # file creation
214 |                     sub_fwd_filename = "{0}/temp_forward_{1}.fastq".format(
215 |                         temp_dir, batch_count
216 |                     )
217 |                     sub_rve_filename = "{0}/temp_reverse_{1}.fastq".format(
218 |                         temp_dir, batch_count
219 |                     )
220 |                     subprocess.check_call(
221 |                         "touch {0}; echo " " > {0}".format(sub_fwd_filename), shell=True
222 |                     )
223 |                     subprocess.check_call(
224 |                         "touch {0}; echo " " > {0}".format(sub_rve_filename), shell=True
225 |                     )
226 |                     temp_file_fwd = open(sub_fwd_filename, "w")
227 |                     temp_file_rve = open(sub_rve_filename, "w")
228 | 
229 |                     read_ids = []
230 |                     if do_split:
231 |                         for j, (fseq, feval, rseq, reval) in enumerate(local_list):
232 |                             fread_id_subs = len(fseq) - split_len + 1
233 |                             rread_id_subs = len(rseq) - split_len + 1
234 |                             prefix_name = "@{0}_".format(j)
235 |                             # forward
236 |                             for sub_i in range(len(fseq) - split_len + 1):
237 |                                 subfread = fseq[sub_i : sub_i + split_len]
238 |                                 subfeval = feval[sub_i : sub_i + split_len]
239 |                                 temp_file_fwd.write(
240 |                                     prefix_name + "{0} /1\n".format(sub_i)
241 |                                 )
242 |                                 temp_file_fwd.write(subfread + "\n")
243 |                                 temp_file_fwd.write("+\n")
244 |                                 temp_file_fwd.write(subfeval + "\n")
245 |                             fkmer += len(fseq) - split_len + 1
246 |                             # reverse
247 |                             for sub_i in range(len(rseq) - split_len + 1):
248 |                                 subrread = rseq[sub_i : sub_i + split_len]
249 |                                 subreval = reval[sub_i : sub_i + split_len]
250 |                                 temp_file_rve.write(
251 |                                     prefix_name + "{0} /2\n".format(sub_i)
252 |                                 )
253 |                                 temp_file_rve.write(subrread + "\n")
254 |                                 temp_file_rve.write("+\n")
255 |                                 temp_file_rve.write(subreval + "\n")
256 |                             rkmer += len(rseq) - split_len + 1
257 |                             read_ids.append(
258 |                                 (j, fread_id_subs, rread_id_subs, len(fseq), len(rseq))
259 |                             )
260 |                     else:
261 |                         for j, (fseq, feval, rseq, reval) in enumerate(local_list):
262 |                             prefix_name = "@{0}_".format(j)
263 |                             temp_file_fwd.write(prefix_name + "{0} /1\n".format(0))
264 |                             temp_file_fwd.write(fseq + "\n")
265 |                             temp_file_fwd.write("+\n")
266 |                             temp_file_fwd.write(feval + "\n")
267 | 
268 |                             temp_file_rve.write(prefix_name + "{0} /2\n".format(0))
269 |                             temp_file_rve.write(rseq + "\n")
270 |                             temp_file_rve.write("+\n")
271 |                             temp_file_rve.write(reval + "\n")
272 |                             read_ids.append((j, 1, 1, len(fseq), len(rseq)))
273 |                     temp_file_fwd.close()
274 |                     temp_file_rve.close()
275 |                     read_summary.append(read_ids)
276 |                     sub_files.append((sub_fwd_filename, sub_rve_filename))
277 |                     local_reads = 0
278 |                     local_list = []
279 |                     batch_count += 1
280 |         fwd.close()
281 |         rve.close()
282 | 
283 |     print("total number of reads (before): ", total_size)
284 |     print("total reads containing N: ", n_reads)
285 |     print("total reads too short [<{0}]: ".format(split_len), short_reads)
286 |     print("total number of reads (used): ", used_reads)
287 |     print("total number of forward reads kmer: ", fkmer)
288 |     print("total number of reverse reads kmer: ", rkmer)
289 |     return read_summary, sub_files
290 | 
291 | 
292 | def minimap_alignment(fasta_file, sub_files, temp_dir):
293 |     paf_files = []
294 |     for i, (sub_fwd_filename, sub_rve_filename) in enumerate(sub_files):
295 |         print(
296 |             "minimap reads {0},{1} to graph..".format(
297 |                 sub_fwd_filename, sub_rve_filename
298 |             )
299 |         )
300 |         start = time.time()
301 |         sub_fwd_paf = "{0}/temp_fwd_aln_{1}.paf".format(temp_dir, i)
302 |         subprocess.check_call(
303 |             "minimap2 -c -t 16 {0} {1} > {2}".format(
304 |                 fasta_file, sub_fwd_filename, sub_fwd_paf
305 |             ),
306 |             shell=True,
307 |         )
308 |         # -B 40 -O 20,50 -E 30,10 -z 1,1 -k 27 -w 18 -s 256
309 |         subprocess.check_call("rm {0}".format(sub_fwd_filename), shell=True)
310 | 
311 |         sub_rve_paf = "{0}/temp_rve_aln_{1}.paf".format(temp_dir, i)
312 |         subprocess.check_call(
313 |             "minimap2 -c -t 16 {0} {1} > {2}".format(
314 |                 fasta_file, sub_rve_filename, sub_rve_paf
315 |             ),
316 |             shell=True,
317 |         )
318 |         subprocess.check_call("rm {0}".format(sub_rve_filename), shell=True)
319 | 
320 |         paf_files.append((sub_fwd_paf, sub_rve_paf))
321 |         elapsed = time.time() - start
322 |         print("Time spent for minimap2: ", elapsed)
323 |     return paf_files
324 | 
325 | 
326 | def main():
327 |     print(
328 |         "----------------------Paired-End Information Alignment----------------------"
329 |     )
330 |     parser = argparse.ArgumentParser(
331 |         prog="pe_info",
332 |         description="""Align Paired-End reads to nodes in graph to obtain strong links""",
333 |     )
334 | 
335 |     parser.add_argument(
336 |         "-g", "--gfa,", dest="gfa", type=str, required=True, help="graph, .gfa format"
337 |     )
338 | 
339 |     parser.add_argument(
340 |         "-o",
341 |         "--output_dir",
342 |         dest="dir",
343 |         type=str,
344 |         required=True,
345 |         help="output directory",
346 |     )
347 | 
348 |     parser.add_argument(
349 |         "-f", "--forward", dest="fwd", required=True, help="forward read, .fastq"
350 |     )
351 | 
352 |     parser.add_argument(
353 |         "-r", "--reverse", dest="rve", required=True, help="reverse read, .fastq"
354 |     )
355 | 
356 |     parser.add_argument(
357 |         "-k",
358 |         "--kmer_size",
359 |         dest="kmer_size",
360 |         type=int,
361 |         default=128,
362 |         help="unique kmer size",
363 |     )
364 | 
365 |     args = parser.parse_args()
366 | 
367 |     # initialize output directory
368 |     if args.dir[-1] == "/":
369 |         args.dir = args.dir[:-1]
370 |     subprocess.check_call("rm -rf {0}".format(args.dir), shell=True)
371 |     os.makedirs(args.dir, exist_ok=True)
372 | 
373 |     glb_start = time.time()
374 |     tmp_g2s_file = "{0}/temp_graph_seq.fasta".format(args.dir)
375 | 
376 |     # convert gfa to fasta file
377 |     index2id = []
378 |     index2reflen = []
379 |     with open(args.gfa, "r") as gfa:
380 |         with open(tmp_g2s_file, "w") as fasta:
381 |             for Line in gfa:
382 |                 splited = (Line[:-1]).split("\t")
383 |                 if splited[0] == "S":
384 |                     fasta.write(">{0}\n{1}\n".format(splited[1], splited[2]))
385 |                     index2id.append(splited[1])
386 |                     index2reflen.append(len(splited[2]))
387 |             fasta.close()
388 |         gfa.close()
389 | 
390 |     split_len = args.kmer_size + 1
391 |     # split reads to several batches
392 |     read_summary, sub_files = batch_split(
393 |         args.fwd, args.rve, args.dir, 40000, True, split_len
394 |     )
395 |     # minimap2 reads to fasta file
396 |     paf_files = minimap_alignment(tmp_g2s_file, sub_files, args.dir)
397 | 
398 |     len_index2id = len(index2id)
399 |     node_mats = []
400 |     strand_mats = []
401 | 
402 |     for i in range(len(paf_files)):
403 |         (node_mat, strand_mat) = process_paf_file(
404 |             index2id,
405 |             index2reflen,
406 |             len_index2id,
407 |             read_summary[i],
408 |             paf_files[i][0],
409 |             paf_files[i][1],
410 |             split_len,
411 |             i,
412 |         )
413 |         node_mats.append(node_mat)
414 |         strand_mats.append(strand_mat)
415 | 
416 |     print("All processes have finished their job, combine the result.")
417 |     # combine all the outputs
418 |     glb_node_mat = numpy.sum(numpy.array(node_mats), axis=0)
419 |     glb_strand_mat = numpy.sum(numpy.array(strand_mats), axis=0)
420 |     out_file = "{0}/pe_info".format(args.dir)
421 |     out_file2 = "{0}/st_info".format(args.dir)
422 |     subprocess.check_call("touch {0}; echo " " > {0}".format(out_file), shell=True)
423 |     with open(out_file, "w") as outfile:
424 |         with open(out_file2, "w") as outfile2:
425 |             for i in range(len_index2id):
426 |                 for j in range(len_index2id):
427 |                     outfile.write(
428 |                         "{0}:{1}:{2}\n".format(
429 |                             index2id[i], index2id[j], glb_node_mat[i][j]
430 |                         )
431 |                     )
432 |                     outfile2.write(
433 |                         "{0}:{1}:{2}\n".format(
434 |                             index2id[i], index2id[j], glb_strand_mat[i][j]
435 |                         )
436 |                     )
437 |             outfile2.close()
438 |         outfile.close()
439 | 
440 |     glb_elapsed = time.time() - glb_start
441 |     print("Global time elapsed: ", glb_elapsed)
442 |     print("result stored in: ", out_file)
443 | 
444 | 
445 | if __name__ == "__main__":
446 |     main()
447 |     sys.exit(0)
448 | 


--------------------------------------------------------------------------------
/utils/VStrains_IO.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from logging import Logger
  4 | from graph_tool.all import Graph
  5 | import gfapy
  6 | import subprocess
  7 | import sys
  8 | import re
  9 | 
 10 | from utils.VStrains_Utilities import *
 11 | 
 12 | 
 13 | def init_graph():
 14 |     graph = Graph(directed=True)
 15 |     graph.vp.seq = graph.new_vertex_property("string", val="")
 16 |     graph.vp.dp = graph.new_vertex_property("double")
 17 |     graph.vp.id = graph.new_vertex_property("string", val="UD")
 18 |     graph.vp.color = graph.new_vertex_property("string")
 19 | 
 20 |     graph.ep.overlap = graph.new_edge_property("int", val=0)
 21 |     graph.ep.flow = graph.new_edge_property("double", val=0.0)
 22 |     graph.ep.color = graph.new_edge_property("string")
 23 | 
 24 |     return graph
 25 | 
 26 | 
 27 | def gfa_to_graph(gfa_file, logger: Logger, init_ori=1):
 28 |     """
 29 |     Convert assembly graph gfa file to graph
 30 |     Nodes: segment with corresponding
 31 |     """
 32 | 
 33 |     logger.info("Parsing GFA format graph")
 34 |     gfa = gfapy.Gfa().from_file(filename=gfa_file)
 35 |     logger.info(
 36 |         "Parsed gfa file length: {0}, version: {1}".format(len(gfa.lines), gfa.version)
 37 |     )
 38 | 
 39 |     graph = init_graph()
 40 |     graph.vp.visited = graph.new_vertex_property("int16_t", val=0)
 41 |     graph.vp.ori = graph.new_vertex_property("int16_t")  # 1 = +, -1 = -
 42 | 
 43 |     graph.ep.visited = graph.new_edge_property("int", val=0)
 44 | 
 45 |     # S
 46 |     node_dict = {}
 47 |     dp_dict = {}
 48 |     edge_dict = {}
 49 |     for line in gfa.segments:
 50 |         # segment, convert into Node^- and Node^+
 51 |         [t, seg_no, seg] = (str(line).split("\t"))[:3]
 52 |         tags = (str(line).split("\t"))[3:]
 53 |         dp_float = 0
 54 |         ln = 0
 55 |         kc = 0
 56 |         for tag in tags:
 57 |             if tag.startswith("dp") or tag.startswith("DP"):
 58 |                 dp_float = float(tag.split(":")[2])
 59 |                 break
 60 |             if tag.startswith("ln") or tag.startswith("LN"):
 61 |                 ln = int(tag.split(":")[2])
 62 |             if tag.startswith("kc") or tag.startswith("KC"):
 63 |                 kc = int(tag.split(":")[2])
 64 |             if ln != 0 and kc != 0:
 65 |                 break
 66 | 
 67 |         # gfa format check
 68 |         if t != "S" or (dp_float == 0 and (ln == 0 or kc == 0)):
 69 |             logger.error(
 70 |                 "file: {0}, Illegal graph format, please double check if the graph has been contaminated".format(
 71 |                     gfa_file
 72 |                 )
 73 |             )
 74 |             sys.exit(1)
 75 | 
 76 |         if dp_float == 0:
 77 |             dp_float = kc / ln
 78 | 
 79 |         v_pos = graph.add_vertex()
 80 |         graph.vp.seq[v_pos] = seg
 81 |         graph.vp.dp[v_pos] = dp_float
 82 |         graph.vp.id[v_pos] = seg_no
 83 |         graph.vp.ori[v_pos] = 1
 84 |         graph.vp.visited[v_pos] = -1
 85 |         graph.vp.color[v_pos] = "black"
 86 | 
 87 |         v_neg = graph.add_vertex()
 88 |         graph.vp.seq[v_neg] = reverse_seq(seg)
 89 |         graph.vp.dp[v_neg] = dp_float
 90 |         graph.vp.id[v_neg] = seg_no
 91 |         graph.vp.ori[v_neg] = -1
 92 |         graph.vp.visited[v_neg] = -1
 93 |         graph.vp.color[v_neg] = "black"
 94 | 
 95 |         node_dict[seg_no] = (v_pos, v_neg)
 96 |         dp_dict[seg_no] = dp_float
 97 |     # L
 98 |     for edge in gfa.edges:
 99 |         [t, seg_no_l, ori_l, seg_no_r, ori_r] = (str(edge).split("\t"))[:5]
100 |         tags = (str(edge).split("\t"))[5:]
101 |         overlap_len = [tag for tag in tags if tag.endswith("m") or tag.endswith("M")][0]
102 |         # gfa format check
103 |         assert t == "L" and overlap_len[-1] == "M"
104 | 
105 |         u_pos, u_neg = node_dict[seg_no_l]
106 |         v_pos, v_neg = node_dict[seg_no_r]
107 |         u = u_pos if ori_l == "+" else u_neg
108 |         v = v_pos if ori_r == "+" else v_neg
109 | 
110 |         if (seg_no_l, graph.vp.ori[u], seg_no_r, graph.vp.ori[v]) in edge_dict:
111 |             logger.error(
112 |                 "parallel edge found, invalid case in assembly graph, please double-check the assembly graph format"
113 |             )
114 |             logger.error("Pipeline aborted")
115 |             sys.exit(1)
116 | 
117 |         if seg_no_l == seg_no_r:
118 |             graph.vp.seq[u] = str.lower(graph.vp.seq[u])
119 |             graph.vp.seq[v] = str.lower(graph.vp.seq[v])
120 |             continue
121 | 
122 |         e = graph.add_edge(source=u, target=v)
123 |         graph.ep.overlap[e] = int(overlap_len[:-1])
124 |         graph.ep.color[e] = "black"
125 | 
126 |         edge_dict[(seg_no_l, graph.vp.ori[u], seg_no_r, graph.vp.ori[v])] = e
127 | 
128 |     graph, simp_node_dict, simp_edge_dict = flip_graph_bfs(
129 |         graph, node_dict, edge_dict, dp_dict, logger, init_ori
130 |     )
131 |     red_graph, red_node_dict, red_edge_dict = reduce_graph(
132 |         graph, simp_node_dict, simp_edge_dict
133 |     )
134 |     return red_graph, red_node_dict, red_edge_dict
135 | 
136 | 
137 | def flip_graph_bfs(
138 |     graph: Graph,
139 |     node_dict: dict,
140 |     edge_dict: dict,
141 |     dp_dict: dict,
142 |     logger: Logger,
143 |     init_ori=1,
144 | ):
145 |     """
146 |     Flip all the node orientation.
147 | 
148 |     return an node_dict, which only contains one orientation per node for simplicity.
149 |     rename all the used node to positive, and forbidden the opponent node.
150 |     """
151 | 
152 |     def source_node_via_dp(dp_dict: dict):
153 |         """
154 |         return the pos-neg node with maximum depth
155 |         """
156 |         return max(dp_dict, key=dp_dict.get)
157 | 
158 |     def reverse_edge(graph: Graph, edge, node_dict: dict, edge_dict: dict):
159 |         """
160 |         reverse an edge with altered orientation and direction.
161 |         """
162 |         tmp_s = edge.source()
163 |         tmp_t = edge.target()
164 | 
165 |         edge_dict.pop(
166 |             (
167 |                 graph.vp.id[tmp_s],
168 |                 graph.vp.ori[tmp_s],
169 |                 graph.vp.id[tmp_t],
170 |                 graph.vp.ori[tmp_t],
171 |             )
172 |         )
173 | 
174 |         tmp_s_pos, tmp_s_neg = node_dict[graph.vp.id[tmp_s]]
175 |         tmp_t_pos, tmp_t_neg = node_dict[graph.vp.id[tmp_t]]
176 |         s = tmp_t_pos if graph.vp.ori[tmp_t] == -1 else tmp_t_neg
177 |         t = tmp_s_pos if graph.vp.ori[tmp_s] == -1 else tmp_s_neg
178 | 
179 |         o = graph.ep.overlap[edge]
180 |         graph.remove_edge(edge)
181 |         e = graph.add_edge(s, t)
182 |         graph.ep.overlap[e] = o
183 |         edge_dict[
184 |             (graph.vp.id[s], graph.vp.ori[s], graph.vp.id[t], graph.vp.ori[t])
185 |         ] = e
186 | 
187 |         return graph, e, edge_dict
188 | 
189 |     logger.info("flip graph orientation..")
190 |     pick_dict = {}
191 |     while set(dp_dict):
192 |         seg_no = source_node_via_dp(dp_dict)
193 |         source_pos, source_neg = node_dict[seg_no]
194 |         graph.vp.visited[source_pos] = 0
195 |         graph.vp.visited[source_neg] = 0
196 |         fifo_queue = [[node_dict[seg_no], init_ori]]
197 | 
198 |         while fifo_queue:
199 |             (v_pos, v_neg), ori = fifo_queue.pop()
200 |             dp_dict.pop(graph.vp.id[v_pos])
201 | 
202 |             u = None
203 |             if ori == 1:
204 |                 u = v_pos
205 |                 pick_dict[graph.vp.id[u]] = "+"
206 |                 # print_vertex(graph, v_neg, "node to reverse pos")
207 |                 for e in set(v_neg.all_edges()):
208 |                     graph, r_e, edge_dict = reverse_edge(graph, e, node_dict, edge_dict)
209 |                     # print_edge(graph, r_e, "after reverse: ")
210 |             else:
211 |                 u = v_neg
212 |                 pick_dict[graph.vp.id[u]] = "-"
213 |                 # print_vertex(graph, v_pos, "node to reverse neg")
214 |                 for e in set(v_pos.all_edges()):
215 |                     graph, r_e, edge_dict = reverse_edge(graph, e, node_dict, edge_dict)
216 |                     # print_edge(graph, r_e, "after reverse: ")
217 | 
218 |             graph.vp.visited[v_pos] = 1
219 |             graph.vp.visited[v_neg] = 1
220 |             # add further nodes into the fifo_queue
221 |             for adj_node in u.all_neighbors():
222 |                 if graph.vp.visited[adj_node] == -1:
223 |                     vpos, vneg = node_dict[graph.vp.id[adj_node]]
224 |                     graph.vp.visited[vpos] = 0
225 |                     graph.vp.visited[vneg] = 0
226 |                     # print("appending node {0} to queue".format(graph.vp.id[adj_node]))
227 |                     fifo_queue.append(
228 |                         [node_dict[graph.vp.id[adj_node]], graph.vp.ori[adj_node]]
229 |                     )
230 | 
231 |     # verify sorted graph
232 |     logger.info("final verifying graph..")
233 |     assert len(pick_dict) == len(node_dict)
234 |     for key, item in list(pick_dict.items()):
235 |         v_pos, v_neg = node_dict[key]
236 |         if item == "+":
237 |             # FIXME split v_neg to a new node
238 |             if v_neg.in_degree() + v_neg.out_degree() > 0:
239 |                 print_vertex(
240 |                     graph, v_neg, logger, "pick ambiguous found, pick both, split node"
241 |                 )
242 |                 pick_dict[key] = "t"
243 |         else:
244 |             # FIXME split v_neg to a new node
245 |             if v_pos.in_degree() + v_pos.out_degree() > 0:
246 |                 print_vertex(
247 |                     graph, v_pos, logger, "pick ambiguous found, pick both, split node"
248 |                 )
249 |                 pick_dict[key] = "t"
250 |     logger.info("Graph is verified")
251 | 
252 |     simp_node_dict = {}
253 |     for seg_no, pick in pick_dict.items():
254 |         if pick == "+":
255 |             simp_node_dict[seg_no] = node_dict[seg_no][0]
256 |         elif pick == "-":
257 |             simp_node_dict["-" + seg_no] = node_dict[seg_no][1]
258 |             graph.vp.id[node_dict[seg_no][1]] = "-" + seg_no
259 |         else:
260 |             simp_node_dict[seg_no] = node_dict[seg_no][0]
261 |             graph.vp.id[node_dict[seg_no][0]] = seg_no
262 |             simp_node_dict["-" + seg_no] = node_dict[seg_no][1]
263 |             graph.vp.id[node_dict[seg_no][1]] = "-" + seg_no
264 | 
265 |     simp_edge_dict = {}
266 |     for e in edge_dict.values():
267 |         simp_edge_dict[(graph.vp.id[e.source()], graph.vp.id[e.target()])] = e
268 |     logger.info("done")
269 |     return graph, simp_node_dict, simp_edge_dict
270 | 
271 | 
272 | def reduce_graph(unsimp_graph: Graph, simp_node_dict: dict, simp_edge_dict: dict):
273 |     graph = init_graph()
274 |     red_node_dict = {}
275 |     red_edge_dict = {}
276 | 
277 |     for no, node in simp_node_dict.items():
278 |         v = graph.add_vertex()
279 |         graph.vp.seq[v] = unsimp_graph.vp.seq[node]
280 |         graph.vp.dp[v] = unsimp_graph.vp.dp[node]
281 |         graph.vp.id[v] = unsimp_graph.vp.id[node]
282 |         graph.vp.color[v] = "black"
283 |         red_node_dict[no] = v
284 | 
285 |     for (u, v), e in simp_edge_dict.items():
286 |         source = red_node_dict[u]
287 |         sink = red_node_dict[v]
288 | 
289 |         re = graph.add_edge(source, sink)
290 |         graph.ep.overlap[re] = unsimp_graph.ep.overlap[e]
291 |         graph.ep.flow[re] = unsimp_graph.ep.flow[e]
292 |         graph.ep.color[re] = "black"
293 |         red_edge_dict[(u, v)] = re
294 | 
295 |     return graph, red_node_dict, red_edge_dict
296 | 
297 | 
298 | def flipped_gfa_to_graph(gfa_file, logger: Logger):
299 |     """
300 |     read flipped gfa format graph in.
301 |     """
302 |     logger.debug("Parsing GFA format graph")
303 |     gfa = gfapy.Gfa().from_file(filename=gfa_file)
304 |     logger.debug(
305 |         "Parsed gfa file length: {0}, version: {1}".format(len(gfa.lines), gfa.version)
306 |     )
307 | 
308 |     graph = init_graph()
309 |     red_node_dict = {}
310 |     red_edge_dict = {}
311 | 
312 |     # S
313 |     for line in gfa.segments:
314 |         [_, seg_no, seg, dp] = str(line).split("\t")
315 |         dp_float = float(dp.split(":")[2])
316 |         v = graph.add_vertex()
317 |         graph.vp.seq[v] = seg
318 |         graph.vp.dp[v] = dp_float
319 |         graph.vp.id[v] = seg_no
320 |         graph.vp.color[v] = "black"
321 |         red_node_dict[seg_no] = v
322 |     # L
323 |     for edge in gfa.edges:
324 |         [_, seg_no_l, ori_l, seg_no_r, ori_r, overlap_len] = str(edge).split("\t")
325 |         source = red_node_dict[seg_no_l]
326 |         sink = red_node_dict[seg_no_r]
327 | 
328 |         assert overlap_len[-1] == "M" and ori_l == ori_r
329 |         re = graph.add_edge(source, sink)
330 |         graph.ep.overlap[re] = int(overlap_len[:-1])
331 |         graph.ep.color[re] = "black"
332 |         red_edge_dict[(seg_no_l, seg_no_r)] = re
333 | 
334 |     return graph, red_node_dict, red_edge_dict
335 | 
336 | 
337 | def graph_to_gfa(
338 |     graph: Graph, simp_node_dict: dict, simp_edge_dict: dict, logger: Logger, filename
339 | ):
340 |     """
341 |     store the swapped graph in simplifed_graph.
342 |     """
343 |     subprocess.check_call("touch {0}; echo > {0}".format(filename), shell=True)
344 | 
345 |     with open(filename, "w") as gfa:
346 |         for v in simp_node_dict.values():
347 |             if graph.vp.color[v] == "black":
348 |                 name = graph.vp.id[v]
349 |                 gfa.write(
350 |                     "S\t{0}\t{1}\tDP:f:{2}\n".format(
351 |                         name, graph.vp.seq[v], graph.vp.dp[v]
352 |                     )
353 |                 )
354 | 
355 |         for (u, v), e in simp_edge_dict.items():
356 |             node_u = simp_node_dict[u] if u in simp_node_dict else None
357 |             node_v = simp_node_dict[v] if v in simp_node_dict else None
358 | 
359 |             if node_u == None or node_v == None:
360 |                 continue
361 |             if graph.vp.color[node_u] != "black" or graph.vp.color[node_v] != "black":
362 |                 continue
363 |             if graph.ep.color[e] != "black":
364 |                 continue
365 |             gfa.write(
366 |                 "L\t{0}\t{1}\t{2}\t{3}\t{4}M\n".format(
367 |                     u, "+", v, "+", graph.ep.overlap[e]
368 |                 )
369 |             )
370 |         gfa.close()
371 |     logger.info(filename + " is stored..")
372 |     return 0
373 | 
374 | 
375 | def is_valid(p: list, idx_mapping: dict, simp_node_dict: dict, simp_edge_dict: dict):
376 |     if len(p) == 0:
377 |         return False
378 |     if len(p) == 1:
379 |         if p[0] not in idx_mapping:
380 |             return False
381 |         if idx_mapping[p[0]] not in simp_node_dict:
382 |             return False
383 |         return True
384 |     for i in range(len(p) - 1):
385 |         if p[i] not in idx_mapping or p[i + 1] not in idx_mapping:
386 |             return False
387 |         mu = idx_mapping[p[i]]
388 |         mv = idx_mapping[p[i + 1]]
389 |         if mu not in simp_node_dict:
390 |             return False
391 |         if mv not in simp_node_dict:
392 |             return False
393 |         if (mu, mv) not in simp_edge_dict:
394 |             return False
395 |     return True
396 | 
397 | 
398 | def spades_paths_parser(
399 |     graph: Graph,
400 |     simp_node_dict: dict,
401 |     simp_edge_dict: dict,
402 |     idx_mapping: dict,
403 |     logger: Logger,
404 |     path_file,
405 |     min_len=250,
406 |     min_cov=0,
407 | ):
408 |     """
409 |     Map SPAdes's contig to the graph, return all the suitable contigs.
410 |     """
411 | 
412 |     def get_paths(fd, path):
413 |         subpaths = []
414 |         total_nodes = 0
415 |         while path.endswith(";\n"):
416 |             subpath = str(path[:-2]).split(",")
417 |             subpath = list(
418 |                 map(
419 |                     lambda v: str(v[:-1]) if v[-1] == "+" else "-" + str(v[:-1]),
420 |                     subpath,
421 |                 )
422 |             )
423 |             subpathred = list(dict.fromkeys(subpath))
424 |             # validity check
425 |             if is_valid(subpathred, idx_mapping, simp_node_dict, simp_edge_dict):
426 |                 subpath = list(map(lambda v: idx_mapping[v], subpath))
427 |                 subpaths.append(subpath)
428 |                 total_nodes += len(subpath)
429 |             path = fd.readline()
430 | 
431 |         subpath = path.rstrip().split(",")
432 |         subpath = list(
433 |             map(lambda v: str(v[:-1]) if v[-1] == "+" else "-" + str(v[:-1]), subpath)
434 |         )
435 |         subpathred = list(dict.fromkeys(subpath))
436 |         # validity check
437 |         if is_valid(subpathred, idx_mapping, simp_node_dict, simp_edge_dict):
438 |             subpath = list(map(lambda v: idx_mapping[v], subpath))
439 |             subpaths.append(subpath)
440 |             total_nodes += len(subpath)
441 | 
442 |         return subpaths, total_nodes
443 | 
444 |     logger.info("parsing SPAdes .paths file..")
445 |     contig_dict = {}
446 |     contig_info = {}
447 |     try:
448 |         with open(path_file, "r") as contigs_file:
449 |             name = contigs_file.readline()
450 |             path = contigs_file.readline()
451 | 
452 |             while name != "" and path != "":
453 |                 (cno, clen, ccov) = re.search(
454 |                     "%s(.*)%s(.*)%s(.*)" % ("NODE_", "_length_", "_cov_"), name.strip()
455 |                 ).group(1, 2, 3)
456 |                 subpaths, total_nodes = get_paths(contigs_file, path)
457 | 
458 |                 name_r = contigs_file.readline()
459 |                 path_r = contigs_file.readline()
460 |                 (cno_r, clen_r, ccov_r) = re.search(
461 |                     "%s(.*)%s(.*)%s(.*)%s" % ("NODE_", "_length_", "_cov_", "'"),
462 |                     name_r.strip(),
463 |                 ).group(1, 2, 3)
464 |                 subpaths_r, total_nodes_r = get_paths(contigs_file, path_r)
465 | 
466 |                 if not (cno == cno_r and clen == clen_r and ccov == ccov_r):
467 |                     raise BaseException
468 | 
469 |                 # next contig group
470 |                 name = contigs_file.readline()
471 |                 path = contigs_file.readline()
472 | 
473 |                 # pick one direction only
474 |                 (segments, total_n) = max(
475 |                     [(subpaths, total_nodes), (subpaths_r, total_nodes_r)],
476 |                     key=lambda t: t[1],
477 |                 )
478 | 
479 |                 # filter contig
480 |                 if segments == []:
481 |                     continue
482 |                 if total_n < 2 and (float(ccov) <= min_cov or int(clen) < min_len):
483 |                     continue
484 |                 for i, subpath in enumerate(segments):
485 |                     repeat_dict = {}
486 |                     for k in subpath:
487 |                         if k not in repeat_dict:
488 |                             repeat_dict[k] = 1
489 |                         else:
490 |                             repeat_dict[k] += 1
491 |                     subpath = list(dict.fromkeys(subpath))
492 | 
493 |                     if len(segments) != 1:
494 |                         contig_dict[cno + "$" + str(i)] = [
495 |                             subpath,
496 |                             path_len(graph, [simp_node_dict[id] for id in subpath]),
497 |                             float(ccov),
498 |                         ]
499 |                         contig_info[cno + "$" + str(i)] = (None, repeat_dict)
500 |                     else:
501 |                         contig_dict[cno] = [subpath, int(clen), float(ccov)]
502 |                         contig_info[cno] = (None, repeat_dict)
503 | 
504 |             contigs_file.close()
505 |     except BaseException as err:
506 |         logger.error(
507 |             err,
508 |             "\nPlease make sure the correct SPAdes contigs .paths file is provided.",
509 |         )
510 |         logger.error("Pipeline aborted")
511 |         sys.exit(1)
512 |     logger.debug(str(contig_dict))
513 |     logger.debug(str(contig_info))
514 |     logger.info("done")
515 |     return contig_dict, contig_info
516 | 
517 | 
518 | def contig_dict_to_fasta(
519 |     graph: Graph, simp_node_dict: dict, contig_dict: dict, output_file
520 | ):
521 |     """
522 |     Store contig dict into fastq file
523 |     """
524 |     subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True)
525 | 
526 |     with open(output_file, "w") as fasta:
527 |         for cno, (contig, clen, ccov) in sorted(
528 |             contig_dict.items(), key=lambda x: x[1][1], reverse=True
529 |         ):
530 |             contig_name = (
531 |                 ">" + str(cno) + "_" + str(clen) + "_" + str(round(ccov, 2)) + "\n"
532 |             )
533 |             seq = path_ids_to_seq(graph, contig, contig_name, simp_node_dict) + "\n"
534 |             fasta.write(contig_name)
535 |             fasta.write(seq)
536 |         fasta.close()
537 | 
538 | 
539 | def strain_dict_to_fasta(strain_dict: dict, output_file):
540 |     """
541 |     Store strain dict into fastq file
542 |     """
543 |     subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True)
544 | 
545 |     with open(output_file, "w") as fasta:
546 |         for cno, (sseq, clen, ccov) in sorted(
547 |             strain_dict.items(), key=lambda x: x[1][1], reverse=True
548 |         ):
549 |             contig_name = (
550 |                 ">" + str(cno) + "_" + str(clen) + "_" + str(round(ccov, 2)) + "\n"
551 |             )
552 |             seq = sseq + "\n"
553 |             fasta.write(contig_name)
554 |             fasta.write(seq)
555 |         fasta.close()
556 | 
557 | 
558 | def contig_dict_to_path(
559 |     contig_dict: dict, output_file, id_mapping: dict = None, keep_original=False
560 | ):
561 |     """
562 |     Store contig dict into paths file
563 |     """
564 |     subprocess.check_call("touch {0}; echo > {0}".format(output_file), shell=True)
565 |     rev_id_mapping = {}
566 |     if id_mapping != None:
567 |         for id, map in id_mapping.items():
568 |             rev_id_mapping[map] = id
569 |     with open(output_file, "w") as paths:
570 |         for cno, (contig, clen, ccov) in sorted(
571 |             contig_dict.items(), key=lambda x: x[1][1], reverse=True
572 |         ):
573 |             contig_name = "NODE_" + str(cno) + "_" + str(clen) + "_" + str(ccov) + "\n"
574 |             path_ids = ""
575 |             for id in contig:
576 |                 if keep_original:
577 |                     for iid in str(id).split("&"):
578 |                         if iid.find("*") != -1:
579 |                             rid = rev_id_mapping[iid[: iid.find("*")]]
580 |                         else:
581 |                             rid = rev_id_mapping[iid]
582 |                         if rid[0] == "-":
583 |                             rid = rid[1:] + "-"
584 |                         path_ids += rid + ","
585 |                 else:
586 |                     for iid in str(id).split("&"):
587 |                         if iid.find("*") != -1:
588 |                             rid = iid[: iid.find("*")]
589 |                         else:
590 |                             rid = iid
591 |                         path_ids += str(rid) + ","
592 |             path_ids = path_ids[:-1] + "\n"
593 |             paths.write(contig_name)
594 |             paths.write(path_ids)
595 |         paths.close()
596 | 
597 | 
598 | def process_pe_info(node_ids, pe_info_file, st_info_file):
599 |     pe_info = {}
600 |     for u in node_ids:
601 |         for v in node_ids:
602 |             pe_info[(min(u, v), max(u, v))] = 0
603 |     with open(pe_info_file, "r") as file:
604 |         for line in file:
605 |             if line == "\n":
606 |                 break
607 |             [u, v, mark] = line[:-1].split(":")[:3]
608 |             # bidirection
609 |             key = (min(u, v), max(u, v))
610 |             if pe_info.get(key) != None:
611 |                 pe_info[key] += int(mark)
612 |         file.close()
613 | 
614 |     with open(st_info_file, "r") as file:
615 |         for line in file:
616 |             if line == "\n":
617 |                 break
618 |             [u, v, mark] = line[:-1].split(":")[:3]
619 |             # bidirection
620 |             key = (min(u, v), max(u, v))
621 |             if pe_info.get(key) != None:
622 |                 pe_info[key] += int(mark)
623 |         file.close()
624 |     dcpy_pe_info = {}
625 |     for (uid, wid), u in pe_info.items():
626 |         dcpy_pe_info[(uid, wid)] = u
627 |     return pe_info, dcpy_pe_info
628 | 
629 | 
630 | def store_reinit_graph(
631 |     graph: Graph,
632 |     simp_node_dict: dict,
633 |     simp_edge_dict: dict,
634 |     logger: Logger,
635 |     opt_filename,
636 | ):
637 |     graph_to_gfa(graph, simp_node_dict, simp_edge_dict, logger, opt_filename)
638 |     grapho, simp_node_dicto, simp_edge_dicto = flipped_gfa_to_graph(
639 |         opt_filename, logger
640 |     )
641 |     assign_edge_flow(grapho, simp_node_dicto, simp_edge_dicto)
642 |     return grapho, simp_node_dicto, simp_edge_dicto
643 | 


--------------------------------------------------------------------------------
/utils/VStrains_Extension.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | from graph_tool.all import Graph
  5 | from utils.VStrains_Utilities import *
  6 | from utils.VStrains_Decomposition import get_non_trivial_branches, global_trivial_split
  7 | from utils.VStrains_IO import store_reinit_graph
  8 | 
  9 | 
 10 | def best_matching(
 11 |     graph: Graph,
 12 |     simp_node_dict: dict,
 13 |     simp_edge_dict: dict,
 14 |     contig_dict: dict,
 15 |     pe_info: dict,
 16 |     logger: Logger,
 17 | ):
 18 |     full_link = {}
 19 |     non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
 20 |     node_to_contig_dict, _ = contig_map_node(contig_dict)
 21 |     for no, node in non_trivial_branches.items():
 22 |         us = [graph.vp.id[src] for src in node.in_neighbors()]
 23 |         ws = [graph.vp.id[tgt] for tgt in node.out_neighbors()]
 24 |         logger.debug("---------------------------------------------")
 25 |         logger.debug(
 26 |             "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format(
 27 |                 no, len(us), len(ws)
 28 |             )
 29 |         )
 30 |         # add contig supports
 31 |         support_contigs = node_to_contig_dict.get(no, [])
 32 |         con_info = {}
 33 |         for cno in support_contigs:
 34 |             [contig, clen, ccov] = contig_dict[cno]
 35 |             loc = contig.index(no)
 36 |             if loc > 0 and loc < len(contig) - 1:
 37 |                 con_info[(contig[loc - 1], contig[loc + 1])] = con_info.get(
 38 |                     (contig[loc - 1], contig[loc + 1]), []
 39 |                 )
 40 |                 con_info[(contig[loc - 1], contig[loc + 1])].append((cno, clen, ccov))
 41 |             print_contig(
 42 |                 cno,
 43 |                 clen,
 44 |                 round(ccov, 2),
 45 |                 contig[max(loc - 1, 0) : loc + 2],
 46 |                 logger,
 47 |                 "support contig",
 48 |             )
 49 |         kept_link = {}
 50 |         sec_comb = []
 51 |         # init node usage for current branch
 52 |         in_usage = dict.fromkeys(us, 0)
 53 |         out_usage = dict.fromkeys(ws, 0)
 54 | 
 55 |         # align contig link first, and update status
 56 |         logger.debug("align contig link first")
 57 |         for uid in us:
 58 |             for wid in ws:
 59 |                 logger.debug("---------------------")
 60 |                 u = simp_node_dict[uid]
 61 |                 w = simp_node_dict[wid]
 62 |                 curr_pe = pe_info[(min(uid, wid), max(uid, wid))]
 63 | 
 64 |                 logger.debug("{0} -> {1} PE: {2}".format(uid, wid, curr_pe))
 65 |                 logger.debug(
 66 |                     "cov info: {0}[{1}] -> {2}[{3}]".format(
 67 |                         graph.ep.flow[graph.edge(u, node)],
 68 |                         pe_info[(min(uid, no), max(uid, no))],
 69 |                         graph.ep.flow[graph.edge(node, w)],
 70 |                         pe_info[(min(no, wid), max(no, wid))],
 71 |                     )
 72 |                 )
 73 |                 accept = False
 74 |                 if (uid, wid) in con_info:
 75 |                     logger.debug(
 76 |                         "current link supported by contig: {0}, added".format(
 77 |                             con_info[(uid, wid)]
 78 |                         )
 79 |                     )
 80 |                     accept = True
 81 |                 if uid == wid:
 82 |                     logger.debug(
 83 |                         "current link is a self link: {0}, potential cyclic strain, added".format(
 84 |                             uid
 85 |                         )
 86 |                     )
 87 |                     accept = True
 88 | 
 89 |                 if accept:
 90 |                     in_usage[uid] += 1
 91 |                     out_usage[wid] += 1
 92 |                     kept_link[(uid, wid)] = curr_pe
 93 |                 else:
 94 |                     logger.debug("current link is secondary choice, process later")
 95 |                     sec_comb.append((uid, wid, curr_pe))
 96 | 
 97 |         logger.debug(
 98 |             "align paired end/single end information first (if any) to isolated nodes"
 99 |         )
100 |         sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True)
101 |         for uid, wid, pe in sorted_sec_comb:
102 |             if pe > 0:
103 |                 logger.debug(
104 |                     "-----SEC LINK {0} -> {1} PE: {2}-----".format(uid, wid, pe)
105 |                 )
106 |                 logger.debug("- link [ > 0] supported case, added")
107 |                 in_usage[uid] += 1
108 |                 out_usage[wid] += 1
109 |                 kept_link[(uid, wid)] = pe
110 |         full_link[no] = kept_link
111 |     return full_link
112 | 
113 | 
114 | # extend contigs on both end, until a non distinct extension
115 | def contig_extension(
116 |     graph: Graph,
117 |     simp_node_dict: dict,
118 |     contig: list,
119 |     ccov,
120 |     full_link: dict,
121 |     logger: Logger,
122 |     threshold,
123 | ):
124 |     visited = dict.fromkeys(simp_node_dict.keys(), False)
125 |     for no in contig[1:-1]:
126 |         visited[no] = True
127 |     final_path = []
128 |     final_path.extend([simp_node_dict[no] for no in contig][1:-1])
129 | 
130 |     curr = simp_node_dict[contig[-1]]
131 |     logger.debug("c-t extension")
132 |     while curr != None and not visited[graph.vp.id[curr]]:
133 |         visited[graph.vp.id[curr]] = True
134 |         final_path.append(curr)
135 |         out_branches = list([n for n in curr.out_neighbors()])
136 |         if len(out_branches) == 0:
137 |             curr = None
138 |             logger.debug("Reach the end")
139 |         elif len(out_branches) == 1:
140 |             curr = out_branches[0]
141 |             logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
142 |         else:
143 |             f_assigned = False
144 |             if graph.vp.id[curr] in full_link and len(final_path) > 1:
145 |                 logger.debug("Curr is Branch")
146 |                 curr_links = [
147 |                     simp_node_dict[wid]
148 |                     for (uid, wid) in full_link[graph.vp.id[curr]].keys()
149 |                     if uid == graph.vp.id[final_path[-2]]
150 |                 ]
151 |                 if len(curr_links) == 1:
152 |                     # curr = curr_links[0]
153 |                     # logger.debug("single link next: {0}".format(graph.vp.id[curr]))
154 |                     if graph.vp.dp[curr_links[0]] - ccov <= -2 * threshold:
155 |                         curr = None
156 |                         logger.debug(
157 |                             "{0} single link < 2delta, use coverage".format(
158 |                                 graph.vp.id[curr_links[0]]
159 |                             )
160 |                         )
161 |                     else:
162 |                         curr = curr_links[0]
163 |                         logger.debug("single link next: {0}".format(graph.vp.id[curr]))
164 |                 elif len(curr_links) > 1:
165 |                     logger.debug("Ambiguous, stop extension")
166 |                     curr = None
167 |                 else:
168 |                     logger.debug("No link in here, use coverage information")
169 |                     f_assigned = True
170 |             else:
171 |                 curr = None
172 |                 logger.debug("Not in full link or len of path <= 1")
173 |             if f_assigned:
174 |                 in_branches = list([n for n in curr.in_neighbors()])
175 |                 if len(final_path) > 1 and len(in_branches) > 0:
176 |                     curru = final_path[-2]
177 |                     opt_ws = sorted(
178 |                         out_branches,
179 |                         key=lambda ww: abs(graph.vp.dp[curru] - graph.vp.dp[ww]),
180 |                     )
181 |                     bestw = opt_ws[0]
182 |                     opt_us = sorted(
183 |                         in_branches,
184 |                         key=lambda uu: abs(graph.vp.dp[bestw] - graph.vp.dp[uu]),
185 |                     )
186 |                     if opt_us[0] == curru:
187 |                         delta = max(
188 |                             2 * abs(graph.vp.dp[curru] - graph.vp.dp[bestw]), threshold
189 |                         )
190 |                         if (
191 |                             len(opt_us) > 1
192 |                             and abs(graph.vp.dp[opt_us[1]] - graph.vp.dp[bestw])
193 |                             <= delta
194 |                         ):
195 |                             logger.debug("ambiguous best matching, stop extension")
196 |                             continue
197 |                         if (
198 |                             len(opt_ws) > 1
199 |                             and abs(graph.vp.dp[curru] - graph.vp.dp[opt_ws[1]])
200 |                             <= delta
201 |                         ):
202 |                             logger.debug("ambiguous best matching, stop extension")
203 |                             continue
204 |                         logger.debug("best matching")
205 |                         curr = bestw
206 |                     else:
207 |                         logger.debug("Not best match")
208 |                         curr = None
209 |                 else:
210 |                     curr = None
211 |                     logger.debug("No Link + Not trivial, stop extension")
212 |             if curr == None:
213 |                 single_bests = sorted(
214 |                     [(onode, graph.vp.dp[onode]) for onode in out_branches],
215 |                     key=lambda tp: tp[1],
216 |                     reverse=True,
217 |                 )
218 |                 logger.debug(
219 |                     "Try last bit: 1st: {0}, 2nd: {1}, delta: {2}, cov: {3}".format(
220 |                         (graph.vp.id[single_bests[0][0]], single_bests[0][1]),
221 |                         (graph.vp.id[single_bests[1][0]], single_bests[1][1]),
222 |                         threshold,
223 |                         ccov,
224 |                     )
225 |                 )
226 |                 if (
227 |                     single_bests[0][1] - ccov > -threshold
228 |                     and single_bests[1][1] - ccov <= -threshold
229 |                 ):
230 |                     logger.debug("Last bit succ")
231 |                     curr = single_bests[0][0]
232 |                 else:
233 |                     logger.debug("Last bit fail")
234 |     unode = simp_node_dict[contig[0]]
235 |     if len(contig) == 1 and final_path[-1] not in unode.in_neighbors():
236 |         visited[contig[0]] = False
237 |         final_path.pop(0)
238 |     curr = unode
239 |     logger.debug("s-c extension")
240 |     while curr != None and not visited[graph.vp.id[curr]]:
241 |         visited[graph.vp.id[curr]] = True
242 |         final_path.insert(0, curr)
243 |         in_branches = list([n for n in curr.in_neighbors()])
244 |         if len(in_branches) == 0:
245 |             curr = None
246 |             logger.debug("Reach the end")
247 |         elif len(in_branches) == 1:
248 |             curr = in_branches[0]
249 |             logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
250 |         else:
251 |             f_assigned = False
252 |             if graph.vp.id[curr] in full_link and len(final_path) > 1:
253 |                 logger.debug("Curr is Branch")
254 |                 curr_links = [
255 |                     simp_node_dict[uid]
256 |                     for (uid, wid) in full_link[graph.vp.id[curr]].keys()
257 |                     if wid == graph.vp.id[final_path[1]]
258 |                 ]
259 |                 if len(curr_links) == 1:
260 |                     # curr = curr_links[0]
261 |                     # logger.debug("single link next: {0}".format(graph.vp.id[curr]))
262 |                     if graph.vp.dp[curr_links[0]] - ccov <= -2 * threshold:
263 |                         curr = None
264 |                         logger.debug(
265 |                             "{0} single link < 2delta, use coverage".format(
266 |                                 graph.vp.id[curr_links[0]]
267 |                             )
268 |                         )
269 |                     else:
270 |                         curr = curr_links[0]
271 |                         logger.debug("prev: {0}".format(graph.vp.id[curr]))
272 |                 elif len(curr_links) > 1:
273 |                     logger.debug("Ambiguous, stop extension")
274 |                     curr = None
275 |                 else:
276 |                     logger.debug("No link in here, use coverage information")
277 |                     f_assigned = True
278 |             else:
279 |                 curr = None
280 |                 logger.debug("Not in full link or len of path <= 1")
281 |             if f_assigned:
282 |                 out_branches = list([n for n in curr.out_neighbors()])
283 |                 if len(final_path) > 1 and len(out_branches) > 0:
284 |                     currw = final_path[1]
285 |                     opt_us = sorted(
286 |                         in_branches,
287 |                         key=lambda uu: abs(graph.vp.dp[currw] - graph.vp.dp[uu]),
288 |                     )
289 |                     bestu = opt_us[0]
290 |                     opt_ws = sorted(
291 |                         out_branches,
292 |                         key=lambda ww: abs(graph.vp.dp[bestu] - graph.vp.dp[ww]),
293 |                     )
294 |                     if opt_ws[0] == currw:
295 |                         delta = max(
296 |                             2 * abs(graph.vp.dp[currw] - graph.vp.dp[bestu]), threshold
297 |                         )
298 |                         if (
299 |                             len(opt_us) > 1
300 |                             and abs(graph.vp.dp[opt_us[1]] - graph.vp.dp[currw])
301 |                             <= delta
302 |                         ):
303 |                             logger.debug("ambiguous best matching, stop extension")
304 |                             continue
305 |                         if (
306 |                             len(opt_ws) > 1
307 |                             and abs(graph.vp.dp[bestu] - graph.vp.dp[opt_ws[1]])
308 |                             <= delta
309 |                         ):
310 |                             logger.debug("ambiguous best matching, stop extension")
311 |                             continue
312 |                         logger.debug("best matching")
313 |                         curr = bestu
314 |                     else:
315 |                         logger.debug("Not best match")
316 |                         curr = None
317 |                 else:
318 |                     logger.debug("No Link + Not trivial, stop extension")
319 |                     curr = None
320 |             if curr == None:
321 |                 single_bests = sorted(
322 |                     [(inode, graph.vp.dp[inode]) for inode in in_branches],
323 |                     key=lambda tp: tp[1],
324 |                     reverse=True,
325 |                 )
326 |                 logger.debug(
327 |                     "Try last bit: 1st: {0}, 2nd: {1}, delta: {2}, cov: {3}".format(
328 |                         (graph.vp.id[single_bests[0][0]], single_bests[0][1]),
329 |                         (graph.vp.id[single_bests[1][0]], single_bests[1][1]),
330 |                         threshold,
331 |                         ccov,
332 |                     )
333 |                 )
334 |                 if (
335 |                     single_bests[0][1] - ccov > -threshold
336 |                     and single_bests[1][1] - ccov <= -threshold
337 |                 ):
338 |                     logger.debug("Last bit succ")
339 |                     curr = single_bests[0][0]
340 |                 else:
341 |                     logger.debug("Last bit fail")
342 |     return final_path
343 | 
344 | 
345 | def final_extension(
346 |     graph: Graph, simp_node_dict: dict, contig: list, full_link: dict, logger: Logger
347 | ):
348 |     visited = dict.fromkeys(simp_node_dict.keys(), False)
349 |     for no in contig[1:-1]:
350 |         visited[no] = True
351 |     curr = simp_node_dict[contig[-1]]
352 |     final_path = []
353 |     final_path.extend([simp_node_dict[no] for no in contig][1:-1])
354 |     # from curr to the tail, or to the non-extendable end
355 |     logger.debug("c-t extension")
356 |     while curr != None and not visited[graph.vp.id[curr]]:
357 |         visited[graph.vp.id[curr]] = True
358 |         final_path.append(curr)
359 |         out_branches = list([n for n in curr.out_neighbors()])
360 |         if len(out_branches) == 0:
361 |             curr = None
362 |             logger.debug("Reach the end")
363 |         elif len(out_branches) == 1:
364 |             curr = out_branches[0]
365 |             logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
366 |         else:
367 |             if graph.vp.id[curr] in full_link and len(final_path) > 1:
368 |                 logger.debug("Curr is Branch")
369 |                 curr_links = [
370 |                     simp_node_dict[wid]
371 |                     for (uid, wid) in full_link[graph.vp.id[curr]].keys()
372 |                     if uid == graph.vp.id[final_path[-2]]
373 |                 ]
374 |                 if len(curr_links) == 1:
375 |                     curr = curr_links[0]
376 |                     logger.debug("single link next: {0}".format(graph.vp.id[curr]))
377 |                 else:
378 |                     logger.debug("No/more link in here, end entension")
379 |                     curr = None
380 |             else:
381 |                 curr = None
382 |                 logger.debug("Not in full link or len of path <= 1")
383 | 
384 |     unode = simp_node_dict[contig[0]]
385 |     if len(contig) == 1 and final_path[-1] not in unode.in_neighbors():
386 |         visited[contig[0]] = False
387 |         final_path.pop(0)
388 |     curr = unode
389 |     # from head to the curr, or to the non-extendable end
390 |     logger.debug("s-c extension")
391 |     while curr != None and not visited[graph.vp.id[curr]]:
392 |         visited[graph.vp.id[curr]] = True
393 |         final_path.insert(0, curr)
394 |         in_branches = list([n for n in curr.in_neighbors()])
395 |         if len(in_branches) == 0:
396 |             curr = None
397 |             logger.debug("Reach the end")
398 |         elif len(in_branches) == 1:
399 |             curr = in_branches[0]
400 |             logger.debug("direct extending.. {0}".format(graph.vp.id[curr]))
401 |         else:
402 |             if graph.vp.id[curr] in full_link and len(final_path) > 1:
403 |                 logger.debug("Curr is Branch")
404 |                 curr_links = [
405 |                     simp_node_dict[uid]
406 |                     for (uid, wid) in full_link[graph.vp.id[curr]].keys()
407 |                     if wid == graph.vp.id[final_path[1]]
408 |                 ]
409 |                 if len(curr_links) == 1:
410 |                     curr = curr_links[0]
411 |                     logger.debug("single link next: {0}".format(graph.vp.id[curr]))
412 |                 else:
413 |                     logger.debug("No/more link in here, end extension")
414 |                     curr = None
415 |             else:
416 |                 curr = None
417 |                 logger.debug("Not in full link or len of path <= 1")
418 |     return final_path
419 | 
420 | 
421 | def get_bubble_nodes(simp_node_dict: dict, contig: list):
422 |     bubbles = []
423 |     for no in contig:
424 |         if simp_node_dict[no].in_degree() == 1 and simp_node_dict[no].out_degree() == 1:
425 |             bubbles.append(simp_node_dict[no])
426 |     return bubbles
427 | 
428 | 
429 | def reduce_graph(
430 |     graph: Graph,
431 |     simp_node_dict: dict,
432 |     usages: dict,
433 |     full_link: dict,
434 |     logger: Logger,
435 |     path,
436 |     pcov,
437 |     threshold,
438 | ):
439 |     del_nodes_ids = []
440 |     for node in path:
441 |         usages[graph.vp.id[node]] += 1
442 |         graph.vp.dp[node] -= pcov
443 |         if graph.vp.dp[node] <= threshold:
444 |             del_nodes_ids.append(graph.vp.id[node])
445 |             graph.vp.color[node] = "gray"
446 |             usages.pop(graph.vp.id[node])
447 |     logger.debug(list_to_string(del_nodes_ids, "invalid nodes"))
448 |     for links in full_link.values():
449 |         for uid, wid in list(links.keys()):
450 |             if (
451 |                 graph.vp.color[simp_node_dict[uid]] != "black"
452 |                 or graph.vp.color[simp_node_dict[wid]] != "black"
453 |             ):
454 |                 links.pop((uid, wid))
455 |                 logger.debug("[D]{0}, {1}".format(uid, wid))
456 | 
457 | 
458 | def reduce_id_simple(id_l: list):
459 |     ids = []
460 |     for id in id_l:
461 |         for iid in id.split("&"):
462 |             if iid.find("*") != -1:
463 |                 ids.append(iid[: iid.find("*")])
464 |             else:
465 |                 ids.append(iid)
466 |     return ids
467 | 
468 | 
469 | def reduce_Anode(id: str, sno2ids: dict):
470 |     ids = [id]
471 |     while any([iid.startswith("A") for iid in ids]):
472 |         len_ids = len(ids)
473 |         for i in range(len_ids):
474 |             if ids[i].startswith("A"):
475 |                 id_v = ids.pop(i).split("*")[0]
476 |                 j = i
477 |                 for subid in sno2ids[id_v]:
478 |                     ids.insert(j, subid)
479 |                     j += 1
480 |                 break
481 |     return ids
482 | 
483 | 
484 | def path_extension(
485 |     graph: Graph,
486 |     simp_node_dict: dict,
487 |     simp_edge_dict: dict,
488 |     contig_dict: dict,
489 |     full_link: dict,
490 |     pe_info: dict,
491 |     logger: Logger,
492 |     threshold,
493 |     temp_dir,
494 | ):
495 |     logger.debug(
496 |         "-------------------------PATH Extension, delta: {0}".format(threshold)
497 |     )
498 |     usages = dict.fromkeys(simp_node_dict.keys(), 0)  # record the usage of each nodes
499 |     strain_dict = {}
500 |     rid = 1
501 |     sno2ids = dict()
502 |     while len(contig_dict) > 0:
503 |         # perform trivial split
504 |         prev_ids = list(simp_node_dict.keys())
505 |         trivial_split_count, id_mapping = global_trivial_split(
506 |             graph, simp_node_dict, simp_edge_dict, logger
507 |         )
508 |         graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
509 |             graph,
510 |             simp_node_dict,
511 |             simp_edge_dict,
512 |             logger,
513 |             "{0}/gfa/graph_S{1}.gfa".format(temp_dir, rid),
514 |         )
515 |         red_id_mapping = contig_dict_remapping(
516 |             graph,
517 |             simp_node_dict,
518 |             simp_edge_dict,
519 |             contig_dict,
520 |             id_mapping,
521 |             prev_ids,
522 |             logger,
523 |         )
524 |         # update links
525 |         for no in list(full_link.keys()):
526 |             if no not in simp_node_dict:
527 |                 full_link.pop(no)
528 |             else:
529 |                 kept_link = full_link.pop(no)
530 |                 node = simp_node_dict[no]
531 |                 for (uid, wid), pe in list(kept_link.items()):
532 |                     # if len(red_id_mapping[uid]) != 1 or len(red_id_mapping[wid]) != 1:
533 |                     #     kept_link.pop((uid, wid))
534 |                     # else:
535 |                     #     kept_link[(list(red_id_mapping[uid])[0], list(red_id_mapping[wid])[0])] = pe
536 |                     kept_link.pop((uid, wid))
537 |                     if len(red_id_mapping[uid]) == 1 or len(red_id_mapping[wid]) == 1:
538 |                         for uuid in red_id_mapping[uid]:
539 |                             for wwid in red_id_mapping[wid]:
540 |                                 if (
541 |                                     (uuid, wwid) not in kept_link
542 |                                     and (simp_node_dict[uuid] in node.in_neighbors())
543 |                                     and (simp_node_dict[wwid] in node.out_neighbors())
544 |                                 ):
545 |                                     kept_link[(uuid, wwid)] = pe
546 |                 full_link[no] = kept_link
547 |         # update usages
548 |         for no, u in list(usages.items()):
549 |             usages.pop(no)
550 |             for new_no in red_id_mapping[no]:
551 |                 usages[new_no] = u
552 |         ############################
553 |         # get longest contig
554 |         (longest_cno, [contig, clen, ccov]) = max(
555 |             contig_dict.items(), key=lambda tp: tp[1][1]
556 |         )
557 |         contig_dict.pop(longest_cno)
558 |         if all(usages[cn] > 0 for cn in contig):
559 |             print_contig(
560 |                 longest_cno, clen, ccov, contig, logger, "-----> Used previously"
561 |             )
562 |             continue
563 |         if any(graph.vp.color[simp_node_dict[no]] == "gray" for no in contig):
564 |             print_contig(
565 |                 longest_cno,
566 |                 clen,
567 |                 ccov,
568 |                 contig,
569 |                 logger,
570 |                 "-----> Some node low cov, skip",
571 |             )
572 |             continue
573 | 
574 |         cbubbles = get_bubble_nodes(simp_node_dict, contig)
575 |         bbl_cov = (
576 |             numpy.median([graph.vp.dp[node] for node in cbubbles])
577 |             if len(cbubbles) != 0
578 |             else ccov
579 |         )
580 |         print_contig(
581 |             longest_cno,
582 |             clen,
583 |             bbl_cov,
584 |             contig,
585 |             logger,
586 |             "-----> Current extending contig: org ccov: {0}, use min {1}".format(
587 |                 ccov, min(ccov, bbl_cov)
588 |             ),
589 |         )
590 | 
591 |         path = contig_extension(
592 |             graph,
593 |             simp_node_dict,
594 |             contig,
595 |             min(ccov, bbl_cov),
596 |             full_link,
597 |             logger,
598 |             threshold,
599 |         )
600 |         pno = "A" + str(rid)
601 |         plen = path_len(graph, path)
602 |         path_ids = [graph.vp.id[n] for n in path]
603 |         sno2ids[pno] = []
604 |         for pid in path_ids:
605 |             if pid in sno2ids:
606 |                 sno2ids[pno].extend(sno2ids[pid])
607 |             else:
608 |                 sno2ids[pno].append(pid)
609 |         pbubbles = get_bubble_nodes(simp_node_dict, path_ids)
610 |         bbl_pcov = (
611 |             numpy.median([graph.vp.dp[node] for node in pbubbles])
612 |             if len(pbubbles) != 0
613 |             else ccov
614 |         )
615 |         pcov = min([ccov, bbl_pcov, bbl_cov])
616 |         logger.debug(
617 |             path_to_id_string(
618 |                 graph, path, "---*extended from contig {0}".format(longest_cno)
619 |             )
620 |         )
621 |         logger.debug(
622 |             "name: {0}, plen: {1}, pcov: {2}, bubble cov: {3}".format(
623 |                 pno, plen, pcov, bbl_pcov
624 |             )
625 |         )
626 |         strain_dict[pno] = [sno2ids[pno], plen, pcov]
627 |         for pid in path_ids:
628 |             if pid in strain_dict:
629 |                 strain_dict.pop(pid)
630 |         path_ins = [n for n in path[0].in_neighbors()]
631 |         path_outs = [n for n in path[-1].out_neighbors()]
632 |         if len(path_ins) == 0 and len(path_outs) == 0:
633 |             # both end st
634 |             logger.debug("st isolated, add to strain")
635 |             reduce_graph(
636 |                 graph, simp_node_dict, usages, full_link, logger, path, pcov, threshold
637 |             )
638 |         elif len(path_ins) != 0 and len(path_outs) == 0:
639 |             if len(path) > 1:
640 |                 logger.debug("left connected, wait")
641 |                 reduce_graph(
642 |                     graph,
643 |                     simp_node_dict,
644 |                     usages,
645 |                     full_link,
646 |                     logger,
647 |                     path[1:],
648 |                     pcov,
649 |                     threshold,
650 |                 )
651 |                 pnode = graph_add_vertex(
652 |                     graph, simp_node_dict, pno, pcov, path_to_seq(graph, path[1:], pno)
653 |                 )
654 |                 graph_add_edge(
655 |                     graph,
656 |                     simp_edge_dict,
657 |                     path[0],
658 |                     pnode,
659 |                     graph.ep.overlap[graph.edge(path[0], path[1])],
660 |                     pcov,
661 |                 )
662 |                 usages[pno] = 0
663 |         elif len(path_ins) == 0 and len(path_outs) != 0:
664 |             if len(path) > 1:
665 |                 logger.debug("right connected, wait")
666 |                 reduce_graph(
667 |                     graph,
668 |                     simp_node_dict,
669 |                     usages,
670 |                     full_link,
671 |                     logger,
672 |                     path[:-1],
673 |                     pcov,
674 |                     threshold,
675 |                 )
676 |                 pnode = graph_add_vertex(
677 |                     graph, simp_node_dict, pno, pcov, path_to_seq(graph, path[:-1], pno)
678 |                 )
679 |                 graph_add_edge(
680 |                     graph,
681 |                     simp_edge_dict,
682 |                     pnode,
683 |                     path[-1],
684 |                     graph.ep.overlap[graph.edge(path[-2], path[-1])],
685 |                     pcov,
686 |                 )
687 |                 usages[pno] = 0
688 |         else:
689 |             if len(path) > 1:
690 |                 logger.debug("both connected, wait")
691 |                 reduce_graph(
692 |                     graph,
693 |                     simp_node_dict,
694 |                     usages,
695 |                     full_link,
696 |                     logger,
697 |                     path[1:-1],
698 |                     pcov,
699 |                     threshold,
700 |                 )
701 |                 if len(path[1:-1]) > 0:
702 |                     pnode = graph_add_vertex(
703 |                         graph,
704 |                         simp_node_dict,
705 |                         pno,
706 |                         pcov,
707 |                         path_to_seq(graph, path[1:-1], pno),
708 |                     )
709 |                     graph_add_edge(
710 |                         graph,
711 |                         simp_edge_dict,
712 |                         path[0],
713 |                         pnode,
714 |                         graph.ep.overlap[graph.edge(path[0], path[1])],
715 |                         pcov,
716 |                     )
717 |                     graph_add_edge(
718 |                         graph,
719 |                         simp_edge_dict,
720 |                         pnode,
721 |                         path[-1],
722 |                         graph.ep.overlap[graph.edge(path[-2], path[-1])],
723 |                         pcov,
724 |                     )
725 |                     usages[pno] = 0
726 | 
727 |         graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
728 |             graph,
729 |             simp_node_dict,
730 |             simp_edge_dict,
731 |             logger,
732 |             "{0}/gfa/graph_S{1}post.gfa".format(temp_dir, rid),
733 |         )
734 |         for cno in list(contig_dict.keys()):
735 |             delete = False
736 |             for no in contig_dict[cno][0]:
737 |                 if no not in simp_node_dict:
738 |                     delete = True
739 |             if delete:
740 |                 contig_dict.pop(cno)
741 |         rid += 1
742 | 
743 |     # remove trivial split multiple nodes
744 |     seq_dict = {}
745 |     for node in graph.vertices():
746 |         if graph.vp.seq[node] not in seq_dict:
747 |             seq_dict[graph.vp.seq[node]] = []
748 |         seq_dict[graph.vp.seq[node]].append(node)
749 | 
750 |     for _, sp_nodes in seq_dict.items():
751 |         if len(sp_nodes) > 1:
752 |             sorted_sp_nodes = sorted(
753 |                 sp_nodes, key=lambda vnode: graph.vp.dp[vnode], reverse=True
754 |             )
755 |             for vnode in sorted_sp_nodes[1:]:
756 |                 graph_remove_vertex(graph, simp_node_dict, graph.vp.id[vnode])
757 |                 usages.pop(graph.vp.id[vnode])
758 |     graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
759 |         graph,
760 |         simp_node_dict,
761 |         simp_edge_dict,
762 |         logger,
763 |         "{0}/gfa/graph_S_final.gfa".format(temp_dir),
764 |     )
765 |     # assign link information
766 |     final_link_info = {}
767 |     for node in graph.vertices():
768 |         for node2 in graph.vertices():
769 |             if node > node2:
770 |                 continue
771 | 
772 |             nid1s = reduce_id_simple(reduce_Anode(graph.vp.id[node], sno2ids))
773 |             # nid1s = (
774 |             #     reduce_id_simple([graph.vp.id[node]])
775 |             #     if graph.vp.id[node][0] != "A"
776 |             #     else reduce_id_simple(sno2ids[graph.vp.id[node].split("*")[0]])
777 |             # )
778 |             nid2s = reduce_id_simple(reduce_Anode(graph.vp.id[node2], sno2ids))
779 |             # nid2s = (
780 |             #     reduce_id_simple([graph.vp.id[node2]])
781 |             #     if graph.vp.id[node2][0] != "A"
782 |             #     else reduce_id_simple(sno2ids[graph.vp.id[node2].split("*")[0]])
783 |             # )
784 |             kpair = (
785 |                 min(graph.vp.id[node], graph.vp.id[node2]),
786 |                 max(graph.vp.id[node], graph.vp.id[node2]),
787 |             )
788 | 
789 |             logger.debug("nid1s: {0}, nid2s: {1}".format(nid1s, nid2s))
790 |             logger.debug(
791 |                 "node1id: {0}, node2id: {1}".format(
792 |                     graph.vp.id[node], graph.vp.id[node2]
793 |                 )
794 |             )
795 |             final_link_info[kpair] = 0
796 |             for id1 in nid1s:
797 |                 for id2 in nid2s:
798 |                     inner_kpair = (min(id1, id2), max(id1, id2))
799 |                     final_link_info[kpair] += pe_info[inner_kpair]
800 | 
801 |     nt_branches = get_non_trivial_branches(graph, simp_node_dict)
802 |     final_links = {}
803 |     for no, node in nt_branches.items():
804 |         final_links[no] = {}
805 |         us = [graph.vp.id[src] for src in node.in_neighbors()]
806 |         ws = [graph.vp.id[tgt] for tgt in node.out_neighbors()]
807 |         logger.debug("---------------------------------------------")
808 |         logger.debug(
809 |             "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format(
810 |                 no, len(us), len(ws)
811 |             )
812 |         )
813 |         combs = []
814 |         in_usage = dict.fromkeys(us, 0)
815 | 
816 |         out_usage = dict.fromkeys(ws, 0)
817 |         for uid in us:
818 |             for wid in ws:
819 |                 combs.append(
820 |                     (uid, wid, final_link_info[(min(uid, wid), max(uid, wid))])
821 |                 )
822 |         sorted_comb = sorted(combs, key=lambda x: x[2], reverse=True)
823 |         for uid, wid, lf in sorted_comb:
824 |             logger.debug("---------------------")
825 |             if lf > 0 and in_usage[uid] == 0 and out_usage[wid] == 0:
826 |                 logger.debug(
827 |                     "-----SEC LINK {0} -> {1} LINK: {2}-----".format(uid, wid, lf)
828 |                 )
829 |                 logger.debug("- unique link [ > 0] supported case, added")
830 |                 final_links[no][(uid, wid)] = lf
831 |                 in_usage[uid] += 1
832 |                 out_usage[wid] += 1
833 | 
834 |     # add all the nodes that not be used in contig extension to final resulting sets
835 |     for node in sorted(
836 |         graph.vertices(), key=lambda nd: len(graph.vp.seq[nd]), reverse=True
837 |     ):
838 |         if len(graph.vp.seq[node]) <= 600:
839 |             break
840 |         if usages[graph.vp.id[node]] == 0:
841 |             logger.debug("Extend from free node: {0}".format(graph.vp.id[node]))
842 |             ccov = graph.vp.dp[node]
843 |             path = final_extension(
844 |                 graph, simp_node_dict, [graph.vp.id[node]], final_links, logger
845 |             )
846 |             pno = "N" + str(rid)
847 |             plen = path_len(graph, path)
848 |             path_ids = [graph.vp.id[n] for n in path]
849 |             pids = []
850 |             for pid in path_ids:
851 |                 if pid in sno2ids:
852 |                     pids.extend(sno2ids[pid])
853 |                 else:
854 |                     pids.append(pid)
855 |             for pid in path_ids:
856 |                 if pid in strain_dict:
857 |                     strain_dict.pop(pid)
858 |             pbubbles = get_bubble_nodes(simp_node_dict, path_ids)
859 |             pcov = (
860 |                 numpy.median([graph.vp.dp[node] for node in pbubbles])
861 |                 if len(pbubbles) != 0
862 |                 else graph.vp.dp[node]
863 |             )
864 |             logger.debug(
865 |                 path_to_id_string(
866 |                     graph,
867 |                     path,
868 |                     "---*extended from free node {0}".format(graph.vp.id[node]),
869 |                 )
870 |             )
871 |             logger.debug("name: {0}, plen: {1}, pcov: {2}".format(pno, plen, pcov))
872 |             strain_dict[pno] = [pids, plen, pcov]
873 |             for node in path:
874 |                 usages[graph.vp.id[node]] += 1
875 |             rid += 1
876 |     for sno, [_, _, scov] in list(strain_dict.items()):
877 |         if scov <= 2 * threshold:
878 |             strain_dict.pop(sno)
879 | 
880 |     # split zipped vertices
881 |     rid = ""
882 |     for cno in strain_dict.keys():
883 |         [contig, clen, ccov] = strain_dict[cno]
884 |         rcontig = []
885 |         for id in contig:
886 |             rcontig.extend(reduce_id_simple(reduce_Anode(id, sno2ids)))
887 |             # for iid in str(id).split("&"):
888 |             #     if iid.find("*") != -1:
889 |             #         rid = iid[: iid.find("*")]
890 |             #     else:
891 |             #         rid = iid
892 | 
893 |             #     if rid in sno2ids:
894 |             #         rcontig.extend(sno2ids[rid])
895 |             #     else:
896 |             #         rcontig.append(rid)
897 |         strain_dict[cno] = [rcontig, clen, ccov]
898 | 
899 |     return strain_dict, usages
900 | 


--------------------------------------------------------------------------------
/utils/VStrains_Decomposition.py:
--------------------------------------------------------------------------------
   1 | from utils.VStrains_Utilities import *
   2 | from utils.VStrains_IO import store_reinit_graph
   3 | import matplotlib.pyplot as plt
   4 | import numpy
   5 | 
   6 | 
   7 | def link_split(
   8 |     sec_comb: list,
   9 |     kept_link: dict,
  10 |     in_usage: dict,
  11 |     in_capacity: dict,
  12 |     out_usage: dict,
  13 |     out_capacity: dict,
  14 |     logger,
  15 | ):
  16 |     """update split plan using paired end & single end information"""
  17 |     logger.debug("attempt to split via paired end information")
  18 |     sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True)
  19 |     for uid, wid, pe in sorted_sec_comb:
  20 |         if pe <= 0:
  21 |             break
  22 |         logger.debug("-----SEC LINK {0} -> {1} PE: {2}".format(uid, wid, pe))
  23 |         logger.debug("Capacity: {0} -> {1}".format(in_capacity[uid], out_capacity[wid]))
  24 |         logger.debug("- distinct compatiable case, added")
  25 |         in_usage[uid] += 1
  26 |         out_usage[wid] += 1
  27 |         kept_link[(uid, wid)] = ((in_capacity[uid] + out_capacity[wid]) / 2, pe)
  28 |     return
  29 | 
  30 | 
  31 | def cov_split(
  32 |     us: list,
  33 |     ws: list,
  34 |     pe_info: dict,
  35 |     sec_comb: list,
  36 |     kept_link: dict,
  37 |     in_usage: dict,
  38 |     in_capacity: dict,
  39 |     out_usage: dict,
  40 |     out_capacity: dict,
  41 |     logger,
  42 | ):
  43 |     """update split plan using coverage information"""
  44 |     logger.debug("attempt to split via coverage information")
  45 |     logger.debug(
  46 |         "align paired end/single end information first (if any) to isolated nodes"
  47 |     )
  48 |     sorted_sec_comb = sorted(sec_comb, key=lambda x: x[2], reverse=True)
  49 |     for uid, wid, pe in sorted_sec_comb:
  50 |         if pe <= 0:
  51 |             break
  52 |         if in_usage[uid] > 0 or out_usage[wid] > 0:
  53 |             continue
  54 |         logger.debug("-----SEC LINK {0} -> {1} PE: {2}-----".format(uid, wid, pe))
  55 |         logger.debug("Capacity: {0} -> {1}".format(in_capacity[uid], out_capacity[wid]))
  56 |         logger.debug("- link [ > 0] supported case, added")
  57 |         in_usage[uid] += 1
  58 |         out_usage[wid] += 1
  59 |         kept_link[(uid, wid)] = ((in_capacity[uid] + out_capacity[wid]) / 2, pe)
  60 | 
  61 |     logger.debug("obtain best match via coverage similarity")
  62 |     for uid in us:
  63 |         if in_usage[uid] > 0:
  64 |             continue
  65 |         opt_ws = sorted(ws, key=lambda wwid: abs(in_capacity[uid] - out_capacity[wwid]))
  66 |         wid = opt_ws[0]
  67 |         opt_us = sorted(us, key=lambda uuid: abs(in_capacity[uuid] - out_capacity[wid]))
  68 |         if opt_us[0] == uid and out_usage[wid] == 0 and (uid, wid) not in kept_link:
  69 |             delta = 2 * abs(in_capacity[uid] - out_capacity[wid])
  70 |             logger.debug(
  71 |                 "Found coverage best match: {0} -> {1} with cov: {2}, {3}, checking delta bound: {4}".format(
  72 |                     uid, wid, in_capacity[uid], out_capacity[wid], delta
  73 |                 )
  74 |             )
  75 |             if (
  76 |                 abs(in_capacity[opt_us[1]] - out_capacity[wid]) <= delta
  77 |                 or abs(in_capacity[uid] - out_capacity[opt_ws[1]]) <= delta
  78 |             ):
  79 |                 logger.debug("ambiguous matching, skip")
  80 |             else:
  81 |                 logger.debug("added")
  82 |                 in_usage[uid] += 1
  83 |                 out_usage[wid] += 1
  84 |                 kept_link[(uid, wid)] = (
  85 |                     (in_capacity[uid] + out_capacity[wid]) / 2,
  86 |                     pe_info[(min(uid, wid), max(uid, wid))],
  87 |                 )
  88 |     return
  89 | 
  90 | 
  91 | def balance_split(
  92 |     graph: Graph,
  93 |     simp_node_dict: dict,
  94 |     simp_edge_dict: dict,
  95 |     contig_dict: dict,
  96 |     pe_info: dict,
  97 |     logger: Logger,
  98 |     ref_file: str,
  99 |     temp_dir: str,
 100 |     count_id: int,
 101 |     threshold,
 102 |     is_prim: bool,
 103 | ):
 104 |     logger.info(
 105 |         "balance split using contigs&paired end links&coverage information.. isPrim: {0}".format(
 106 |             is_prim
 107 |         )
 108 |     )
 109 |     correct_X = []
 110 |     correct_Y = []
 111 |     false_error_X = []
 112 |     false_error_Y = []
 113 |     error_X = []
 114 |     error_Y = []
 115 |     error_text = []
 116 |     cut = 100
 117 | 
 118 |     # detect all non-trivial branches right now
 119 |     non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
 120 |     split_branches = []
 121 |     node_to_contig_dict, _ = contig_map_node(contig_dict)
 122 |     for no, node in non_trivial_branches.items():
 123 |         us = [
 124 |             graph.vp.id[e.source()]
 125 |             for e in node.in_edges()
 126 |             if graph.ep.color[e] == "black"
 127 |         ]
 128 |         ws = [
 129 |             graph.vp.id[e.target()]
 130 |             for e in node.out_edges()
 131 |             if graph.ep.color[e] == "black"
 132 |         ]
 133 |         logger.debug("---------------------------------------------")
 134 |         logger.debug(
 135 |             "current non trivial branch: {0}, in-degree: {1}, out-degree: {2}".format(
 136 |                 no, len(us), len(ws)
 137 |             )
 138 |         )
 139 | 
 140 |         # authenticate if split-able
 141 |         if any([pe_info[(uid, uid)] == None for uid in us]) or any(
 142 |             [pe_info[(wid, wid)] == None for wid in ws]
 143 |         ):
 144 |             logger.debug(
 145 |                 "current non-trivial branch: {0} is related to current iteration, split later".format(
 146 |                     no
 147 |                 )
 148 |             )
 149 |             continue
 150 |         if not is_non_trivial(graph, node):
 151 |             logger.debug(
 152 |                 "current non-trivial branch: {0} is not non-trivial, potential bug".format(
 153 |                     no
 154 |                 )
 155 |             )
 156 |             continue
 157 |         if len(us) != len(ws):
 158 |             logger.debug("Not N-N split, skip")
 159 |             continue
 160 | 
 161 |         # check if link-split
 162 |         split_via_link = True
 163 | 
 164 |         # not perform link-split if any leaf is from a splitted node
 165 |         for id in us + ws:
 166 |             singles = id.split("&")
 167 |             if all([single.count("*") > 0 for single in singles]):
 168 |                 logger.debug(
 169 |                     "leaf:{0} is total branch nodes, no link information, skip link split".format(
 170 |                         id
 171 |                     )
 172 |                 )
 173 |                 split_via_link = False
 174 |                 break
 175 | 
 176 |         # not perform link-split if no combination has link information
 177 |         if all(
 178 |             [pe_info[(min(uid, wid), max(uid, wid))] == 0 for uid in us for wid in ws]
 179 |         ):
 180 |             logger.debug(
 181 |                 "current branch node too long, no link information, skip link split"
 182 |             )
 183 |             split_via_link = False
 184 | 
 185 |         # add contig supports
 186 |         support_contigs = node_to_contig_dict.get(no, [])
 187 |         con_info = {}
 188 |         for cno in support_contigs:
 189 |             [contig, clen, ccov] = contig_dict[cno]
 190 |             loc = contig.index(no)
 191 |             if loc > 0 and loc < len(contig) - 1:
 192 |                 con_info[(contig[loc - 1], contig[loc + 1])] = con_info.get(
 193 |                     (contig[loc - 1], contig[loc + 1]), []
 194 |                 )
 195 |                 con_info[(contig[loc - 1], contig[loc + 1])].append((cno, clen, ccov))
 196 |             print_contig(
 197 |                 cno,
 198 |                 clen,
 199 |                 round(ccov, 2),
 200 |                 contig[max(loc - 1, 0) : loc + 2],
 201 |                 logger,
 202 |                 "support contig",
 203 |             )
 204 | 
 205 |         # debug only
 206 |         # obtain perfect split via reference
 207 |         expect_link = []
 208 |         ref_pair_dict = {}
 209 |         ref_all_dict = {}
 210 |         if ref_file:
 211 |             lrefs = set()
 212 |             rrefs = set()
 213 |             error_nos = set()
 214 |             for uid in us:
 215 |                 for wid in ws:
 216 |                     u = simp_node_dict[uid]
 217 |                     w = simp_node_dict[wid]
 218 |                     ref_l = best_aln_score(graph, "L", [u], ref_file, temp_dir)
 219 |                     best_ref_l = [
 220 |                         ref
 221 |                         for [_, l, ref, nm] in ref_l
 222 |                         if nm == 0 and l == len(graph.vp.seq[u])
 223 |                     ]
 224 |                     ref_r = best_aln_score(graph, "R", [w], ref_file, temp_dir)
 225 |                     best_ref_r = [
 226 |                         ref
 227 |                         for [_, l, ref, nm] in ref_r
 228 |                         if nm == 0 and l == len(graph.vp.seq[w])
 229 |                     ]
 230 |                     lrefs = lrefs.union(best_ref_l)
 231 |                     rrefs = rrefs.union(best_ref_r)
 232 |                     ref_pair_dict[(uid, wid)] = set(best_ref_l).intersection(
 233 |                         set(best_ref_r)
 234 |                     )
 235 |                     ref_all_dict[(uid, wid)] = set(
 236 |                         [ref for [_, _, ref, nm] in ref_l if nm < 5]
 237 |                     ).union(set([ref for [_, _, ref, nm] in ref_r if nm < 5]))
 238 |                     if len(ref_pair_dict[(uid, wid)]) > 0:
 239 |                         expect_link.append((uid, wid))
 240 |                     if len(best_ref_l) == 0:
 241 |                         error_nos.add(uid)
 242 |                     if len(best_ref_r) == 0:
 243 |                         error_nos.add(wid)
 244 |             sym_diff = lrefs.symmetric_difference(rrefs)
 245 |             if len(sym_diff) > 0:
 246 |                 logger.debug(
 247 |                     "Current branch have force mismatch connection for following strains: {0}".format(
 248 |                         sym_diff
 249 |                     )
 250 |                 )
 251 |         # debug only
 252 | 
 253 |         kept_link = {}
 254 |         sec_comb = []
 255 |         # init node usage for current branch
 256 |         in_usage = dict.fromkeys(us, 0)
 257 |         in_capacity = {}
 258 |         for uid in us:
 259 |             in_capacity[uid] = graph.ep.flow[simp_edge_dict[(uid, no)]]
 260 | 
 261 |         out_usage = dict.fromkeys(ws, 0)
 262 |         out_capacity = {}
 263 |         for wid in ws:
 264 |             out_capacity[wid] = graph.ep.flow[simp_edge_dict[(no, wid)]]
 265 | 
 266 |         # align contig link first, and update status
 267 |         logger.debug("align contig link first")
 268 |         for uid in us:
 269 |             for wid in ws:
 270 |                 logger.debug("---------------------")
 271 |                 u = simp_node_dict[uid]
 272 |                 w = simp_node_dict[wid]
 273 |                 curr_pe = pe_info[(min(uid, wid), max(uid, wid))]
 274 | 
 275 |                 logger.debug("{0} -> {1} PE: {2}".format(uid, wid, curr_pe))
 276 |                 logger.debug(
 277 |                     "cov info: {0}[{1}] -> {2}[{3}]".format(
 278 |                         graph.ep.flow[graph.edge(u, node)],
 279 |                         pe_info[(min(uid, no), max(uid, no))],
 280 |                         graph.ep.flow[graph.edge(node, w)],
 281 |                         pe_info[(min(no, wid), max(no, wid))],
 282 |                     )
 283 |                 )
 284 |                 if ref_file:
 285 |                     logger.debug(
 286 |                         "intersect reference: {0}".format(ref_pair_dict[(uid, wid)])
 287 |                     )
 288 |                     # potential incorrect matching, but supported by links
 289 |                     if len(ref_pair_dict[(uid, wid)]) == 0 and curr_pe > 0:
 290 |                         logger.debug("False Positive case, WARN")
 291 |                 accept = False
 292 |                 if (uid, wid) in con_info:
 293 |                     logger.debug(
 294 |                         "current link supported by contig: {0}, added".format(
 295 |                             con_info[(uid, wid)]
 296 |                         )
 297 |                     )
 298 |                     accept = True
 299 |                 if uid == wid:
 300 |                     logger.debug(
 301 |                         "current link is a self link: {0}, potential cyclic strain, added".format(
 302 |                             uid
 303 |                         )
 304 |                     )
 305 |                     accept = True
 306 | 
 307 |                 if accept:
 308 |                     in_usage[uid] += 1
 309 |                     out_usage[wid] += 1
 310 |                     kept_link[(uid, wid)] = (
 311 |                         (in_capacity[uid] + out_capacity[wid]) / 2,
 312 |                         curr_pe,
 313 |                     )
 314 |                 else:
 315 |                     logger.debug("current link is secondary choice, process later")
 316 |                     sec_comb.append((uid, wid, curr_pe))
 317 |         if is_prim:
 318 |             if split_via_link:
 319 |                 link_split(
 320 |                     sec_comb,
 321 |                     kept_link,
 322 |                     in_usage,
 323 |                     in_capacity,
 324 |                     out_usage,
 325 |                     out_capacity,
 326 |                     logger,
 327 |                 )
 328 |         else:
 329 |             # secondary split, via link first, then coverage
 330 |             cov_split(
 331 |                 us,
 332 |                 ws,
 333 |                 pe_info,
 334 |                 sec_comb,
 335 |                 kept_link,
 336 |                 in_usage,
 337 |                 in_capacity,
 338 |                 out_usage,
 339 |                 out_capacity,
 340 |                 logger,
 341 |             )
 342 |         if not (
 343 |             all([u == 1 for u in in_usage.values()])
 344 |             and all([v == 1 for v in out_usage.values()])
 345 |         ):
 346 |             logger.debug("->Not satisfy N-N split, skip: {0}".format(kept_link))
 347 |             continue
 348 |         worst_pair_diff = max(
 349 |             [
 350 |                 abs(in_capacity[uid] - out_capacity[wid])
 351 |                 for (uid, wid) in kept_link.keys()
 352 |             ]
 353 |         )
 354 |         if worst_pair_diff > 4 * threshold:
 355 |             logger.debug(
 356 |                 "worst pair coverage diff greater than 4 delta: {0} > {1}, too uneven, skip: {2}".format(
 357 |                     worst_pair_diff, 4 * threshold, kept_link
 358 |                 )
 359 |             )
 360 |             continue
 361 |         logger.debug("->perform split, all kept links: {0}".format(kept_link))
 362 |         if ref_file:
 363 |             logger.debug("->expected links: {0}".format(expect_link))
 364 |             if set(kept_link) != set(expect_link):
 365 |                 logger.debug("Incorrect split")
 366 |             else:
 367 |                 logger.debug("Correct split")
 368 | 
 369 |         split_branches.append(no)
 370 |         link2subs = {}
 371 |         counter = 0
 372 |         for (uid, wid), (sub_flow, pe) in kept_link.items():
 373 |             logger.debug("--------> {0} - {1}".format(uid, wid))
 374 |             # debug only
 375 |             if ref_file:
 376 |                 if len(ref_pair_dict[(uid, wid)]) != 0:
 377 |                     logger.debug("best pair")
 378 |                     if pe <= cut:
 379 |                         correct_X.append(pe)
 380 |                         correct_Y.append(sub_flow)
 381 |                         if pe < 5:
 382 |                             logger.debug(
 383 |                                 "correct node with 0 pest {0}->{1}->{2}, with branch size: {3}".format(
 384 |                                     uid, no, wid, len(graph.vp.seq[node])
 385 |                                 )
 386 |                             )
 387 |                 else:
 388 |                     is_graph_error = False
 389 |                     if uid in error_nos:
 390 |                         logger.debug(
 391 |                             "src: {0} is incorrect graph erroroness node, no optimal ref".format(
 392 |                                 uid
 393 |                             )
 394 |                         )
 395 |                         is_graph_error = True
 396 |                     if wid in error_nos:
 397 |                         logger.debug(
 398 |                             "tgt: {0} is incorrect graph erroroness node, no optimal ref".format(
 399 |                                 wid
 400 |                             )
 401 |                         )
 402 |                         is_graph_error = True
 403 |                     if len(ref_all_dict[(uid, wid)].intersection(sym_diff)) > 0:
 404 |                         is_graph_error = True
 405 |                     if is_graph_error:
 406 |                         if pe <= cut:
 407 |                             false_error_X.append(pe)
 408 |                             false_error_Y.append(sub_flow)
 409 |                         logger.debug("false positive error pair")
 410 |                     else:
 411 |                         if pe <= cut:
 412 |                             error_X.append(pe)
 413 |                             error_Y.append(sub_flow)
 414 |                             error_text.append("{0}:{1}:{2}".format(uid, wid, pe))
 415 |                         logger.debug("error pair")
 416 |             # debug only
 417 |             # perform split
 418 |             sub_id = no + "*" + str(counter)
 419 |             counter += 1
 420 |             sub_node = graph_add_vertex(
 421 |                 graph, simp_node_dict, sub_id, sub_flow, graph.vp.seq[node]
 422 |             )
 423 | 
 424 |             graph_add_edge(
 425 |                 graph,
 426 |                 simp_edge_dict,
 427 |                 simp_node_dict[uid],
 428 |                 sub_node,
 429 |                 graph.ep.overlap[simp_edge_dict[(uid, no)]],
 430 |                 sub_flow,
 431 |             )
 432 | 
 433 |             graph_add_edge(
 434 |                 graph,
 435 |                 simp_edge_dict,
 436 |                 sub_node,
 437 |                 simp_node_dict[wid],
 438 |                 graph.ep.overlap[simp_edge_dict[(no, wid)]],
 439 |                 sub_flow,
 440 |             )
 441 |             link2subs[(uid, wid)] = sub_id
 442 | 
 443 |         # keep track of related contig record
 444 |         for cno in support_contigs:
 445 |             curr_contig, clen, ccov = contig_dict.pop(cno)
 446 |             branch_ind = curr_contig.index(no)
 447 |             uid = curr_contig[branch_ind - 1] if branch_ind > 0 else None
 448 |             wid = (
 449 |                 curr_contig[branch_ind + 1]
 450 |                 if branch_ind < len(curr_contig) - 1
 451 |                 else None
 452 |             )
 453 |             if uid != None and wid != None:
 454 |                 # unique mapping
 455 |                 curr_contig[branch_ind] = link2subs[(uid, wid)]
 456 |                 contig_dict[cno] = [curr_contig, clen, ccov]
 457 |             elif uid == None and wid == None:
 458 |                 for sub_id in link2subs.values():
 459 |                     # all possible contigs
 460 |                     contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [
 461 |                         [sub_id],
 462 |                         len(graph.vp.seq[simp_node_dict[sub_id]]),
 463 |                         graph.vp.dp[simp_node_dict[sub_id]],
 464 |                     ]
 465 |             elif uid != None and wid == None:
 466 |                 for (uid2, _), sub_id in link2subs.items():
 467 |                     if uid == uid2:
 468 |                         curr_contig[branch_ind] = sub_id
 469 |                         contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [
 470 |                             list(curr_contig),
 471 |                             clen,
 472 |                             ccov,
 473 |                         ]
 474 |             else:
 475 |                 for (_, wid2), sub_id in link2subs.items():
 476 |                     if wid == wid2:
 477 |                         curr_contig[branch_ind] = sub_id
 478 |                         contig_dict[cno + "$" + str(sub_id.split("*")[-1])] = [
 479 |                             list(curr_contig),
 480 |                             clen,
 481 |                             ccov,
 482 |                         ]
 483 | 
 484 |         # remove related edges and vertex, update contig tracker
 485 |         for uid in us:
 486 |             graph_remove_edge(graph, simp_edge_dict, uid, no)
 487 |         for wid in ws:
 488 |             graph_remove_edge(graph, simp_edge_dict, no, wid)
 489 |         graph_remove_vertex(graph, simp_node_dict, no)
 490 |         node_to_contig_dict, _ = contig_map_node(contig_dict)
 491 | 
 492 |         # update link info
 493 |         for (uid, wid), sub_id in link2subs.items():
 494 |             for nno in simp_node_dict.keys():
 495 |                 pe_info[(min(sub_id, nno), max(sub_id, nno))] = None
 496 |         for pu, pv in list(pe_info.keys()):
 497 |             if pu == no or pv == no:
 498 |                 # out of date
 499 |                 pe_info.pop((min(pu, pv), max(pu, pv)))
 500 |     # final step, assign all the none val pe link to 0
 501 |     for k in pe_info.keys():
 502 |         if pe_info[k] == None:
 503 |             pe_info[k] = 0
 504 |     logger.debug("No of branch be removed: " + str(len(set(split_branches))))
 505 |     logger.debug("Split branches: " + list_to_string(set(split_branches)))
 506 |     logger.info("done")
 507 | 
 508 |     # plot the data
 509 |     if ref_file:
 510 |         _, (ax1) = plt.subplots(1, 1, figsize=(32, 32))
 511 |         ax1.scatter(correct_X, correct_Y, color="red", s=100, label="Correct")
 512 |         ax1.scatter(
 513 |             false_error_X, false_error_Y, color="blue", s=100, label="False-Positive"
 514 |         )
 515 |         ax1.scatter(error_X, error_Y, color="green", marker="^", s=100, label="Error")
 516 | 
 517 |         for index in range(len(error_X)):
 518 |             ax1.text(error_X[index], error_Y[index], error_text[index], size=10)
 519 | 
 520 |         ax1.set_xlabel("PE")
 521 |         ax1.set_ylabel("FLOW")
 522 |         ax1.set_title("Scatter Plot - flow vs pe")
 523 |         ax1.legend()
 524 |         plt.yticks(numpy.arange(0, 500, 10))
 525 |         plt.xticks(numpy.arange(0, cut + 1, 1))
 526 |         plt.savefig(
 527 |             "{0}{1}".format(temp_dir, "/tmp/scatter_plot_pest_{0}.png".format(count_id))
 528 |         )
 529 | 
 530 |     return len(set(split_branches))
 531 | 
 532 | 
 533 | def trivial_split(
 534 |     graph: Graph,
 535 |     simp_node_dict: dict,
 536 |     simp_edge_dict: dict,
 537 |     pe_info: dict,
 538 |     logger: Logger,
 539 | ):
 540 |     """
 541 |     Split the graph, for any (0|1)->N, N->(0|1) branch, split by forking the 1 edge to N edge.
 542 |     """
 543 |     logger.info("graph trivial split on NT related vertices..")
 544 |     # detect all non-trivial branches right now
 545 |     non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
 546 |     trivial_split_count = 0
 547 |     id_mapping = {}
 548 |     for id in simp_node_dict.keys():
 549 |         id_mapping[id] = set()
 550 | 
 551 |     for ntno, ntnode in non_trivial_branches.items():
 552 |         if graph.vp.color[ntnode] != "black":
 553 |             continue
 554 |         logger.debug("Current involving NT branch: {0}".format(ntno))
 555 |         for inode in set(ntnode.in_neighbors()):
 556 |             if graph.vp.color[inode] != "black":
 557 |                 continue
 558 |             ino = graph.vp.id[inode]
 559 |             if ino not in id_mapping:
 560 |                 id_mapping[ino] = set()
 561 |             ines = [ue for ue in inode.in_edges() if graph.ep.color[ue] == "black"]
 562 |             outes = [ve for ve in inode.out_edges() if graph.ep.color[ve] == "black"]
 563 |             if len(ines) > 1 and len(outes) == 1:
 564 |                 # n to 1
 565 |                 logger.debug("{0}, n->1 split right".format(ino))
 566 |                 graph.vp.color[inode] = "gray"
 567 |                 graph.ep.color[graph.edge(inode, ntnode)] = "gray"
 568 |                 s = "A"
 569 |                 for i in range(len(ines)):
 570 |                     ine = ines[i]
 571 |                     src = ine.source()
 572 |                     snode = graph_add_vertex(
 573 |                         graph,
 574 |                         simp_node_dict,
 575 |                         ino + "*" + chr(ord(s) + i),
 576 |                         graph.ep.flow[ine],
 577 |                         graph.vp.seq[inode],
 578 |                     )
 579 |                     graph.ep.color[ine] = "gray"
 580 |                     sedge_in = graph_add_edge(
 581 |                         graph,
 582 |                         simp_edge_dict,
 583 |                         src,
 584 |                         snode,
 585 |                         graph.ep.overlap[ine],
 586 |                         graph.ep.flow[ine],
 587 |                     )
 588 |                     simp_node_dict[graph.vp.id[snode]] = snode
 589 |                     simp_edge_dict[
 590 |                         (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
 591 |                     ] = sedge_in
 592 | 
 593 |                     sedge_out = graph_add_edge(
 594 |                         graph,
 595 |                         simp_edge_dict,
 596 |                         snode,
 597 |                         ntnode,
 598 |                         graph.ep.overlap[graph.edge(inode, ntnode)],
 599 |                         graph.ep.flow[ine],
 600 |                     )
 601 |                     simp_edge_dict[
 602 |                         (
 603 |                             graph.vp.id[sedge_out.source()],
 604 |                             graph.vp.id[sedge_out.target()],
 605 |                         )
 606 |                     ] = sedge_out
 607 |                     id_mapping[ino].add(graph.vp.id[snode])
 608 |                     for nno in simp_node_dict.keys():
 609 |                         pe_info[
 610 |                             (min(graph.vp.id[snode], nno), max(graph.vp.id[snode], nno))
 611 |                         ] = None
 612 |                 trivial_split_count += 1
 613 |                 # update link information
 614 |                 for pu, pv in list(pe_info.keys()):
 615 |                     if pu == ino or pv == ino:
 616 |                         # out of date
 617 |                         pe_info.pop((min(pu, pv), max(pu, pv)))
 618 | 
 619 |         for onode in set(ntnode.out_neighbors()):
 620 |             if graph.vp.color[onode] != "black":
 621 |                 continue
 622 |             ono = graph.vp.id[onode]
 623 |             if ono not in id_mapping:
 624 |                 id_mapping[ono] = set()
 625 |             ines = [ue for ue in onode.in_edges() if graph.ep.color[ue] == "black"]
 626 |             outes = [ve for ve in onode.out_edges() if graph.ep.color[ve] == "black"]
 627 |             if len(ines) == 1 and len(outes) > 1:
 628 |                 # 1 to n
 629 |                 logger.debug("{0}, 1->n split left".format(ono))
 630 |                 graph.vp.color[onode] = "gray"
 631 |                 graph.ep.color[graph.edge(ntnode, onode)] = "gray"
 632 |                 s = "A"
 633 |                 for i in range(len(outes)):
 634 |                     oute = outes[i]
 635 |                     tgt = oute.target()
 636 |                     snode = graph_add_vertex(
 637 |                         graph,
 638 |                         simp_node_dict,
 639 |                         ono + "*" + chr(ord(s) + i),
 640 |                         graph.ep.flow[oute],
 641 |                         graph.vp.seq[onode],
 642 |                     )
 643 |                     graph.ep.color[oute] = "gray"
 644 |                     sedge_out = graph_add_edge(
 645 |                         graph,
 646 |                         simp_edge_dict,
 647 |                         snode,
 648 |                         tgt,
 649 |                         graph.ep.overlap[oute],
 650 |                         graph.ep.flow[oute],
 651 |                     )
 652 |                     simp_node_dict[graph.vp.id[snode]] = snode
 653 |                     simp_edge_dict[
 654 |                         (
 655 |                             graph.vp.id[sedge_out.source()],
 656 |                             graph.vp.id[sedge_out.target()],
 657 |                         )
 658 |                     ] = sedge_out
 659 | 
 660 |                     sedge_in = graph_add_edge(
 661 |                         graph,
 662 |                         simp_edge_dict,
 663 |                         ntnode,
 664 |                         snode,
 665 |                         graph.ep.overlap[graph.edge(ntnode, onode)],
 666 |                         graph.ep.flow[oute],
 667 |                     )
 668 |                     simp_edge_dict[
 669 |                         (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
 670 |                     ] = sedge_in
 671 |                     id_mapping[ono].add(graph.vp.id[snode])
 672 |                     for nno in simp_node_dict.keys():
 673 |                         pe_info[
 674 |                             (min(graph.vp.id[snode], nno), max(graph.vp.id[snode], nno))
 675 |                         ] = None
 676 |                 trivial_split_count += 1
 677 |                 # update link information
 678 |                 for pu, pv in list(pe_info.keys()):
 679 |                     if pu == ono or pv == ono:
 680 |                         # out of date
 681 |                         pe_info.pop((min(pu, pv), max(pu, pv)))
 682 |     for k in pe_info.keys():
 683 |         if pe_info[k] == None:
 684 |             pe_info[k] = 0
 685 |     logger.debug(
 686 |         "Total split-ted trivial branch count: {0}".format(trivial_split_count)
 687 |     )
 688 |     return trivial_split_count, id_mapping
 689 | 
 690 | 
 691 | def global_trivial_split(
 692 |     graph: Graph, simp_node_dict: dict, simp_edge_dict: dict, logger: Logger
 693 | ):
 694 |     """
 695 |     Split the graph, for any (0|1)->N, N->(0|1) branch, split by forking the 1 edge to N edge.
 696 |     """
 697 |     logger.info("graph trivial split..")
 698 | 
 699 |     BOUND_ITER = len(simp_node_dict) ** 2
 700 |     has_split = True
 701 |     trivial_split_count = 0
 702 |     id_mapping = {}
 703 |     for id in simp_node_dict.keys():
 704 |         id_mapping[id] = set()
 705 |     while has_split and trivial_split_count < BOUND_ITER:
 706 |         has_split = False
 707 |         for id in list(simp_node_dict.keys()):
 708 |             node = simp_node_dict[id]
 709 |             if graph.vp.color[node] != "black":
 710 |                 continue
 711 |             if id not in id_mapping:
 712 |                 id_mapping[id] = set()
 713 |             ines = [ue for ue in node.in_edges() if graph.ep.color[ue] == "black"]
 714 |             outes = [ve for ve in node.out_edges() if graph.ep.color[ve] == "black"]
 715 |             if len(ines) == 1 and len(outes) > 1:
 716 |                 logger.debug(id + " split left")
 717 |                 graph.vp.color[node] = "gray"
 718 |                 ine = ines[0]
 719 |                 src = ine.source()
 720 |                 graph.ep.color[ine] = "gray"
 721 |                 s = "A"
 722 |                 for i in range(len(outes)):
 723 |                     oute = outes[i]
 724 |                     tgt = oute.target()
 725 |                     snode = graph_add_vertex(
 726 |                         graph,
 727 |                         simp_node_dict,
 728 |                         id + "*" + chr(ord(s) + i),
 729 |                         graph.ep.flow[oute],
 730 |                         graph.vp.seq[node],
 731 |                     )
 732 |                     graph.ep.color[oute] = "gray"
 733 |                     sedge_out = graph_add_edge(
 734 |                         graph,
 735 |                         simp_edge_dict,
 736 |                         snode,
 737 |                         tgt,
 738 |                         graph.ep.overlap[oute],
 739 |                         graph.ep.flow[oute],
 740 |                     )
 741 |                     simp_node_dict[graph.vp.id[snode]] = snode
 742 |                     simp_edge_dict[
 743 |                         (
 744 |                             graph.vp.id[sedge_out.source()],
 745 |                             graph.vp.id[sedge_out.target()],
 746 |                         )
 747 |                     ] = sedge_out
 748 | 
 749 |                     sedge_in = graph_add_edge(
 750 |                         graph,
 751 |                         simp_edge_dict,
 752 |                         src,
 753 |                         snode,
 754 |                         graph.ep.overlap[ine],
 755 |                         graph.ep.flow[oute],
 756 |                     )
 757 |                     simp_edge_dict[
 758 |                         (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
 759 |                     ] = sedge_in
 760 |                     id_mapping[id].add(graph.vp.id[snode])
 761 |                 has_split = True
 762 |                 trivial_split_count += 1
 763 |             elif len(ines) > 1 and len(outes) == 1:
 764 |                 logger.debug(id + " split right")
 765 |                 graph.vp.color[node] = "gray"
 766 |                 oute = outes[0]
 767 |                 tgt = oute.target()
 768 |                 graph.ep.color[oute] = "gray"
 769 |                 s = "A"
 770 |                 for i in range(len(ines)):
 771 |                     ine = ines[i]
 772 |                     src = ine.source()
 773 |                     snode = graph_add_vertex(
 774 |                         graph,
 775 |                         simp_node_dict,
 776 |                         id + "*" + chr(ord(s) + i),
 777 |                         graph.ep.flow[ine],
 778 |                         graph.vp.seq[node],
 779 |                     )
 780 |                     graph.ep.color[ine] = "gray"
 781 |                     sedge_in = graph_add_edge(
 782 |                         graph,
 783 |                         simp_edge_dict,
 784 |                         src,
 785 |                         snode,
 786 |                         graph.ep.overlap[ine],
 787 |                         graph.ep.flow[ine],
 788 |                     )
 789 |                     simp_node_dict[graph.vp.id[snode]] = snode
 790 |                     simp_edge_dict[
 791 |                         (graph.vp.id[sedge_in.source()], graph.vp.id[sedge_in.target()])
 792 |                     ] = sedge_in
 793 | 
 794 |                     sedge_out = graph_add_edge(
 795 |                         graph,
 796 |                         simp_edge_dict,
 797 |                         snode,
 798 |                         tgt,
 799 |                         graph.ep.overlap[oute],
 800 |                         graph.ep.flow[ine],
 801 |                     )
 802 |                     simp_edge_dict[
 803 |                         (
 804 |                             graph.vp.id[sedge_out.source()],
 805 |                             graph.vp.id[sedge_out.target()],
 806 |                         )
 807 |                     ] = sedge_out
 808 |                     id_mapping[id].add(graph.vp.id[snode])
 809 |                 has_split = True
 810 |                 trivial_split_count += 1
 811 |             else:
 812 |                 None
 813 |     if trivial_split_count >= BOUND_ITER:
 814 |         logger.warning("Strange topology detected, exit trivial split immediately")
 815 |         return None, id_mapping
 816 |     else:
 817 |         logger.debug("No of trivial branch be removed: " + str(trivial_split_count))
 818 |         logger.info("done")
 819 |         return trivial_split_count, id_mapping
 820 | 
 821 | 
 822 | def edge_cleaning(
 823 |     graph: Graph, simp_edge_dict: dict, contig_dict: dict, pe_info: dict, logger: Logger
 824 | ):
 825 |     """
 826 |     Detect the crossing edges and select the confident edges only.
 827 |     """
 828 |     un_assigned_edge = graph.num_edges()
 829 |     assigned = dict.fromkeys(
 830 |         [(graph.vp.id[e.source()], graph.vp.id[e.target()]) for e in graph.edges()],
 831 |         False,
 832 |     )
 833 |     _, edge_to_contig_dict = contig_map_node(contig_dict)
 834 |     logger.debug("Total edges: " + str(un_assigned_edge))
 835 |     # converage iteration
 836 |     converage_flag = 0
 837 |     while True:
 838 |         for node in graph.vertices():
 839 |             in_d = node.in_degree()
 840 |             in_e = []
 841 |             for e in node.in_edges():
 842 |                 if assigned[(graph.vp.id[e.source()], graph.vp.id[e.target()])]:
 843 |                     in_d = in_d - 1
 844 |                 else:
 845 |                     in_e.append(e)
 846 | 
 847 |             out_d = node.out_degree()
 848 |             out_e = []
 849 |             for e in node.out_edges():
 850 |                 if assigned[(graph.vp.id[e.source()], graph.vp.id[e.target()])]:
 851 |                     out_d = out_d - 1
 852 |                 else:
 853 |                     out_e.append(e)
 854 | 
 855 |             if in_d == 1:
 856 |                 assigned[
 857 |                     (graph.vp.id[in_e[0].source()], graph.vp.id[in_e[0].target()])
 858 |                 ] = True
 859 |                 un_assigned_edge = un_assigned_edge - 1
 860 |             if out_d == 1:
 861 |                 assigned[
 862 |                     (graph.vp.id[out_e[0].source()], graph.vp.id[out_e[0].target()])
 863 |                 ] = True
 864 |                 un_assigned_edge = un_assigned_edge - 1
 865 |         if converage_flag == un_assigned_edge:
 866 |             break
 867 |         else:
 868 |             converage_flag = un_assigned_edge
 869 | 
 870 |     logger.debug(
 871 |         "un-assigned edges after node-weight coverage iteration : {0}".format(
 872 |             un_assigned_edge
 873 |         )
 874 |     )
 875 |     for u, v in assigned.keys():
 876 |         if not assigned[(u, v)]:
 877 |             logger.debug(
 878 |                 "***cross un-assigned edge: {0} -> {1}, with paired end link {2}".format(
 879 |                     u, v, pe_info[(min(u, v), max(u, v))]
 880 |                 )
 881 |             )
 882 |             if (u, v) in edge_to_contig_dict:
 883 |                 logger.debug(
 884 |                     "support contig: {0}, force assign".format(
 885 |                         edge_to_contig_dict[(u, v)]
 886 |                     )
 887 |                 )
 888 |                 assigned[(u, v)] = True
 889 |             else:
 890 |                 logger.debug("support contig: None")
 891 |     for u, v in assigned.keys():
 892 |         if not assigned[(u, v)]:
 893 |             force_assign = True
 894 |             for w, z in assigned.keys():
 895 |                 if (u == w or v == z) and assigned[(w, z)]:
 896 |                     force_assign = False
 897 |                     break
 898 |             if not force_assign:
 899 |                 graph.remove_edge(simp_edge_dict.pop((u, v)))
 900 |                 logger.debug(
 901 |                     "intersect unsupported edge: {0} -> {1}, removed".format(u, v)
 902 |                 )
 903 |             else:
 904 |                 logger.debug("disjoint unsupported edge: {0} -> {1}, kept".format(u, v))
 905 |     return assigned
 906 | 
 907 | 
 908 | def iter_graph_disentanglement(
 909 |     graph: Graph,
 910 |     simp_node_dict: dict,
 911 |     simp_edge_dict: dict,
 912 |     contig_dict: dict,
 913 |     pe_info: dict,
 914 |     ref_file: str,
 915 |     logger: Logger,
 916 |     threshold,
 917 |     temp_dir,
 918 | ):
 919 |     BOUND_ITER = len(simp_node_dict) ** 2
 920 |     it = 0
 921 |     total_removed_branch = 0
 922 |     num_split = 0
 923 |     iterCount = "A"
 924 |     for is_prim in [True, False]:  # False
 925 |         do_trivial_split = True
 926 |         while it < BOUND_ITER:
 927 |             num_split = balance_split(
 928 |                 graph,
 929 |                 simp_node_dict,
 930 |                 simp_edge_dict,
 931 |                 contig_dict,
 932 |                 pe_info,
 933 |                 logger,
 934 |                 ref_file,
 935 |                 temp_dir,
 936 |                 it,
 937 |                 threshold,
 938 |                 is_prim,
 939 |             )
 940 |             graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
 941 |                 graph,
 942 |                 simp_node_dict,
 943 |                 simp_edge_dict,
 944 |                 logger,
 945 |                 "{0}/gfa/split_graph_L{1}d.gfa".format(temp_dir, iterCount),
 946 |             )
 947 |             simp_path_compactification(
 948 |                 graph, simp_node_dict, simp_edge_dict, contig_dict, pe_info, logger
 949 |             )
 950 |             graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
 951 |                 graph,
 952 |                 simp_node_dict,
 953 |                 simp_edge_dict,
 954 |                 logger,
 955 |                 "{0}/gfa/split_graph_L{1}dc.gfa".format(temp_dir, iterCount),
 956 |             )
 957 | 
 958 |             if num_split > 0:
 959 |                 do_trivial_split = True
 960 |             else:
 961 |                 if do_trivial_split:
 962 |                     # trivial split nt branch related cases FIXME
 963 |                     prev_ids = list(simp_node_dict.keys())
 964 |                     trivial_split_count, id_mapping = trivial_split(
 965 |                         graph, simp_node_dict, simp_edge_dict, pe_info, logger
 966 |                     )
 967 |                     logger.debug("my id mapping: {0}".format(id_mapping))
 968 |                     graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
 969 |                         graph,
 970 |                         simp_node_dict,
 971 |                         simp_edge_dict,
 972 |                         logger,
 973 |                         "{0}/gfa/split_graph_L{1}dct.gfa".format(temp_dir, iterCount),
 974 |                     )
 975 | 
 976 |                     contig_dict_remapping(
 977 |                         graph,
 978 |                         simp_node_dict,
 979 |                         simp_edge_dict,
 980 |                         contig_dict,
 981 |                         id_mapping,
 982 |                         prev_ids,
 983 |                         logger,
 984 |                     )
 985 |                     simp_path_compactification(
 986 |                         graph,
 987 |                         simp_node_dict,
 988 |                         simp_edge_dict,
 989 |                         contig_dict,
 990 |                         pe_info,
 991 |                         logger,
 992 |                     )
 993 |                     graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
 994 |                         graph,
 995 |                         simp_node_dict,
 996 |                         simp_edge_dict,
 997 |                         logger,
 998 |                         "{0}/gfa/split_graph_L{1}dctd.gfa".format(temp_dir, iterCount),
 999 |                     )
1000 | 
1001 |             contig_dup_removed_s(contig_dict, logger)
1002 |             trim_contig_dict(graph, simp_node_dict, contig_dict, logger)
1003 |             # analysis
1004 |             if ref_file:
1005 |                 map_ref_to_graph(
1006 |                     ref_file,
1007 |                     simp_node_dict,
1008 |                     "{0}/gfa/split_graph_L{1}dc.gfa".format(temp_dir, iterCount),
1009 |                     logger,
1010 |                     True,
1011 |                     "{0}/paf/node_to_ref_{1}.paf".format(temp_dir, iterCount),
1012 |                     "{0}/tmp/temp_gfa_to_fasta_{1}.fasta".format(temp_dir, iterCount),
1013 |                 )
1014 |             # analysis
1015 |             total_removed_branch += num_split
1016 |             it += 1
1017 |             iterCount = chr(ord(iterCount) + 1)
1018 |             if num_split == 0:
1019 |                 if do_trivial_split:
1020 |                     do_trivial_split = False
1021 |                 else:
1022 |                     break
1023 | 
1024 |     logger.debug("Total non-trivial branches removed: " + str(total_removed_branch))
1025 |     non_trivial_branches = get_non_trivial_branches(graph, simp_node_dict)
1026 |     logger.debug(
1027 |         list_to_string(
1028 |             non_trivial_branches.keys(),
1029 |             "non-trivial branches ({0}) left after paired-end&single-strand links".format(
1030 |                 len(non_trivial_branches)
1031 |             ),
1032 |         )
1033 |     )
1034 | 
1035 |     graph, simp_node_dict, simp_edge_dict = store_reinit_graph(
1036 |         graph,
1037 |         simp_node_dict,
1038 |         simp_edge_dict,
1039 |         logger,
1040 |         "{0}/gfa/split_graph_final.gfa".format(temp_dir),
1041 |     )
1042 |     return graph, simp_node_dict, simp_edge_dict
1043 | 
1044 | 
1045 | def best_aln_score(graph: Graph, ori, strain, ref_file, temp_dir):
1046 |     fname = "{0}/temp_{1}.fa".format(temp_dir, ori)
1047 |     pafname = "{0}/temp_{1}_aln.paf".format(temp_dir, ori)
1048 |     subprocess.check_call('echo "" > {0}'.format(fname), shell=True)
1049 |     with open(fname, "w") as f:
1050 |         f.write(">{0}\n".format(ori))
1051 |         f.write("{0}\n".format(path_to_seq(graph, strain, "")))
1052 |         f.close()
1053 |     minimap_api(ref_file, fname, pafname)
1054 |     subprocess.check_call("rm {0}".format(fname), shell=True)
1055 |     best_aln = []
1056 |     with open(pafname, "r") as paf:
1057 |         for line in paf.readlines():
1058 |             splited = line[:-1].split("\t")
1059 |             if len(splited) < 12:
1060 |                 continue
1061 |             best_aln.append(
1062 |                 [
1063 |                     splited[0],
1064 |                     int(splited[10]),
1065 |                     splited[5],
1066 |                     int(splited[10]) - int(splited[9]),
1067 |                 ]
1068 |             )
1069 |         paf.close()
1070 |     subprocess.check_call("rm {0}".format(pafname), shell=True)
1071 |     return best_aln
1072 | 


--------------------------------------------------------------------------------