├── pictures
    ├── leia.jpg
    ├── start.txt
    ├── fail.txt
    └── success.txt
├── envs
    ├── clair3.yaml
    ├── minimap2.yaml
    ├── sniffles.yaml
    ├── clair3_no_depend.yaml
    ├── variant_tools.yaml
    ├── env_tools.txt
    ├── util.yaml
    ├── minimap_full.yaml
    ├── sniffles.yaml_back
    ├── whatshap.yaml
    ├── pythonRun.yaml
    ├── princess_env.yaml
    ├── clair3.yaml_back
    └── run_princess_env.yaml
├── cluster
    ├── config.yaml
    ├── lsf_status.py
    ├── pbs_status.py
    ├── key_mapping.yaml
    ├── slurm_status.py
    ├── scheduler.py
    └── cluster_config.yaml
├── LICENSE
├── scripts
    ├── process.py
    ├── update_meth_hp_ps.py
    ├── rawcoverage.py
    ├── update_sv_hp_ps.py
    └── phasing_report_update_vcf.py
├── config.yaml
├── .gitignore
├── modules
    ├── stat.smk
    ├── methylation.smk
    ├── phasing.smk
    ├── sv.smk
    ├── align.smk
    ├── snp.smk
    └── output.smk
├── Snakefile
├── README.md
└── princess


/pictures/leia.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MeHelmy/princess/HEAD/pictures/leia.jpg


--------------------------------------------------------------------------------
/envs/clair3.yaml:
--------------------------------------------------------------------------------
1 | name: clair3.0.1.11
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - clair3=0.1.11
6 | 


--------------------------------------------------------------------------------
/pictures/start.txt:
--------------------------------------------------------------------------------
1 | ###########################
2 | ###    Start analysis   ###
3 | ###########################
4 | 


--------------------------------------------------------------------------------
/envs/minimap2.yaml:
--------------------------------------------------------------------------------
1 | name: Minimap2
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - minimap2=2.24
6 |   - samtools=1.15.1
7 | 


--------------------------------------------------------------------------------
/envs/sniffles.yaml:
--------------------------------------------------------------------------------
1 | name: Sniffles2
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - sniffles=2.0.5
6 |   - pysam=0.18.0=py39h5030a8b_2
7 | 


--------------------------------------------------------------------------------
/envs/clair3_no_depend.yaml:
--------------------------------------------------------------------------------
1 | name: clair3.0.1.11
2 | channels:
3 |   - conda-forge
4 |   - bioconda
5 |   - defaults
6 |   - anaconda
7 | dependencies:
8 |   - clair3=1.0.3 #0.1.11


--------------------------------------------------------------------------------
/envs/variant_tools.yaml:
--------------------------------------------------------------------------------
1 | name: VariantTools
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - bcftools=1.9
6 |   - bedtools=2.29.2
7 |   - vcflib=1.0.0
8 |   - tabix=0.2.6
9 |   - survivor=1.0.6


--------------------------------------------------------------------------------
/envs/env_tools.txt:
--------------------------------------------------------------------------------
1 | conda create --no-default-packages -n princess_env  ngmlr samtools minimap2 sniffles bcftools whatshap vcflib survivor tabix pandas numpy seaborn matplotlib biopython nanopolish pyfadix
2 | 


--------------------------------------------------------------------------------
/envs/util.yaml:
--------------------------------------------------------------------------------
 1 | name: Utils
 2 | channels:
 3 |   - bioconda
 4 | dependencies:
 5 |   - bcftools=1.15.1
 6 |   - bedtools=2.30.0
 7 |   - samtools=1.15.1
 8 |   - survivor=1.0.7
 9 |   - vcflib=1.0.3
10 | 


--------------------------------------------------------------------------------
/pictures/fail.txt:
--------------------------------------------------------------------------------
1 | #######################################
2 | ###     Sorry, unsuccessful run     ###
3 | #######################################
4 | 
5 | Please contact helmy dot medhat [@] gmail for more information
6 | 


--------------------------------------------------------------------------------
/pictures/success.txt:
--------------------------------------------------------------------------------
1 | ################################################
2 | ###   Successfully finished the analysis!!   ###
3 | ################################################
4 | 
5 | Please contact helmy dot medhat [@] gmail for more information
6 | 


--------------------------------------------------------------------------------
/cluster/config.yaml:
--------------------------------------------------------------------------------
 1 | restart-times: 3
 2 | latency-wait: 1200
 3 | cluster-config: "cluster/cluster_config.yaml" #abs path
 4 | cluster: "scheduler.py" #
 5 | #cluster-status: "pbs_status.py" #
 6 | cluster-status: "slurm_status.py" #
 7 | max-jobs-per-second: 30
 8 | max-status-checks-per-second: 10
 9 | cores: 99 # how many jobs you want to submit to your cluster queue
10 | local-cores: 1
11 | rerun-incomplete: true  # recommended for cluster submissions
12 | keep-going: true
13 | 


--------------------------------------------------------------------------------
/cluster/lsf_status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | 
 4 | import os
 5 | import sys
 6 | import warnings
 7 | import subprocess
 8 | 
 9 | 
10 | jobid = sys.argv[1]
11 | 
12 | out= subprocess.run(['bjobs','-noheader',jobid],stdout=subprocess.PIPE).stdout.decode('utf-8')
13 | 
14 | state = out.strip().split()[2]
15 | 
16 | 
17 | map_state={"PEND":'running',
18 |            "RUN":'running',
19 |            "PROV":"running",
20 |            "WAIT":'running',
21 |            "DONE":'success',
22 |            "":'success'}
23 | 
24 | print(map_state.get(state,'failed'))
25 | 


--------------------------------------------------------------------------------
/cluster/pbs_status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import subprocess
 5 | import xml.etree.cElementTree as ET
 6 | 
 7 | jobid = sys.argv[1]
 8 | 
 9 | try:
10 |     res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
11 | 
12 |     xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot()
13 |     job_state = xmldoc.findall('.//job_state')[0].text
14 | 
15 |     if job_state == "C":
16 |         exit_status = xmldoc.findall('.//exit_status')[0].text
17 |         if exit_status == '0':
18 |             print("success")
19 |         else:
20 |             print("failed")
21 |     else:
22 |         print("running")
23 | 
24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e:
25 |     print("failed")
26 | 


--------------------------------------------------------------------------------
/envs/minimap_full.yaml:
--------------------------------------------------------------------------------
 1 | name: Minimap2
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 |   - anaconda
 7 | dependencies:
 8 |   - _libgcc_mutex=0.1=conda_forge
 9 |   - _openmp_mutex=4.5=2_gnu
10 |   - bzip2=1.0.8=h7f98852_4
11 |   - c-ares=1.18.1=h7f98852_0
12 |   - ca-certificates=2022.6.15=ha878542_0
13 |   - htslib=1.15.1=h9753748_0
14 |   - k8=0.2.5=hd03093a_2
15 |   - keyutils=1.6.1=h166bdaf_0
16 |   - krb5=1.19.3=h3790be6_0
17 |   - libcurl=7.83.1=h7bff187_0
18 |   - libdeflate=1.10=h7f98852_0
19 |   - libedit=3.1.20191231=he28a2e2_2
20 |   - libev=4.33=h516909a_1
21 |   - libgcc-ng=12.1.0=h8d9b700_16
22 |   - libgomp=12.1.0=h8d9b700_16
23 |   - libnghttp2=1.47.0=h727a467_0
24 |   - libssh2=1.10.0=ha56f1ee_2
25 |   - libstdcxx-ng=12.1.0=ha89aaad_16
26 |   - libzlib=1.2.12=h166bdaf_2
27 |   - minimap2=2.24=h7132678_1
28 |   - ncurses=6.3=h27087fc_1
29 |   - openssl=1.1.1q=h166bdaf_0
30 |   - samtools=1.15.1=h1170115_0
31 |   - xz=5.2.5=h516909a_1
32 |   - zlib=1.2.12=h166bdaf_2
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Medhat
 4 | =======
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/cluster/key_mapping.yaml:
--------------------------------------------------------------------------------
 1 | # only parameters defined in key_mapping (see below) are passed to the command in the order specified.
 2 | #system: "pbs" #check if system is defined below
 3 | system: "slurm" #check if system is defined below
 4 | 
 5 | slurm:
 6 |   command: "sbatch --parsable"
 7 |   key_mapping:
 8 |     name: "--job-name={}"
 9 |     threads: "-n {}"
10 |     mem: "--mem={}"
11 |     account: "--account={}"
12 |     queue: "--partition={}"
13 |     time: "--time={}"
14 |     nodes: "-N {}"
15 | pbs:
16 |   command: "qsub"
17 |   key_mapping:
18 |     name: "-N {}"
19 |     account: "-A {}"
20 |     queue: "-q {}"
21 |     threads: "-l nodes=1:ppn={}" # always use 1 node
22 |     mem: "-l mem={}"
23 |     time: "-l walltime={}" #min= seconds x 100
24 |     output: "-o {}"
25 |     error: "-e {}"
26 |     host: "-l select=1:{}"
27 | lsf:
28 |   command: "bsub -e lsf_%J.log -o lsf_%J.log"
29 |   key_mapping:
30 |     queue: "-q {}"
31 |     name: "-J {}"
32 |     threads: "-n {}"
33 |     mem: '-R "rusage[mem={}000]"'
34 |     account: "-P {}"
35 |     nodes: "-C {}"
36 | 
37 | 
38 | 
39 | # for other cluster systems see: https://slurm.schedmd.com/rosetta.pdf
40 | # cluster = "qsub -A {cluster.account}  -l walltime={cluster.time} -q \
41 | #       {cluster.queue} -l nodes=1:ppn={cluster.nCPUs} -l mem={cluster.memory}"
42 | 


--------------------------------------------------------------------------------
/envs/sniffles.yaml_back:
--------------------------------------------------------------------------------
 1 | name: sniffles2
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 |   - anaconda
 7 | dependencies:
 8 |   - _libgcc_mutex=0.1=conda_forge
 9 |   - _openmp_mutex=4.5=1_gnu
10 |   - bzip2=1.0.8=h7f98852_4
11 |   - c-ares=1.18.1=h7f98852_0
12 |   - ca-certificates=2021.10.8=ha878542_0
13 |   - keyutils=1.6.1=h166bdaf_0
14 |   - krb5=1.19.3=h3790be6_0
15 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
16 |   - libcurl=7.82.0=h7bff187_0
17 |   - libdeflate=1.10=h7f98852_0
18 |   - libedit=3.1.20191231=he28a2e2_2
19 |   - libev=4.33=h516909a_1
20 |   - libffi=3.4.2=h7f98852_5
21 |   - libgcc-ng=11.2.0=h1d223b6_14
22 |   - libgomp=11.2.0=h1d223b6_14
23 |   - libnghttp2=1.47.0=h727a467_0
24 |   - libnsl=2.0.0=h7f98852_0
25 |   - libssh2=1.10.0=ha56f1ee_2
26 |   - libstdcxx-ng=11.2.0=he4da1e4_14
27 |   - libuuid=2.32.1=h7f98852_1000
28 |   - libzlib=1.2.11=h36c2ea0_1013
29 |   - ncurses=6.3=h9c3ff4c_0
30 |   - openssl=1.1.1l=h7f98852_0
31 |   - pip=22.0.4=pyhd8ed1ab_0
32 |   - pysam=0.18.0=py39h5030a8b_2
33 |   - python=3.9.10=h85951f9_2_cpython
34 |   - python_abi=3.9=2_cp39
35 |   - readline=8.1=h46c0cb4_0
36 |   - setuptools=60.10.0=py39hf3d152e_0
37 |   - sniffles=2.0.5=pyhdfd78af_0
38 |   - sqlite=3.37.1=h4ff8645_0
39 |   - tk=8.6.12=h27826a3_0
40 |   - tzdata=2022a=h191b570_0
41 |   - wheel=0.37.1=pyhd8ed1ab_0
42 |   - xz=5.2.5=h516909a_1
43 |   - zlib=1.2.11=h36c2ea0_1013
44 | 


--------------------------------------------------------------------------------
/scripts/process.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser(description="Processing variant file to identifie the passed variant", usage="%(prog)s [options]",
 5 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS)
 6 | 
 7 | parser.add_argument("input", help="Input file from clariovant", nargs='?', type=argparse.FileType('r'), default=sys.stdin)
 8 | parser.add_argument("output", help="The output file from clariovant", nargs='?', type=argparse.FileType('w'), default=sys.stdout)
 9 | parser.add_argument("-f", "--filter", help="Minimum threshold for variant to be passed (default: %(default)s)", type=int, default=200 )
10 | 
11 | args = parser.parse_args()
12 | 
13 | myFile = args.input
14 | dataOut = args.output
15 | threshold = args.filter
16 | 
17 | 
18 | # myFile = sys.argv[1]
19 | # with open(myFile, "r") as dataIn, open(myFile+"_filter.vcf", 'w') as dataOut:
20 | 
21 | for line in myFile:
22 |         lineSplit = line.split()
23 |         if line.startswith("#"):
24 |             dataOut.write(line)
25 |         elif lineSplit[4].startswith("<"):
26 |             pass
27 |         else:
28 |             if int(float(lineSplit[5])) >= threshold:
29 |                 lineSplit[6] = 'PASS'
30 |                 lineSplit[5] = str(int(float(lineSplit[5])))
31 |                 dataOut.write("{}\n".format("\t".join(lineSplit)))
32 | 
33 | myFile.close()
34 | dataOut.close()
35 | 


--------------------------------------------------------------------------------
/envs/whatshap.yaml:
--------------------------------------------------------------------------------
 1 | name: Whatshap
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 |   - anaconda
 7 | dependencies:
 8 |   - _libgcc_mutex=0.1
 9 |   - _openmp_mutex=4.5
10 |   - biopython=1.79
11 |   - bzip2=1.0.8
12 |   - c-ares=1.18.1
13 |   - ca-certificates=2022.6.15
14 |   - htslib=1.15.1
15 |   - isa-l=2.30.0
16 |   - keyutils=1.6.1
17 |   - krb5=1.19.3
18 |   - ld_impl_linux-64=2.36.1
19 |   - libblas=3.9.0
20 |   - libcblas=3.9.0
21 |   - libcurl=7.83.1
22 |   - libdeflate=1.10
23 |   - libedit=3.1.20191231
24 |   - libev=4.33
25 |   - libffi=3.4.2
26 |   - libgcc-ng=12.1.0
27 |   - libgfortran-ng=12.1.0
28 |   - libgfortran5=12.1.0
29 |   - libgomp=12.1.0
30 |   - liblapack=3.9.0
31 |   - libnghttp2=1.47.0
32 |   - libnsl=2.0.0
33 |   - libopenblas=0.3.21
34 |   - libsqlite=3.39.2
35 |   - libssh2=1.10.0
36 |   - libstdcxx-ng=12.1.0
37 |   - libuuid=2.32.1
38 |   - libzlib=1.2.12
39 |   - ncurses=6.3
40 |   - networkx=2.8.6
41 |   - numpy=1.23.2
42 |   - openssl=1.1.1q
43 |   - packaging=21.3
44 |   - pbzip2=1.1.13
45 |   - pigz=2.6
46 |   - pip=22.2.2
47 |   - pyfaidx=0.7.1
48 |   - pyparsing=3.0.9
49 |   - pysam=0.19.1
50 |   - python=3.10.6
51 |   - python-isal=1.0.1
52 |   - python_abi=3.10
53 |   - pyvcf3=1.0.3
54 |   - readline=8.1.2
55 |   - scipy=1.9.0
56 |   - setuptools=65.2.0
57 |   - six=1.16.0
58 |   - tk=8.6.12
59 |   - tzdata=2022c
60 |   - whatshap=1.4
61 |   - wheel=0.37.1
62 |   - xopen=1.6.0
63 |   - xz=5.2.6
64 |   - zlib=1.2.12
65 | 


--------------------------------------------------------------------------------
/cluster/slurm_status.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import re
 3 | import subprocess as sp
 4 | import shlex
 5 | import sys
 6 | import time
 7 | import logging
 8 | logger = logging.getLogger("__name__")
 9 | 
10 | STATUS_ATTEMPTS = 20
11 | 
12 | jobid = sys.argv[1]
13 | 
14 | for i in range(STATUS_ATTEMPTS):
15 |     try:
16 |         sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
17 |         res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")}
18 |         break
19 |     except sp.CalledProcessError as e:
20 |         logger.error("sacct process error")
21 |         logger.error(e)
22 |     except IndexError as e:
23 |         pass
24 |     # Try getting job with scontrol instead in case sacct is misconfigured
25 |     try:
26 |         sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid)))
27 |         m = re.search("JobState=(\w+)", sctrl_res.decode())
28 |         res = {jobid: m.group(1)}
29 |         break
30 |     except sp.CalledProcessError as e:
31 |         logger.error("scontrol process error")
32 |         logger.error(e)
33 |         if i >= STATUS_ATTEMPTS - 1:
34 |             print("failed")
35 |             exit(0)
36 |         else:
37 |             time.sleep(1)
38 | 
39 | status = res[jobid]
40 | 
41 | if (status == "BOOT_FAIL"):
42 |     print("failed")
43 | elif (status == "OUT_OF_MEMORY"):
44 |     print("failed")
45 | elif (status.startswith("CANCELLED")):
46 |     print("failed")
47 | elif (status == "COMPLETED"):
48 |     print("success")
49 | elif (status == "DEADLINE"):
50 |     print("failed")
51 | elif (status == "FAILED"):
52 |     print("failed")
53 | elif (status == "NODE_FAIL"):
54 |     print("failed")
55 | elif (status == "PREEMPTED"):
56 |     print("failed")
57 | elif (status == "TIMEOUT"):
58 |     print("failed")
59 | # Unclear whether SUSPENDED should be treated as running or failed
60 | elif (status == "SUSPENDED"):
61 |     print("failed")
62 | else:
63 |     print("running")
64 | 


--------------------------------------------------------------------------------
/envs/pythonRun.yaml:
--------------------------------------------------------------------------------
 1 | name: PythonRun
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 |   - anaconda
 7 | dependencies:
 8 |   - _libgcc_mutex=0.1
 9 |   - _openmp_mutex=4.5
10 |   - biopython=1.74
11 |   - ca-certificates=2022.12.7
12 |   - certifi=2022.12.7
13 |   - cycler=0.11.0
14 |   - dbus=1.13.6
15 |   - expat=2.5.0
16 |   - fontconfig=2.14.2
17 |   - freetype=2.12.1
18 |   - gettext=0.21.1
19 |   - glib=2.66.3
20 |   - gst-plugins-base=1.14.5
21 |   - gstreamer=1.14.5
22 |   - icu=64.2
23 |   - jpeg=9e
24 |   - kiwisolver=1.4.4
25 |   - ld_impl_linux-64=2.40
26 |   - libblas=3.9.0
27 |   - libcblas=3.9.0
28 |   - libclang=9.0.1
29 |   - libffi=3.2.1
30 |   - libgcc-ng=12.2.0
31 |   - libgfortran-ng=12.2.0
32 |   - libgfortran5=12.2.0
33 |   - libglib=2.66.3
34 |   - libgomp=12.2.0
35 |   - libiconv=1.16
36 |   - liblapack=3.9.0
37 |   - libllvm9=9.0.1
38 |   - libopenblas=0.3.21
39 |   - libpng=1.6.39
40 |   - libsqlite=3.40.0
41 |   - libstdcxx-ng=12.2.0
42 |   - libuuid=2.32.1
43 |   - libxcb=1.13
44 |   - libxkbcommon=0.10.0
45 |   - libxml2=2.9.10
46 |   - libzlib=1.2.13
47 |   - matplotlib=3.1.1
48 |   - matplotlib-base=3.1.1
49 |   - ncurses=6.3
50 |   - nspr=4.35
51 |   - nss=3.82
52 |   - numpy=1.17.2
53 |   - openssl=1.1.1t
54 |   - packaging=23.0
55 |   - pandas=1.2.3
56 |   - patsy=0.5.3
57 |   - pcre=8.45
58 |   - pip=23.0.1
59 |   - pthread-stubs=0.4
60 |   - pyfaidx=0.5.5.2
61 |   - pyparsing=3.0.9
62 |   - pyqt=5.12.3
63 |   - python=3.7.8
64 |   - python-dateutil=2.8.2
65 |   - python_abi=3.7
66 |   - pytz=2022.7.1
67 |   - qt=5.12.5
68 |   - readline=8.1.2
69 |   - scipy=1.5.3
70 |   - seaborn=0.12.2
71 |   - seaborn-base=0.12.2
72 |   - setuptools=59.8.0
73 |   - six=1.16.0
74 |   - sqlite=3.40.0
75 |   - statsmodels=0.13.5
76 |   - tk=8.6.12
77 |   - tornado=6.2
78 |   - typing-extensions=4.5.0
79 |   - typing_extensions=4.5.0
80 |   - wheel=0.38.4
81 |   - xorg-libxau=1.0.9
82 |   - xorg-libxdmcp=1.1.3
83 |   - xz=5.2.6
84 |   - zlib=1.2.13
85 |   - pip:
86 |     - pyqt5-sip==4.19.18
87 |     - pyqtchart==5.12
88 |     - pyqtwebengine==5.12.1


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | ---
 3 | # Information about samples
 4 | ###########################
 5 | sample_directory: "" # samples by default should exist in this directory unless otherwise.
 6 | read_type: "ont" # ont,clr or ccs
 7 | sample_extension: "gz" # please add sample extension ex: fastq , fasta or gz # default gz
 8 | # sample list If specified this will overlap the default behavior which is all samples in directory specified above.
 9 | # If you leave empty the input will be all the samples in directory.
10 | sample_list: []
11 | sample_name: "SAMPLE"
12 | delete_files: ""
13 | delete_samples: ""
14 | ###########################
15 | 
16 | # Information about reference
17 | ###############################
18 | reference: "/reference/GRCh38-2.1.0/genome.fa"
19 | chrs: []
20 | 
21 | 
22 | # Samtools Parameters
23 | #####################
24 | samtools_threads: 5
25 | #####################
26 | 
27 | 
28 | # Aligner
29 | #########
30 | aligner: "minimap" # minimap or ngmlr
31 | aligner_threads: 10
32 | #minimap_other_tags: ""
33 | minimap_other_tags: "-y"
34 | #########
35 | 
36 | 
37 | # Structural Variant Parameters
38 | ###############################
39 | min_sv_len: 50
40 | sv_threads: 5
41 | phase_sv: 'False'
42 | mosaic_sv: 'False'
43 | 
44 | 
45 | 
46 | # Calling Variant Parameters
47 | ############################
48 | clair_location: "bin/Clair/clair.py" # not used anymore
49 | clair_coverage: 2
50 | clair_threads: 5
51 | # chr_split: 24925062
52 | chr_split: 29925062
53 | filter_chrs: True # Case sensitive options [True, Fasle].
54 | clair_pypy: "/home/source/Clair/pypy3/pypy3.5-7.0.0-linux_x86_64-portable/bin/pypy"
55 | clair_model: ""
56 | gvcf_snv: 'False'
57 | #tmp_directory: "/tmp"
58 | tmp_directory: ""
59 | 
60 | 
61 | 
62 | # Update SNPs
63 | #############
64 | update_snps: False
65 | paternal_snps: ""
66 | maternal_snps: ""
67 | 
68 | 
69 | # Methylation
70 | #############
71 | methylation: False
72 | fast5_dir: False
73 | methylation_threads: 8
74 | 
75 | # Zipping Parameters
76 | ####################
77 | bgzip_threads: 5
78 | 
79 | 
80 | # Scripts
81 | ##########
82 | read_raw_coverage: "scripts/rawcoverage.py"
83 | read_raw_coverage_threads: 5
84 | 
85 | updat_snps_script: "scripts/phasing_report_update_vcf.py"
86 | updat_sv: "scripts/update_sv_hp_ps.py"
87 | hap_methylation: "scripts/update_meth_hp_ps.py"
88 | 
89 | 
90 | # Cluster
91 | ##########
92 | cluster_jobs: 10
93 | number_of_tries: 0
94 | 
95 | ...
96 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | =======
131 | # Ignore directories
132 | samples/
133 | .snakemake/
134 | 


--------------------------------------------------------------------------------
/modules/stat.smk:
--------------------------------------------------------------------------------
 1 | ###############################
 2 | ######  STATISTICS RULES ######
 3 | ###############################
 4 | 
 5 | 
 6 | 
 7 | #### RAW READS STATISTICS ####
 8 | ##############################
 9 | 
10 | rule readsStat:
11 |     """
12 |     Input is the reads in directory output is info about reads
13 |     """
14 |     input: expand(data_dir + "/{sample}", sample=sample_list)
15 |     output:data_dir + "/statistics/raw_reads/reads_stat.txt",
16 |     message: "Calculating read coverage statistics for: {input}",
17 |     params:
18 |         read_stat_script = rawcoverage_script,
19 |     threads: config['read_raw_coverage_threads']
20 |     log: data_dir + "/statistics/raw_reads/reads_stat.log",
21 |     benchmark: data_dir + "/benchmark/raw_reads/stat.benchmark.txt"
22 |     conda: READ_STAT_ENV
23 |     shell:
24 |         """
25 |         python {params.read_stat_script} -i {input} -o {output} -t {threads} 2>{log}
26 |         """
27 | 
28 | #### BAM STATISTICS ####
29 | ########################
30 | 
31 | rule bamStatistics:
32 |     """
33 |     Calculate statistics from merged bam file
34 |     """
35 |     input:data_dir + "/align/{aligner}/data.bam"
36 |     output:data_dir + "/statistics/{aligner}/data.stat"
37 |     message:"Calculating aligned reads statistics from bam file"
38 |     benchmark: data_dir + "/benchmark/align/{aligner}/stat.benchmark.txt"
39 |     conda: MINIMAP2_ENV
40 |     shell:"""
41 |         samtools stats {input} > {output}
42 |         """
43 | 
44 | #### SV STATISTICS ####
45 | #######################
46 | 
47 | rule svStat:
48 |     input: expand(data_dir + "/sv/{aligner}/sniffles.vcf", aligner=config['aligner'])
49 |     output: data_dir + "/statistics/sv/data.stat"
50 |     message: "calculating statistics for structural variant"
51 |     benchmark: data_dir + "/benchmark/sv/stat.benchmark.txt"
52 |     conda: VARIANT_ENV
53 |     shell:"""
54 |         SURVIVOR stats {input} -1 -1 -1  {output}
55 |         """
56 | 
57 | #### SNPs STATISTICS ####
58 | #########################
59 | 
60 | rule snpStat:
61 |     input:
62 |         snp_file = expand(data_dir + "/phased/{aligner}/data.vcf.gz", aligner=config['aligner']) ,
63 |         snp_file_index = expand(data_dir + "/phased/{aligner}/data.vcf.gz.tbi", aligner=config['aligner']) ,
64 |     output: data_dir + "/statistics/snp/snp.txt",
65 |     message: "Calculate SNPs statistics"
66 |     benchmark: data_dir + "/benchmark/snp/stat.benchmark.txt"
67 |     conda: VARIANT_ENV
68 |     shell:"""
69 |         bcftools stats {input.snp_file} > {output}
70 |         """
71 | #### ALL STATISTICS No READs ####
72 | #################################
73 | 
74 | rule statNoReads:
75 |     input:
76 |         expand(data_dir + "/statistics/{aligner}/data.stat", aligner=config['aligner']),
77 |         data_dir + "/statistics/sv/data.stat",
78 |         data_dir + "/statistics/snp/snp.txt",
79 |     output: data_dir + "/stat.NoReads.txt"
80 |     shell: "touch {output}"
81 | 
82 | #### ALL STATISTICS ####
83 | #######################
84 | 
85 | rule stat:
86 |     input:
87 |         expand(data_dir + "/statistics/{aligner}/data.stat", aligner=config['aligner']),
88 |         data_dir + "/statistics/raw_reads/reads_stat.txt",
89 |         data_dir + "/statistics/sv/data.stat",
90 |         data_dir + "/statistics/snp/snp.txt"
91 |     output: data_dir + "/stat.txt"
92 |     shell: "touch {output}"
93 | 


--------------------------------------------------------------------------------
/scripts/update_meth_hp_ps.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | """
 4 | This script update Methylation file to add both HP haplotag and PS phasing block, It takes as input meth file, hp, ps.
 5 | """
 6 | import argparse
 7 | import sys, os
 8 | from operator import itemgetter
 9 | from collections import Counter
10 | 
11 | # Python program to print
12 | # green text with red background
13 | #
14 | # from colorama import init
15 | # from termcolor import colored
16 | #
17 | # init()
18 | 
19 | 
20 | 
21 | 
22 | def get_args():
23 |     parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.",
24 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter,
25 |                                      description='Produce phasing report for Methylation',
26 |                                      add_help=True, )
27 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01')
28 |     # parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
29 |     # parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout)
30 | 
31 |     parser.add_argument('input', nargs='?', help="Methylation file",
32 |                              type=argparse.FileType('r'),
33 |                              default=sys.stdin)
34 |     parser.add_argument('hp', nargs='?', help="tab delimeted read\thp\tps file",
35 |                              type=argparse.FileType('r'))
36 |     parser.add_argument('output', nargs='?', help="Output file, PS and HP will be added.",
37 |                                  type=argparse.FileType('w+'),
38 |                                  default=sys.stdout)
39 | 
40 |     parser.set_defaults(func=update_meth)
41 | 
42 |     # if not argument print help.
43 |     if len(sys.argv) == 1 and  sys.stdin.isatty():  # sys.stdin.isatty() returns false if there's something in stdin
44 |          parser.print_help(sys.stderr)
45 |          sys.exit(1)
46 | 
47 |     args = parser.parse_args()
48 | 
49 | 
50 |     if 'func' in args:
51 |         args.func(args)
52 |     else:
53 |         parser.print_help()
54 | 
55 | def update_meth(args):
56 |     # check if the input from stdin
57 |     if not sys.stdin.isatty(): # there is nothing in the stdin
58 |         if args.input.name.endswith("gz"):
59 |             import gzip
60 |             myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default.
61 |         else:
62 |             myfile = args.input
63 |     else:
64 |         myfile = args.input
65 | 
66 |     # read the Haplotype file as dictionary
67 |     hp_dic = {}
68 |     with args.hp as hp_in:
69 |         for line in hp_in:
70 |             id, hp, ps = line.split()
71 |             hp_dic[id] = [hp.rsplit(":", 1)[-1], ps.rsplit(":", 1)[-1]] # read hp, ps
72 | 
73 | 
74 |     with myfile as data_in, args.output as data_out:
75 |         first = True
76 |         n = 0
77 |         for line in data_in:
78 |             n+=1
79 |             if first:
80 |                 first = False
81 |                 data_out.write(line.strip()+"\tHP\tPS\n")
82 |                 continue
83 |             line_split = line.split()
84 |             read = line_split[4]
85 |             hp, ps = hp_dic.get(read, ['.', '.']) # In case f the read have not been haplotyped.
86 |             data_out.write("{}\t{}\t{}\n".format(line.strip(), hp, ps))
87 | 
88 | def main():
89 |     args = get_args()
90 | 
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/envs/princess_env.yaml:
--------------------------------------------------------------------------------
  1 | name: princess_env
  2 | channels:
  3 |   - conda-forge
  4 |   - anaconda
  5 |   - bioconda
  6 |   - defaults
  7 | dependencies:
  8 |   - bcftools=1.9=h68d8f2e_7
  9 |   - bedtools=2.29.2=hc088bd4_0
 10 |   - biopython=1.74=py37h516909a_0
 11 |   - bwa=0.7.17=hed695b0_6
 12 |   - bzip2=1.0.8=h516909a_1
 13 |   - ca-certificates=2020.12.5=ha878542_0
 14 |   - certifi=2020.12.5=py37h89c1867_1
 15 |   - curl=7.65.3=hf8cf82a_0
 16 |   - cycler=0.10.0=py_1
 17 |   - dbus=1.13.6=he372182_0
 18 |   - decorator=4.4.0=py_0
 19 |   - eigen=3.3.7=h6bb024c_1000
 20 |   - expat=2.2.5=he1b5a44_1004
 21 |   - fontconfig=2.13.1=h86ecdb6_1001
 22 |   - freetype=2.10.0=he983fc9_1
 23 |   - gettext=0.19.8.1=hc5be6a0_1002
 24 |   - glib=2.58.3=h6f030ca_1002
 25 |   - gsl=2.5=h294904e_1
 26 |   - gst-plugins-base=1.14.5=h0935bb2_0
 27 |   - gstreamer=1.14.5=h36ae1b5_0
 28 |   - hdf5=1.10.5=nompi_h3c11f04_1104
 29 |   - htslib=1.9=ha228f0b_7
 30 |   - icu=64.2=he1b5a44_1
 31 |   - jpeg=9c=h14c3975_1001
 32 |   - kiwisolver=1.1.0=py37hc9558a2_0
 33 |   - krb5=1.16.3=h05b26f9_1001
 34 |   - libblas=3.8.0=14_openblas
 35 |   - libcblas=3.8.0=14_openblas
 36 |   - libclang=9.0.0=hc9558a2_1
 37 |   - libcurl=7.65.3=hda55be3_0
 38 |   - libdeflate=1.0=h14c3975_1
 39 |   - libedit=3.1.20170329=hf8c457e_1001
 40 |   - libffi=3.2.1=he1b5a44_1006
 41 |   - libgcc-ng=9.1.0=hdf63c60_0
 42 |   - libgfortran-ng=7.3.0=hdf63c60_2
 43 |   - libiconv=1.15=h516909a_1005
 44 |   - liblapack=3.8.0=14_openblas
 45 |   - libllvm9=9.0.0=hc9558a2_2
 46 |   - libopenblas=0.3.7=h6e990d7_2
 47 |   - libpng=1.6.37=hed695b0_0
 48 |   - libssh2=1.8.2=h22169c7_2
 49 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 50 |   - libuuid=2.32.1=h14c3975_1000
 51 |   - libxcb=1.13=h14c3975_1002
 52 |   - libxkbcommon=0.9.1=hebb1f50_0
 53 |   - libxml2=2.9.9=hee79883_5
 54 |   - llvm-openmp=8.0.1=hc9558a2_0
 55 |   - matplotlib=3.1.1=py37_1
 56 |   - matplotlib-base=3.1.1=py37he7580a8_1
 57 |   - minimap2=2.17=h8b12597_1
 58 |   - nanopolish=0.11.2=h705302d_0
 59 |   - ncurses=6.1=hf484d3e_1002
 60 |   - networkx=2.4=py_0
 61 |   - ngmlr=0.2.7=he860b03_1
 62 |   - nspr=4.23=he1b5a44_0
 63 |   - nss=3.47=he751ad9_0
 64 |   - numpy=1.17.2=py37h95a1406_0
 65 |   - openmp=8.0.1=0
 66 |   - openssl=1.1.1c=h516909a_0
 67 |   - pandas=0.25.2=py37hb3f55d8_0
 68 |   - patsy=0.5.1=py_0
 69 |   - pcre=8.43=he1b5a44_0
 70 |   - perl=5.26.2=h516909a_1006
 71 |   - pip=19.3.1=py37_0
 72 |   - pthread-stubs=0.4=h14c3975_1001
 73 |   - pyfaidx=0.5.5.2=py_1
 74 |   - pyparsing=2.4.2=py_0
 75 |   - pyqt=5.12.3=py37hcca6a23_0
 76 |   - pysam=0.15.3=py37hda2845c_1
 77 |   - python=3.7.3=h33d41f4_1
 78 |   - python-dateutil=2.8.0=py_0
 79 |   - python_abi=3.7=1_cp37m
 80 |   - pytz=2019.3=py_0
 81 |   - pyvcf=0.6.8=py37_1000
 82 |   - qt=5.12.5=h0c104cb_0
 83 |   - readline=8.0=hf8c457e_0
 84 |   - samtools=1.9=h10a08f8_12
 85 |   - scipy=1.3.1=py37h921218d_2
 86 |   - seaborn=0.9.0=py_1
 87 |   - setuptools=41.4.0=py37_0
 88 |   - six=1.12.0=py37_1000
 89 |   - sniffles=1.0.12=h8b12597_1
 90 |   - sqlite=3.30.1=hcee41ef_0
 91 |   - statsmodels=0.10.1=py37hc1659b7_0
 92 |   - survivor=1.0.6=h6bb024c_0
 93 |   - tabix=0.2.6=ha92aebf_0
 94 |   - tclap=1.2.1=h470a237_1
 95 |   - tk=8.6.9=hed695b0_1003
 96 |   - tornado=6.0.3=py37h516909a_0
 97 |   - vcflib=1.0.0_rc3=py37hc088bd4_0
 98 |   - whatshap=0.18=py37h6bb024c_0
 99 |   - wheel=0.33.6=py37_0
100 |   - xopen=0.8.3=py37_0
101 |   - xorg-libxau=1.0.9=h14c3975_0
102 |   - xorg-libxdmcp=1.1.3=h516909a_0
103 |   - xz=5.2.4=h14c3975_1001
104 |   - zlib=1.2.11=h516909a_1006
105 |   - pip:
106 |     - pyqt5-sip==4.19.18
107 |     - pyqtwebengine==5.12.1
108 | 


--------------------------------------------------------------------------------
/modules/methylation.smk:
--------------------------------------------------------------------------------
 1 | #################################
 2 | ######  METHYLATION RULES #######
 3 | #################################
 4 | 
 5 | 
 6 | 
 7 | #### NANOPOLISH INDEX ####
 8 | ##########################
 9 | 
10 | rule nanoIndex:
11 |     """
12 |     Preparing index to links read ids with their signal-level data in the FAST5 files
13 |     """
14 |     input:
15 |         fastq_file=data_dir + "/{sample}",
16 |     output: data_dir + "/{sample}.index.readdb"
17 |     message: "Input file is {wildcards.sample}"
18 |     params:
19 |         fast5_dir = config['fast5_dir']#lambda wildcards: ont_sample_dir[wildcards.sample]
20 |     benchmark: data_dir + "/benchmark/methylation/index.{sample}.benchmark.txt"
21 |     conda: PRINCESS_ENV
22 |     shell:"""
23 |             nanopolish index -d {params.fast5_dir} {input.fastq_file}
24 |             """
25 | 
26 | #### NANOPOLISH METHYLATION ####
27 | ################################
28 | 
29 | rule callMeth:
30 |     """
31 |     Calling Methylation
32 |     """
33 |     input:
34 |         fastq_file=data_dir + "/{sample}",
35 |         bam_file=data_dir + "/align/{aligner}/{sample}.bam",
36 |         bam_index=data_dir + "/align/{aligner}/{sample}.bam.bai",
37 |         fastq_index=data_dir + "/{sample}.index.readdb",
38 |     output: data_dir + "/meth/{aligner}/{sample}.methylation_calls.tsv"
39 |     params:
40 |         ref = REFERENCES,
41 |     threads: config['methylation_threads']
42 |     message: "Calling Methylation for sample: {wildcards.sample}"
43 |     benchmark: data_dir + "/benchmark/methylation/{aligner}/call_methylation.{sample}.benchmark.txt"
44 |     conda: PRINCESS_ENV
45 |     shell:"""
46 |         nanopolish call-methylation -t 8 -r {input.fastq_file} -b {input.bam_file} -g {params.ref} > {output}
47 |         """
48 | 
49 | #### NANOPOLISH METHYLATION Haplotype ####
50 | ##########################################
51 | 
52 | rule callMethHap:
53 |     """
54 |     Haplotype Methylation
55 |     """
56 |     input:
57 |         meth = data_dir + "/meth/{aligner}/{sample}.methylation_calls.tsv",
58 |         bam = data_dir + "/align/{aligner}/data_hap.tab",
59 |     output: data_dir + "/meth/{aligner}/{sample}.methylation_calls_hap.tsv"
60 |     params:
61 |         update_script = config['hap_methylation'],
62 |     message: "Updating Methylation for {wildcards.sample} using align/{wildcards.aligner}/data_hap.tab"
63 |     benchmark: data_dir + "/benchmark/methylation/{aligner}/call_methylation.{sample}.hap.benchmark.txt"
64 |     shell:"""
65 |         python {params.update_script} {input.meth} {input.bam} {output}
66 |         """
67 | 
68 | #### CALL ALL METHYLATION ####
69 | ##############################
70 | 
71 | rule allMethylation:
72 |     """
73 |     Call all methylation samples.
74 |     """
75 |     input: lambda wildcards: expand(data_dir + "/meth/{aligner}/{sample}.methylation_calls.tsv", aligner=wildcards.aligner, sample=config['sample_list'].split())
76 |     output: data_dir + "/meth/{aligner}/methylation_calls.tsv",
77 |     message: "Collecting all methylation samples {input}"
78 |     shell:"""
79 |         touch {output}
80 |         """
81 | 
82 | #### CALL ALL METHYLATION PHASED & HAPLOTYPED ####
83 | #################################################
84 | 
85 | rule allMethylationHap:
86 |     """
87 |     Call all methylation samples phased.
88 |     """
89 |     input: lambda wildcards: expand(data_dir + "/meth/{aligner}/{sample}.methylation_calls_hap.tsv", aligner=wildcards.aligner, sample=config['sample_list'].split())
90 |     output: data_dir + "/meth/{aligner}/methylation_calls_hap.tsv",
91 |     message: "Collecting all methylation samples {input}"
92 |     shell:"""
93 |         touch {output}
94 |         """
95 | 


--------------------------------------------------------------------------------
/modules/phasing.smk:
--------------------------------------------------------------------------------
  1 | 
  2 | ############################
  3 | ######  PHASING RULES ######
  4 | ###########################
  5 | 
  6 | 
  7 | 
  8 | 
  9 | #### GENOTYPING ####
 10 | ####################
 11 | 
 12 | rule gt:
 13 |     """
 14 |     Genotype SNPs one chromosome per time.
 15 |     """
 16 |     input:
 17 |         bam=data_dir + "/align/{aligner}/data.bam",
 18 |         bam_index=data_dir + "/align/{aligner}/data.bam.bai",
 19 |         snps=data_dir + "/snp/{aligner}/data.{chr}.vcf",
 20 |     output:
 21 |         data_dir + "/gt/{aligner}/data.{chr}.vcf"
 22 |     params:
 23 |         reference=REFERENCES,
 24 |     conda: WHATSHAP_ENV
 25 |     log:
 26 |         data_dir + "/gt/{aligner}/data.{chr}.log"
 27 |     benchmark: data_dir + "/benchmark/gt/{aligner}/{chr}.benchmark.txt"
 28 |     shell:"""
 29 |         whatshap genotype --reference {params.reference} \
 30 |                       --ignore-read-groups \
 31 |                        --output {output} {input.snps} {input.bam} > {log} 2>&1
 32 |         """
 33 | 
 34 | #### PHASING ####
 35 | #################
 36 | 
 37 | rule phasing:
 38 |     """
 39 |     Phase SNPs one chromosome per time
 40 |     """
 41 |     input:
 42 |         bam=data_dir + "/align/{aligner}/data.bam",
 43 |         bam_index=data_dir + "/align/{aligner}/data.bam.bai",
 44 |         snps=data_dir + "/snp/{aligner}/data.{chr}.vcf",
 45 |     output:
 46 |         phased=temp(data_dir + "/phased/{aligner}/data.{chr}.vcf"),
 47 |     params:
 48 |         reference=REFERENCES,
 49 |         read_list=data_dir + "/phased/{aligner}/data.{chr}.reads",
 50 |     log:
 51 |         data_dir + "/phased/{aligner}/data.{chr}.log"
 52 |     conda: WHATSHAP_ENV
 53 |     benchmark: data_dir + "/benchmark/phase/{aligner}/{chr}.benchmark.txt"
 54 |     shell:"""
 55 |         whatshap phase --reference {params.reference} \
 56 |                      --output {output.phased} {input.snps} {input.bam} \
 57 |                      --ignore-read-groups \
 58 |                      --output-read-list {params.read_list} > {log} 2>&1
 59 |         """
 60 | 
 61 | #### CONCAT PHASING ####
 62 | ########################
 63 | 
 64 | rule allPhased:
 65 |     """
 66 |     Concat all the phased SNPs into one file.
 67 |     """
 68 |     input:lambda wildcards: expand(data_dir + "/phased/{aligner}/data.{chr}.vcf", aligner=wildcards.aligner, chr=chr_list),
 69 |     output: temp(data_dir + "/phased/{aligner}/data.vcf")
 70 |     conda: VARIANT_ENV
 71 |     params:
 72 |         sample_name = SAMPLE_NAME,
 73 |     benchmark: data_dir + "/benchmark/phase/{aligner}/concat_phased.benchmark.txt"
 74 |     shell:"""
 75 |         echo "{params.sample_name}" > sample_name.txt && vcfcat {input} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output}
 76 |         """
 77 | 
 78 | #### HAPLOTYPE BAM FILE ####
 79 | ############################
 80 | 
 81 | rule partionBam:
 82 |     """
 83 |     Partion a bam file based on the phased SNPs,
 84 |     It will use the updated SNPs if the parental SNPs were provided.
 85 |     """
 86 |     input:
 87 |         bam = data_dir + "/align/{aligner}/data.bam",  # SM filed must be set to the sample name in vcf file
 88 |         bam_index = data_dir + "/align/{aligner}/data.bam.bai",
 89 |         snp = lambda wildcards: data_dir + "/phased/{aligner}/data_updated.vcf.gz" if config['update_snps'] else data_dir + "/phased/{aligner}/data.vcf.gz",
 90 |         snp_index = lambda wildcards: data_dir + "/phased/{aligner}/data_updated.vcf.gz.tbi" if config['update_snps'] else data_dir + "/phased/{aligner}/data.vcf.gz.tbi",
 91 |     output:
 92 |         hap_bam = data_dir + "/align/{aligner}/data_hap.bam"
 93 |     message: "Partitioning bam file"
 94 |     conda: WHATSHAP_ENV
 95 |     params:
 96 |         ref = REFERENCES
 97 |     shell:"""
 98 |         whatshap haplotag --ignore-read-groups -o {output.hap_bam} -r {params.ref} {input.snp} {input.bam}
 99 |         """
100 | 


--------------------------------------------------------------------------------
/cluster/scheduler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | import sys, os
  5 | import subprocess
  6 | from subprocess import Popen, PIPE
  7 | import yaml
  8 | 
  9 | 
 10 | def eprint(*args, **kwargs):
 11 |     print(*args, file=sys.stderr, **kwargs)
 12 | 
 13 | 
 14 | def run_cmd(cmd):
 15 |     eprint("Running from subprocess" + str(cmd))
 16 |     try:
 17 |         subprocess.run(cmd, check=True, universal_newlines=True)
 18 |     except subprocess.CalledProcessError as e:
 19 |         logger.error("Error in subprocess:\n{}".format(e.returncode))
 20 | 
 21 | 
 22 | def convert_time_to_seconds(run_time):
 23 |     # Number of : in input:
 24 |     colons = run_time.count(":")
 25 |     if colons == 3:
 26 |         # dd:hh:mm:ss
 27 |         d, h, m, s = run_time.split(":")
 28 |         return str(day2sec(int(d)) + hours2sec(int(h)) + minutes2sec(int(m)) + int(s))
 29 |     elif colons == 2:
 30 |         # hh:mm:ss
 31 |         h, m, s = run_time.split(":")
 32 |         return str(hours2sec(int(h)) + minutes2sec(int(m)) + int(s))
 33 |     elif colons == 1:
 34 |         # mm:ss
 35 |         m, s = run_time.split(":")
 36 |         return str(minutes2sec(int(m)) + int(s))
 37 |     else:
 38 |         return run_time
 39 | 
 40 | 
 41 | def day2sec(days):
 42 |     return days * 24 * 60 * 60
 43 | 
 44 | 
 45 | def hours2sec(hours):
 46 |     return hours * 60 * 60
 47 | 
 48 | 
 49 | def minutes2sec(minutes):
 50 |     return minutes * 60
 51 | 
 52 | 
 53 | def qsub_to_slurm_time(qsub_time):
 54 |     qsub_time = qsub_time.split(":")
 55 |     slurm_time = ""
 56 |     if len(qsub_time) == 4:
 57 |         slurm_time = "{}-{}:{}:{}".format(
 58 |             qsub_time[0], qsub_time[1], qsub_time[2], qsub_time[3]
 59 |         )
 60 |     elif len(qsub_time) == 3:
 61 |         slurm_time = "{}:{}:{}".format(qsub_time[0], qsub_time[1], qsub_time[2])
 62 |     return slurm_time
 63 | 
 64 | 
 65 | # let snakemake read job_properties
 66 | from snakemake.utils import read_job_properties
 67 | 
 68 | 
 69 | jobscript = sys.argv[1]
 70 | 
 71 | job_properties = read_job_properties(jobscript)
 72 | 
 73 | 
 74 | # default parameters defined in cluster_spec (accessed via snakemake read_job_properties)
 75 | cluster_param = job_properties["cluster"]
 76 | 
 77 | 
 78 | if job_properties["type"] == "single":
 79 |     cluster_param["name"] = "snakejob.{}".format(job_properties["rule"])
 80 | elif job_properties["type"] == "group":
 81 |     cluster_param["name"] = job_properties["groupid"]
 82 | else:
 83 |     raise NotImplementedError(
 84 |         f"Don't know what to do with job_properties['type']=={job_properties['type']}"
 85 |     )
 86 | 
 87 | 
 88 | # don't overwrite default parameters if defined in rule (or config file)
 89 | if ("threads" in job_properties) and ("threads" not in cluster_param):
 90 |     cluster_param["threads"] = job_properties["threads"]
 91 | for res in ["time", "mem"]:
 92 |     if (res in job_properties["resources"]) and (res not in cluster_param):
 93 |         cluster_param[res] = job_properties["resources"][res]
 94 | 
 95 | # check which system you are on and load command command_options
 96 | key_mapping_file = os.path.join(os.path.dirname(__file__), "key_mapping.yaml")
 97 | command_options = yaml.load(open(key_mapping_file), Loader=yaml.BaseLoader)
 98 | system = command_options["system"]
 99 | command = command_options[system]["command"]
100 | key_mapping = command_options[system]["key_mapping"]
101 | 
102 | 
103 | ## TODO: Comment this line || test while using normal time 01:00:00:00
104 | # time in hours
105 | if "time" in cluster_param:
106 |     # cluster_param["time"]=int(cluster_param["time"])*60
107 |     # cluster_param["time"]=convert_time_to_seconds(cluster_param["time"])
108 |     if system == "pbs":
109 |         cluster_param["time"] = cluster_param["time"]
110 |     elif system == "slurm":
111 |         cluster_param["time"] = qsub_to_slurm_time(cluster_param["time"])
112 | 
113 | 
114 | # construct command:
115 | for key in key_mapping:
116 |     if key in cluster_param:
117 |         command += " "
118 |         command += key_mapping[key].format(cluster_param[key])
119 | 
120 | command += " {}".format(jobscript)
121 | 
122 | eprint("submit command: " + command)
123 | 
124 | # run_cmd(command.split(' '))
125 | p = Popen(command.split(" "), stdout=PIPE, stderr=PIPE)
126 | output, error = p.communicate()
127 | if p.returncode != 0:
128 |     raise Exception(
129 |         "Job can't be submitted\n" + output.decode("utf-8") + error.decode("utf-8")
130 |     )
131 | else:
132 |     res = output.decode("utf-8")
133 | 
134 |     if system == "lsf":
135 |         import re
136 | 
137 |         match = re.search(r"Job <(\d+)> is submitted", res)
138 |         jobid = match.group(1)
139 | 
140 |     elif system == "pbs":
141 |         jobid = res.strip().split(".")[0]
142 | 
143 |     else:
144 |         jobid = int(res.strip().split()[-1])
145 | 
146 |     print(jobid)
147 | 


--------------------------------------------------------------------------------
/envs/clair3.yaml_back:
--------------------------------------------------------------------------------
  1 | name: clair3.0.1.11
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 |   - anaconda
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=2_gnu
 10 |   - _tflow_select=2.3.0=mkl
 11 |   - absl-py=1.0.0=pyhd8ed1ab_0
 12 |   - astor=0.8.1=pyh9f0ad1d_0
 13 |   - astunparse=1.6.3=pyhd8ed1ab_0
 14 |   - biopython=1.79=py36h8f6f2f9_0
 15 |   - blinker=1.4=py_1
 16 |   - blosc=1.21.1=hd32f23e_0
 17 |   - brotlipy=0.7.0=py36h8f6f2f9_1001
 18 |   - bzip2=1.0.8=h7f98852_4
 19 |   - c-ares=1.18.1=h7f98852_0
 20 |   - ca-certificates=2022.5.18.1=ha878542_0
 21 |   - cachetools=2.1.0=py_0
 22 |   - certifi=2021.5.30=py36h5fab9bb_0
 23 |   - cffi=1.14.4=py36h211aa47_0
 24 |   - charset-normalizer=2.0.12=pyhd8ed1ab_0
 25 |   - clair3=0.1.11=py36hb9dc472_5
 26 |   - click=8.0.1=py36h5fab9bb_0
 27 |   - cryptography=35.0.0=py36hb60f036_0
 28 |   - cycler=0.11.0=pyhd8ed1ab_0
 29 |   - dataclasses=0.8=pyh787bdff_2
 30 |   - expat=2.4.8=h27087fc_0
 31 |   - freetype=2.10.4=h0708190_1
 32 |   - gast=0.3.3=py_0
 33 |   - gdbm=1.18=h0a1914f_2
 34 |   - google-auth=1.2.1=py_0
 35 |   - google-auth-oauthlib=0.4.1=py_2
 36 |   - google-pasta=0.2.0=pyh8c360ce_0
 37 |   - grpcio=1.38.1=py36h8e87921_0
 38 |   - h5py=2.10.0=nompi_py36h4510012_106
 39 |   - hdf5=1.10.6=nompi_h6a2412b_1114
 40 |   - htslib=1.10.2=hd3b49d5_1
 41 |   - idna=3.3=pyhd8ed1ab_0
 42 |   - importlib-metadata=4.8.1=py36h5fab9bb_0
 43 |   - isa-l=2.30.0=ha770c72_4
 44 |   - jpeg=9e=h166bdaf_1
 45 |   - keras-preprocessing=1.1.2=pyhd8ed1ab_0
 46 |   - keyutils=1.6.1=h166bdaf_0
 47 |   - kiwisolver=1.3.1=py36h605e78d_1
 48 |   - krb5=1.19.3=h3790be6_0
 49 |   - lcms2=2.11=hcbb858e_1
 50 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
 51 |   - libblas=3.9.0=14_linux64_openblas
 52 |   - libcblas=3.9.0=14_linux64_openblas
 53 |   - libcurl=7.83.1=h7bff187_0
 54 |   - libdeflate=1.6=h516909a_0
 55 |   - libedit=3.1.20191231=he28a2e2_2
 56 |   - libev=4.33=h516909a_1
 57 |   - libffi=3.2.1=he1b5a44_1007
 58 |   - libgcc-ng=12.1.0=h8d9b700_16
 59 |   - libgfortran-ng=12.1.0=h69a702a_16
 60 |   - libgfortran5=12.1.0=hdcd56e2_16
 61 |   - libgomp=12.1.0=h8d9b700_16
 62 |   - liblapack=3.9.0=14_linux64_openblas
 63 |   - libnghttp2=1.47.0=h727a467_0
 64 |   - libnsl=2.0.0=h7f98852_0
 65 |   - libopenblas=0.3.20=pthreads_h78a6416_0
 66 |   - libpng=1.6.37=h21135ba_2
 67 |   - libprotobuf=3.18.0=h780b84a_1
 68 |   - libssh2=1.10.0=ha56f1ee_2
 69 |   - libstdcxx-ng=12.1.0=ha89aaad_16
 70 |   - libtiff=4.1.0=hc3755c2_3
 71 |   - libzlib=1.2.12=h166bdaf_0
 72 |   - lz4-c=1.9.2=he1b5a44_3
 73 |   - lzo=2.10=h516909a_1000
 74 |   - markdown=3.3.7=pyhd8ed1ab_0
 75 |   - matplotlib-base=3.3.4=py36hd391965_0
 76 |   - mock=4.0.3=py36h5fab9bb_1
 77 |   - ncurses=6.2=h58526e2_4
 78 |   - networkx=2.7.1=pyhd8ed1ab_0
 79 |   - numexpr=2.7.3=py36h0cdc3f0_0
 80 |   - numpy=1.19.5=py36hfc0c790_2
 81 |   - oauthlib=3.2.0=pyhd8ed1ab_0
 82 |   - olefile=0.46=pyh9f0ad1d_1
 83 |   - openssl=1.1.1o=h166bdaf_0
 84 |   - opt_einsum=3.3.0=pyhd8ed1ab_1
 85 |   - pandas=1.1.5=py36h284efc9_0
 86 |   - parallel=20191122=0
 87 |   - pbzip2=1.1.13=0
 88 |   - perl=5.32.1=2_h7f98852_perl5
 89 |   - pigz=2.4=h84994c4_0
 90 |   - pillow=8.1.0=py36h4f9996e_1
 91 |   - pip=21.3.1=pyhd8ed1ab_0
 92 |   - protobuf=3.18.0=py36hc4f0c31_0
 93 |   - pyasn1=0.4.8=py_0
 94 |   - pyasn1-modules=0.0.5=py36_0
 95 |   - pycparser=2.21=pyhd8ed1ab_0
 96 |   - pyfaidx=0.6.4=pyh5e36f6f_0
 97 |   - pyjwt=2.4.0=pyhd8ed1ab_0
 98 |   - pyopenssl=22.0.0=pyhd8ed1ab_0
 99 |   - pyparsing=3.0.9=pyhd8ed1ab_0
100 |   - pypy3.6=7.3.2=h45e8706_2
101 |   - pysam=0.16.0.1=py36h4c34d4e_1
102 |   - pysocks=1.7.1=py36h5fab9bb_3
103 |   - pytables=3.6.1=py36hb7ec5aa_3
104 |   - python=3.6.10=h8356626_1011_cpython
105 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
106 |   - python-isal=0.11.1=py36h8f6f2f9_0
107 |   - python_abi=3.6=2_cp36m
108 |   - pytz=2022.1=pyhd8ed1ab_0
109 |   - readline=8.1=h46c0cb4_0
110 |   - requests=2.27.1=pyhd8ed1ab_0
111 |   - requests-oauthlib=1.3.1=pyhd8ed1ab_0
112 |   - rsa=3.1.4=py36_0
113 |   - samtools=1.10=h2e538c0_3
114 |   - scipy=1.5.3=py36h81d768a_1
115 |   - setuptools=58.0.4=py36h5fab9bb_2
116 |   - six=1.16.0=pyh6c4a22f_0
117 |   - snappy=1.1.9=hbd366e4_1
118 |   - sqlite=3.37.0=h9cd32fc_0
119 |   - tensorboard=2.3.0=py_0
120 |   - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
121 |   - tensorflow=2.2.0=mkl_py36h5a57954_0
122 |   - tensorflow-base=2.2.0=mkl_py36hd506778_0
123 |   - tensorflow-estimator=2.6.0=py36hc4f0c31_0
124 |   - termcolor=1.1.0=py_2
125 |   - tk=8.6.12=h27826a3_0
126 |   - tornado=6.1=py36h8f6f2f9_1
127 |   - typing_extensions=4.1.1=pyha770c72_0
128 |   - urllib3=1.26.9=pyhd8ed1ab_0
129 |   - werkzeug=2.0.2=pyhd8ed1ab_0
130 |   - whatshap=1.0=py36hf1ae8f4_1
131 |   - wheel=0.37.1=pyhd8ed1ab_0
132 |   - wrapt=1.13.1=py36h8f6f2f9_0
133 |   - xopen=1.2.0=py36h5fab9bb_0
134 |   - xz=5.2.5=h516909a_1
135 |   - zipp=3.6.0=pyhd8ed1ab_0
136 |   - zlib=1.2.12=h166bdaf_0
137 |   - zstd=1.4.4=h6597ccf_3
138 | 


--------------------------------------------------------------------------------
/modules/sv.smk:
--------------------------------------------------------------------------------
  1 | 
  2 | ######################
  3 | ###### SV RULES ######
  4 | #####################
  5 | 
  6 | 
  7 | #### SNIFFLES ####
  8 | ##################
  9 | rule sniffles:
 10 |     """
 11 |     Identify structural variants using Sniffles2.
 12 |     """
 13 |     input:
 14 |         datain=data_dir + "/align/{aligner}/data_hap.bam" if config['phase_sv'] else data_dir + "/align/{aligner}/data.bam",
 15 |         data_index=data_dir + "/align/{aligner}/data_hap.bam.bai" if config['phase_sv'] else data_dir + "/align/{aligner}/data.bam.bai",
 16 |     output:
 17 |         dataout=data_dir + "/sv/{aligner}/sniffles.vcf",
 18 |         dataout_snf=data_dir + "/sv/{aligner}/sniffles.snf"
 19 |     message: "Running Sniffles in rule: {rule}\nUsing {input.datain} output:{output.dataout}"
 20 |     params:
 21 |         min_sv_len=config['min_sv_len'],
 22 |         sv_threads=config['sv_threads'],
 23 |         sample_name = SAMPLE_NAME,
 24 |         phase = "--phase" if config['phase_sv'] else "",
 25 |         mosaic = "--non-germline" if config['mosaic_sv'] else "",
 26 |     conda: SNIFFLES_ENV
 27 |     priority: 2
 28 |     log: data_dir + "/sv/{aligner}/sniffles.log"
 29 |     benchmark: data_dir + "/benchmark/sv/{aligner}/sv.benchmark.txt"
 30 |     shell:"""
 31 |         sniffles --minsvlen {params.min_sv_len} --sample-id {params.sample_name} -t {params.sv_threads} --input {input.datain} --vcf {output.dataout} --snf {output.dataout_snf} {params.phase} {params.mosaic} > {log} 2>&1
 32 |         """
 33 | 
 34 | #### HAPLOTYPE SVs ####
 35 | #######################
 36 | # TODO: orphan code # # # # # # # # # # # # # #
 37 | rule phaseSVs:
 38 |     """
 39 |     This rules takes as input a taged tabed bam file
 40 |     from whatshap and vcf file contains SVs and update
 41 |     the SVs to add haplotype HP and phase blocks PS.
 42 |     """
 43 |     input:
 44 |         bam = data_dir + "/align/{aligner}/data_hap.tab",
 45 |         sv = data_dir + "/sv/{aligner}/sniffles.vcf",
 46 |     output:data_dir + "/sv/{aligner}/sniffles_hp_updated.vcf"
 47 |     message: "Updating SVs using align/{aligner}/data_hap.tab"
 48 |     params:
 49 |         update_script = updat_sv,
 50 |     shell:"""
 51 |         python {params.update_script} {input.sv} {input.bam} {output} -c {params.min_conflict}
 52 |         """
 53 | 
 54 | #### SORTING SVs ####
 55 | #####################
 56 | 
 57 | rule vcfSort:
 58 |     """
 59 |     To concat the haplotype SVs with SNVs, SVs needs to be sorted first.
 60 |     """
 61 |     input:
 62 |         vcffile = data_dir + "/{sample}.vcf.gz",
 63 |         ref = REFERENCES,
 64 |     output:data_dir + "/{sample}.sorted.vcf.gz"
 65 |     conda: PRINCESS_ENV
 66 |     shell:"""
 67 |          zcat {input.vcffile} | awk 'BEGIN{{OFS="\t";}} /^#/{{print $0}} !/^#/{{ if ($2==0){{$2=1;print}} else {{print $0}} }}' |  bedtools sort -header -faidx {input.ref}.fai -i - | bgzip > {output}
 68 |         """
 69 | 
 70 | #### BGZIP SVs ####
 71 | ###################
 72 | 
 73 | rule bgzipFile:
 74 |     """
 75 |     General rule to bgzip files
 76 |     """
 77 |     input:data_dir + "/{name}.vcf"
 78 |     output:data_dir + "/{name}.vcf.gz"
 79 |     threads: config['bgzip_threads']
 80 |     conda: VARIANT_ENV
 81 |     shell:"""
 82 |         bgzip -c -@ {threads} {input} > {output}
 83 |         """
 84 | 
 85 | #### CHANGE SVs SAMPLE NAME ####
 86 | ###############################
 87 | 
 88 | rule changeSampleName:
 89 |     """
 90 |     Sniffles name the sample as the bam, but Clair call it SAMPLE
 91 |     This rule will change the sample name in the SV file.
 92 |     """
 93 |     input:data_dir + "/{sample}.sorted.vcf.gz"
 94 |     output:data_dir + "/{sample}.sorted.namechnage.vcf.gz"
 95 |     conda: PRINCESS_ENV
 96 |     shell:"""
 97 |         echo SAMPLE > sample.name && bcftools reheader -s sample.name  -o {output} {input} && rm sample.name
 98 |         """
 99 | 
100 | #### CONCAT SVs WITH SNPs ####
101 | ##############################
102 | 
103 | rule SVsSNPsCombined:
104 |     """
105 |     Concat haplotype SNPs with haplotype and Genotype SVs.
106 |     """
107 |     input:
108 |         sv = data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz",
109 |         sv_index =data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz.tbi",
110 |         snp =  lambda wildcards: data_dir + "/phased/{aligner}/data_updated.vcf.gz" if config['update_snps'] else data_dir + "/phased/{aligner}/data.vcf.gz",
111 |         ref = REFERENCES,
112 |     output: data_dir + "/sv/{aligner}/sv_snp.vcf.gz"
113 |     params:
114 |         extension = "z"  # output a compressed file.
115 |     message: "Concat sv with SNPs"
116 |     log: data_dir + "/sv/{aligner}/sv_snp.log"
117 |     threads: config['samtools_threads']
118 |     conda: PRINCESS_ENV
119 |     shell:"""
120 |         vcfcat  {input.sv} {input.snp}| bedtools sort -header -faidx {input.ref}.fai -i - | bgzip > {output} 2> {log}
121 |         """
122 |     # shell:"""
123 |     #     bcftools concat -a -O {params.extension} -o {output} --threads {threads} {input.sv} {input.snp} > {log} 2>&1
124 |     #     """
125 | 


--------------------------------------------------------------------------------
/cluster/cluster_config.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | long: &long_queue ill-prod #scavenger
  4 | short: &short_queue ill-prod #medium
  5 | 
  6 | __default__:
  7 |     "nCPUs"     : "16"
  8 |     "mem"       : 20000
  9 |     "queue"     : *long_queue #"analysis"
 10 |     "name"      : "JOBNAME.{rule}.{wildcards}"
 11 |     "time"      : "01:00:00:00"
 12 |     # "resources" : "\"select[mem>20000] rusage[mem=20000] span[hosts=1]\""
 13 |     #"output"    : "log/{rule}.PBS_JOBID.out"
 14 |     #"error"     : "log/{rule}.PBS_JOBID.err"
 15 | 
 16 | 
 17 | 
 18 | ##########################
 19 | ######  ALIGN RULES ######
 20 | #########################
 21 | 
 22 | minimap2:
 23 |   queue: *long_queue
 24 |   time: "72:00:00"
 25 |   nCPUs: "12"
 26 |   mem: 20G
 27 | 
 28 | indexBam:
 29 |   queue: *short_queue
 30 |   time: "10:00:00"
 31 |   nCPUs: "4"
 32 |   mem: 10G
 33 | 
 34 | mergeAlign:
 35 |   queue: *long_queue
 36 |   time: "01:00:00:00"
 37 |   nCPUs: "8"
 38 |   mem: 10G
 39 | 
 40 | sam2bam:
 41 |   queue: *long_queue
 42 |   time: "01:00:00:00"
 43 |   nCPUs: "2"
 44 |   mem: 10G
 45 | 
 46 | addRG:
 47 |   queue: *long_queue
 48 |   time: "01:00:00:00"
 49 |   nCPUs: "2"
 50 |   mem: 10G
 51 | 
 52 | bam2tab:
 53 |   queue: *short_queue
 54 |   time: "01:00:00:00"
 55 |   nCPUs: "2"
 56 |   mem: 10G
 57 | 
 58 | mvAlign:
 59 |   queue: *short_queue
 60 |   time: "01:00:00"
 61 |   nCPUs: "2"
 62 |   mem: 1G
 63 | 
 64 | #<*><*><*><*><*><*><*><*><*>
 65 | 
 66 | 
 67 | ######################
 68 | ###### SV RULES ######
 69 | #####################
 70 | 
 71 | sniffles:
 72 |   time: "10:00:00"
 73 |   nCPUs: "16"
 74 |   mem: 30G
 75 | 
 76 | phaseSVs:
 77 |   queue: *long_queue
 78 |   time: "24:00:00"
 79 |   nCPUs: "8"
 80 |   mem: 20G
 81 | 
 82 | vcfSort:
 83 |   queue: *short_queue
 84 |   time: "10:00:00"
 85 |   nCPUs: "8"
 86 |   mem: 10G
 87 | 
 88 | bgzipFile:
 89 |   queue: *long_queue
 90 |   time: "01:00:00:00"
 91 |   nCPUs: "2"
 92 |   mem: 10G
 93 | 
 94 | changeSampleName:
 95 |   queue: *long_queue
 96 |   time: "01:00:00:00"
 97 |   nCPUs: "2"
 98 |   mem: 10G
 99 | 
100 | SVsSNPsCombined:
101 |   queue: *long_queue
102 |   time: "01:00:00:00"
103 |   nCPUs: "8"
104 |   mem: 10G
105 | 
106 | #<*><*><*><*><*><*><*><*><*>
107 | 
108 | 
109 | 
110 | #########################
111 | ######  SNPs RULES ######
112 | #########################
113 | 
114 | concatChromosome:
115 |   queue: *long_queue
116 |   time: "04:00:00:00"
117 |   nCPUs: "2"
118 |   mem: 10G
119 | 
120 | concactSNPs:
121 |   queue: *long_queue
122 |   time: "01:00:00:00"
123 |   nCPUs: "2"
124 |   mem: 10G
125 | 
126 | callSNVsChunk:
127 |   queue: *long_queue
128 |   time: "07:00:00:00"
129 |   nCPUs: "5"
130 |   mem: 30G
131 |   host: "host=c86q-23+1:host=c86q-22+1:host=c86q-21+1:host=c86q-20+1:host=c86q-19+1:host=c86q-18+1:host=c86q-17+1:host=c86q-16+1:host=c86q-15+1:host=c86q-14+1:host=c86q-13+1:host=c86q-12+1:host=c86q-11"
132 | 
133 | updateHeader:
134 |   queue: *long_queue
135 |   time: "01:00:00:00"
136 |   nCPUs: "2"
137 |   mem: 4G
138 | 
139 | vcfIndex:
140 |   queue: *long_queue
141 |   time: "01:00:00:00"
142 |   nCPUs: "2"
143 |   mem: 4G
144 | 
145 | mergeParentalSNPs:
146 |   queue: *long_queue
147 |   time: "01:00:00:00"
148 |   nCPUs: "8"
149 |   mem: 16G
150 | 
151 | updateSNPs:
152 |   queue: *long_queue
153 |   time: "01:00:00:00"
154 |   nCPUs: "4"
155 |   mem: 20G
156 | 
157 | #<*><*><*><*><*><*><*><*><*>
158 | 
159 | 
160 | 
161 | #################################
162 | ######  METHYLATION RULES #######
163 | #################################
164 | 
165 | nanoIndex:
166 |   queue: *long_queue
167 |   nCPUs: "5"
168 |   mem: 50G
169 | 
170 | callMeth:
171 |   queue: *long_queue
172 |   time: "07:00:00:00"
173 |   nCPUs: "8"
174 |   mem: 50G
175 | 
176 | allMethylation:
177 |   queue: *long_queue
178 |   nCPUs: "1"
179 |   mem: 2G
180 |   time: "00:40:00"
181 | 
182 | #<*><*><*><*><*><*><*><*><*>
183 | 
184 | 
185 | 
186 | ############################
187 | ######  PHASING RULES ######
188 | ###########################
189 | 
190 | gt:
191 |   queue: *long_queue
192 |   nCPUs: "1"
193 |   mem: 50G
194 |   time: "72:00:00"
195 | 
196 | phasing:
197 |   queue: *long_queue
198 |   nCPUs: "4"
199 |   mem: 50G
200 |   time: "05:00:00:00"
201 | 
202 | allPhased:
203 |   queue: *long_queue
204 |   time: "01:00:00:00"
205 |   nCPUs: "2"
206 |   mem: 10G
207 | 
208 | partionBam:
209 |   queue: *long_queue
210 |   nCPUs: "4"
211 |   mem: 50G
212 |   time: "05:00:00:00"
213 | 
214 | #<*><*><*><*><*><*><*><*><*>
215 | 
216 | 
217 | 
218 | ###############################
219 | ######  STATISTICS RULES ######
220 | ###############################
221 | 
222 | readsStat:
223 |   queue: *long_queue
224 |   nCPUs: "8"
225 |   mem: 20G
226 |   time: "05:00:00:00"
227 | 
228 | bamStatistics:
229 |   queue: *long_queue
230 |   time: "01:00:00:00"
231 |   nCPUs: "4"
232 |   mem: 10G
233 | 
234 | svStat:
235 |   queue: *long_queue
236 |   time: "01:00:00:00"
237 |   nCPUs: "4"
238 |   mem: 10G
239 | 
240 | snpStat:
241 |   queue: *long_queue
242 |   time: "01:00:00:00"
243 |   nCPUs: "4"
244 |   mem: 10G
245 | 
246 | stat:
247 |   queue: *short_queue
248 |   time: "00:00:10:00"
249 |   nCPUs: "1"
250 |   mem: 1G
251 | 
252 | 
253 | #<*><*><*><*><*><*><*><*><*>
254 | 
255 | ...
256 | 


--------------------------------------------------------------------------------
/scripts/rawcoverage.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import concurrent.futures as cf
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | # import multiprocessing as mp
  7 | import pandas as pd
  8 | import seaborn as sns
  9 | 
 10 | plt.switch_backend('agg')
 11 | from functools import partial
 12 | from os import path as opath
 13 | import ntpath
 14 | import sys
 15 | from Bio import SeqIO
 16 | 
 17 | 
 18 | def main():
 19 |     args = get_args()
 20 |     files = flat_list(args.input)
 21 |     nfiles = len(files)
 22 |     nworkers = min(nfiles, args.threads)
 23 |     # cpus  = mp.cpu_count()
 24 |     with cf.ProcessPoolExecutor(max_workers=nworkers) as executor, open(args.output, 'w') as data_out:
 25 |         # return pd.concat([i for i in executor.map(process_reads, files)], ignore_index=True)
 26 |         df = pd.concat([i for i in executor.map(process_reads, files)], ignore_index=True)
 27 |         data_out.write("Reads: {length}\n"
 28 |                        "Bases: {nbases}\n"
 29 |                        "Mean read length: {rmean}\n"
 30 |                        "Median: {rmdeian}\n"
 31 |                        "Max: {rmax}\n"
 32 |                        "Min: {rmin}\n"
 33 |                        "N50: {n50}". \
 34 |                        format(length=len(df), nbases=np.sum(df["lengths"]), rmean=np.mean(df["lengths"]),
 35 |                               rmdeian=np.median(df["lengths"]),
 36 |                               rmax=np.max(df["lengths"]),
 37 |                               rmin=np.min(df["lengths"]),
 38 |                               n50=get_N50(np.sort(df['lengths']))
 39 |                               ))
 40 | 
 41 |         plot_output = opath.join(opath.dirname(args.output), ntpath.basename(args.output).rsplit(".", 1)[0] + ".png")
 42 |         sns.set()
 43 |         myplot = sns.distplot(np.log(df['lengths']))
 44 |         myplot.set(xlabel='Log Read Length')
 45 |         myplot.get_figure().savefig(plot_output)
 46 | 
 47 |         # if i want to count reads c = s.groupby(['length']).size().reset_index(name='count')
 48 | 
 49 | 
 50 | def get_args():
 51 |     parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for more info.",
 52 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 53 |                                      description='Calulate statistics form fasta, fastq, fasta.gz and fastq.gz files ',
 54 |                                      add_help=True, )
 55 | 
 56 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01')
 57 | 
 58 |     parser.add_argument("-i", "--input", nargs="+",
 59 |                         help="<Required> one or more reads file ex: -i 1.fasta -i 2.fasta .... or -i 1.fasta  2.fasta",
 60 |                         action="append", required=True, metavar="FOO.fasta/q/gz")
 61 |     parser.add_argument("-o", "--output", help="<Required> output statistics file", metavar="FOO.txt")
 62 | 
 63 |     parser.add_argument("-t", "--threads", type=int, metavar='N', default=1,
 64 |                         help="<Optional> Number of threads default %(default)d")
 65 | 
 66 |     args = parser.parse_args()
 67 | 
 68 |     return args
 69 | 
 70 | 
 71 | def flat_list(my_list):
 72 |     """
 73 |     Transform list of lists to flat list
 74 |     :param my_list: list of lists ex: [[1],[1, 2], [a,v]]
 75 |     :return: [1, 1, 2, a, v]
 76 |     """
 77 |     return [element for each_list in my_list for element in each_list]
 78 | 
 79 | 
 80 | def process_reads(read_file):
 81 |     file_handle, file_type = open_handle(read_file)
 82 |     return (pd.DataFrame(
 83 |         data=[len(rec) for rec in SeqIO.parse(file_handle, file_type)],
 84 |         columns=["lengths"]).dropna())
 85 | 
 86 | 
 87 | def open_handle(myfile):
 88 |     if opath.isfile(myfile):
 89 |         if myfile.endswith(('fastq.gz', 'fq.gz')):
 90 |             import gzip
 91 |             return gzip.open(myfile, 'rt'), "fastq"
 92 |         elif myfile.endswith('fasta.gz'):
 93 |             import gzip
 94 |             return gzip.open(myfile, 'rt'), "fasta"
 95 |         elif myfile.endswith('.fasta', ):
 96 |             return open(myfile, 'r'), 'fasta'
 97 |         elif myfile.endswith('.fastq'):
 98 |             return open(myfile, 'r'), 'fastq'
 99 |         # elif myfile.endswith("fastq.tar.gz"):
100 |         #     import tarfile
101 |         #     tar = tarfile.open(myfile, 'r:gz')#, 'fasta'
102 |         #     for member in tar.getmembers():
103 |         #          f = tar.extractfile(member)
104 |         #          if f is not None:
105 |         #              print(type(f))
106 |         #              return open(f, 'r'), 'fastq'
107 |         # elif myfile.endswith("fasta.tar.gz"):
108 |         #     import tarfile
109 |         #     tar = tarfile.open(myfile, 'r:gz')#, 'fasta'
110 |         #     for member in tar.getmembers():
111 |         #          f = tar.extractfile(member)
112 |         #          if f is not None:
113 |         #              return open(f, 'r'), 'fasta'
114 |         else:
115 |             sys.exit("This file {} is of unknown extension!".format(myfile))
116 |     else:
117 |         sys.exit("This file {} does not exist.".format(myfile))
118 | 
119 | 
120 | def get_N50(read_lengths):
121 |     return read_lengths[np.where(np.cumsum(read_lengths) >= 0.5 * np.sum(read_lengths))[0][0]]
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/modules/align.smk:
--------------------------------------------------------------------------------
  1 | ##########################
  2 | ######  ALIGN RULES ######
  3 | #########################
  4 | 
  5 | 
  6 | 
  7 | #### MINIMAP2 ####
  8 | ##################
  9 | 
 10 | # Minimap2 Parameters
 11 | #====================
 12 | 
 13 | if config["read_type"].lower() in ["clr", "ccs"]:
 14 |     minimap2_read_type = "-H"
 15 |     x_param = "-ax map-pb"
 16 | elif config["read_type"].lower() == "ont":
 17 |     x_param = "-ax map-ont"
 18 |     minimap2_read_type = ""
 19 | else:
 20 |     minimap2_read_type = ""
 21 |     x_param = ""
 22 | 
 23 | rule minimap2:
 24 |     """
 25 |     Using Minimap2 to align reads
 26 |     """
 27 |     input:
 28 |         datain=data_dir + "/{sample}"
 29 |     output:
 30 |         dataout=temp(data_dir + "/align/minimap/{sample}.bam")
 31 |     params:
 32 |         reference=REFERENCES,
 33 |         h = minimap2_read_type,
 34 |         md = "--MD",
 35 |         x = x_param,
 36 |         sample_name = SAMPLE_NAME,
 37 |         # rg = "@RG\\tSM:SAMPLE\\tID:LONG", should be used like -R {params.rg}
 38 |         minimap_other_tags = config['minimap_other_tags'],
 39 |     log:
 40 |         data_dir + "/align/minimap/{sample}.log"
 41 |     message:
 42 |         "Running minimap2 , sample is: {wildcards.sample} in rule {rule}"
 43 |     threads: config['aligner_threads']
 44 |     benchmark: data_dir + "/benchmark/align/{sample}.minimap.benchmark.txt"
 45 |     conda: MINIMAP2_ENV
 46 |     shell:"""
 47 |     if [[ ! -z "{params.minimap_other_tags}" ]]; then
 48 |         minimap2 -Y -R '@RG\\tSM:{params.sample_name}\\tID:{params.sample_name}' {params.x} "{params.reference}"  "{input.datain}" {params.h} "{params.md}" -t "{threads}" "{params.minimap_other_tags}"  2>{log} | samtools sort -@ {threads} - > "{output.dataout}" 2>>{log}
 49 |     else
 50 |         minimap2 -Y -R '@RG\\tSM:{params.sample_name}\\tID:{params.sample_name}' {params.x} "{params.reference}"  "{input.datain}" {params.h} "{params.md}" -t "{threads}"  2>{log} | samtools sort -@ {threads} - > "{output.dataout}" 2>>{log}
 51 |     fi
 52 |     """
 53 | #### NGMLR ####
 54 | ###############
 55 | 
 56 | rule ngmlr:
 57 |     """
 58 |     Using ngmlr to align reads
 59 |     """
 60 |     input:
 61 |         datain=data_dir + "/{sample}"
 62 |     output:
 63 |         dataout=temp(data_dir + "/align/ngmlr/{sample}.sam")
 64 |     params:
 65 |         reference=REFERENCES,
 66 |         platform="pacbio" if config["read_type"] in ["clr", "ccs"] else "ont" if config["read_type"] == "ont" else ""
 67 |     log:
 68 |         data_dir + "/align/ngmlr/{sample}.log"
 69 |     message:
 70 |         "Running ngmlr , sample is: {wildcards.sample}"
 71 |     threads: config['aligner_threads']
 72 |     benchmark: data_dir + "/benchmark/align/{sample}.ngmlr.benchmark.txt"
 73 |     conda: PRINCESS_ENV
 74 |     shell:"""
 75 |             ngmlr -r "{params.reference}" -q "{input.datain}" --rg-sm SAMPLE -o "{output.dataout}" -t "{threads}" -x "{params.platform}" --bam-fix > {log} 2>&1
 76 |             """
 77 | 
 78 | #### SAM2BAM ####
 79 | ################
 80 | 
 81 | rule sam2bam:
 82 |     input: data_dir + "/align/ngmlr/{sample}.sam"
 83 |     output: temp(data_dir + "/align/ngmlr/{sample}.bam")
 84 |     message: "Covert SAM to sorted BAM"
 85 |     threads: config['aligner_threads']
 86 |     benchmark: data_dir + "/benchmark/align/{sample}.sam2bam.benchmark.txt"
 87 |     conda: PRINCESS_ENV
 88 |     shell:"""
 89 |         samtools view -bhS {input} | samtools sort -@ {threads} - > {output}
 90 |         """
 91 | 
 92 | #### INDEX BAM ####
 93 | ###################
 94 | 
 95 | rule indexBam:
 96 |     """
 97 |     Indexing bam file.
 98 |     """
 99 |     input:
100 |         data_dir + "/{sample}.bam"
101 |     output:
102 |         temp(data_dir + "/{sample}.bam.bai")
103 |     benchmark: data_dir + "/benchmark/align/{sample}.index.benchmark.txt"
104 |     message: "Indexing {input}"
105 |     conda: MINIMAP2_ENV
106 |     shell:
107 |         "samtools index {input}"
108 | 
109 | #### MERGE BAM FILES ####
110 | ########################
111 | 
112 | rule mergeAlign:
113 |     input:
114 |         bams=lambda wildcards: expand(data_dir + "/align/{aligner}/{sample}.bam", aligner=wildcards.aligner, sample=sample_list),
115 |         index_bams=lambda wildcards: expand(data_dir + "/align/{aligner}/{sample}.bam.bai", aligner=wildcards.aligner, sample=sample_list),
116 |     output:
117 |         file_name=temp(data_dir + "/align/{aligner}/data.bam")
118 |     message:"Mergeing data"
119 |     threads: config['samtools_threads']
120 |     benchmark: data_dir + "/benchmark/align/{aligner}.merging.benchmark.txt"
121 |     log:
122 |         data_dir + "/align/{aligner}/merge.log"
123 |     conda: MINIMAP2_ENV
124 |     threads: config['aligner_threads']
125 |     shell:"""
126 |         samtools merge -@ {threads} {output} {input.bams} > {log} 2>&1
127 |         """
128 | 
129 | #### ADD RG TO BAM FILE ####
130 | ############################
131 | 
132 | rule addRG:
133 |     input:data_dir + "/{sample}.bam"
134 |     output:temp(data_dir + "/{sample}_rg.bam")
135 |     params:
136 |         rg = "@RG\\tSM:SAMPLE\\tID:LONG",
137 |     conda: PRINCESS_ENV
138 |     shell:"""
139 |         samtools addreplacerg -r "{params.rg}" -o {output} {input}
140 |     """
141 | 
142 | 
143 | 
144 | #### CONVERT BAM FILE TO TAB ####
145 | ################################
146 | 
147 | rule bam2tab:
148 |     """
149 |     This rules takes bam file and extract to tab delimeted file: reads   HP  PS.
150 |     """
151 |     input:
152 |         bam_file = data_dir + "/align/{aligner}/data_hap.bam",
153 |     output: data_dir + "/align/{aligner}/data_hap.tab",
154 |     message: "Extracting read hp and ps info from tagged bam file."
155 |     conda: PRINCESS_ENV
156 |     benchmark: data_dir + "/benchmark/align/{aligner}.bam2tab.benchmark.txt"
157 |     shell:"""
158 |         samtools index {input} && samtools view  {input.bam_file} |  grep  "PS:i:" |  awk 'BEGIN{{OFS="\\t";}}{{print $1,$(NF-2), $(NF)}}'  > {output}
159 |         """
160 | 


--------------------------------------------------------------------------------
/Snakefile:
--------------------------------------------------------------------------------
  1 | 
  2 | # import Lib
  3 | ############
  4 | import os, glob, ntpath, math, shutil
  5 | from snakemake.utils import min_version
  6 | 
  7 | ############################
  8 | 
  9 | # Snake Version
 10 | ###############
 11 | min_version("5.7.1")
 12 | 
 13 | 
 14 | 
 15 | # Config File
 16 | #############
 17 | # # if len(config) == 0:
 18 | if os.path.isfile("config.yaml"):
 19 |     configfile: "config.yaml"
 20 | else:
 21 |     sys.exit("Looks like there is no config.yaml file in " + os.getcwd() + " make sure there is one or at least specify one with the --configfile commandline parameter.")
 22 | #############
 23 | 
 24 | 
 25 | 
 26 | # Listing samples
 27 | #################
 28 | # GET SAMPLES EXTENSION
 29 | sample_extension = config['sample_extension'] if config['sample_extension'] else "gz"
 30 | 
 31 | # GET WORKING DIRECTORY DEFAULT IS CURRENT DIRECTORY
 32 | data_dir =  config["sample_directory"] if config['sample_directory'] else os.getcwd()
 33 | 
 34 | # GET SAMPLES LIST
 35 | sample_list = config['sample_list']
 36 | if not isinstance(sample_list, list):
 37 |     sample_list = sample_list.split()
 38 | #############
 39 | 
 40 | 
 41 | 
 42 | # Output sample name
 43 | ###################
 44 | SAMPLE_NAME = config['sample_name']
 45 | #############
 46 | 
 47 | 
 48 | # Clean after success
 49 | ####################
 50 | source_dir = config['delete_files']
 51 | samples_names = config['delete_samples']
 52 | def clean(source_dir, data_dir, samples_names):
 53 |     file_list = os.listdir(source_dir)
 54 |     if samples_names:
 55 |         for sample in samples_names: os.remove(os.path.join(data_dir, os.path.basename(sample)))
 56 |     for sample in file_list:
 57 |         if os.path.isfile(sample):
 58 |             os.remove(sample)
 59 |         else:
 60 |             shutil.rmtree(sample)
 61 | #############
 62 | 
 63 | 
 64 | 
 65 | # Config reference and chromosomes list
 66 | #######################################
 67 | REFERENCES = config["reference"]
 68 | chr_list = config['chrs']
 69 | 
 70 | # chromosomes List split to chunks
 71 | split_size = config['chr_split'] if config['chr_split'] and (config['chr_split'] >= 1000000) else 1000000
 72 | ref_index_file = REFERENCES+".fai"
 73 | chr_range = {}
 74 | with open(ref_index_file, 'r') as data_in:
 75 |     for line in data_in:
 76 |         chr, length = line.split()[0:2]
 77 |         if chr in chr_list:
 78 |             # Identify number of splits
 79 |             chr_split = int(length) // split_size
 80 |             chr_split = chr_split if chr_split > 1 else 1
 81 |             # step_value = int(length)//chr_split if chr_split > 0 else int(length)
 82 |             step_value = int(length)//chr_split
 83 |             ranges = list(range(0, int(length), step_value))
 84 |             if len(ranges) == chr_split + 1:
 85 |                 ranges[-1] = int(length)
 86 |             else:
 87 |                 ranges.append(int(length))
 88 |             ranges[0] = 1
 89 |             chr_range[chr] = ranges
 90 | #############
 91 | 
 92 | 
 93 | 
 94 | # Declare aligner
 95 | #################
 96 | aligner = config["aligner"]
 97 | #############
 98 | 
 99 | 
100 | 
101 | # Methylation variables
102 | #######################
103 | # ont_sample_dir = config['fast5_dir']
104 | #############
105 | 
106 | 
107 | 
108 | # Preparing conda environments.
109 | ###############################
110 | PRINCESS_ENV=os.getcwd()+"/envs/princess_env.yaml"
111 | SNIFFLES_ENV=os.getcwd()+"/envs/sniffles.yaml"
112 | #CLAIR_ENV=os.getcwd()+"/envs/clair3.yaml"
113 | CLAIR_ENV=os.getcwd()+"/envs/clair3_no_depend.yaml"
114 | MINIMAP2_ENV=os.getcwd()+"/envs/minimap2.yaml"
115 | WHATSHAP_ENV=os.getcwd()+"/envs/whatshap.yaml"
116 | VARIANT_ENV=os.getcwd()+"/envs/variant_tools.yaml"
117 | READ_STAT_ENV=os.getcwd()+"/envs/pythonRun.yaml"
118 | #############
119 | 
120 | 
121 | 
122 | # Importing scripts
123 | ###################
124 | rawcoverage_script = config['read_raw_coverage']
125 | updat_sv = config['updat_sv']
126 | #############
127 | 
128 | 
129 | 
130 | # Include all snakemake files sub-modules
131 | ########################################
132 | prefixed = ["./modules/"+filename for filename in os.listdir('./modules') if filename.endswith(".smk")]
133 | for f in prefixed:
134 |     include: f
135 | ###################################
136 | 
137 | 
138 | 
139 | # Building output
140 | ##################
141 | final_output = []
142 | 
143 | if config['sample_list']:
144 |     if not config['methylation']:
145 |         pass
146 |     # elif config['methylation'] and all(value  for value in ont_sample_dir.values()):
147 |     elif config['methylation'] and  config['fast5_dir']:
148 |         final_output.append(data_dir + "/result" + "/methylation.{}_calls_hap.tsv".format(aligner)) # DONE
149 |     else:
150 |         sys.exit("Every ONT sample should have corresponding fast5 directory, please correct fast5_dir files in config.yaml or use -md option")
151 | 
152 |     if config['update_snps'] and config['paternal_snps'] and config['maternal_snps']:
153 |         final_output.extend([data_dir + "/result/.allReadsparental.{aligner}.txt".format(aligner=aligner)])
154 |     else:
155 |         final_output.extend([data_dir + "/result/.all.Reads.{}.txt".format(aligner)])
156 | else:
157 |     if config['update_snps'] and config['paternal_snps'] and config['maternal_snps']:
158 |         final_output.extend([data_dir + "/result/.allReadsparental.{aligner}.txt".format(aligner=aligner)])
159 |     else:
160 |         final_output.extend([data_dir + "/result/.all.noReads.{}.txt".format(aligner)])
161 | 
162 | 
163 | ##############
164 | 
165 | 
166 | # RULES
167 | #######
168 | onstart:
169 |     shell("cat pictures/start.txt")
170 | 
171 | rule all:
172 |     input: final_output
173 | 
174 | ## ------------------------------------------------------------------------------------ ##
175 | ## Success and failure messages
176 | ## ------------------------------------------------------------------------------------ ##
177 | onsuccess:
178 |     clean(source_dir, data_dir, samples_names)
179 |     if os.path.exists(os.path.join(data_dir, ".snakemake")):
180 |         import shutil
181 |         shutil.rmtree(os.path.join(data_dir, ".snakemake"), ignore_errors=True)
182 |     shell("mkdir -p {data_dir}/snake_log &&\
183 |     find . -maxdepth 1  \( -name 'snakejob*' -or -name 'slurm*' \) -type f -exec mv -t {data_dir}/snake_log {{}}  \;  &&\
184 |     cat {source_dir}/pictures/success.txt")
185 | 
186 | 
187 | onerror:
188 |     shell("mkdir -p {data_dir}/snake_log &&\
189 |     find . -maxdepth 1  \( -name 'snakejob*' -or -name 'slurm*' \) -type f -exec mv -t {data_dir}/snake_log {{}}  \;  &&\
190 |     cat {source_dir}/pictures/fail.txt")
191 | 


--------------------------------------------------------------------------------
/envs/run_princess_env.yaml:
--------------------------------------------------------------------------------
  1 | name: princess_env2
  2 | channels:
  3 |   - conda-forge
  4 |   - bioconda
  5 |   - defaults
  6 |   - anaconda
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=2_gnu
 10 |   - aioeasywebdav=2.4.0=py37h89c1867_1001
 11 |   - aiohttp=3.8.1=py37h540881e_1
 12 |   - aiosignal=1.2.0=pyhd8ed1ab_0
 13 |   - amply=0.1.5=pyhd8ed1ab_0
 14 |   - appdirs=1.4.4=pyh9f0ad1d_0
 15 |   - async-timeout=4.0.2=pyhd8ed1ab_0
 16 |   - asynctest=0.13.0=py_0
 17 |   - atk-1.0=2.36.0=h3371d22_4
 18 |   - attmap=0.13.2=pyhd8ed1ab_0
 19 |   - attrs=21.4.0=pyhd8ed1ab_0
 20 |   - backports=1.0=py_2
 21 |   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
 22 |   - bcrypt=3.2.2=py37h540881e_0
 23 |   - boto3=1.23.9=pyhd8ed1ab_0
 24 |   - botocore=1.26.10=pyhd8ed1ab_0
 25 |   - bottleneck=1.3.4=py37hda87dfa_1
 26 |   - brotli=1.0.9=h166bdaf_7
 27 |   - brotli-bin=1.0.9=h166bdaf_7
 28 |   - brotlipy=0.7.0=py37h540881e_1004
 29 |   - bzip2=1.0.8=h7f98852_4
 30 |   - c-ares=1.18.1=h7f98852_0
 31 |   - ca-certificates=2022.5.18.1=ha878542_0
 32 |   - cachetools=5.0.0=pyhd8ed1ab_0
 33 |   - cairo=1.16.0=ha61ee94_1011
 34 |   - certifi=2022.5.18.1=py37h89c1867_0
 35 |   - cffi=1.15.0=py37h036bc23_0
 36 |   - charset-normalizer=2.0.12=pyhd8ed1ab_0
 37 |   - coincbc=2.10.5=hcee13e7_1
 38 |   - configargparse=1.5.3=pyhd8ed1ab_0
 39 |   - connection_pool=0.0.3=pyhd3deb0d_0
 40 |   - cryptography=37.0.1=py37h9ce1e76_0
 41 |   - cycler=0.11.0=pyhd8ed1ab_0
 42 |   - datrie=0.8.2=py37h5e8e339_3
 43 |   - decorator=5.1.1=pyhd8ed1ab_0
 44 |   - defusedxml=0.7.1=pyhd8ed1ab_0
 45 |   - docutils=0.18.1=py37h89c1867_1
 46 |   - dpath=2.0.6=py37h89c1867_1
 47 |   - dropbox=11.31.0=pyhd8ed1ab_0
 48 |   - expat=2.4.8=h27087fc_0
 49 |   - fftw=3.3.10=nompi_h77c792f_102
 50 |   - filechunkio=1.8=py_2
 51 |   - filelock=3.7.0=pyhd8ed1ab_0
 52 |   - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
 53 |   - font-ttf-inconsolata=3.000=h77eed37_0
 54 |   - font-ttf-source-code-pro=2.038=h77eed37_0
 55 |   - font-ttf-ubuntu=0.83=hab24e00_0
 56 |   - fontconfig=2.14.0=h8e229c2_0
 57 |   - fonts-conda-ecosystem=1=0
 58 |   - fonts-conda-forge=1=0
 59 |   - fonttools=4.33.3=py37h540881e_0
 60 |   - freetype=2.10.4=h0708190_1
 61 |   - fribidi=1.0.10=h36c2ea0_0
 62 |   - frozenlist=1.3.0=py37h540881e_1
 63 |   - ftputil=5.0.4=pyhd8ed1ab_0
 64 |   - gdk-pixbuf=2.42.8=hff1cb4f_0
 65 |   - gettext=0.19.8.1=h73d1719_1008
 66 |   - ghostscript=9.54.0=h27087fc_2
 67 |   - giflib=5.2.1=h36c2ea0_2
 68 |   - gitdb=4.0.9=pyhd8ed1ab_0
 69 |   - gitpython=3.1.27=pyhd8ed1ab_0
 70 |   - google-api-core=2.8.0=pyhd8ed1ab_1
 71 |   - google-api-python-client=2.49.0=pyhd8ed1ab_0
 72 |   - google-auth=2.6.6=pyh6c4a22f_0
 73 |   - google-auth-httplib2=0.1.0=pyhd8ed1ab_0
 74 |   - google-cloud-core=2.2.2=pyh6c4a22f_0
 75 |   - google-cloud-storage=2.1.0=pyh6c4a22f_0
 76 |   - google-crc32c=1.1.2=py37h5d4fa31_3
 77 |   - google-resumable-media=2.1.0=pyh6c4a22f_0
 78 |   - googleapis-common-protos=1.56.2=py37h89c1867_0
 79 |   - graphite2=1.3.13=h58526e2_1001
 80 |   - graphviz=3.0.0=h5abf519_1
 81 |   - grpcio=1.46.3=py37h0327239_0
 82 |   - gtk2=2.24.33=h90689f9_2
 83 |   - gts=0.7.6=h64030ff_2
 84 |   - harfbuzz=4.3.0=hf9f4e7c_0
 85 |   - httplib2=0.20.4=pyhd8ed1ab_0
 86 |   - icu=70.1=h27087fc_0
 87 |   - idna=3.3=pyhd8ed1ab_0
 88 |   - imagemagick=7.1.0_35=pl5321heb7c40d_0
 89 |   - importlib-metadata=4.11.4=py37h89c1867_0
 90 |   - importlib_metadata=4.11.4=hd8ed1ab_0
 91 |   - importlib_resources=5.7.1=pyhd8ed1ab_1
 92 |   - iniconfig=1.1.1=pyh9f0ad1d_0
 93 |   - jbig=2.1=h7f98852_2003
 94 |   - jinja2=3.1.2=pyhd8ed1ab_0
 95 |   - jmespath=1.0.0=pyhd8ed1ab_0
 96 |   - jpeg=9e=h166bdaf_1
 97 |   - jsonschema=4.5.1=pyhd8ed1ab_0
 98 |   - jupyter_core=4.10.0=py37h89c1867_0
 99 |   - kiwisolver=1.4.2=py37h7cecad7_1
100 |   - lcms2=2.12=hddcbb42_0
101 |   - ld_impl_linux-64=2.36.1=hea4e1c9_2
102 |   - lerc=3.0=h9c3ff4c_0
103 |   - libblas=3.9.0=14_linux64_openblas
104 |   - libbrotlicommon=1.0.9=h166bdaf_7
105 |   - libbrotlidec=1.0.9=h166bdaf_7
106 |   - libbrotlienc=1.0.9=h166bdaf_7
107 |   - libcblas=3.9.0=14_linux64_openblas
108 |   - libcrc32c=1.1.2=h9c3ff4c_0
109 |   - libdeflate=1.10=h7f98852_0
110 |   - libffi=3.4.2=h7f98852_5
111 |   - libgcc-ng=12.1.0=h8d9b700_16
112 |   - libgd=2.3.3=h18fbbfe_3
113 |   - libgfortran-ng=12.1.0=h69a702a_16
114 |   - libgfortran5=12.1.0=hdcd56e2_16
115 |   - libglib=2.70.2=h174f98d_4
116 |   - libgomp=12.1.0=h8d9b700_16
117 |   - libiconv=1.16=h516909a_0
118 |   - liblapack=3.9.0=14_linux64_openblas
119 |   - libnsl=2.0.0=h7f98852_0
120 |   - libopenblas=0.3.20=pthreads_h78a6416_0
121 |   - libpng=1.6.37=h21135ba_2
122 |   - libprotobuf=3.20.1=h6239696_0
123 |   - librsvg=2.54.3=h7abd40a_0
124 |   - libsodium=1.0.18=h36c2ea0_1
125 |   - libstdcxx-ng=12.1.0=ha89aaad_16
126 |   - libtiff=4.3.0=h0fcbabc_4
127 |   - libtool=2.4.6=h9c3ff4c_1008
128 |   - libuuid=2.32.1=h7f98852_1000
129 |   - libwebp=1.2.2=h3452ae3_0
130 |   - libwebp-base=1.2.2=h7f98852_1
131 |   - libxcb=1.13=h7f98852_1004
132 |   - libxml2=2.9.14=h22db469_0
133 |   - libzlib=1.2.12=h166bdaf_0
134 |   - logmuse=0.2.6=pyh8c360ce_0
135 |   - lz4-c=1.9.3=h9c3ff4c_1
136 |   - markupsafe=2.1.1=py37h540881e_1
137 |   - matplotlib-base=3.5.2=py37hc347a89_0
138 |   - multidict=6.0.2=py37h540881e_1
139 |   - munkres=1.1.4=pyh9f0ad1d_0
140 |   - nbformat=5.4.0=pyhd8ed1ab_0
141 |   - ncurses=6.3=h27087fc_1
142 |   - networkx=2.7.1=pyhd8ed1ab_0
143 |   - nomkl=1.0=h5ca1d4c_0
144 |   - numexpr=2.8.0=py37h85a3170_102
145 |   - numpy=1.21.6=py37h976b520_0
146 |   - oauth2client=4.1.3=py_0
147 |   - openjpeg=2.4.0=hb52868f_1
148 |   - openssl=3.0.3=h166bdaf_0
149 |   - packaging=21.3=pyhd8ed1ab_0
150 |   - pandas=1.3.5=py37h8c16a72_0
151 |   - pango=1.50.7=hbd2fdc8_0
152 |   - paramiko=2.11.0=pyhd8ed1ab_0
153 |   - pcre=8.45=h9c3ff4c_0
154 |   - peppy=0.31.2=pyhd8ed1ab_2
155 |   - perl=5.32.1=2_h7f98852_perl5
156 |   - pillow=9.1.1=py37h44f0d7a_0
157 |   - pip=22.1.1=pyhd8ed1ab_0
158 |   - pixman=0.40.0=h36c2ea0_0
159 |   - pkg-config=0.29.2=h36c2ea0_1008
160 |   - plac=1.3.5=pyhd8ed1ab_0
161 |   - pluggy=1.0.0=py37h89c1867_3
162 |   - ply=3.11=py_1
163 |   - prettytable=3.3.0=pyhd8ed1ab_0
164 |   - protobuf=3.20.1=py37hd23a5d3_0
165 |   - psutil=5.9.1=py37h540881e_0
166 |   - pthread-stubs=0.4=h36c2ea0_1001
167 |   - pulp=2.6.0=py37h89c1867_1
168 |   - py=1.11.0=pyh6c4a22f_0
169 |   - pyasn1=0.4.8=py_0
170 |   - pyasn1-modules=0.2.7=py_0
171 |   - pycparser=2.21=pyhd8ed1ab_0
172 |   - pygments=2.12.0=pyhd8ed1ab_0
173 |   - pygraphviz=1.6=py37h8f50634_0
174 |   - pynacl=1.5.0=py37h540881e_1
175 |   - pyopenssl=22.0.0=pyhd8ed1ab_0
176 |   - pyparsing=3.0.9=pyhd8ed1ab_0
177 |   - pyrsistent=0.18.1=py37h540881e_1
178 |   - pysftp=0.2.9=py_1
179 |   - pysocks=1.7.1=py37h89c1867_5
180 |   - pytest=7.1.2=py37h89c1867_0
181 |   - python=3.7.12=hf930737_100_cpython
182 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
183 |   - python-fastjsonschema=2.15.3=pyhd8ed1ab_0
184 |   - python-irodsclient=1.1.3=pyhd8ed1ab_0
185 |   - python_abi=3.7=2_cp37m
186 |   - pytz=2022.1=pyhd8ed1ab_0
187 |   - pyu2f=0.1.5=pyhd8ed1ab_0
188 |   - pyyaml=6.0=py37h540881e_4
189 |   - ratelimiter=1.2.0=py_1002
190 |   - readline=8.1=h46c0cb4_0
191 |   - requests=2.27.1=pyhd8ed1ab_0
192 |   - retry=0.9.2=py_0
193 |   - rsa=4.8=pyhd8ed1ab_0
194 |   - s3transfer=0.5.2=pyhd8ed1ab_0
195 |   - scipy=1.7.3=py37hf2a6cf1_0
196 |   - setuptools=62.3.2=py37h89c1867_0
197 |   - six=1.16.0=pyh6c4a22f_0
198 |   - slacker=0.14.0=py_0
199 |   - smart_open=6.0.0=pyhd8ed1ab_0
200 |   - smmap=3.0.5=pyh44b312d_0
201 |   - snakemake=6.15.5=hdfd78af_0
202 |   - snakemake-minimal=6.15.5=pyhdfd78af_0
203 |   - sqlite=3.38.5=h4ff8645_0
204 |   - stone=3.3.1=pyhd8ed1ab_0
205 |   - stopit=1.1.2=py_0
206 |   - tabulate=0.8.9=pyhd8ed1ab_0
207 |   - tk=8.6.12=h27826a3_0
208 |   - tomli=2.0.1=pyhd8ed1ab_0
209 |   - toposort=1.7=pyhd8ed1ab_0
210 |   - traitlets=5.2.1.post0=pyhd8ed1ab_0
211 |   - typing-extensions=4.2.0=hd8ed1ab_1
212 |   - typing_extensions=4.2.0=pyha770c72_1
213 |   - ubiquerg=0.6.1=pyh9f0ad1d_0
214 |   - unicodedata2=14.0.0=py37h540881e_1
215 |   - uritemplate=4.1.1=pyhd8ed1ab_0
216 |   - urllib3=1.26.9=pyhd8ed1ab_0
217 |   - veracitools=0.1.3=py_0
218 |   - wcwidth=0.2.5=pyh9f0ad1d_2
219 |   - wheel=0.37.1=pyhd8ed1ab_0
220 |   - wrapt=1.14.1=py37h540881e_0
221 |   - xorg-kbproto=1.0.7=h7f98852_1002
222 |   - xorg-libice=1.0.10=h7f98852_0
223 |   - xorg-libsm=1.2.3=hd9c2040_1000
224 |   - xorg-libx11=1.7.2=h7f98852_0
225 |   - xorg-libxau=1.0.9=h7f98852_0
226 |   - xorg-libxdmcp=1.1.3=h7f98852_0
227 |   - xorg-libxext=1.3.4=h7f98852_1
228 |   - xorg-libxrender=0.9.10=h7f98852_1003
229 |   - xorg-libxt=1.2.1=h7f98852_2
230 |   - xorg-renderproto=0.11.1=h7f98852_1002
231 |   - xorg-xextproto=7.3.0=h7f98852_1002
232 |   - xorg-xproto=7.0.31=h7f98852_1007
233 |   - xz=5.2.5=h516909a_1
234 |   - yaml=0.2.5=h7f98852_2
235 |   - yarl=1.7.2=py37h540881e_2
236 |   - yte=1.4.0=py37h89c1867_0
237 |   - zipp=3.8.0=pyhd8ed1ab_0
238 |   - zlib=1.2.12=h166bdaf_0
239 |   - zstd=1.5.2=h8a70e8d_1
240 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Princess
  2 | [![GitHub](https://img.shields.io/github/license/MeHelmy/princess)](https://opensource.org/licenses/MIT) ![GitHub last commit](https://img.shields.io/github/last-commit/MeHelmy/princess)
  3 | ---
  4 | ## What is new?
  5 | - Clair3 for calling single nucleotide polymorphisms (SNPs) and insertions/deletions (Indels)
  6 |   - Ability to use different models than the default one that comes with Clair3, which can be helpful in cases where there is new kit/training dataset or when working with data other than the human genome.
  7 | - Sniffles2 for detecting structural variants (SVs)
  8 | - Generation of a gVCF file for cohort analysis
  9 | - Generation of an SNF file for cohort structural variant analysis
 10 | - The pipeline has been fully tested on both PBS and Slurm systems with easy configuration
 11 | - The main conda environment has been updated for improved granularity.
 12 | ---
 13 | 
 14 | Princess is a fast and scalable framework to detect and report haplotype resolved Single Nucleotide Variants (SNV) and Structural Variations (SVs) at scale. It can leverage your cluster environment to speed up the detection which starts with one or many fasta or fastq files.
 15 | Publication: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02486-w   
 16 | 
 17 | <!-- Cite the code: [![DOI](https://zenodo.org/badge/179986953.svg)](https://zenodo.org/badge/latestdoi/179986953) -->
 18 | 
 19 | 
 20 | ![princess](./pictures/leia.jpg)
 21 | 
 22 | ## Princess
 23 | 
 24 | * __Mapping__:   Minimap2 or NGMLR
 25 | * __SNVs__: Clair3 
 26 | * __SVs__: Sniffles2
 27 | * __Phasing SNVs__: WhatsHap
 28 | * __Phasing SVs__: Sniffles2
 29 | * __Extend Phasing__: PRINCESS-subtool
 30 | * __Phased Methylation__: Nanopolish + PRINCESS-subtool
 31 | * __QC Statistics__ for each step
 32 | 
 33 | ---
 34 | 
 35 | ## Installation
 36 | Princess was tested on CentOS release 6.7, Conda version 4.7.12 is installed:
 37 | for more information about [Installing Conda press here](https://bioconda.github.io/user/install.html#install-conda, "Install Conda")
 38 | To download same Conda version [here](https://repo.continuum.io/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh "Conda 4.7.12")*
 39 | 
 40 | 1. After conda is installed. Snakemake should be installed and yaml
 41 | ~~~
 42 | conda install snakemake=5.7.1
 43 | conda install pyyaml
 44 | ~~~
 45 | 2. Downloading PRINCESS  
 46 | ~~~
 47 | git clone https://github.com/MeHelmy/princess.git
 48 | ~~~
 49 | 
 50 | ---
 51 | 
 52 | ## Tutorial
 53 | 
 54 | To have an overview about princess write command `princess -h`.
 55 | You will have the following list of commands that we can use in princess.
 56 | 
 57 | ~~~
 58 | usage: princess [-h] {all,align,sv,snv,variant,phase,overview} ...
 59 | 
 60 | Princess A framework for long-reads analysis.
 61 | 
 62 | optional arguments:
 63 |   -h, --help            show this help message and exit
 64 | 
 65 | Sub-commands:
 66 |   Valid sub-commands
 67 | 
 68 |   {all,align,sv,snv,variant,phase,overview}
 69 |     all                 This command will run the following: Align the reads.
 70 |                         Identify SVs Identify SNVs Phase both SNVs and SVs
 71 |     align               This command will use the input sequence files and
 72 |                         align them against the reference using either Minimap2
 73 |                         or NGMLR use -a to choose aligner otherwise Minimap2
 74 |                         will be used by default.
 75 |     sv                  This command will use bam file to identify SV using
 76 |                         Sniffles.
 77 |     snv                 This command will use bam file to identify SNVs using
 78 |                         Clair3.
 79 |     variant             This command will use bam file to identify SVs and
 80 |                         SNVs.
 81 |     phase               This command will use use reads to identify SNVs by
 82 |                         Clair and Phase them.
 83 |     overview            This command will show what steps will run.
 84 | 
 85 | princess version 0.01. use command -h for info.
 86 | ~~~
 87 | 
 88 | 
 89 | Assume that we want only to run `snv` command, to know more about its option:
 90 | 
 91 | `princess snv -h`
 92 | 
 93 | 
 94 | ~~~
 95 | usage: princess snv [-h] [-v] -d Working directory -r {ont,clr,ccs} [-l] [-u]
 96 |                     [-e] [-a {minimap,ngmlr}]
 97 |                     [-s sampleFiles [sampleFiles ...]] -f REF [-j JOBS]
 98 |                     [-g LOG_FILE] [-c CHRS [CHRS ...]] [-t]
 99 | 
100 | optional arguments:
101 |   -h, --help            show this help message and exit
102 |   -v, --version         show program's version number and exit
103 |   -d Working directory, --directory Working directory
104 |                         Working directory.
105 |   -r {ont,clr,ccs}, --ReadType {ont,clr,ccs}
106 |                         Read technology (Note: clr is not supported anymore by clair3)
107 |   -l, --removeFiles     remove princess source script after running default:
108 |                         False)
109 |   -u, --UseConda        Use conda for running default: True)
110 |   -e, --Cluster         Use cluster while running default: True)
111 |   -a {minimap,ngmlr}, --Aligner {minimap,ngmlr}
112 |                         In case if you want to choose specific aligner
113 |                         otherwise default will be used default: minimap)
114 |   -s sampleFiles [sampleFiles ...], --sampleFiles sampleFiles [sampleFiles ...]
115 |                         list of fatsa, fastq, or gz files.
116 |   -f REF, --ref REF     The reference file will be used to align reads to.
117 |   -j JOBS, --jobs JOBS  Number of running jobs default: 200 )
118 |   -g LOG_FILE, --log LOG_FILE
119 |                         Log file: PrincessLog.txt )
120 |   -c CHRS [CHRS ...], --chr CHRS [CHRS ...]
121 |                         Chromosomes list, if not specified we will use all
122 |                         Chromosomes.
123 |   -t, --filter          Filter identified SNVs using Princess algorithm
124 |                         default: True)
125 | ~~~
126 | 
127 | 
128 | ~~~
129 | princess all  -d ./princess_all -r ont -s reads.split00.fastq.gz reads.split01.fastq.gz  -f hs37d5_mainchr.fa
130 | ~~~
131 | 
132 | `-r` defines the reads type.  
133 | `-s` samples that we would like to analyze.  
134 | `-f` **full path** to the reference.  
135 | 
136 | *__Note__*  
137 | I am assuming that the reference file is indexed, if not please use the following command.  
138 | `samtools faidx hs37d5_mainchr.fa` as a result you will have `hs37d5_mainchr.fa.fai`.
139 | 
140 | Done!!
141 | 
142 | ### For methylation calling.
143 | Methylation calling is a part from the `all` option.
144 | 
145 | ```
146 | optional arguments:
147 |   -h, --help            show this help message and exit
148 |   -v, --version         show program's version number and exit
149 |   -d Working directory, --directory Working directory
150 |                         Working directory.
151 |   -r {ont,clr,ccs}, --ReadType {ont,clr,ccs}
152 |                         Read technology
153 |   -l, --removeFiles     remove princess source script after running default: False)
154 |   -u, --UseConda        Use conda for running default: True)
155 |   -e, --Cluster         Use cluster while running default: True)
156 |   -a {minimap,ngmlr}, --Aligner {minimap,ngmlr}
157 |                         In case if you want to choose specific aligner otherwise default will be used default: minimap)
158 |   -s sampleFiles [sampleFiles ...], --sampleFiles sampleFiles [sampleFiles ...]
159 |                         list of fatsa, fastq, or gz files.
160 |   -f REF, --ref REF     The reference file will be used to align reads to.
161 |   -j JOBS, --jobs JOBS  Number of running jobs default: 200 )
162 |   -g LOG_FILE, --log LOG_FILE
163 |                         Log file: PrincessLog.txt )
164 |   -c CHRS [CHRS ...], --chr CHRS [CHRS ...]
165 |                         Chromosomes list, if not specified we will use all Chromosomes.
166 |   -t, --filter          Filter identified SNVs using Princess algorithm default: True)
167 |   -m, --methylation     Identify methylation, mutually inclusive with -md default: False)
168 |   -md Fast5 Directory, --methylationDirectory Fast5 Directory
169 |                         Fast5 directory will be used to identify methylation mutually inclusive with option -m default: False)
170 | ```
171 | By choosing the flag __`--methylation`__, Princess will call the methylation on the input data (ONT data), this option is inclusive with the option __`--methylationDirectory`__ which requires the fasta5 directory.
172 | 
173 | ## Test case
174 | 
175 | We uploaded a HiFi compressed data file from the publicly available HG002 data set.
176 | The complete data set (High-fidelity 15kb long-read dataset of HG002, Ashkenazim Son.) is available [Here](https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/PacBio_CCS_15kb/)
177 | 
178 | To download the test data run the following command:
179 | ```
180 | wget https://bcm.box.com/shared/static/sdml5d7csxprgu3cl5cve0lgv5jnrrlv --output-document  HiFi.fastq.gz
181 | ```
182 | After download is finished you shall have a HiFi fastq file called `HiFi.fastq.gz`, to run the analysis test run the following command:
183 | ```
184 | Full/Path/To/princess all  --directory $PWD/analysis --ReadType ccs --ref Path/To/Reference/genome.fa --jobs 7 --sampleFiles $PWD/HiFi.fastq.gz  --latency-wait 200 -p
185 | ```
186 | all:           The command to run full analysis for other options please run `princess -h`  
187 | ---directory:  The out put directory it could be any name, use the full path, in my case the output is  same place.  
188 | --ReadType:    Read type, the supported read types are clr, ccs, and ont.  
189 | --ref:         Path to the reference please use samtools faidx with refernce before running Princess.  
190 | --jobs:        Number of running jobs on cluster.  
191 | --sampleFiles: Sample fastq file we downloaded, it could be more than one either compressed or not.  
192 | --latency-wait 200 -p:  These are additional Snakemake option to wait 200 seconds before collecting output.  
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | ## Output
200 | 
201 | Princess will create these directories:
202 | - align   contains directory [minimap or ngmlr] based on the aligner that was specified.
203 | - sv      contains the structural variant file sv/minimap/sniffles.vcf
204 | - snp     contains single nucleotide variant calls per chromosomes
205 | - phased  contains phased variant
206 | - stat    contains Statistics
207 | - meth    contains methylation info (if user choose to run methylation)  
208 | 
209 | ## Collect benchmark Statistics  
210 | ```
211 | cd benchmark  # There is a directory benchmark contains all the analyses that were done by PRINCESS
212 | find "$PWD" -type f | grep -v "myBenchMark.txt" > myBenchMark.txt
213 | while read -r line; do n=$(echo  $line | awk  -v FS=/ '{print $(NF-1)"-"$(NF)}');  awk -v f=$line -v o=$n 'NR!=1 {print o"\t"$(NF)}' $line  ;done < myBenchMark.txt
214 | ```  
215 | - meth    contains methylation info (if user choose to run methylation) 
216 | 
217 | ---
218 | 
219 | ## Converting from PBS to Slurm
220 | 1- Please ensure that you modify the `cluster/cluster_config.yaml` to specify the appropriate long-running node. For example, you can set the long queue system as follows:
221 |     `long: &long_queue long_queue` 
222 |     Where long_queue is the queue system that can run for a long time. Similarly, you can set the short queue in the following way:
223 |     `short: &short_queue short_queue`, . Please refer to your cluster system administrator for more details.  
224 | 2- Please, ensure that you changed `cluster/config.yaml` from `cluster-status: "pbs_status.py"` to `cluster-status: "slurm_status.py"`  
225 | 3- In the `cluster/key_mapping.yaml` file. Please, change `system: "pbs"` to `system: "slurm"`  
226 | 4- Finally, in the `cluster/cluster_config.yaml` file, I set CPU and memory to each job to suit my cluster.  
227 | E.g.
228 | ```
229 | minimap2:
230 |   queue: *long_queue
231 |   time: "72:00:00"
232 |   nCPUs: "12"
233 |   mem: 20G
234 | ```
235 | Here, I am using 12 CPUs, 20G memory, and the job running time is "72:00:00" maximum (three days.). You may need to use a different configuration based on the resources availability in your cluster. Please, refer to your system administrator for more details.
236 | 
237 | 


--------------------------------------------------------------------------------
/scripts/update_sv_hp_ps.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #!/usr/bin/env python3
  3 | 
  4 | """
  5 | This script update vcf file to add both HP haplotag and PS phasing block info fields, It takes as input vcf file, hp, ps.
  6 | """
  7 | import argparse
  8 | import sys, os
  9 | from operator import itemgetter
 10 | from collections import Counter
 11 | 
 12 | # Python program to print
 13 | # green text with red background
 14 | #
 15 | # from colorama import init
 16 | # from termcolor import colored
 17 | #
 18 | # init()
 19 | 
 20 | 
 21 | 
 22 | 
 23 | def get_args():
 24 |     parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.",
 25 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 26 |                                      description='Phase SVs Using haplotyped reads in tab format',
 27 |                                      add_help=True, )
 28 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01')
 29 |     # parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
 30 |     # parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout)
 31 | 
 32 |     parser.add_argument('input', nargs='?', help="Structural variant vcf file",
 33 |                              type=argparse.FileType('r'),
 34 |                              default=sys.stdin)
 35 |     parser.add_argument('hp', nargs='?', help="tab delimeted read\thp\tps file",
 36 |                              type=argparse.FileType('r'))
 37 |     parser.add_argument('output', nargs='?', help="Output file, PS and HP will be added.",
 38 |                                  type=argparse.FileType('w+'),
 39 |                                  default=sys.stdout)
 40 |     parser.add_argument('-c', '--conflict', dest='ignore_conflict', metavar='Max Conflict Reads', type=int,  help='Minumum number of conflict reads to ignore', default=0)
 41 | 
 42 |     parser.set_defaults(func=update_vcf)
 43 | 
 44 |     # if no argument print help.
 45 |     if len(sys.argv) == 1 and  sys.stdin.isatty():  # sys.stdin.isatty() returns false if there's something in stdin
 46 |          parser.print_help(sys.stderr)
 47 |          sys.exit(1)
 48 | 
 49 |     args = parser.parse_args()
 50 | 
 51 | 
 52 |     if 'func' in args:
 53 |         args.func(args)
 54 |     else:
 55 |         parser.print_help()
 56 | 
 57 | def update_vcf(args):
 58 |     # check if the input from stdin
 59 |     if not sys.stdin.isatty(): # there is nothing in the stdin
 60 |         if args.input.name.endswith("gz"):
 61 |             import gzip
 62 |             myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default.
 63 |         else:
 64 |             myfile = args.input
 65 |     else:
 66 |         myfile = args.input
 67 | 
 68 |     # read the Haplotyped reads file as dictionary
 69 |     hp_dic = {}
 70 |     with args.hp as hp_in:
 71 |         for line in hp_in:
 72 |             id, hp, ps = line.split()
 73 |             hp_dic[id] = [hp.rsplit(":", 1)[-1], ps.rsplit(":", 1)[-1]] # read -> [hp, ps]
 74 | 
 75 | 
 76 |     with myfile as data_in, args.output as data_out:
 77 |         for line in data_in:
 78 |             reads = []
 79 |             if line.startswith('##'):
 80 |                 data_out.write(line)
 81 |             elif line.startswith("#"):
 82 |                 # data_out.write("##INFO=<ID=HP,Number=1,Type=Integer,Description=\"Haplotype identifier\">\n")
 83 |                 data_out.write("##INFO=<ID=CONFLICT,Number=.,Type=Integer,Description=\"The Phase is conflict [1], not [0] or no data [2]\">\n")
 84 |                 data_out.write("##FORMAT=<ID=PS,Number=.,Type=Integer,Description=\"Phase set identifier\">\n")
 85 |                 data_out.write(line)
 86 |             else:
 87 |                 line_split = line.split()
 88 |                 if line_split[-1].split(":", 1)[0] == "1/1" or line_split[-1].split(":", 1)[0] == "0/0" or line_split[-1].split(":", 1)[0] == "./.":  # no gt to phase
 89 |                     data_out.write("{}\n".format("\t".join(line_split)))
 90 |                 elif line_split[-1].split(":", 1)[0] == "0/1" or line_split[-1].split(":", 1)[0] == "1/0":
 91 |                     reads = [i for i in line_split[7].split(";")  if i.startswith("RNAMES")][0].split("=",1)[-1].split(",")
 92 |                     svtype = [i for i in line_split[7].split(";")  if i.startswith("SVTYPE")][0].split("=",1)[-1].split(",")
 93 |                     svlen = [i for i in line_split[7].split(";")  if i.startswith("SVLEN")][0].split("=",1)[-1].split(",")
 94 |                     #reads = line_split[7].split(";")[10].split(",") #info field -> reads
 95 |                     #reads[0] = reads[0].split("=")[-1]
 96 |                     myvalues = list(map(hp_dic.get, reads))  # list of lists first element id hp second is ps or None on case there are no reads with hp and ps to support this sv
 97 |                     id = line_split[2]
 98 |                     # If any value not None
 99 |                     # print(f'{line_split[1]}\t{id}\t{svtype[0]}\t{svlen[0]}\t{myvalues}')
100 |                     if any(myvalues): # any value is not none
101 |                         # print(f'{id}\t{svtype[0]}\t{svlen[0]}\t{myvalues}')
102 |                         ps_dict = categorize_ps_up(myvalues, args.ignore_conflict)
103 |                         if 0 in list(ps_dict.values()): # means that the hp is conflicting do not update anything and add flag that is is conflicting.
104 |                             line_split[7] = "{info};CONFLICT={conflict}".format(info=line_split[7], conflict=1)
105 |                             line_split[-2] = "{}:{}".format(line_split[-2], "PS")
106 |                             line_split[-1] = "{}:{}".format(line_split[-1], ",".join(ps_dict.keys()))
107 |                             data_out.write("{}\n".format("\t".join(line_split)))
108 |                         else: # update the gt field and ps to sv
109 |                             line_split[7] = "{info};CONFLICT={conflict}".format(info=line_split[7], conflict=0)
110 |                             line_split[-2] = "{}:{}".format(line_split[-2], "PS")
111 |                             # if values are negative then it is hp=1 1|0 else it is hp2 0|1
112 |                             # line_split[-1] = line_split[-1].replace("/", "|")
113 |                             hp_new_value = line_split[-1].split(':')
114 |                             try:
115 |                                 if list(ps_dict.values())[0] < 1: # haplotype 1
116 |                                     hp_new_value[0] = "1|0"
117 |                                 else:
118 |                                     hp_new_value[0] = "0|1"
119 |                             except Exception as e:
120 |                                 print(e)
121 | 
122 | 
123 |                             hp_new_value = ":".join(hp_new_value)
124 |                             line_split[-1] = "{}:{}".format(hp_new_value, ",".join(ps_dict.keys()))
125 |                             data_out.write("{}\n".format("\t".join(line_split)))
126 |                     else: # all are none
127 |                         line_split[7] = "{info};CONFLICT=2".format(info=line_split[7])
128 |                         line_split[-2] = "{}:{}".format(line_split[-2], "PS")
129 |                         line_split[-1] = "{}:{}".format(line_split[-1], ".")
130 |                         data_out.write("{}\n".format("\t".join(line_split)))
131 | 
132 | 
133 | # Test case [['1', '23200'], ['2', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['1', '23200'], ['1', '23200'], ['2', '23200'], ['2', '23200'], ['2', '23200']]
134 | #
135 | # [['1', '13164067'], ['1', '13164067'], ['1', '13164067'], ['1', '13164067'], ['1', '13164067'], ['2', '12948612'], ['1', '13164067'], ['1', '13164067'], ['2', '12948612'], ['1', '13164067'], ['2', '12948612']]
136 | def categorize_ps(myvalues):
137 |     myvalues = [i for i in myvalues if i is not None] # remove None
138 |     ps_dict = {}
139 |     for i in myvalues:
140 |         hp = int(i[0])
141 |         ps = i[1]
142 |         if ps in ps_dict:
143 |             if hp == 1:
144 |                 if ps_dict[ps] < 0:
145 |                     ps_dict[ps] = ps_dict[ps] - 1
146 |                 else: #conflict
147 |                     ps_dict[ps] = 0
148 | 
149 |             else: # means that it is haplotype 2 hp=2
150 |                 if ps_dict[ps] > 0:
151 |                     ps_dict[ps] = ps_dict[ps] + 1
152 |                 else: #conflict
153 |                     ps_dict[ps] = 0
154 |         else:
155 |             if hp == 1:
156 |                 ps_dict[ps] = -1
157 |             else:
158 |                 ps_dict[ps] = 1
159 |     return ps_dict
160 | 
161 | 
162 | def most_frequent(List):
163 | 	return max(set(List), key = List.count)
164 | 
165 | 
166 | def categorize_ps_conflict(myvalues, max_conflict):
167 |     ps_dict = {}
168 |     myvalues = [i for i in myvalues if i is not None] # remove None
169 |     hp = [i[0] for i in myvalues]
170 |     hp_count = {i: hp.count(i) for i in hp} # i.e {'1': 3, '2': 2} or {'1': 3}
171 |     # TODO: chek if we have two hap with differnt phase block they should not be counted as conflict
172 |     if len(hp_count) > 1: # they are conflicting
173 |         if min(hp_count.values()) <= max_conflict: # we are less than or equal the minium accepted number of conflict reads
174 |         # calculate PS and HP
175 |             for i in myvalues:
176 |                 # get the hp based on the hoghest number
177 |                 hp = max(hp_count, key = hp_count.get) # either 1 or 2
178 |                 ps =  most_frequent([i[1]  for i in myvalues if i[0] == hp])
179 |                 ps_dict[ps] = int(hp) if int(hp) > 1 else -1
180 |         else: # Number of reads conflicting are higher than user suggestion
181 |             ps = most_frequent([i[1] for i in myvalues])
182 |             ps_dict[ps] = 0 # ['ps', 0]
183 |     else:
184 |         # Data are not conflict calculate normally:
185 |         ps_dict = categorize_ps(myvalues)
186 |     return ps_dict
187 | 
188 | 
189 | def categorize_ps_up(myvalues: 'list', min_conflict: 'int') -> {}:
190 |     myvalues = [i for i in myvalues if i is not None] # remove None
191 |     ps_dict = {}
192 |     if min_conflict > 0:
193 |         min_conflict += 1
194 |     for i in myvalues:
195 |         hp = int(i[0])
196 |         ps = i[1]
197 |         if ps in ps_dict:
198 |             if hp == 1:
199 |                 if ps_dict[ps] < 0:
200 |                     ps_dict[ps] = ps_dict[ps] - 1
201 |                 else: #conflict
202 |                     if min_conflict == 0:
203 |                         ps_dict[ps] = 0
204 |                     else:
205 |                         min_conflict -= 1
206 |                         ps_dict[ps] -= 1
207 |             else: # means that it is haplotype 2 hp=2
208 |                 if ps_dict[ps] > 0:
209 |                     ps_dict[ps] = ps_dict[ps] + 1
210 |                 else: #conflict
211 |                     if min_conflict == 0:
212 |                         ps_dict[ps] = 0
213 |                     else:
214 |                         min_conflict -= 1
215 |                         ps_dict[ps] += 1
216 |         else:
217 |             if hp == 1:
218 |                 ps_dict[ps] = -1
219 |             else:
220 |                 ps_dict[ps] = 1
221 |     return ps_dict
222 | 
223 | def main():
224 |     args = get_args()
225 | 
226 | 
227 | 
228 | if __name__ == "__main__":
229 |     # del_test_840 = [['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175']]
230 |     #
231 |     # hp = [i[0] for i in del_test_840]
232 |     # hp_count = {i: hp.count(i) for i in hp}
233 |     #
234 |     # print(f'Number of reads supports HP {hp_count}')
235 |     #
236 |     # print(f'CONF 19 DEL 840 {categorize_ps_up(del_test_840, 19)}')
237 |     # exit(1)
238 |     main()
239 | 


--------------------------------------------------------------------------------
/scripts/phasing_report_update_vcf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | """
  4 | bamtools merge should be used before this script. where the vcf file should be merged with both paternal and maternal, respectively.
  5 | """
  6 | import argparse
  7 | import sys, re
  8 | 
  9 | def get_args():
 10 |     parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.",
 11 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter,
 12 |                                      description='Produce phasing report',
 13 |                                      add_help=True, )
 14 |     parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01')
 15 |     # parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
 16 |     # parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout)
 17 | 
 18 |     # parser.add_argument('input', nargs='?', help="Phased vcf file",
 19 |     #                          type=argparse.FileType('r'),
 20 |     #                          default=sys.stdin)
 21 |     # parser.add_argument('output', nargs='?', help="Output file if no file result will be directed to stander output",
 22 |     #                              type=argparse.FileType('w+'),
 23 |     #                              default=sys.stdout)
 24 |     parser.add_argument('-i', '--input', nargs='?', help="Phased vcf file", required=True)
 25 |     parser.add_argument('-o', '--output', nargs='?', help="Output file for blocks", required=True)
 26 |     parser.add_argument('-s', '--stat', nargs='?', help="Output statistics file for phased datat", required=True)
 27 |     parser.add_argument("-u", '--update_snps',  help="Output updated snp file", required=True)
 28 |     parser.add_argument('-t', '--tolerance', help="Percent of tolerance.", type=int, action='store', default=5)
 29 |     parser.add_argument('-n', '--min_snps', help="Minimum Number of SNPs per block.", type=int, action='store', default=10)
 30 | 
 31 |     parser.set_defaults(func=phase_filtering)
 32 |     args = parser.parse_args()
 33 |     if 'func' in args:
 34 |         args.func(args)
 35 |     else:
 36 |         parser.print_help()
 37 | 
 38 | 
 39 | 
 40 | def phase_filtering(args):
 41 |     # check if the input from stdin
 42 |     # if sys.stdin.isatty():
 43 |     #     if args.input.name.endswith("gz"):
 44 |     #         import gzip
 45 |     #         myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default.
 46 |     #     else:
 47 |     #         myfile = args.input
 48 |     # else:
 49 |     myfile = args.input
 50 |     if myfile.endswith("gz"):
 51 |         import gzip
 52 |         myfile_open = gzip.open(myfile, 'rt', encoding='utf-8')
 53 |         #myfile_open = gzip.open(myfile, 'rt', encoding='utf-16')
 54 |     else:
 55 |         myfile_open = open(myfile, 'r')
 56 | 
 57 |     phasing_dictionary = {}
 58 | 
 59 | 
 60 |     with myfile_open as data_in, open(args.output, 'w') as data_out:
 61 |         nonphased_hetero = 0
 62 |         snp_number = 0
 63 |         maternal = 0
 64 |         paternal = 0
 65 |         homo_number = 0
 66 |         unknown_phased = 0
 67 |         non_sample_snp = 0
 68 |         for line in data_in:
 69 |             if line.startswith('#'):
 70 |                 pass # chnage to print in output file
 71 |             else:
 72 |                 snp_number += 1
 73 |                 line_split = line.split()
 74 |                 gt_flag, sample, father, mother = line_split[8:12]
 75 |                 # gt_flag, sample, father, mother = line_split[8:12]
 76 |                 if "1/1" in sample:
 77 |                     homo_number +=1
 78 |                 # 9 is first sample followed by father and mother
 79 |                 # ID f0/1 f1/0 m0/1 m1/0
 80 |                 # paternal
 81 |                 if "|" in sample and bool(re.search(r'\d',father)) and not bool(re.search(r'\d', mother)):
 82 |                     paternal += 1
 83 |                     gt_index = gt_flag.split(":").index("PS")
 84 |                     id = sample.split(":")[gt_index]
 85 | 
 86 |                     f0_1 = 0
 87 |                     f1_0 = 0
 88 |                     if sample[0] == "0":
 89 |                         f0_1 = 1
 90 |                     else:
 91 |                         f1_0 = 1
 92 | 
 93 |                     if id not in phasing_dictionary:
 94 |                         phasing_dictionary[id] = [f0_1, f1_0 , 0, 0]
 95 |                     else:
 96 |                         phasing_dictionary[id][0] += f0_1
 97 |                         phasing_dictionary[id][1] += f1_0
 98 |                 # maternal
 99 |                 elif "|" in sample and bool(re.search(r'\d', mother)) and not bool(re.search(r'\d',father)):
100 |                     maternal += 1
101 |                     gt_index = gt_flag.split(":").index("PS")
102 |                     id = sample.split(":")[gt_index]
103 | 
104 |                     m0_1 = 0
105 |                     m1_0 = 0
106 |                     if sample[0] == "0":
107 |                         m0_1 = 1
108 |                     else:
109 |                         m1_0 = 1
110 | 
111 |                     if id not in phasing_dictionary:
112 |                         phasing_dictionary[id] = [0, 0, m0_1, m1_0]
113 |                     else:
114 |                         phasing_dictionary[id][2] += m0_1
115 |                         phasing_dictionary[id][3] += m1_0
116 |                 # Unknown if it is right or wrong cause no equivliant in mother or father
117 |                 elif "|" in sample:
118 |                     unknown_phased += 1
119 |                 elif "1/0" in sample or "0/1" in sample:
120 |                 #elif "1/1" not in sample and "." not in sample:
121 |                     nonphased_hetero += 1 # is it hetero or homo zygot
122 |                 elif sample.startswith("."):
123 |                     non_sample_snp += 1
124 | 
125 | 
126 |         for k, v in phasing_dictionary.items():
127 |             data_out.write("{}\t{}\n".format(str(k), "\t".join(map(str, v))))
128 |         # print("Number SNPs: {snp}\nUnknown phased case: {unknown}\n \
129 |         # Number of non-phased Hetero: {not_phased_hetero}\n \
130 |         # Maternal phased: {mother}\nPaternal phased:  \
131 |         # {father}\nDone".format(unknown=unknown_phased, not_phased_hetero=nonphased_hetero, snp=snp_number, mother=maternal, father=paternal))
132 |         with open(args.stat, 'w') as stat_out:
133 |             stat_out.write("\
134 |             Number SNPs: {snp}\n\
135 |             Homozygot number 1/1: {homo}\n\
136 |             Unknown phased case: {unknown_cases}\n\
137 |             Number of non-phased Hetero: {not_phased_hetero}\n\
138 |             Total number of Phased SNPs: {total}\n\
139 |             Maternal phased: {mother}\n\
140 |             Paternal phased:{father}\n\
141 |             SNP only in paternal: {no_snp}".format(unknown_cases=unknown_phased, homo=homo_number, not_phased_hetero=nonphased_hetero, snp=snp_number, mother=maternal, father=paternal, total=str(maternal+paternal), no_snp = non_sample_snp))
142 | 
143 | 
144 |     # updating vcf phased snps
145 |     chr = ""
146 |     new_block_value = ""
147 |     if args.update_snps:
148 |         if myfile.endswith("gz"):
149 |             import gzip
150 |             myfile_open = gzip.open(myfile, 'rt', encoding='utf-8')
151 |             #myfile_open = gzip.open(myfile, 'rt', encoding='utf-16')
152 |         else:
153 |             myfile_open = open(myfile, 'r')
154 | 
155 |         with myfile_open as data_in, open(args.update_snps , "w") as output:
156 |             for line in data_in:
157 |                 if line.startswith('##'):
158 |                     output.write(line)
159 |                 elif line.startswith("#"):
160 |                     output.write("##INFO=<ID=parental-snps,Number=1,Type=Integer,Description=\"Total number of parental snps supporting the phsing block in called genotypes\">\n")
161 |                     output.write("{}\n".format("\t".join(line.split()[:-2])))
162 |                 else:
163 |                     line_split = line.split()
164 |                     chr_value = line_split[0]
165 |                     # identify wich chromosome we are using.
166 |                     if chr_value != chr:
167 |                         chr = chr_value
168 |                         first_block = True
169 |                     snp_format, format_value = line_split[8:10]
170 |                     format_value_split = format_value.split(':')
171 |                     if "PS" in snp_format and "|" in format_value.split(":")[0]:  # It is phased
172 |                         # pritn(line)
173 |                         # gt_value = format_value.split(":")[0]
174 |                         block_value = format_value.split(":")[snp_format.split(":").index("PS")]
175 |                         block_not_conflict = False
176 |                         if block_value in phasing_dictionary:  # this snp have a similr one in parents vcf file.
177 |                             block_not_conflict, gt_value = not_conflecting(phasing_dictionary[block_value], args)
178 |                             # update the PS value.
179 |                             if block_not_conflict:
180 |                                 # add +N of snps supoorting the block to p-snp
181 |                                 # udate the PS tag for each chromsome to be the first value in the first block (ps)
182 |                                 # write the updated line
183 |                                 # Update gt
184 |                                 if first_block:
185 |                                     first_block = False
186 |                                     new_block_value = block_value
187 | 
188 |                                 format_value_split[0] = gt_value
189 |                                 # Update PS
190 |                                 format_value_split[snp_format.split(":").index("PS")] = new_block_value
191 |                                 line_split[9] = ":".join(format_value_split)
192 |                                 line_split[7] = line_split[7] + ";parental-snps=" + str(
193 |                                     sum(phasing_dictionary[block_value]))
194 |                                 output.write("{}\n".format("\t".join(line_split[:-2])))
195 |                                 # print("block --> "+ new_block_value)
196 |                             else:
197 |                                 # add -1 of snps supoorting the block to p-snp
198 |                                 # udate the PS tag for each chromsome to be the first value in the first block (ps)
199 |                                 # write the updated line
200 |                                 # Update gt
201 |                                 format_value_split[0] = gt_value
202 |                                 # Update PS
203 |                                 # format_value_split[snp_format.split(":").index("PS")] = new_block_value
204 |                                 line_split[9] = ":".join(format_value_split)
205 |                                 line_split[7] = line_split[7] + ";parental-snps=-" + str(
206 |                                     sum(phasing_dictionary[block_value]))  # add sum
207 |                                 output.write("{}\n".format("\t".join(line_split[:-2])))
208 |                                 # print("block --> "+ new_block_value)
209 |                         else:
210 |                             # No information form parental about it
211 |                             # keep it the same add 0 to p-snp flag
212 |                             line_split[7] = line_split[7] + ";parental-snps=0"
213 |                             output.write("{}\n".format("\t".join(line_split[:-2])))
214 |                     elif bool(re.search(r'\d',format_value)):
215 |                         # exit(format_value.split(":"))
216 |                     # elif "/" in format_value.split(":")[0]:
217 |                         # Write the line without change
218 |                         line_split[7] = line_split[7] + ";parental-snps=."
219 |                         output.write("{}\n".format("\t".join(line_split[:-2])))
220 |                     # else:
221 |                     #     print(line)
222 | 
223 | 
224 | 
225 | 
226 | 
227 | def hasNumbers(inputString):
228 |      return any(char.isdigit() for char in inputString)
229 | 
230 | def is_not_conflict(block_snp_list, args):
231 |     all_snps_in_block = sum(block_snp_list)
232 |     if all_snps_in_block >= args.min_snps:
233 |         # assuming that the list is formed like that F0|1 F1|0 M0|1 M1|0
234 |         tolerance_percentage = args.tolerance * all_snps_in_block/100  # tolerance is 5%
235 |         index_m, value_m = max(enumerate(block_snp_list[:2]), key=operator.itemgetter(1))
236 |         index_f, value_f = max(enumerate(block_snp_list[2:]), key=operator.itemgetter(1))
237 | 
238 |         if  block_snp_list.count(0) == 3 or ( ( (index_m == 0 and index_f == 1) or (index_m == 1 and index_f == 0) ) and ( any(i <= 5*sum(block_snp_list[:2])/100 for i in block_snp_list[:2]) and any(i <= 5*sum(block_snp_list[2:])/100 for i in block_snp_list[2:])) ):
239 |             return (True, "{}|{}".format(index_m, index_f))  # it means 0|1 or 1|0
240 | 
241 | def not_conflecting(block, args):
242 |     tolerance = args.tolerance / sum(block) * 100
243 |     max_index = block.index(max(block)) # bigest value in snps
244 |     # assuming that the list is formed like that F0|1 F1|0 M0|1 M1|0
245 |     # if sum(block) >= args.min_snps:
246 |     if (max_index == 0 or max_index==3):
247 |         non_conflict = block[0] + block[3]
248 |         conflict = block[1] + block[2]
249 |         gt = "0|1"  # parental|maternal
250 |     elif (max_index == 1 or max_index ==2):
251 |         non_conflict = block[1] + block[2]
252 |         conflict = block[0] + block[3]
253 |         gt = "1|0"
254 |     return (conflict / (conflict + non_conflict) * 100 <= tolerance, gt)
255 |     # else:
256 |     #     return(False, "")
257 | 
258 | 
259 | 
260 | 
261 | def main():
262 |     args = get_args()
263 | 
264 | 
265 | 
266 | if __name__ == "__main__":
267 |     main()
268 | 


--------------------------------------------------------------------------------
/modules/snp.smk:
--------------------------------------------------------------------------------
  1 | #########################
  2 | ######  SNPs RULES ######
  3 | #########################
  4 | 
  5 | 
  6 | #### CLAIR #######
  7 | ##################
  8 | 
  9 | # CLAIR Parameters
 10 | #=================
 11 | 
 12 | # if config["clair_model"]:
 13 | #     training_data=config["clair_model"]
 14 | def platform(wildcards):
 15 |     if config['read_type'] == "ccs":
 16 |         return "hifi"
 17 |     elif config['read_type'] == "ont":
 18 |         return "ont"
 19 |     elif config['read_type'] == "clr":
 20 |         return "hifi"
 21 |     else:
 22 |         print("Unknow data type, supported format are: ont, ccs, and clr")
 23 |         exit(1)
 24 | 
 25 | def get_model(conda_dir):
 26 |     training_data = ""
 27 |     if config['read_type'] == "ccs":
 28 |         training_data=config["clair_model"] if config["clair_model"] else None
 29 |     elif config['read_type'] == "ont":
 30 |         training_data=config["clair_model"] if config["clair_model"] else None
 31 |     elif config['read_type'] == "clr":
 32 |         training_data=config["clair_model"] if config["clair_model"] else None
 33 |     else:
 34 |         print("Unknown data type, supported format are: ont, ccs, and clr")
 35 |         exit(1)
 36 |     return training_data
 37 | # if config['read_type'] == "ccs":
 38 | #     # training_data=config["training_data_ccs"]
 39 | #     platform="hifi"
 40 | #     training_data=config["clair_model"] if config["clair_model"] else os.path.join(os.environ['CONDA_PREFIX'], "bin/models/hifi")
 41 | # elif config['read_type'] == "ont":
 42 | #     # training_data=config["training_data_ont"]
 43 | #     platform="ont"
 44 | #     training_data=config["clair_model"] if config["clair_model"] else os.path.join(os.environ['CONDA_PREFIX'], "bin/models/ont")
 45 | #     # training_data="/bin/models/ont"
 46 | # elif config['read_type'] == "clr":
 47 | #     platform="hifi"
 48 | #     # training_data=config["training_data_clr"]
 49 | #     training_data=config["clair_model"] if config["clair_model"] else os.path.join(os.environ['CONDA_PREFIX'], "bin/models/hifi")
 50 | #     # training_data="/bin/models/hifi"
 51 | # else:
 52 | #     print("Unknow data type, supported format are: ont, ccs, and clr")
 53 | #     exit(1)
 54 | 
 55 | 
 56 | # CLAIR RULE
 57 | #===========
 58 | 
 59 | 
 60 | # CLAIR CHUNK RULE
 61 | #=================
 62 | 
 63 | #  [ ! -f {output.gvcf} ] && cp {output.vcf} {output.gvcf} && cp {output.vcf}.tbi {output.gvcf}.tbi &&\
 64 | if config['gvcf_snv']:
 65 |     rule callSNVsChunk:
 66 |         """
 67 |         Calling SNPs using clair in case gVCF file is required.
 68 |         """
 69 |         input:
 70 |             bam=data_dir + "/align/{aligner}/data.bam",
 71 |             data_index=data_dir + "/align/{aligner}/data.bam.bai",
 72 |             reference=REFERENCES,
 73 |         output:
 74 |             vcf = temp(data_dir + "/snp/{aligner}/chr.split.{chr}_{region,\d+}/merge_output.vcf.gz"),
 75 |             gvcf = temp(data_dir + "/snp/{aligner}/chr.split.{chr}_{region,\d+}/merge_output.gvcf.gz")
 76 |         params:
 77 |             train_data = lambda wildcards: get_model(os.environ['CONDA_PREFIX']),
 78 |             platform = platform,
 79 |             start = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region)],
 80 |             end = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region) + 1],
 81 |             gvcf = "--gvcf",
 82 |         benchmark: data_dir + "/benchmark/snp/{aligner}/chr.split.{chr}_{region}/{chr}_{region}.benchmark.txt"
 83 |         conda: CLAIR_ENV
 84 |         log: data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/data.split.{chr}_{region}.log"
 85 |         threads: config['clair_threads']
 86 |         shell:
 87 |             """
 88 |             if [ {params.train_data} == "None" ]
 89 |                 then
 90 |                 model="$CONDA_PREFIX/bin/models/{params.platform}"
 91 |             else
 92 |                 model="{params.train_data}"
 93 |             fi
 94 | 
 95 |             echo $'{wildcards.chr}\t{params.start}\t{params.end}' > {wildcards.chr}.{params.start}.{params.end}.bed  &&\
 96 |             run_clair3.sh \
 97 |             --bam_fn {input.bam} \
 98 |             --ref_fn {input.reference} \
 99 |             --threads {threads} \
100 |             --platform {params.platform} \
101 |             --model_path $model \
102 |             --output $PWD/snp/{wildcards.aligner}/chr.split.{wildcards.chr}_{wildcards.region} \
103 |             --bed_fn={wildcards.chr}.{params.start}.{params.end}.bed \
104 |             {params.gvcf} > {log} 2>&1 \
105 |             &&\
106 |             if [ ! -f {output.gvcf} ]; then
107 |                 cp {output.vcf} {output.gvcf} 
108 |             fi &&\
109 |             rm {wildcards.chr}.{params.start}.{params.end}.bed
110 |             """
111 | else:
112 |     rule callSNVsChunk:
113 |         """
114 |         Calling SNPs using clair
115 |         """
116 |         input:
117 |             bam=data_dir + "/align/{aligner}/data.bam",
118 |             data_index=data_dir + "/align/{aligner}/data.bam.bai",
119 |             reference=REFERENCES,
120 |         output:
121 |             vcf = temp(data_dir + "/snp/{aligner}/chr.split.{chr}_{region,\d+}/merge_output.vcf.gz")
122 |         params:
123 |             train_data = lambda wildcards: get_model(os.environ['CONDA_PREFIX']),
124 |             platform = platform,
125 |             start = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region)],
126 |             end = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region) + 1],
127 |         benchmark: data_dir + "/benchmark/snp/{aligner}/chr.split.{chr}_{region}/{chr}_{region}.benchmark.txt"
128 |         conda: CLAIR_ENV
129 |         log: data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/data.split.{chr}_{region}.log"
130 |         threads: config['clair_threads']
131 |         shell:
132 |             """
133 |             if [ {params.train_data} == "None" ]
134 |                 then
135 |                 model="$CONDA_PREFIX/bin/models/{params.platform}"
136 |             else
137 |                 model="{params.train_data}"
138 |             fi
139 | 
140 |             echo $'{wildcards.chr}\t{params.start}\t{params.end}' > {wildcards.chr}.{params.start}.{params.end}.bed  &&\
141 |             run_clair3.sh \
142 |             --bam_fn {input.bam} \
143 |             --ref_fn {input.reference} \
144 |             --threads {threads} \
145 |             --platform {params.platform} \
146 |             --model_path $model \
147 |             --output $PWD/snp/{wildcards.aligner}/chr.split.{wildcards.chr}_{wildcards.region} \
148 |             --bed_fn={wildcards.chr}.{params.start}.{params.end}.bed  > {log} 2>&1 \
149 |             && rm {wildcards.chr}.{params.start}.{params.end}.bed
150 |             """
151 | # resources:
152 |     # mem_mb=lambda wildcards, attempt: 1024 * (attempt + 1) if attempt < 3
153 | #         --model_path $CONDA_PREFIX{params.train_data} \
154 |     # --model_path {params.train_data} \
155 | 
156 | #### CALL VARIANT BY CHUNKS #######
157 | ###################################
158 | 
159 | if config['gvcf_snv']:
160 |     ## TODO: This function will raise an error if there is missing gvcf, missing gvcf results from calling varaint in alt contigs where there are no variants this no gVCF, solution is to check if the vcf file is empty then just copy it with gVCF name.
161 |     # if $(head -n 1000 {input} | grep -q -v "#") ; then
162 |     #     vcfcat {input}  | vcfstreamsort > {params.temp_chr}\
163 |     #     && first_max=$(find_max {params.temp_chr} {params.read_type})\
164 |     #     && threshold=$(filsn {params.temp_chr} $first_max)\
165 |     #     && awk -v threshold=$threshold '/^#/{{print}} !/^#/{{if ( $6 >= threshold ) {{print $0}}}}' {params.temp_chr} | awk '/^#/ {{ print }} !/^#/ {{ if ($4 != $5 ) {{ print }} }}' > {output}
166 |     # else
167 |     #     cp {input} {output}
168 |     # fi
169 |     rule concatChromosome:
170 |         """
171 |         Concat split chromosomes regions in case of gVCF file is required.
172 |         """
173 |         input:
174 |             vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/merge_output.vcf.gz", aligner=wildcards.aligner, chr=wildcards.chr, region=list(range(0,len(chr_range[wildcards.chr]) - 1))),
175 |             gvcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/merge_output.gvcf.gz", aligner=wildcards.aligner, chr=wildcards.chr, region=list(range(0,len(chr_range[wildcards.chr]) - 1))),
176 |         output:
177 |             vcf = temp(data_dir + "/snp/{aligner}/data.{chr}.vcf"),
178 |             gvcf = temp(data_dir + "/snp/{aligner}/data.{chr}.gvcf")
179 |         message: "Concat variant split per Chromosome"
180 |         params:
181 |             tmp_dir=config["tmp_directory"],
182 |         conda: VARIANT_ENV
183 |         benchmark: data_dir + "/benchmark/snp/{aligner}/{chr}.benchmark.txt"
184 |         shell:"""
185 |         if [[ ! -z "{params.tmp_dir}" ]]; then
186 |             bcftools concat {input.vcf} | bcftools sort -T {params.tmp_dir} > {output.vcf} &&\
187 |             bcftools concat {input.gvcf} | bcftools sort -T {params.tmp_dir} > {output.gvcf}
188 |         else
189 |             bcftools concat {input.vcf} | bcftools sort  > {output.vcf} &&\
190 |             bcftools concat {input.gvcf} | bcftools sort > {output.gvcf}
191 |         fi
192 |             """
193 | else:
194 |     rule concatChromosome:
195 |         """
196 |         Concat split chromosomes regions
197 |         """
198 |         input:
199 |             vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/merge_output.vcf.gz", aligner=wildcards.aligner, chr=wildcards.chr, region=list(range(0,len(chr_range[wildcards.chr]) - 1))),
200 |         output:
201 |             vcf = temp(data_dir + "/snp/{aligner}/data.{chr}.vcf")
202 |         message: "Concat variant split per Chromosome"
203 |         params:
204 |             tmp_dir=config["tmp_directory"],
205 |         conda: VARIANT_ENV
206 |         benchmark: data_dir + "/benchmark/snp/{aligner}/{chr}.benchmark.txt"
207 |         shell:"""
208 |          if [[ ! -z "{params.tmp_dir}" ]]; then
209 |             bcftools concat {input.vcf} | bcftools sort  -T {params.tmp_dir} > {output.vcf} 
210 |          else
211 |              bcftools concat {input.vcf} | bcftools sort  > {output.vcf} 
212 |          fi
213 |             """
214 | 
215 | 
216 | #### UPDATE HEADER #######
217 | ##########################
218 | 
219 | rule updateHeader:
220 |     """
221 |     Update the phased SNPs in phased/aligner/data.vcf
222 |     Where the PS in header defined as Integer where it should be String.
223 |     Result: phased/aligner/data_update_header.vcf
224 |     Will be used in mergeParentalSNPs rule later
225 |     """
226 |     input:data_dir + "/{sample}.vcf"
227 |     output:data_dir + "/{sample}_update_header.vcf"
228 |     message:"Update header file to change from float to string"
229 |     shell:"""
230 |         sed 's/ID=PS,Number=1,Type=Integer,Descri/ID=PS,Number=1,Type=String,Descri/' {input} > {output}
231 |         """
232 | 
233 | 
234 | #### INDEXING VCF FILE ########
235 | ###############################
236 | 
237 | rule vcfIndex:
238 |     """
239 |     Index VCF file.
240 |     """
241 |     input: data_dir + "/{sample}.vcf.gz"
242 |     output: data_dir + "/{sample}.vcf.gz.tbi"
243 |     message: "Indexing vcf file {input}"
244 |     conda: VARIANT_ENV
245 |     shell:"""
246 |         tabix -p vcf {input}
247 |         """
248 | 
249 | rule gvcfIndex:
250 |     """
251 |     Index VCF file.
252 |     """
253 |     input: data_dir + "/{sample}.gvcf.gz"
254 |     output: data_dir + "/{sample}.gvcf.gz.tbi"
255 |     message: "Indexing vcf file {input}"
256 |     conda: VARIANT_ENV
257 |     shell:"""
258 |         tabix -p vcf {input}
259 |         """
260 | 
261 | 
262 | #### MERGING PHASED VCF FILE WITH PARENTAL SNPs ########
263 | ########################################################
264 | 
265 | rule mergeParentalSNPs:
266 |     """
267 |     If the user wanted to update identified SNVs this will be the first rule in sequence,
268 |     Input: phased SNVs after updating header using update_header the bgzip and index it using
269 |     bgzip_vcf and vcf_index respectively.
270 |     """
271 |     input:
272 |         sample_snps = data_dir + "/phased/{aligner}/data_update_header.vcf.gz",
273 |         sample_snps_index = data_dir + "/phased/{aligner}/data_update_header.vcf.gz.tbi",
274 |         maternal_snps = config['maternal_snps'],
275 |         paternal_snps = config['paternal_snps'],
276 |     output: data_dir + "/phased/{aligner}/data_paternal_maternal.vcf.gz"
277 |     message: data_dir + "/merging vcf from samplepaternal and maternal respectively"
278 |     benchmark: data_dir + "/benchmark/snp/{aligner}/merge_parental.benchmark.txt"
279 |     conda: VARIANT_ENV
280 |     shell:"""
281 |         bcftools merge {input.sample_snps} {input.paternal_snps} {input.maternal_snps} | bgzip > {output}
282 |         """
283 | 
284 | #### UPDATING PHASED SNPs ########
285 | ##################################
286 | 
287 | rule updateSNPs:
288 |     """
289 |     Here we shall take the input from mergeParentalSNPs but we need to unzip it first.
290 |     """
291 |     input: data_dir + "/phased/{aligner}/data_paternal_maternal.vcf.gz"
292 |     output:
293 |         updated_vcf = data_dir + "/phased/{aligner}/data_updated.vcf",
294 |     message: "Running update SNPs"
295 |     params:
296 |         update_script = config['updat_snps_script'],
297 |         phased_stat = data_dir + "/statistics/phased/phasing_stat.txt",
298 |         block_tsv = data_dir + "/statistics/phased/blocks.tsv",
299 |     benchmark: data_dir + "/benchmark/snp/{aligner}/update_snps.benchmark.txt"
300 |     conda: READ_STAT_ENV
301 |     shell:"""
302 |         mkdir -p statistics/phased  &&
303 |         python {params.update_script} -i {input} -u {output.updated_vcf} -o {params.block_tsv} -s {params.phased_stat}
304 |         """
305 | 
306 | 
307 | #### CONCAT SNPs ########
308 | #########################
309 | 
310 | if config['gvcf_snv']:
311 |     rule concatSNPs:
312 |         """
313 |         Rule to concat the identifed SNPs this will only be called by the user
314 |         in case if he wanted to have only SNPs
315 |         """
316 |         input:
317 |             vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/data.{chr}.vcf", aligner=wildcards.aligner, chr=chr_list),
318 |             gvcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/data.{chr}.gvcf", aligner=wildcards.aligner, chr=chr_list),
319 |         output:
320 |             vcf = data_dir + "/snp/{aligner}/data.vcf",
321 |             gvcf = data_dir + "/snp/{aligner}/data.gvcf",
322 |         message: "Concat SNP files"
323 |         benchmark: data_dir + "/benchmark/snp/{aligner}/concat_snp.txt"
324 |         params:
325 |             sample_name = SAMPLE_NAME,
326 |         conda: VARIANT_ENV
327 |         shell:"""
328 |             echo "{params.sample_name}" > sample_name.txt && vcfcat {input.vcf} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output.vcf} &&\
329 |             vcfcat {input.gvcf} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output.gvcf}
330 |             """
331 | else:
332 |     rule concatSNPs:
333 |         """
334 |         Rule to concat the identified SNPs this will only be called by the user
335 |         in case if he wanted to have only SNPs and indels and no gVCF required.
336 |         """
337 |         input:
338 |             vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/data.{chr}.vcf", aligner=wildcards.aligner, chr=chr_list),
339 |         output:
340 |             vcf = data_dir + "/snp/{aligner}/data.vcf"
341 |         message: "Concat SNP files"
342 |         benchmark: data_dir + "/benchmark/snp/{aligner}/concat_snp.txt"
343 |         params:
344 |             sample_name = SAMPLE_NAME,
345 |         conda: VARIANT_ENV
346 |         shell:"""
347 |             echo "{params.sample_name}" > sample_name.txt && vcfcat {input.vcf} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output.vcf}
348 |             """
349 | 
350 | # #### Bgzip gVCF #########
351 | # #########################
352 | 
353 | rule bgzipgVCFFile:
354 |     """
355 |     General rule to bgzip gVCF files
356 |     """
357 |     input:data_dir + "/{name}.gvcf"
358 |     output:data_dir + "/{name}.gvcf.gz"
359 |     threads: config['bgzip_threads']
360 |     conda: VARIANT_ENV
361 |     shell:"""
362 |         bgzip -c -@ {threads} {input} > {output}
363 |         """
364 | 


--------------------------------------------------------------------------------
/modules/output.smk:
--------------------------------------------------------------------------------
  1 | 
  2 | ##########################
  3 | ######  OUTPUT RULES ####
  4 | #########################
  5 | 
  6 | #### Align Moving ########
  7 | #########################
  8 | 
  9 | rule mvAlign:
 10 |     input:
 11 |         bam = data_dir + "/align/{aligner}/data.bam",
 12 |         bamindex = data_dir + "/align/{aligner}/data.bam.bai",
 13 |     output:
 14 |         bam = data_dir +'/result' + '/.aligning.{aligner}.done',
 15 |     message: "Moving Aligned bam to result directory {input.bam}"
 16 |     params:
 17 |         bam = data_dir +'/result' + "/aligning.{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME),
 18 |     shell:"""
 19 |     function rm_last2() {{
 20 |     d1=$(dirname $1)
 21 |     d2=$(dirname $d1)
 22 |     rm -rf $d2
 23 |     }}
 24 |     mv {input.bamindex} {params.bam}.bai && mv {input.bam} {params.bam} &&\
 25 |     mkdir -p {data_dir}/log &&\
 26 |     mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : &&\
 27 |     rm_last2 {input.bam} &&\
 28 |     touch {output}
 29 |     """
 30 | 
 31 | 
 32 | #### SVs Moving ########
 33 | ########################
 34 | 
 35 | rule mvSV:
 36 |     input:
 37 |         vcf = data_dir + "/sv/{aligner}/sniffles.vcf",
 38 |         snf = data_dir + "/sv/{aligner}/sniffles.snf",
 39 |         bam = data_dir + "/align/{aligner}/data.bam",
 40 |         bamindex = data_dir + "/align/{aligner}/data.bam.bai",
 41 |     output:
 42 |         vcf = data_dir +'/result' + '/.SVs.{aligner}.done',
 43 |     params:
 44 |         vcf = data_dir +'/result' + "/{sample}.{{aligner}}.SVs.vcf".format(sample=SAMPLE_NAME),
 45 |         snf = data_dir +'/result' + "/{sample}.{{aligner}}.SVs.snf".format(sample=SAMPLE_NAME),
 46 |         bam = data_dir +'/result' + "/{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME),
 47 |         bamindex = data_dir +'/result' + "/{sample}.{{aligner}}.bam.bai".format(sample=SAMPLE_NAME),
 48 |     message: "Moving called SVs to result directory {input}"
 49 |     priority: 1
 50 |     shell:"""
 51 |     function rm_last2() {{
 52 |     d1=$(dirname $1)
 53 |     d2=$(dirname $d1)
 54 |     rm -rf $d2
 55 |     }}
 56 |     mkdir -p {data_dir}/log &&\
 57 |     if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then
 58 |         mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || :
 59 |     fi &&\
 60 |     if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
 61 |         mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
 62 |     fi &&\
 63 |     mv {input.vcf} {params.vcf} &&\
 64 |     if [ -f {input.snf} ]; then
 65 |         mv {input.snf} {params.snf} && rm_last2 {input.snf} || :
 66 |     else
 67 |         rm_last2 {input.vcf} || :
 68 |     fi &&\
 69 |     mv {input.bam} {params.bam} && mv {input.bamindex} {params.bamindex} && rm_last2 {input.bam} &&\
 70 |     touch {output.vcf}
 71 |     """
 72 | 
 73 | 
 74 | #### SNVs Moving ########
 75 | #########################
 76 | 
 77 | if config['gvcf_snv']:
 78 |     rule mvSNV:
 79 |         input:
 80 |             vcf=data_dir + "/snp/{aligner}/data.vcf.gz",
 81 |             gvcf=data_dir + "/snp/{aligner}/data.gvcf.gz",
 82 |             vcfindex=data_dir + "/snp/{aligner}/data.vcf.gz.tbi",
 83 |             gvcfindex=data_dir + "/snp/{aligner}/data.gvcf.gz.tbi",
 84 |             bam=data_dir + "/align/{aligner}/data.bam",
 85 |             bamindex=data_dir + "/align/{aligner}/data.bam.bai",
 86 |         output:
 87 |             data_dir + '/result' + '/.SNVs.{aligner}.done'
 88 |         params:
 89 |             vcf=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz".format(sample=SAMPLE_NAME),
 90 |             gvcf=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.gvcf.gz".format(sample=SAMPLE_NAME),
 91 |             vcfindex=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME),
 92 |             gvcfindex=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.gvcf.gz.tbi".format(sample=SAMPLE_NAME),
 93 |             bam=data_dir + '/result' + "/{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME),
 94 |             bamindex=data_dir + '/result' + "/{sample}.{{aligner}}.bam.bai".format(sample=SAMPLE_NAME),
 95 |         message: "Moving called SNVs to result directory {input}"
 96 |         priority: 1
 97 |         shell: """
 98 |             function rm_last2() {{
 99 |             d1=$(dirname $1)
100 |             d2=$(dirname $d1)
101 |             rm -rf $d2
102 |             }}
103 |             mkdir -p {data_dir}/log &&\
104 |             mv {input.vcf} {params.vcf} &&\
105 |             mv {input.gvcf} {params.gvcf} &&\
106 |             mv {input.vcfindex} {params.vcfindex}  &&\
107 |             mv {input.gvcfindex} {params.gvcfindex}  &&\
108 |             mv {input.bam} {params.bam} &&\
109 |             mv {input.bamindex} {params.bamindex}  &&\
110 |             if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
111 |                 mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
112 |             fi &&\
113 |             if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
114 |                 mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
115 |             fi &&\
116 |             rm_last2 {input.vcf} && rm_last2 {input.bam} &&\
117 |             touch {output}
118 |             """
119 | else:
120 |     rule mvSNV:
121 |         input:
122 |             vcf = data_dir + "/snp/{aligner}/data.vcf.gz",
123 |             vcfindex = data_dir + "/snp/{aligner}/data.vcf.gz.tbi",
124 |             bam = data_dir + "/align/{aligner}/data.bam",
125 |             bamindex = data_dir + "/align/{aligner}/data.bam.bai",
126 |         output:
127 |             data_dir +'/result' + '/.SNVs.{aligner}.done'
128 |         params:
129 |             vcf = data_dir +'/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz".format(sample=SAMPLE_NAME),
130 |             vcfindex = data_dir +'/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME),
131 |             bam = data_dir +'/result' + "/{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME),
132 |             bamindex = data_dir +'/result' + "/{sample}.{{aligner}}.bam.bai".format(sample=SAMPLE_NAME),
133 |         message: "Moving called SNVs to result directory {input}"
134 |         priority: 1
135 |         shell:"""
136 |         function rm_last2() {{
137 |         d1=$(dirname $1)
138 |         d2=$(dirname $d1)
139 |         rm -rf $d2
140 |         }}
141 |         mkdir -p {data_dir}/log &&\
142 |         mv {input.vcf} {params.vcf} &&\
143 |         mv {input.vcfindex} {params.vcfindex}  &&\
144 |         mv {input.bam} {params.bam} &&\
145 |         mv {input.bamindex} {params.bamindex}  &&\
146 |         if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
147 |             mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
148 |         fi &&\
149 |         if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
150 |             mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
151 |         fi &&\
152 |         rm_last2 {input.vcf} && rm_last2 {input.bam} &&\
153 |         touch {output}
154 |         """
155 | 
156 | #### Variants Moving ########
157 | ############################
158 | 
159 | rule mvVariants:
160 |     input:
161 |         snv = data_dir + "/snp/{aligner}/data.vcf.gz",
162 |         snvindex = data_dir + "/snp/{aligner}/data.vcf.gz.tbi",
163 |         sv = data_dir + "/sv/{aligner}/sniffles.vcf",
164 |         snf = data_dir + "/sv/{aligner}/sniffles.snf",
165 |         bam = data_dir + "/align/{aligner}/data.bam",
166 |         bamindex = data_dir + "/align/{aligner}/data.bam.bai",
167 |     output:
168 |         data_dir + "/result" + "/.variant.{aligner}.done"
169 |     message: "Moving called SNVs to result directory {input}"
170 |     shell:"""
171 |     function rm_last2() {{
172 |     d1=$(dirname $1)
173 |     d2=$(dirname $d1)
174 |     rm -rf $d2
175 |     }}
176 |     mkdir -p {data_dir}/log &&\
177 |     if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
178 |         mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
179 |     fi &&\
180 |     if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then
181 |         mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || :
182 |     fi &&\
183 |     if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
184 |         mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
185 |     fi &&\
186 |     if [ -f {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log ]; then
187 |         mv {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log {data_dir}/log || :
188 |     fi &&\
189 |     mv {input.snv} {data_dir}/result/SNVs.{wildcards.aligner}.vcf.gz &&\
190 |     mv {input.snvindex} {data_dir}/result/SNVs.{wildcards.aligner}.vcf.gz.tbi &&\
191 |     rm_last2 {input.snv} &&\
192 |     mv {input.sv} {data_dir}/result/SVs.{wildcards.aligner}.vcf &&\
193 |     mv {input.snf} {data_dir}/result/SVs.{wildcards.aligner}.snf &&\
194 |     rm_last2 {input.sv} &&\
195 |     rm_last2 {input.snf} &&\
196 |     mv {input.bam} {data_dir}/result/align.{wildcards.aligner}.bam &&\
197 |     mv {input.bamindex} {data_dir}/result/align.{wildcards.aligner}.bam.bai &&\
198 |     rm_last2 {input.bam} &&\
199 |     touch {output}
200 |     """
201 | 
202 | #### Phasing Moving ########
203 | ############################
204 | 
205 | rule mvPhasing:
206 |     input:
207 |         snv = data_dir + "/phased/{aligner}/data.vcf.gz",
208 |         snvindex = data_dir + "/phased/{aligner}/data.vcf.gz.tbi",
209 |         bam = data_dir + "/align/{aligner}/data.bam",
210 |         bamindex = data_dir + "/align/{aligner}/data.bam.bai",
211 |     output:
212 |         snv = data_dir + "/result" + "/phased.SNVs.{aligner}.done",
213 |     params:
214 |         snv = data_dir + "/result" + "/phased.SNVs.{aligner}.vcf.gz",
215 |         snvindex = data_dir + "/result" + "/phased.SNVs.{aligner}.vcf.gz.tbi",
216 |         bam = data_dir +'/result' + '/aligning.{aligner}.bam',
217 |         bamindex = data_dir +'/result' + '/aligning.{aligner}.bam.bai',
218 |     message: "Moving called phased SNVs to result directory {input}"
219 |     shell:"""
220 |     function rm_last2() {{
221 |     d1=$(dirname $1)
222 |     d2=$(dirname $d1)
223 |     rm -rf $d2 ||:
224 |     }}
225 |     function rm_last1() {{
226 |     d1=$(dirname $1)
227 |     rm -rf $d1 ||:
228 |     }}
229 |     mkdir -p {data_dir}/log &&\
230 |     if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
231 |         mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
232 |     fi &&\
233 |     if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
234 |         mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
235 |     fi &&\
236 |     if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then
237 |         mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || :
238 |     fi &&\
239 |     if [ -f {data_dir}/phased/{wildcards.aligner}/*.txt ]; then
240 |         mv {data_dir}/phased/{wildcards.aligner}/*.txt {data_dir}/log || :
241 |     fi &&\
242 |     if [ -f {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log ]; then
243 |         mv {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log {data_dir}/log || :
244 |     fi &&\
245 |     mv {input.snv} {params.snv} &&\
246 |     mv {input.snvindex} {params.snvindex} &&\
247 |     mv {input.bam} {params.bam} &&
248 |     mv {input.bamindex} {params.bamindex} &&\
249 |     rm_last2 {input.snv} &&\
250 |     rm_last2 {input.bam} &&\
251 |     rm_last1 {data_dir}/snp/{wildcards.aligner} &&\
252 |     touch {output}
253 |     """
254 | 
255 | #### All Moving ########
256 | ########################
257 | 
258 | rule mvmethylation:
259 |     input:
260 |         methylation = data_dir + "/meth/"+ "{aligner}" + "/methylation_calls_hap.tsv",
261 |     output:
262 |         methylation = data_dir + "/result" + "/methylation.{aligner}_calls_hap.tsv"
263 |     shell:"""
264 |     function rm_last2() {{
265 |     d1=$(dirname $1)
266 |     d2=$(dirname $d1)
267 |     rm -rf $d2 ||:
268 |     }}
269 |     mv {input.methylation} {output.methylation} &&\
270 |     rm_last2 input.methylation}
271 |     """
272 | 
273 | rule mvParentalPhased:
274 |     input:
275 |         stat = data_dir + "/stat.txt" if config['sample_list'] else data_dir + "/stat.NoReads.txt",
276 |         phasedSNVs = data_dir + "/phased/{aligner}/data_updated.vcf",
277 |         # phasedSVs = data_dir + "/sv/{aligner}/sniffles_hp_updated.vcf",
278 |         bam = data_dir + "/align/{aligner}/data_hap.bam",
279 |         bamindex = data_dir + "/align/{aligner}/data_hap.bam.bai",
280 |     output:
281 |         stat = data_dir + "/result/.allReadsparental.{aligner}.txt"  #if config['sample_list'] else data_dir + "/result/.allNoReadsparental.{aligner}.txt",
282 |     params:
283 |         stat = data_dir + "/result/stat.{aligner}.txt",
284 |         phasedSNVs = data_dir + "/result/{aligner}.phased.SNVs.vcf",
285 |         # phasedSVs = data_dir + "/result/{aligner}.phased.SVs.vcf",
286 |         bam = data_dir + "/result/{aligner}.hap.bam",
287 |         bamindex = data_dir + "/result/{aligner}.hap.bam.bai",
288 |     shell:"""
289 |     function rm_last2() {{
290 |     d1=$(dirname $1)
291 |     d2=$(dirname $d1)
292 |     rm -rf $d2 ||:
293 |     }}
294 |     function rm_last1() {{
295 |     d1=$(dirname $1)
296 |     rm -rf $d1 ||:
297 |     }}
298 |     mkdir -p {data_dir}/log &&\
299 |     if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then
300 |         mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || :
301 |     fi &&\
302 |     if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
303 |         mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
304 |     fi &&\
305 |     if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
306 |         mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
307 |     fi &&\
308 |     if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then
309 |         mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || :
310 |     fi &&\
311 |     mv {input.stat} {params.stat} &&\
312 |     mv {data_dir}/statistics {data_dir}/result &&\
313 |     mv {input.phasedSNVs} {params.phasedSNVs} &&\
314 |     bgzip {params.phasedSNVs} &&\
315 |     tabix {params.phasedSNVs}.gz &&\
316 |     mv {input.bam} {params.bam} &&\
317 |     mv {input.bamindex} {params.bamindex} &&\
318 |     rm_last2 {input.phasedSNVs} &&\
319 |     rm_last2 {input.bam} &&\
320 |     touch {output}
321 |     """
322 | # mv {input.phasedSVs} {params.phasedSVs} &&\
323 | # rm_last2 {input.phasedSVs} &&\
324 | 
325 | if config['gvcf_snv']:
326 |     rule mvNoParentalPhased:
327 |         """
328 |         The rule here will make sure to get the rule for all command without parental comparison and later will delete the data.
329 |         """
330 |         input:
331 |             stat = data_dir + "/stat.txt" if config['sample_list'] else data_dir + "/stat.NoReads.txt",
332 |             # phasedSvsSNVs = data_dir + "/sv/{aligner}/sv_snp.vcf.gz",
333 |             phasedSNVs = data_dir + "/phased/{aligner}/data.vcf.gz",
334 |             phasedSNVsindex = data_dir + "/phased/{aligner}/data.vcf.gz.tbi",
335 |             SVs = data_dir + "/sv/{aligner}/sniffles.vcf",
336 |             snf = data_dir + "/sv/{aligner}/sniffles.snf",
337 |             # phasedSVs = data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz",
338 |             bam = data_dir + "/align/{aligner}/data_hap.bam",
339 |             bamindex = data_dir + "/align/{aligner}/data_hap.bam.bai",
340 |             gvcf = data_dir + "/snp/{aligner}/data.gvcf.gz",
341 |         output:
342 |             stat = data_dir + "/result/.all.Reads.{aligner}.txt" if config['sample_list'] else data_dir + "/result/.all.noReads.{aligner}.txt",
343 |         params:
344 |             stat = data_dir + "/result/stat.{sample}.{{aligner}}.txt".format(sample=SAMPLE_NAME),
345 |             # phasedSvsSNVs = data_dir + "/result/{aligner}.phased.sv_snp.vcf.gz",
346 |             phasedSNVs = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz".format(sample=SAMPLE_NAME),
347 |             phasedSNVsindex = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME),
348 |             SVs = data_dir + "/result/{sample}.{{aligner}}.SVs.vcf".format(sample=SAMPLE_NAME),
349 |             snf = data_dir + "/result/{sample}.{{aligner}}.SVs.snf".format(sample=SAMPLE_NAME),
350 |             # phasedSVs = data_dir + "/result/{aligner}.SVs.phased.vcf.gz",
351 |             bam = data_dir + "/result/{sample}.{{aligner}}.hap.bam".format(sample=SAMPLE_NAME),
352 |             bamindex = data_dir + "/result/{sample}.{{aligner}}.hap.bam.bai".format(sample=SAMPLE_NAME),
353 |             copy_gvcf = "True" if config['gvcf_snv'] else "False",
354 |             gvcf = data_dir + "/result/{sample}.{{aligner}}.SNVs.gvcf.gz".format(sample=SAMPLE_NAME),
355 |         shell:"""
356 |         function rm_last2() {{
357 |         d1=$(dirname $1)
358 |         d2=$(dirname $d1)
359 |         rm -rf $d2 ||:
360 |         }}
361 |         function rm_last1() {{
362 |         d1=$(dirname $1)
363 |         rm -rf $d1 ||:
364 |         }}
365 |         mkdir -p {data_dir}/log &&\
366 |         if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then
367 |             mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || :
368 |         fi &&\
369 |         if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
370 |             mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
371 |         fi &&\
372 |         if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
373 |             mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
374 |         fi &&\
375 |         if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then
376 |             mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || :
377 |         fi &&\
378 |         mv {input.gvcf} {params.gvcf} &&\
379 |         mv {input.stat} {params.stat} &&\
380 |         mv {input.phasedSNVs} {params.phasedSNVs} &&\
381 |         mv {input.phasedSNVsindex} {params.phasedSNVsindex} &&\
382 |         mv {input.SVs} {params.SVs} &&\
383 |         mv {input.snf} {params.snf} &&\
384 |         mv {input.bam} {params.bam} &&\
385 |         mv {input.bamindex} {params.bamindex} &&\
386 |         rm_last2 {input.SVs} &&\
387 |         rm_last2 {input.phasedSNVs} &&\
388 |         rm_last2 {input.bam} &&\
389 |         rm_last1 {data_dir}/snp/{wildcards.aligner} &&\
390 |         touch {output}
391 |         """
392 | else:
393 |     rule mvNoParentalPhased:
394 |         """
395 |         The rule here will make sure to get the rule for all command without parental comparison and later will delete the data.
396 |         """
397 |         input:
398 |             stat = data_dir + "/stat.txt" if config['sample_list'] else data_dir + "/stat.NoReads.txt",
399 |             # phasedSvsSNVs = data_dir + "/sv/{aligner}/sv_snp.vcf.gz",
400 |             phasedSNVs = data_dir + "/phased/{aligner}/data.vcf.gz",
401 |             phasedSNVsindex = data_dir + "/phased/{aligner}/data.vcf.gz.tbi",
402 |             SVs = data_dir + "/sv/{aligner}/sniffles.vcf",
403 |             snf = data_dir + "/sv/{aligner}/sniffles.snf",
404 |             # phasedSVs = data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz",
405 |             bam = data_dir + "/align/{aligner}/data_hap.bam",
406 |             bamindex = data_dir + "/align/{aligner}/data_hap.bam.bai",
407 |         output:
408 |             stat = data_dir + "/result/.all.Reads.{aligner}.txt" if config['sample_list'] else data_dir + "/result/.all.noReads.{aligner}.txt",
409 |         params:
410 |             stat = data_dir + "/result/stat.{sample}.{{aligner}}.txt".format(sample=SAMPLE_NAME),
411 |             # phasedSvsSNVs = data_dir + "/result/{aligner}.phased.sv_snp.vcf.gz",
412 |             phasedSNVs = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz".format(sample=SAMPLE_NAME),
413 |             phasedSNVsindex = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME),
414 |             SVs = data_dir + "/result/{sample}.{{aligner}}.SVs.vcf".format(sample=SAMPLE_NAME),
415 |             snf = data_dir + "/result/{sample}.{{aligner}}.SVs.snf".format(sample=SAMPLE_NAME),
416 |             # phasedSVs = data_dir + "/result/{aligner}.SVs.phased.vcf.gz",
417 |             bam = data_dir + "/result/{sample}.{{aligner}}.hap.bam".format(sample=SAMPLE_NAME),
418 |             bamindex = data_dir + "/result/{sample}.{{aligner}}.hap.bam.bai".format(sample=SAMPLE_NAME),
419 |             copy_gvcf = "True" if config['gvcf_snv'] else "False",
420 |         shell:"""
421 |         function rm_last2() {{
422 |         d1=$(dirname $1)
423 |         d2=$(dirname $d1)
424 |         rm -rf $d2 ||:
425 |         }}
426 |         function rm_last1() {{
427 |         d1=$(dirname $1)
428 |         rm -rf $d1 ||:
429 |         }}
430 |         mkdir -p {data_dir}/log &&\
431 |         if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then
432 |             mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || :
433 |         fi &&\
434 |         if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then
435 |             mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || :
436 |         fi &&\
437 |         if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then
438 |             mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || :
439 |         fi &&\
440 |         if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then
441 |             mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || :
442 |         fi &&\
443 |         mv {input.stat} {params.stat} &&\
444 |         mv {input.phasedSNVs} {params.phasedSNVs} &&\
445 |         mv {input.phasedSNVsindex} {params.phasedSNVsindex} &&\
446 |         mv {input.SVs} {params.SVs} &&\
447 |         mv {input.snf} {params.snf} &&\
448 |         mv {input.bam} {params.bam} &&\
449 |         mv {input.bamindex} {params.bamindex} &&\
450 |         rm_last2 {input.SVs} &&\
451 |         rm_last2 {input.phasedSNVs} &&\
452 |         rm_last2 {input.bam} &&\
453 |         rm_last1 {data_dir}/snp/{wildcards.aligner} &&\
454 |         touch {output}
455 |         """
456 | # mv {input.phasedSVs} {params.phasedSVs} &&\
457 | # mv {input.phasedSvsSNVs} {params.phasedSvsSNVs} &&\
458 | # rm_last2 {input.phasedSvsSNVs} &&\
459 | 


--------------------------------------------------------------------------------
/princess:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | 
   3 | """
   4 | Snakemake file wrapper for princess.
   5 | """
   6 | import argparse
   7 | import sys, os, subprocess, ntpath, yaml
   8 | from distutils.dir_util import copy_tree
   9 | import filecmp, shutil
  10 | from pathlib import Path
  11 | from collections import namedtuple
  12 | import logging
  13 | import filecmp
  14 | from typing import Any
  15 | 
  16 | # Create a custom logger
  17 | logger = logging.getLogger(__name__)
  18 | logger.setLevel(logging.DEBUG)
  19 | f_handler = None
  20 | 
  21 | 
  22 | def get_args():
  23 |     parser = argparse.ArgumentParser(
  24 |         epilog="%(prog)s version 0.01. use command -h for info.",
  25 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
  26 |         description="Princess A framework for long-reads analysis.",
  27 |         add_help=True,
  28 |     )
  29 | 
  30 |     parent_parser = argparse.ArgumentParser(
  31 |         add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter
  32 |     )
  33 |     parent_parser.add_argument(
  34 |         "-v", "--version", action="version", version="%(prog)s 0.01"
  35 |     )
  36 | 
  37 |     # Adding the main params for any commands
  38 |     parent_parser.add_argument(
  39 |         "-d",
  40 |         "--directory",
  41 |         help="Working directory.",
  42 |         metavar="Working directory",
  43 |         required=True,
  44 |     )
  45 |     parent_parser.add_argument(
  46 |         "-r",
  47 |         "--ReadType",
  48 |         dest="read_type",
  49 |         type=str.lower,
  50 |         choices=["ont", "clr", "ccs"],
  51 |         help="Read technology",
  52 |         required=True,
  53 |     )
  54 |     parent_parser.add_argument(
  55 |         "-u",
  56 |         "--UseConda",
  57 |         dest="use_conda",
  58 |         help="Use conda for running default: %(default)s)",
  59 |         action="store_false",
  60 |     )
  61 |     parent_parser.add_argument(
  62 |         "-e",
  63 |         "--Cluster",
  64 |         dest="is_cluster",
  65 |         help="Use cluster while running default: %(default)s)",
  66 |         action="store_false",
  67 |     )
  68 |     parent_parser.add_argument(
  69 |         "-a",
  70 |         "--Aligner",
  71 |         dest="aligner",
  72 |         choices=["minimap", "ngmlr"],
  73 |         help="In case if you want to choose specific aligner otherwise default will be used default: %(default)s)",
  74 |         default="minimap",
  75 |     )
  76 |     parent_parser.add_argument(
  77 |         "-s",
  78 |         "--samplesFiles",
  79 |         dest="samples_files",
  80 |         metavar="samplesFiles",
  81 |         nargs="+",
  82 |         help="list of Fasta, Fastq, or gz files.",
  83 |     )
  84 |     parent_parser.add_argument(
  85 |         "-f",
  86 |         "--ref",
  87 |         dest="ref",
  88 |         help="The reference file will be used to align reads to.",
  89 |         required=True,
  90 |     )
  91 |     parent_parser.add_argument(
  92 |         "-j",
  93 |         "--jobs",
  94 |         dest="jobs",
  95 |         type=str,
  96 |         help="Number of running jobs default: %(default)s )",
  97 |         default="200",
  98 |     )
  99 |     parent_parser.add_argument(
 100 |         "-g",
 101 |         "--log",
 102 |         dest="log_file",
 103 |         type=str,
 104 |         help="Log file: %(default)s )",
 105 |         default="PrincessLog.txt",
 106 |     )
 107 |     parent_parser.add_argument(
 108 |         "-sn",
 109 |         "--sample-name",
 110 |         dest="sample_name",
 111 |         type=str,
 112 |         help="A sample name to use for BAMs, SVs, and SNVs helps when you are planning to merge multiple samples in the downstream analysis %(default)s",
 113 |         default="SAMPLE",
 114 |     )
 115 |     parent_parser.add_argument(
 116 |         "-sp",
 117 |         "--phase-sv",
 118 |         dest="phase_sv",
 119 |         help="Phase the identified SV, default: %(default)s",
 120 |         action="store_true",
 121 |     )
 122 |     parent_parser.add_argument(
 123 |         "-ms",
 124 |         "--mosaic-sv",
 125 |         dest="mosaic_sv",
 126 |         help="Identify mosaic SV, default: %(default)s",
 127 |         action="store_true",
 128 |     )
 129 |     parent_parser.add_argument(
 130 |         "-gv",
 131 |         "--gvcf-snv",
 132 |         dest="gvcf_snv",
 133 |         help="Identify gVCF SNVs, default: %(default)s",
 134 |         action="store_true",
 135 |     )
 136 | 
 137 |     # Sub-commands:
 138 |     subparser = parser.add_subparsers(
 139 |         title="Sub-commands", description="Valid sub-commands", dest="command"
 140 |     )
 141 | 
 142 |     # All subparser.
 143 |     all_subparser = subparser.add_parser(
 144 |         "all",
 145 |         help="""This command will run the following:\n
 146 |     Align the reads.\nIdentify SVs\nIdentify SNVs\nPhase both SNVs and SVs""",
 147 |         parents=[parent_parser],
 148 |     )
 149 |     all_subparser.add_argument(
 150 |         "-c",
 151 |         "--chr",
 152 |         dest="chrs",
 153 |         type=str,
 154 |         help="Chromosomes list,\
 155 |      if not specified Princess will use all Chromosomes.",
 156 |         nargs="+",
 157 |         default=[],
 158 |     )
 159 |     all_subparser.add_argument(
 160 |         "-t",
 161 |         "--filter",
 162 |         dest="filter",
 163 |         help="Filter identified SNVs using Princess algorithm\
 164 |     default: %(default)s)",
 165 |         action="store_false",
 166 |     )
 167 |     all_subparser.add_argument(
 168 |         "-m",
 169 |         "--methylation",
 170 |         dest="detect_methylation",
 171 |         help="Identify methylation, mutually inclusive with -md default: %(default)s)",
 172 |         action="store_true",
 173 |     )
 174 |     all_subparser.add_argument(
 175 |         "-md",
 176 |         "--methylationDirectory",
 177 |         metavar="Fast5 Directory",
 178 |         dest="methylation_dir",
 179 |         help="Fast5 directory will be used to identify\
 180 |     methylation mutually inclusive with option -m default: %(default)s)",
 181 |         default=False,
 182 |     )
 183 |     all_subparser.add_argument(
 184 |         "-cm",
 185 |         "--clair-model",
 186 |         metavar="Clair model",
 187 |         dest="clair_model",
 188 |         help="Clair model, if not supplied we will use default model came with conda installation of Clair3.\nThe folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-00000-of-00002, full_alignment.data-00001-of-00002  and full_alignment.index)",
 189 |         default=None,
 190 |     )
 191 |     all_subparser.set_defaults(func=all_analysis)
 192 | 
 193 |     # Align subparser.
 194 |     align_subparser = subparser.add_parser(
 195 |         "align",
 196 |         help="This command will align the input sequence files against the reference using either Minimap2 or NGMLR. You can use the -a option to choose the aligner, otherwise Minimap2 will be used by default.",
 197 |         parents=[parent_parser],
 198 |     )
 199 |     align_subparser.set_defaults(func=align)
 200 | 
 201 |     # SV subparser.
 202 |     sv_subparser = subparser.add_parser(
 203 |         "sv",
 204 |         help="This command will use bam file \
 205 |     to identify SV using Sniffles.",
 206 |         parents=[parent_parser],
 207 |     )
 208 |     sv_subparser.set_defaults(func=sv)
 209 | 
 210 |     # SNV subparser.
 211 |     snv_subparser = subparser.add_parser(
 212 |         "snv",
 213 |         help="This command will use bam file \
 214 |     to identify SNVs using Clair3.",
 215 |         parents=[parent_parser],
 216 |     )
 217 |     snv_subparser.add_argument(
 218 |         "-c",
 219 |         "--chr",
 220 |         dest="chrs",
 221 |         type=str,
 222 |         help="Chromosomes list,\
 223 |      if not specified we will use all Chromosomes.",
 224 |         nargs="+",
 225 |         default=[],
 226 |     )
 227 |     snv_subparser.add_argument(
 228 |         "-t",
 229 |         "--filter",
 230 |         dest="filter",
 231 |         help="Filter identified SNVs using Princess algorithm\
 232 |     default: %(default)s)",
 233 |         action="store_false",
 234 |     )
 235 |     snv_subparser.add_argument(
 236 |         "-cm",
 237 |         "--clair-model",
 238 |         metavar="Clair model",
 239 |         dest="clair_model",
 240 |         help="Clair model, if not supplied we will use default model came with conda installation of Clair3.\nThe folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-00000-of-00002, full_alignment.data-00001-of-00002  and full_alignment.index)",
 241 |         default="",
 242 |     )
 243 |     snv_subparser.set_defaults(func=snv)
 244 | 
 245 |     # VARIANT [SNV, and SV] subparser.
 246 |     variant_subparser = subparser.add_parser(
 247 |         "variant",
 248 |         help="This command will use bam file \
 249 |     to identify SVs and SNVs.",
 250 |         parents=[parent_parser],
 251 |     )
 252 |     variant_subparser.add_argument(
 253 |         "-c",
 254 |         "--chr",
 255 |         dest="chrs",
 256 |         type=str,
 257 |         help="Chromosomes list,\
 258 |      if not specified we will use all Chromosomes.",
 259 |         nargs="+",
 260 |         default=[],
 261 |     )
 262 |     variant_subparser.add_argument(
 263 |         "-t",
 264 |         "--filter",
 265 |         dest="filter",
 266 |         help="Filter identified SNVs using Princess algorithm\
 267 |     default: %(default)s)",
 268 |         action="store_false",
 269 |     )
 270 |     variant_subparser.add_argument(
 271 |         "-cm",
 272 |         "--clair-model",
 273 |         metavar="Clair model",
 274 |         dest="clair_model",
 275 |         help="Clair model, if not supplied we will use default model came with conda installation of Clair3.\nThe folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-00000-of-00002, full_alignment.data-00001-of-00002  and full_alignment.index)",
 276 |         default=None,
 277 |     )
 278 |     variant_subparser.set_defaults(func=variant)
 279 | 
 280 |     # Phase subparser.
 281 |     phase_subparser = subparser.add_parser(
 282 |         "phase",
 283 |         help="This command will use use reads \
 284 |     to identify SNVs by Clair and Phase them.",
 285 |         parents=[parent_parser],
 286 |     )
 287 |     phase_subparser.add_argument(
 288 |         "-c",
 289 |         "--chr",
 290 |         dest="chrs",
 291 |         type=str,
 292 |         help="Chromosomes list,\
 293 |      if not specified we will use all Chromosomes.",
 294 |         nargs="+",
 295 |         default=[],
 296 |     )
 297 |     phase_subparser.add_argument(
 298 |         "-t",
 299 |         "--filter",
 300 |         dest="filter",
 301 |         help="Filter identified SNVs using Princess algorithm\
 302 |     default: %(default)s)",
 303 |         action="store_false",
 304 |     )
 305 |     phase_subparser.set_defaults(func=phase)
 306 | 
 307 |     # Overview subparser.
 308 |     overview_subparser = subparser.add_parser(
 309 |         "overview",
 310 |         help="This command will show what steps will run.",
 311 |         parents=[parent_parser],
 312 |     )
 313 |     overview_subparser.add_argument(
 314 |         "-c",
 315 |         "--chr",
 316 |         dest="chrs",
 317 |         type=str,
 318 |         help="Chromosomes list,\
 319 |      if not specified we will use all Chromosomes.",
 320 |         nargs="+",
 321 |         default=[],
 322 |     )
 323 |     overview_subparser.set_defaults(func=overview)
 324 | 
 325 |     # if no argument print help.
 326 |     if len(sys.argv) == 1:
 327 |         parser.print_help(sys.stderr)
 328 |         sys.exit(1)
 329 | 
 330 |     args, unknownargs = parser.parse_known_args()
 331 |     unknownargs = sort_params(args, unknownargs)
 332 | 
 333 |     if "func" in args:
 334 |         (
 335 |             current_dir,
 336 |             running_file,
 337 |             work_dir,
 338 |             conf_yaml,
 339 |             aligner,
 340 |             sample_list_from_config,
 341 |             number_of_jobs,
 342 |             number_of_tries,
 343 |         ) = required_vars(args, unknownargs)
 344 |         Main_vars = namedtuple(
 345 |             "Main_vars",
 346 |             "current_dir, running_file, work_dir, conf_yaml, aligner, sample_list_from_config, number_of_jobs, number_of_tries",
 347 |         )
 348 |         main_vars = Main_vars(
 349 |             current_dir,
 350 |             running_file,
 351 |             work_dir,
 352 |             conf_yaml,
 353 |             aligner,
 354 |             sample_list_from_config,
 355 |             number_of_jobs,
 356 |             number_of_tries,
 357 |         )
 358 |         log_dir = os.path.join(work_dir, args.log_file)
 359 |         global f_handler
 360 |         f_handler = logging.FileHandler(log_dir)
 361 |         f_handler.setLevel(logging.DEBUG)
 362 |         f_format = logging.Formatter(
 363 |             "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 364 |         )
 365 |         f_handler.setFormatter(f_format)
 366 |         logger.addHandler(f_handler)
 367 |         args.func(args, unknownargs, main_vars)
 368 |     else:
 369 |         parser.print_help()
 370 | 
 371 | 
 372 | def overview(args, unknownargs, main_vars):
 373 |     if not args.samples_files and not main_vars.sample_list_from_config:
 374 |         print(
 375 |             "You need to support sequence read file/s either by using -s parameter or through sample_list filed in config.yaml file",
 376 |             file=sys.stderr,
 377 |         )
 378 |         exit(
 379 |             f"Error exiting, see log file {os.path.join(args.directory, args.log_file)}"
 380 |         )
 381 |     # check if the user gave existing chromosomes.
 382 |     chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"]
 383 |     is_valid_chrs(chr_list, args.ref)
 384 |     if not chr_list:
 385 |         chr_list = get_chrs(args.ref)
 386 |     chrs = "chrs=" + str(chr_list)
 387 |     samples, samples_names = get_sample_names(args, main_vars)
 388 |     # If we have samples let us create working directory If not exists.
 389 |     if not os.path.exists(main_vars.work_dir):
 390 |         os.makedirs(main_vars.work_dir)
 391 |     sample_dir = "sample_directory=" + main_vars.work_dir
 392 |     reference = (
 393 |         "reference=" + args.ref
 394 |         if args.ref
 395 |         else "reference=" + main_vars.conf_yaml["reference"]
 396 |     )
 397 |     # If we are already in main princess directory do nothing
 398 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
 399 |         pass
 400 |     else:
 401 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
 402 |     os.chdir(main_vars.work_dir)
 403 |     chr_log = " ".join(chr_list) if chr_list else "All Chromosomes"
 404 |     logger.info("Analyzed Chromosomes: ".format(chr_log))
 405 |     running_command = "running_command=" + args.command
 406 | 
 407 |     cmd = [
 408 |         "snakemake",
 409 |         "-n",
 410 |         "-p",
 411 |         "-r",
 412 |         "-j",
 413 |         args.jobs,
 414 |         "--config",
 415 |         sample_dir,
 416 |         samples,
 417 |         reference,
 418 |         chrs,
 419 |         running_command,
 420 |         *unknownargs,
 421 |     ]
 422 |     run_cmd(cmd)
 423 |     os.chdir(main_vars.current_dir)
 424 | 
 425 | 
 426 | def align(args, unknownargs, main_vars):
 427 |     # Do we have samples?
 428 |     if not args.samples_files and not main_vars.sample_list_from_config:
 429 |         print(
 430 |             "You need to support sequence read file/s either by using -s parameter or through sample_list filed in config.yaml file"
 431 |         )
 432 |         exit(
 433 |             f"Error extincting, see log file {os.path.join(args.directory, args.log_file)}"
 434 |         )
 435 |     # If we have samples let us create working directory If not exists.
 436 |     if not os.path.exists(main_vars.work_dir):
 437 |         os.makedirs(main_vars.work_dir)
 438 |     # Get samples to pass to Snakefile
 439 |     samples_names = (
 440 |         args.samples_files if args.samples_files else main_vars.sample_list_from_config
 441 |     )
 442 |     sample_dir = "sample_directory=" + main_vars.work_dir
 443 |     reference = (
 444 |         "reference=" + args.ref
 445 |         if args.ref
 446 |         else "reference=" + main_vars.conf_yaml["reference"]
 447 |     )
 448 | 
 449 |     # If we are already in main princess directory do nothing
 450 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
 451 |         pass
 452 |     else:
 453 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
 454 |     # Move to working directory to start
 455 |     os.chdir(main_vars.work_dir)
 456 |     cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml")
 457 |     result = os.path.join(
 458 |         main_vars.work_dir, "result", ".aligning.{}.done".format(main_vars.aligner)
 459 |     )
 460 | 
 461 |     reset_times = main_vars.number_of_tries
 462 |     running_command = "running_command=" + args.command
 463 |     delete_files = "delete_files=" + main_vars.running_file
 464 |     delete_samples = "delete_samples=" + str(samples_names)
 465 |     sample_name = "sample_name=" + args.sample_name
 466 |     # TODO: send only what needed instead of sending full object.
 467 |     samples, samples_names = get_sample_names(args, main_vars)
 468 |     if args.is_cluster:
 469 |         cmd = [
 470 |             "snakemake",
 471 |             "-p",
 472 |             result,
 473 |             "-j",
 474 |             args.jobs,
 475 |             "--profile",
 476 |             "cluster",
 477 |             "--nolock",
 478 |             "--restart-times",
 479 |             reset_times,
 480 |             "--config",
 481 |             sample_dir,
 482 |             samples,
 483 |             reference,
 484 |             running_command,
 485 |             delete_files,
 486 |             delete_samples,
 487 |             sample_name,
 488 |             *unknownargs,
 489 |         ]
 490 |     else:
 491 |         cmd = [
 492 |             "snakemake",
 493 |             "-p",
 494 |             result,
 495 |             "-j",
 496 |             args.jobs,
 497 |             "--cluster-config",
 498 |             cluster_config,
 499 |             "--nolock",
 500 |             "--restart-times",
 501 |             reset_times,
 502 |             "--config",
 503 |             sample_dir,
 504 |             samples,
 505 |             reference,
 506 |             running_command,
 507 |             delete_files,
 508 |             delete_samples,
 509 |             sample_name,
 510 |             *unknownargs,
 511 |         ]
 512 |     print(cmd)
 513 |     run_cmd(cmd)
 514 |     os.chdir(main_vars.current_dir)
 515 | 
 516 | 
 517 | def sv(args, unknownargs, main_vars):
 518 |     if check_samples(
 519 |         main_vars.work_dir,
 520 |         main_vars.aligner,
 521 |         args.samples_files,
 522 |         main_vars.sample_list_from_config,
 523 |         args.command,
 524 |         args.log_file,
 525 |     ):
 526 |         pass
 527 |     if not os.path.exists(main_vars.work_dir):
 528 |         os.makedirs(main_vars.work_dir)
 529 |     # If we are already in main princess directory do nothing
 530 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
 531 |         pass
 532 |     else:
 533 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
 534 |     os.chdir(main_vars.work_dir)
 535 |     cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml")
 536 |     result = os.path.join(
 537 |         main_vars.work_dir, "result", ".SVs.{}.done".format(main_vars.aligner)
 538 |     )
 539 |     sample_dir = "sample_directory=" + main_vars.work_dir
 540 |     reference = (
 541 |         "reference=" + args.ref
 542 |         if args.ref
 543 |         else "reference=" + main_vars.conf_yaml["reference"]
 544 |     )
 545 |     samples_names = (
 546 |         args.samples_files if args.samples_files else main_vars.sample_list_from_config
 547 |     )
 548 |     reset_times = main_vars.number_of_tries
 549 |     running_command = "running_command=" + args.command
 550 |     delete_files = "delete_files=" + main_vars.running_file
 551 |     delete_samples = "delete_samples=" + str(samples_names)
 552 |     phase_sv = "phase_sv=" + str(args.phase_sv)
 553 |     mosaic_sv = "mosaic_sv=" + str(args.mosaic_sv)
 554 | 
 555 |     if samples_names:
 556 |         samples, samples_names_str = get_sample_names(args, main_vars)
 557 |         if args.is_cluster:
 558 |             cmd = [
 559 |                 "snakemake",
 560 |                 "-p",
 561 |                 result,
 562 |                 "-j",
 563 |                 args.jobs,
 564 |                 "--profile",
 565 |                 "cluster",
 566 |                 "--config",
 567 |                 sample_dir,
 568 |                 samples,
 569 |                 reference,
 570 |                 running_command,
 571 |                 delete_files,
 572 |                 delete_samples,
 573 |                 phase_sv,
 574 |                 mosaic_sv,
 575 |                 *unknownargs,
 576 |             ]
 577 |         else:
 578 |             cmd = [
 579 |                 "snakemake",
 580 |                 "-p",
 581 |                 result,
 582 |                 "-j",
 583 |                 args.jobs,
 584 |                 "--cluster-config",
 585 |                 cluster_config,
 586 |                 "--nolock",
 587 |                 "--restart-times",
 588 |                 reset_times,
 589 |                 "--config",
 590 |                 sample_dir,
 591 |                 samples,
 592 |                 reference,
 593 |                 running_command,
 594 |                 delete_files,
 595 |                 delete_samples,
 596 |                 phase_sv,
 597 |                 mosaic_sv,
 598 |                 *unknownargs,
 599 |             ]
 600 |     else:
 601 |         if args.is_cluster:
 602 |             cmd = [
 603 |                 "snakemake",
 604 |                 "-p",
 605 |                 result,
 606 |                 "-j",
 607 |                 args.jobs,
 608 |                 "--profile",
 609 |                 "cluster",
 610 |                 "--nolock",
 611 |                 "--restart-times",
 612 |                 reset_times,
 613 |                 "--config",
 614 |                 sample_dir,
 615 |                 reference,
 616 |                 running_command,
 617 |                 delete_files,
 618 |                 delete_samples,
 619 |                 phase_sv,
 620 |                 mosaic_sv,
 621 |                 *unknownargs,
 622 |             ]
 623 |         else:
 624 |             cmd = [
 625 |                 "snakemake",
 626 |                 "-p",
 627 |                 result,
 628 |                 "-j",
 629 |                 args.jobs,
 630 |                 "--cluster-config",
 631 |                 cluster_config,
 632 |                 "--config",
 633 |                 sample_dir,
 634 |                 reference,
 635 |                 running_command,
 636 |                 delete_files,
 637 |                 delete_samples,
 638 |                 phase_sv,
 639 |                 mosaic_sv,
 640 |                 *unknownargs,
 641 |             ]
 642 |     run_cmd(cmd)
 643 |     os.chdir(main_vars.current_dir)
 644 | 
 645 | 
 646 | def snv(args, unknownargs, main_vars):
 647 |     if check_samples(
 648 |         main_vars.work_dir,
 649 |         main_vars.aligner,
 650 |         args.samples_files,
 651 |         main_vars.sample_list_from_config,
 652 |         args.command,
 653 |         args.log_file,
 654 |     ):
 655 |         pass
 656 |     chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"]
 657 |     is_valid_chrs(chr_list, args.ref)
 658 |     if not chr_list:
 659 |         chr_list = get_chrs(args.ref)
 660 |     chrs = "chrs=" + str(chr_list)
 661 | 
 662 |     # Will I filter the SNVs?
 663 |     # TODO: filtering was developed for Clair2, now we do not, we need to develop another procedure to filter variants identified by Clair3
 664 |     filter_snv = "filter_chrs=" + str(args.filter)
 665 | 
 666 |     # Which model to use
 667 |     clair_model: str = (
 668 |         "clair_model=" + args.clair_model if args.clair_model else "clair_model=''"
 669 |     )
 670 | 
 671 |     # If we have samples let us create working directory If not exists.
 672 |     if not os.path.exists(main_vars.work_dir):
 673 |         os.makedirs(main_vars.work_dir)
 674 |     # If we are already in main princess directory do nothing
 675 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
 676 |         pass
 677 |     else:
 678 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
 679 |     os.chdir(main_vars.work_dir)
 680 |     cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml")
 681 |     result = os.path.join(
 682 |         main_vars.work_dir, "result", ".SNVs.{}.done".format(main_vars.aligner)
 683 |     )
 684 |     sample_dir = "sample_directory=" + main_vars.work_dir
 685 |     reference = (
 686 |         "reference=" + args.ref
 687 |         if args.ref
 688 |         else "reference=" + main_vars.conf_yaml["reference"]
 689 |     )
 690 |     samples_names = (
 691 |         args.samples_files if args.samples_files else main_vars.sample_list_from_config
 692 |     )
 693 |     reset_times = main_vars.number_of_tries
 694 |     chr_log = " ".join(chr_list) if chr_list else "All Chromosomes"
 695 |     logger.info("Analyzed Chromosomes: ".format(chr_log))
 696 |     logger.info("Clair model: ".format(clair_model))
 697 |     running_command = "running_command=" + args.command
 698 |     delete_files = "delete_files=" + main_vars.running_file
 699 |     delete_samples = "delete_samples=" + str(samples_names)
 700 |     gvcf_snv = "gvcf_snv=" + str(args.gvcf_snv)
 701 | 
 702 |     if samples_names:
 703 |         samples, samples_names_str = get_sample_names(args, main_vars)
 704 |         if args.is_cluster:
 705 |             cmd = [
 706 |                 "snakemake",
 707 |                 "-p",
 708 |                 result,
 709 |                 "-j",
 710 |                 args.jobs,
 711 |                 "--profile",
 712 |                 "cluster",
 713 |                 "--nolock",
 714 |                 "--restart-times",
 715 |                 reset_times,
 716 |                 "--config",
 717 |                 filter_snv,
 718 |                 sample_dir,
 719 |                 samples,
 720 |                 reference,
 721 |                 chrs,
 722 |                 running_command,
 723 |                 delete_files,
 724 |                 delete_samples,
 725 |                 clair_model,
 726 |                 gvcf_snv,
 727 |                 *unknownargs,
 728 |             ]
 729 |         else:
 730 |             cmd = [
 731 |                 "snakemake",
 732 |                 "-p",
 733 |                 result,
 734 |                 "-j",
 735 |                 args.jobs,
 736 |                 "--cluster-config",
 737 |                 cluster_config,
 738 |                 "--nolock",
 739 |                 "--restart-times",
 740 |                 reset_times,
 741 |                 "--config",
 742 |                 filter_snv,
 743 |                 sample_dir,
 744 |                 samples,
 745 |                 reference,
 746 |                 chrs,
 747 |                 running_command,
 748 |                 delete_files,
 749 |                 delete_samples,
 750 |                 clair_model,
 751 |                 gvcf_snv,
 752 |                 *unknownargs,
 753 |             ]
 754 |     else:
 755 |         if args.is_cluster:
 756 |             cmd = [
 757 |                 "snakemake",
 758 |                 "-p",
 759 |                 result,
 760 |                 "-j",
 761 |                 args.jobs,
 762 |                 "--profile",
 763 |                 "cluster",
 764 |                 "--nolock",
 765 |                 "--restart-times",
 766 |                 reset_times,
 767 |                 "--config",
 768 |                 filter_snv,
 769 |                 sample_dir,
 770 |                 reference,
 771 |                 chrs,
 772 |                 running_command,
 773 |                 delete_files,
 774 |                 delete_samples,
 775 |                 clair_model,
 776 |                 gvcf_snv,
 777 |                 *unknownargs,
 778 |             ]
 779 |         else:
 780 |             cmd = [
 781 |                 "snakemake",
 782 |                 "-p",
 783 |                 result,
 784 |                 "-j",
 785 |                 args.jobs,
 786 |                 "--cluster-config",
 787 |                 cluster_config,
 788 |                 "--nolock",
 789 |                 "--restart-times",
 790 |                 reset_times,
 791 |                 "--config",
 792 |                 filter_snv,
 793 |                 sample_dir,
 794 |                 reference,
 795 |                 chrs,
 796 |                 running_command,
 797 |                 delete_files,
 798 |                 delete_samples,
 799 |                 clair_model,
 800 |                 gvcf_snv,
 801 |                 *unknownargs,
 802 |             ]
 803 |     run_cmd(cmd)
 804 |     os.chdir(main_vars.current_dir)
 805 | 
 806 | 
 807 | def variant(args, unknownargs, main_vars):
 808 |     if check_samples(
 809 |         main_vars.work_dir,
 810 |         main_vars.aligner,
 811 |         args.samples_files,
 812 |         main_vars.sample_list_from_config,
 813 |         args.command,
 814 |         args.log_file,
 815 |     ):
 816 |         pass
 817 |     # check if the user gave existing chromosomes.
 818 |     chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"]
 819 |     is_valid_chrs(chr_list, args.ref)
 820 |     if not chr_list:
 821 |         chr_list = get_chrs(args.ref)
 822 |     chrs = "chrs=" + str(chr_list)
 823 |     # Will I filter the SNVs?
 824 |     filter_snv = "filter_chrs=" + str(args.filter)
 825 |     samples_names = (
 826 |         args.samples_files if args.samples_files else main_vars.sample_list_from_config
 827 |     )
 828 |     # If we have samples let us create working directory If not exists.
 829 |     if not os.path.exists(main_vars.work_dir):
 830 |         os.makedirs(main_vars.work_dir)
 831 |     # If we are already in main princess directory do nothing
 832 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
 833 |         pass
 834 |     else:
 835 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
 836 |     os.chdir(main_vars.work_dir)
 837 |     cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml")
 838 |     result = os.path.join(
 839 |         main_vars.work_dir, "result", ".variant.{}.done".format(main_vars.aligner)
 840 |     )
 841 |     sample_dir = "sample_directory=" + main_vars.work_dir
 842 |     reference = (
 843 |         "reference=" + args.ref
 844 |         if args.ref
 845 |         else "reference=" + main_vars.conf_yaml["reference"]
 846 |     )
 847 |     reset_times = main_vars.number_of_tries
 848 |     running_command = "running_command=" + args.command
 849 |     delete_files = "delete_files=" + main_vars.running_file
 850 |     delete_samples = "delete_samples=" + str(samples_names)
 851 |     clair_model: str = (
 852 |         "clair_model=" + args.clair_model if args.clair_model else "clair_model=''"
 853 |     )
 854 | 
 855 |     if samples_names:
 856 |         samples, samples_names_str = get_sample_names(args, main_vars)
 857 |         if args.is_cluster:
 858 |             cmd = [
 859 |                 "snakemake",
 860 |                 "-p",
 861 |                 result,
 862 |                 "-j",
 863 |                 args.jobs,
 864 |                 "--profile",
 865 |                 "cluster",
 866 |                 "--nolock",
 867 |                 "--restart-times",
 868 |                 reset_times,
 869 |                 "--config",
 870 |                 filter_snv,
 871 |                 sample_dir,
 872 |                 samples,
 873 |                 reference,
 874 |                 chrs,
 875 |                 running_command,
 876 |                 delete_files,
 877 |                 delete_samples,
 878 |                 clair_model,
 879 |                 *unknownargs,
 880 |             ]
 881 |         else:
 882 |             cmd = [
 883 |                 "snakemake",
 884 |                 "-p",
 885 |                 result,
 886 |                 "-j",
 887 |                 args.jobs,
 888 |                 "--cluster-config",
 889 |                 cluster_config,
 890 |                 "--nolock",
 891 |                 "--restart-times",
 892 |                 reset_times,
 893 |                 "--config",
 894 |                 filter_snv,
 895 |                 sample_dir,
 896 |                 samples,
 897 |                 reference,
 898 |                 chrs,
 899 |                 running_command,
 900 |                 delete_files,
 901 |                 delete_samples,
 902 |                 clair_model,
 903 |                 *unknownargs,
 904 |             ]
 905 |     else:
 906 |         if args.is_cluster:
 907 |             cmd = [
 908 |                 "snakemake",
 909 |                 "-p",
 910 |                 result,
 911 |                 "-j",
 912 |                 args.jobs,
 913 |                 "--profile",
 914 |                 "cluster",
 915 |                 "--nolock",
 916 |                 "--restart-times",
 917 |                 reset_times,
 918 |                 "--config",
 919 |                 filter_snv,
 920 |                 sample_dir,
 921 |                 reference,
 922 |                 chrs,
 923 |                 running_command,
 924 |                 delete_files,
 925 |                 delete_samples,
 926 |                 clair_model,
 927 |                 *unknownargs,
 928 |             ]
 929 |         else:
 930 |             cmd = [
 931 |                 "snakemake",
 932 |                 "-p",
 933 |                 result,
 934 |                 "-j",
 935 |                 args.jobs,
 936 |                 "--cluster-config",
 937 |                 cluster_config,
 938 |                 "--nolock",
 939 |                 "--restart-times",
 940 |                 reset_times,
 941 |                 "--config",
 942 |                 filter_snv,
 943 |                 sample_dir,
 944 |                 reference,
 945 |                 chrs,
 946 |                 running_command,
 947 |                 delete_files,
 948 |                 delete_samples,
 949 |                 clair_model,
 950 |                 *unknownargs,
 951 |             ]
 952 | 
 953 |     log_chrs = " ".join(chr_list) if chr_list else "All Chromosomes"
 954 |     logger.info("{}{}".format("Chromosomes that will be analyzed: ", log_chrs))
 955 |     logger.info("SNVs will be filtered: {}".format(str(args.filter)))
 956 |     logger.info("Clair model: ".format(args.clair_model))
 957 |     logger.info("Work directory: {}".format(sample_dir))
 958 |     logger.info("Reference: {}".format(args.ref))
 959 |     logger.info("Aligner: {}".format(main_vars.aligner))
 960 |     logger.info("Cluster Will be used: {}".format(args.is_cluster))
 961 |     logger.info("Samples:\n{}".format("\n".join(samples_names)))
 962 |     logger.info("Results:\t{}".format(result))
 963 |     run_cmd(cmd)
 964 |     os.chdir(main_vars.current_dir)
 965 | 
 966 | 
 967 | def all_analysis(args, unknownargs, main_vars):
 968 |     if check_samples(
 969 |         main_vars.work_dir,
 970 |         main_vars.aligner,
 971 |         args.samples_files,
 972 |         main_vars.sample_list_from_config,
 973 |         args.command,
 974 |         args.log_file,
 975 |     ):
 976 |         pass
 977 |     if (
 978 |         args.detect_methylation
 979 |         and not args.methylation_dir
 980 |         and dir_path(args.methylation_dir)
 981 |     ):
 982 |         logger.error("Option -m and -md is mutually inclusive")
 983 |         exit(
 984 |             f"Error exiting, see log file {os.path.join(args.directory, args.log_file)}"
 985 |         )
 986 | 
 987 |     # Methylation option and directory
 988 |     methylation_option = "methylation=" + str(args.detect_methylation)
 989 |     meth_dir = "fast5_dir=" + str(args.methylation_dir)
 990 | 
 991 |     # check if the user gave existing chromosomes.
 992 |     chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"]
 993 |     is_valid_chrs(chr_list, args.ref)
 994 |     if not chr_list:
 995 |         chr_list = get_chrs(args.ref)
 996 |     chrs = "chrs=" + str(chr_list)
 997 | 
 998 |     # Will I filter the SNVs?
 999 |     filter_snv = "filter_chrs=" + str(args.filter)
1000 |     samples_names = (
1001 |         args.samples_files if args.samples_files else main_vars.sample_list_from_config
1002 |     )
1003 | 
1004 |     # If we have samples let us create working directory If not exists.
1005 |     if not os.path.exists(main_vars.work_dir):
1006 |         os.makedirs(main_vars.work_dir)
1007 |     # If we are already in main princess directory do nothing
1008 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
1009 |         pass
1010 |     else:
1011 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
1012 |     os.chdir(main_vars.work_dir)
1013 |     cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml")
1014 |     sample_dir = "sample_directory=" + main_vars.work_dir
1015 |     reference = (
1016 |         "reference=" + args.ref
1017 |         if args.ref
1018 |         else "reference=" + main_vars.conf_yaml["reference"]
1019 |     )
1020 |     reset_times = main_vars.number_of_tries
1021 |     running_command = "running_command=" + args.command
1022 |     delete_files = "delete_files=" + main_vars.running_file
1023 |     delete_samples = "delete_samples=" + str(samples_names)
1024 |     sample_name = "sample_name=" + args.sample_name
1025 |     phase_sv = "phase_sv=" + str(args.phase_sv)
1026 |     mosaic_sv = "mosaic_sv=" + str(args.mosaic_sv)
1027 |     clair_model = (
1028 |         "clair_model=" + args.clair_model if args.clair_model else "clair_model=''"
1029 |     )
1030 |     gvcf_snv = "gvcf_snv=" + str(args.gvcf_snv)
1031 | 
1032 |     if samples_names:
1033 |         samples, samples_names_str = get_sample_names(args, main_vars)
1034 |         if args.is_cluster:
1035 |             cmd = [
1036 |                 "snakemake",
1037 |                 "-p",
1038 |                 "-j",
1039 |                 args.jobs,
1040 |                 "--profile",
1041 |                 "cluster",
1042 |                 "--nolock",
1043 |                 "--restart-times",
1044 |                 reset_times,
1045 |                 "--config",
1046 |                 methylation_option,
1047 |                 meth_dir,
1048 |                 filter_snv,
1049 |                 sample_dir,
1050 |                 samples,
1051 |                 reference,
1052 |                 chrs,
1053 |                 running_command,
1054 |                 delete_files,
1055 |                 delete_samples,
1056 |                 clair_model,
1057 |                 sample_name,
1058 |                 phase_sv,
1059 |                 mosaic_sv,
1060 |                 gvcf_snv,
1061 |                 *unknownargs,
1062 |             ]
1063 |         else:
1064 |             cmd = [
1065 |                 "snakemake",
1066 |                 "-p",
1067 |                 "-j",
1068 |                 args.jobs,
1069 |                 "--cluster-config",
1070 |                 cluster_config,
1071 |                 "--nolock",
1072 |                 "--restart-times",
1073 |                 reset_times,
1074 |                 "--config",
1075 |                 methylation_option,
1076 |                 meth_dir,
1077 |                 filter_snv,
1078 |                 sample_dir,
1079 |                 samples,
1080 |                 reference,
1081 |                 chrs,
1082 |                 running_command,
1083 |                 delete_files,
1084 |                 delete_samples,
1085 |                 clair_model,
1086 |                 sample_name,
1087 |                 phase_sv,
1088 |                 mosaic_sv,
1089 |                 gvcf_snv,
1090 |                 *unknownargs,
1091 |             ]
1092 |     else:
1093 |         if args.is_cluster:
1094 |             cmd = [
1095 |                 "snakemake",
1096 |                 "-p",
1097 |                 "-j",
1098 |                 args.jobs,
1099 |                 "--profile",
1100 |                 "cluster",
1101 |                 "--nolock",
1102 |                 "--restart-times",
1103 |                 reset_times,
1104 |                 "--config",
1105 |                 methylation_option,
1106 |                 meth_dir,
1107 |                 filter_snv,
1108 |                 sample_dir,
1109 |                 reference,
1110 |                 chrs,
1111 |                 running_command,
1112 |                 delete_files,
1113 |                 delete_samples,
1114 |                 clair_model,
1115 |                 sample_name,
1116 |                 phase_sv,
1117 |                 mosaic_sv,
1118 |                 gvcf_snv,
1119 |                 *unknownargs,
1120 |             ]
1121 |         else:
1122 |             cmd = [
1123 |                 "snakemake",
1124 |                 "-p",
1125 |                 "-j",
1126 |                 args.jobs,
1127 |                 "--cluster-config",
1128 |                 cluster_config,
1129 |                 "--nolock",
1130 |                 "--restart-times",
1131 |                 reset_times,
1132 |                 "--config",
1133 |                 methylation_option,
1134 |                 meth_dir,
1135 |                 filter_snv,
1136 |                 sample_dir,
1137 |                 reference,
1138 |                 chrs,
1139 |                 running_command,
1140 |                 delete_files,
1141 |                 delete_samples,
1142 |                 clair_model,
1143 |                 sample_name,
1144 |                 phase_sv,
1145 |                 mosaic_sv,
1146 |                 gvcf_snv,
1147 |                 *unknownargs,
1148 |             ]
1149 | 
1150 |     log_chrs = " ".join(chr_list) if chr_list else "All Chromosomes"
1151 |     logger.info("{}{}".format("Chromosomes that will be analyzed: ", log_chrs))
1152 |     logger.info("SNVs will be filtered: {}".format(str(args.filter)))
1153 |     logger.info("Work directory: {}".format(sample_dir))
1154 |     logger.info("Reference: {}".format(args.ref))
1155 |     logger.info("Aligner: {}".format(main_vars.aligner))
1156 |     logger.info("Cluster Will be used: {}".format(args.is_cluster))
1157 |     logger.info("Methylation will be detected: {}".format(args.detect_methylation))
1158 |     logger.info("Fast5 directory for Methylation: {}".format(args.methylation_dir))
1159 |     logger.info("Samples:\n{}".format("\n".join(samples_names)))
1160 |     logger.info("Sample name: {}".format(args.sample_name))
1161 |     logger.info("Clair modle: {}".format(args.clair_model))
1162 |     logger.info("Running command:\n{}".format((str(cmd))))
1163 |     run_cmd(cmd)
1164 |     os.chdir(main_vars.current_dir)
1165 | 
1166 | 
1167 | def phase(args, unknownargs, main_vars):
1168 |     if check_samples(
1169 |         main_vars.work_dir,
1170 |         main_vars.aligner,
1171 |         args.samples_files,
1172 |         main_vars.sample_list_from_config,
1173 |         args.command,
1174 |         args.log_file,
1175 |     ):
1176 |         pass
1177 |     # check if the user gave existing chromosomes.
1178 |     chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"]
1179 |     is_valid_chrs(chr_list, args.ref)
1180 |     if not chr_list:
1181 |         chr_list = get_chrs(args.ref)
1182 |     chrs = "chrs=" + str(chr_list)
1183 | 
1184 |     # Will I filter the SNVs?
1185 |     filter_snv = "filter_chrs=" + str(args.filter)
1186 | 
1187 |     # If we have samples let us create working directory If not exists.
1188 |     if not os.path.exists(main_vars.work_dir):
1189 |         os.makedirs(main_vars.work_dir)
1190 |     # If we are already in main princess directory do nothing
1191 |     if os.path.samefile(main_vars.running_file, main_vars.work_dir):
1192 |         pass
1193 |     else:
1194 |         copy_tree(main_vars.running_file, main_vars.work_dir, update=1)
1195 |     os.chdir(main_vars.work_dir)
1196 |     cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml")
1197 |     # result = os.path.join(main_vars.work_dir, 'phased', main_vars.aligner, 'data.vcf')
1198 |     result = os.path.join(
1199 |         main_vars.work_dir, "result", "phased.SNVs.{}.done".format(main_vars.aligner)
1200 |     )
1201 |     sample_dir = "sample_directory=" + main_vars.work_dir
1202 |     reference = (
1203 |         "reference=" + args.ref
1204 |         if args.ref
1205 |         else "reference=" + main_vars.conf_yaml["reference"]
1206 |     )
1207 |     samples_names = (
1208 |         args.samples_files if args.samples_files else main_vars.sample_list_from_config
1209 |     )
1210 |     reset_times = main_vars.number_of_tries
1211 |     running_command = "running_command=" + args.command
1212 |     delete_files = "delete_files=" + main_vars.running_file
1213 |     delete_samples = "delete_samples=" + str(samples_names)
1214 | 
1215 |     if samples_names:
1216 |         samples, samples_names_str = get_sample_names(args, main_vars)
1217 |         if args.is_cluster:
1218 |             cmd = [
1219 |                 "snakemake",
1220 |                 "-p",
1221 |                 result,
1222 |                 "-j",
1223 |                 args.jobs,
1224 |                 "--profile",
1225 |                 "cluster",
1226 |                 "--nolock",
1227 |                 "--restart-times",
1228 |                 reset_times,
1229 |                 "--config",
1230 |                 filter_snv,
1231 |                 sample_dir,
1232 |                 samples,
1233 |                 reference,
1234 |                 chrs,
1235 |                 running_command,
1236 |                 delete_files,
1237 |                 delete_samples,
1238 |                 *unknownargs,
1239 |             ]
1240 |         else:
1241 |             cmd = [
1242 |                 "snakemake",
1243 |                 "-p",
1244 |                 result,
1245 |                 "-j",
1246 |                 args.jobs,
1247 |                 "--cluster-config",
1248 |                 cluster_config,
1249 |                 "--nolock",
1250 |                 "--restart-times",
1251 |                 reset_times,
1252 |                 "--config",
1253 |                 filter_snv,
1254 |                 sample_dir,
1255 |                 samples,
1256 |                 reference,
1257 |                 chrs,
1258 |                 running_command,
1259 |                 delete_files,
1260 |                 delete_samples,
1261 |                 *unknownargs,
1262 |             ]
1263 |     else:
1264 |         if args.is_cluster:
1265 |             cmd = [
1266 |                 "snakemake",
1267 |                 "-p",
1268 |                 result,
1269 |                 "-j",
1270 |                 args.jobs,
1271 |                 "--profile",
1272 |                 "cluster",
1273 |                 "--nolock",
1274 |                 "--restart-times",
1275 |                 reset_times,
1276 |                 "--config",
1277 |                 filter_snv,
1278 |                 sample_dir,
1279 |                 reference,
1280 |                 chrs,
1281 |                 running_command,
1282 |                 delete_files,
1283 |                 delete_samples,
1284 |                 *unknownargs,
1285 |             ]
1286 |         else:
1287 |             cmd = [
1288 |                 "snakemake",
1289 |                 "-p",
1290 |                 result,
1291 |                 "-j",
1292 |                 args.jobs,
1293 |                 "--cluster-config",
1294 |                 cluster_config,
1295 |                 "--nolock",
1296 |                 "--restart-times",
1297 |                 reset_times,
1298 |                 "--config",
1299 |                 filter_snv,
1300 |                 sample_dir,
1301 |                 reference,
1302 |                 chrs,
1303 |                 running_command,
1304 |                 delete_files,
1305 |                 delete_samples,
1306 |                 *unknownargs,
1307 |             ]
1308 |     log_chrs = " ".join(chr_list) if chr_list else "All Chromosomes"
1309 |     logger.info("{}{}".format("Chromosomes that will be analyzed: ", log_chrs))
1310 |     logger.info("SNVs will be filtered: {}".format(str(args.filter)))
1311 |     logger.info("Work directory: {}".format(sample_dir))
1312 |     logger.info("Reference: {}".format(args.ref))
1313 |     logger.info("Aligner: {}".format(main_vars.aligner))
1314 |     logger.info("Cluster Will be used: {}".format(args.is_cluster))
1315 |     logger.info("Samples:\n{}".format("\n".join(samples_names)))
1316 |     logger.info("Samples:\n{}".format("\n".join(samples_names)))
1317 |     logger.info("Results:\n{}".format(result))
1318 |     logger.info("running command\n{}".format("\t".join(cmd)))
1319 |     run_cmd(cmd)
1320 |     os.chdir(main_vars.current_dir)
1321 | 
1322 | 
1323 | def sort_params(args, unknownargs):
1324 |     # To follow directly the param --config
1325 |     if args.aligner:
1326 |         unknownargs.insert(0, "aligner={}".format(args.aligner))
1327 | 
1328 |     if args.read_type:
1329 |         unknownargs.insert(0, "read_type={}".format(args.read_type))
1330 | 
1331 |     # add other snakemake params at the tail of the list
1332 |     if args.use_conda:
1333 |         unknownargs.append("--use-conda")
1334 | 
1335 |     return unknownargs
1336 | 
1337 | 
1338 | def required_vars(args, unknownargs):
1339 |     current_dir = os.getcwd()
1340 |     running_file = os.path.dirname(os.path.realpath(__file__))
1341 |     work_dir = os.path.abspath(args.directory)
1342 | 
1343 |     # creating DIRECTORY
1344 |     if not os.path.exists(work_dir):
1345 |         os.makedirs(work_dir)
1346 | 
1347 |     # loading info from yaml file (configfile)
1348 |     if not os.path.exists(os.path.join(work_dir, "config.yaml")) or not filecmp.cmp(
1349 |         os.path.join(running_file, "config.yaml"),
1350 |         os.path.exists(os.path.join(work_dir, "config.yaml")),
1351 |     ):
1352 |         shutil.copy(os.path.join(running_file, "config.yaml"), work_dir)
1353 | 
1354 |     with open(os.path.join(work_dir, "config.yaml"), "r") as myyaml:
1355 |         conf_yaml = yaml.safe_load(myyaml)
1356 | 
1357 |     aligner = args.aligner if args.aligner else str(conf_yaml["aligner"])
1358 |     # TODO: you shall create this variable by checking first if it was passed as argument else use config file.
1359 |     sample_list_from_config = conf_yaml["sample_list"]
1360 |     number_of_jobs = args.jobs if args.jobs else str(conf_yaml["cluster_jobs"])
1361 |     number_of_tries = str(conf_yaml["number_of_tries"])
1362 | 
1363 |     return (
1364 |         current_dir,
1365 |         running_file,
1366 |         work_dir,
1367 |         conf_yaml,
1368 |         aligner,
1369 |         sample_list_from_config,
1370 |         number_of_jobs,
1371 |         number_of_tries,
1372 |     )
1373 | 
1374 | 
1375 | def get_sample_names(args, main_vars):
1376 |     final_samples = ""
1377 |     samples_names = ""
1378 |     if args.samples_files or main_vars.sample_list_from_config:
1379 |         samples = (
1380 |             [os.path.abspath(i) for i in args.samples_files]
1381 |             if args.samples_files
1382 |             else main_vars.sample_list_from_config
1383 |         )
1384 |         # get samples names and soft link them in the new directory
1385 |         for sample in samples:
1386 |             if not os.path.isfile(sample):
1387 |                 print("This sample {} does not exist".format(sample))
1388 |                 exit(
1389 |                     f"Error extincting, see log file {os.path.join(args.directory, args.log_file)}"
1390 |                 )
1391 |             absolute_name = ntpath.basename(sample)
1392 |             if not os.path.islink(
1393 |                 os.path.join(main_vars.work_dir, absolute_name)
1394 |             ) and not os.path.isfile(os.path.join(main_vars.work_dir, absolute_name)):
1395 |                 os.symlink(sample, os.path.join(main_vars.work_dir, absolute_name))
1396 |             if samples_names:
1397 |                 samples_names += " " + absolute_name
1398 |             else:
1399 |                 samples_names += absolute_name
1400 |         final_samples = "sample_list=" + samples_names
1401 |     return final_samples, samples_names
1402 | 
1403 | 
1404 | def run_cmd(cmd):
1405 |     try:
1406 |         subprocess.run(cmd, check=True, universal_newlines=True)
1407 |     except subprocess.CalledProcessError as e:
1408 |         v = " ".join(cmd)
1409 |         print(f"Running:\n{v}")
1410 |         logger.error(
1411 |             "Error in subprocess:\nCommand: {}\nError: {}".format(" ".join(cmd), e)
1412 |         )
1413 | 
1414 | 
1415 | def is_valid_chrs(chr_list, ref):
1416 |     if chr_list:
1417 |         if os.path.isfile(ref + ".fai"):
1418 |             chr_names = set()
1419 |             with open(ref + ".fai", "r") as data_in:
1420 |                 for line in data_in:
1421 |                     chr_names.add(str(line.split()[0]))
1422 |             if not set(chr_list).issubset(chr_names):
1423 |                 print(
1424 |                     "The chromosomes names you gave {} one or more of them does not exists in the reference.\nSupported Chromosomes are:{}".format(
1425 |                         str(chr_list), sorted(chr_names)
1426 |                     )
1427 |                 )
1428 |         else:
1429 |             print(
1430 |                 "Please make sure that {ref}.fai exists.\nOtherwise run:\nsamtools faidx {ref}".format(
1431 |                     ref=ref
1432 |                 )
1433 |             )
1434 | 
1435 | 
1436 | def get_chrs(ref):
1437 |     if os.path.isfile(ref + ".fai"):
1438 |         chr_names = []
1439 |         with open(ref + ".fai", "r") as data_in:
1440 |             for line in data_in:
1441 |                 chr_names.append(str(line.split()[0]))
1442 |         return chr_names
1443 |     else:
1444 |         print(
1445 |             "Please make sure that {ref}.fai exists.\nOtherwise run:\nsamtools faidx {ref}".format(
1446 |                 ref=ref
1447 |             )
1448 |         )
1449 | 
1450 | 
1451 | def clean(source_dir, samples_names):
1452 |     file_list = os.listdir(source_dir)
1453 |     if samples_names:
1454 |         for f in samples_names.split():
1455 |             os.remove(f)
1456 |     for f in file_list:
1457 |         if os.path.isfile(f):
1458 |             os.remove(f)
1459 |         else:
1460 |             shutil.rmtree(f)
1461 | 
1462 | 
1463 | def dir_path(path):
1464 |     return True if os.path.isdir(path) else False
1465 | 
1466 | 
1467 | def check_samples(work_dir, aligner, samples_files, sample_list, command, log_file):
1468 |     if (
1469 |         not (
1470 |             os.path.exists(os.path.join(work_dir, "align", aligner, "data.bam"))
1471 |             or os.path.exists(os.path.join(work_dir, "align", aligner, "data_hap.bam"))
1472 |         )
1473 |         and not samples_files
1474 |         and not sample_list
1475 |     ):
1476 |         logger.error(
1477 |             "Please if you want run {} command there should be aligned file like {} otherwise use -s to support samples or sample_list filed in config.yaml file to support Princess with the files to align".format(
1478 |                 command, os.path.join(work_dir, "align", aligner, "data.bam")
1479 |             )
1480 |         )
1481 |         exit(f"Error extincting, see log file {os.path.join(work_dir, log_file)}")
1482 |     else:
1483 |         return True
1484 | 
1485 | 
1486 | def main():
1487 |     get_args()
1488 | 
1489 | 
1490 | if __name__ == "__main__":
1491 |     main()
1492 | 


--------------------------------------------------------------------------------