├── pictures ├── leia.jpg ├── start.txt ├── fail.txt └── success.txt ├── envs ├── clair3.yaml ├── minimap2.yaml ├── sniffles.yaml ├── clair3_no_depend.yaml ├── variant_tools.yaml ├── env_tools.txt ├── util.yaml ├── minimap_full.yaml ├── sniffles.yaml_back ├── whatshap.yaml ├── pythonRun.yaml ├── princess_env.yaml ├── clair3.yaml_back └── run_princess_env.yaml ├── cluster ├── config.yaml ├── lsf_status.py ├── pbs_status.py ├── key_mapping.yaml ├── slurm_status.py ├── scheduler.py └── cluster_config.yaml ├── LICENSE ├── scripts ├── process.py ├── update_meth_hp_ps.py ├── rawcoverage.py ├── update_sv_hp_ps.py └── phasing_report_update_vcf.py ├── config.yaml ├── .gitignore ├── modules ├── stat.smk ├── methylation.smk ├── phasing.smk ├── sv.smk ├── align.smk ├── snp.smk └── output.smk ├── Snakefile ├── README.md └── princess /pictures/leia.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MeHelmy/princess/HEAD/pictures/leia.jpg -------------------------------------------------------------------------------- /envs/clair3.yaml: -------------------------------------------------------------------------------- 1 | name: clair3.0.1.11 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - clair3=0.1.11 6 | -------------------------------------------------------------------------------- /pictures/start.txt: -------------------------------------------------------------------------------- 1 | ########################### 2 | ### Start analysis ### 3 | ########################### 4 | -------------------------------------------------------------------------------- /envs/minimap2.yaml: -------------------------------------------------------------------------------- 1 | name: Minimap2 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - minimap2=2.24 6 | - samtools=1.15.1 7 | -------------------------------------------------------------------------------- /envs/sniffles.yaml: -------------------------------------------------------------------------------- 1 | name: Sniffles2 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - sniffles=2.0.5 6 | - pysam=0.18.0=py39h5030a8b_2 7 | -------------------------------------------------------------------------------- /envs/clair3_no_depend.yaml: -------------------------------------------------------------------------------- 1 | name: clair3.0.1.11 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - clair3=1.0.3 #0.1.11 -------------------------------------------------------------------------------- /envs/variant_tools.yaml: -------------------------------------------------------------------------------- 1 | name: VariantTools 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - bcftools=1.9 6 | - bedtools=2.29.2 7 | - vcflib=1.0.0 8 | - tabix=0.2.6 9 | - survivor=1.0.6 -------------------------------------------------------------------------------- /envs/env_tools.txt: -------------------------------------------------------------------------------- 1 | conda create --no-default-packages -n princess_env ngmlr samtools minimap2 sniffles bcftools whatshap vcflib survivor tabix pandas numpy seaborn matplotlib biopython nanopolish pyfadix 2 | -------------------------------------------------------------------------------- /envs/util.yaml: -------------------------------------------------------------------------------- 1 | name: Utils 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - bcftools=1.15.1 6 | - bedtools=2.30.0 7 | - samtools=1.15.1 8 | - survivor=1.0.7 9 | - vcflib=1.0.3 10 | -------------------------------------------------------------------------------- /pictures/fail.txt: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ### Sorry, unsuccessful run ### 3 | ####################################### 4 | 5 | Please contact helmy dot medhat [@] gmail for more information 6 | -------------------------------------------------------------------------------- /pictures/success.txt: -------------------------------------------------------------------------------- 1 | ################################################ 2 | ### Successfully finished the analysis!! ### 3 | ################################################ 4 | 5 | Please contact helmy dot medhat [@] gmail for more information 6 | -------------------------------------------------------------------------------- /cluster/config.yaml: -------------------------------------------------------------------------------- 1 | restart-times: 3 2 | latency-wait: 1200 3 | cluster-config: "cluster/cluster_config.yaml" #abs path 4 | cluster: "scheduler.py" # 5 | #cluster-status: "pbs_status.py" # 6 | cluster-status: "slurm_status.py" # 7 | max-jobs-per-second: 30 8 | max-status-checks-per-second: 10 9 | cores: 99 # how many jobs you want to submit to your cluster queue 10 | local-cores: 1 11 | rerun-incomplete: true # recommended for cluster submissions 12 | keep-going: true 13 | -------------------------------------------------------------------------------- /cluster/lsf_status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import os 5 | import sys 6 | import warnings 7 | import subprocess 8 | 9 | 10 | jobid = sys.argv[1] 11 | 12 | out= subprocess.run(['bjobs','-noheader',jobid],stdout=subprocess.PIPE).stdout.decode('utf-8') 13 | 14 | state = out.strip().split()[2] 15 | 16 | 17 | map_state={"PEND":'running', 18 | "RUN":'running', 19 | "PROV":"running", 20 | "WAIT":'running', 21 | "DONE":'success', 22 | "":'success'} 23 | 24 | print(map_state.get(state,'failed')) 25 | -------------------------------------------------------------------------------- /cluster/pbs_status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import subprocess 5 | import xml.etree.cElementTree as ET 6 | 7 | jobid = sys.argv[1] 8 | 9 | try: 10 | res = subprocess.run("qstat -f -x {}".format(jobid), check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) 11 | 12 | xmldoc = ET.ElementTree(ET.fromstring(res.stdout.decode())).getroot() 13 | job_state = xmldoc.findall('.//job_state')[0].text 14 | 15 | if job_state == "C": 16 | exit_status = xmldoc.findall('.//exit_status')[0].text 17 | if exit_status == '0': 18 | print("success") 19 | else: 20 | print("failed") 21 | else: 22 | print("running") 23 | 24 | except (subprocess.CalledProcessError, IndexError, KeyboardInterrupt) as e: 25 | print("failed") 26 | -------------------------------------------------------------------------------- /envs/minimap_full.yaml: -------------------------------------------------------------------------------- 1 | name: Minimap2 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=2_gnu 10 | - bzip2=1.0.8=h7f98852_4 11 | - c-ares=1.18.1=h7f98852_0 12 | - ca-certificates=2022.6.15=ha878542_0 13 | - htslib=1.15.1=h9753748_0 14 | - k8=0.2.5=hd03093a_2 15 | - keyutils=1.6.1=h166bdaf_0 16 | - krb5=1.19.3=h3790be6_0 17 | - libcurl=7.83.1=h7bff187_0 18 | - libdeflate=1.10=h7f98852_0 19 | - libedit=3.1.20191231=he28a2e2_2 20 | - libev=4.33=h516909a_1 21 | - libgcc-ng=12.1.0=h8d9b700_16 22 | - libgomp=12.1.0=h8d9b700_16 23 | - libnghttp2=1.47.0=h727a467_0 24 | - libssh2=1.10.0=ha56f1ee_2 25 | - libstdcxx-ng=12.1.0=ha89aaad_16 26 | - libzlib=1.2.12=h166bdaf_2 27 | - minimap2=2.24=h7132678_1 28 | - ncurses=6.3=h27087fc_1 29 | - openssl=1.1.1q=h166bdaf_0 30 | - samtools=1.15.1=h1170115_0 31 | - xz=5.2.5=h516909a_1 32 | - zlib=1.2.12=h166bdaf_2 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Medhat 4 | ======= 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /cluster/key_mapping.yaml: -------------------------------------------------------------------------------- 1 | # only parameters defined in key_mapping (see below) are passed to the command in the order specified. 2 | #system: "pbs" #check if system is defined below 3 | system: "slurm" #check if system is defined below 4 | 5 | slurm: 6 | command: "sbatch --parsable" 7 | key_mapping: 8 | name: "--job-name={}" 9 | threads: "-n {}" 10 | mem: "--mem={}" 11 | account: "--account={}" 12 | queue: "--partition={}" 13 | time: "--time={}" 14 | nodes: "-N {}" 15 | pbs: 16 | command: "qsub" 17 | key_mapping: 18 | name: "-N {}" 19 | account: "-A {}" 20 | queue: "-q {}" 21 | threads: "-l nodes=1:ppn={}" # always use 1 node 22 | mem: "-l mem={}" 23 | time: "-l walltime={}" #min= seconds x 100 24 | output: "-o {}" 25 | error: "-e {}" 26 | host: "-l select=1:{}" 27 | lsf: 28 | command: "bsub -e lsf_%J.log -o lsf_%J.log" 29 | key_mapping: 30 | queue: "-q {}" 31 | name: "-J {}" 32 | threads: "-n {}" 33 | mem: '-R "rusage[mem={}000]"' 34 | account: "-P {}" 35 | nodes: "-C {}" 36 | 37 | 38 | 39 | # for other cluster systems see: https://slurm.schedmd.com/rosetta.pdf 40 | # cluster = "qsub -A {cluster.account} -l walltime={cluster.time} -q \ 41 | # {cluster.queue} -l nodes=1:ppn={cluster.nCPUs} -l mem={cluster.memory}" 42 | -------------------------------------------------------------------------------- /envs/sniffles.yaml_back: -------------------------------------------------------------------------------- 1 | name: sniffles2 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=1_gnu 10 | - bzip2=1.0.8=h7f98852_4 11 | - c-ares=1.18.1=h7f98852_0 12 | - ca-certificates=2021.10.8=ha878542_0 13 | - keyutils=1.6.1=h166bdaf_0 14 | - krb5=1.19.3=h3790be6_0 15 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 16 | - libcurl=7.82.0=h7bff187_0 17 | - libdeflate=1.10=h7f98852_0 18 | - libedit=3.1.20191231=he28a2e2_2 19 | - libev=4.33=h516909a_1 20 | - libffi=3.4.2=h7f98852_5 21 | - libgcc-ng=11.2.0=h1d223b6_14 22 | - libgomp=11.2.0=h1d223b6_14 23 | - libnghttp2=1.47.0=h727a467_0 24 | - libnsl=2.0.0=h7f98852_0 25 | - libssh2=1.10.0=ha56f1ee_2 26 | - libstdcxx-ng=11.2.0=he4da1e4_14 27 | - libuuid=2.32.1=h7f98852_1000 28 | - libzlib=1.2.11=h36c2ea0_1013 29 | - ncurses=6.3=h9c3ff4c_0 30 | - openssl=1.1.1l=h7f98852_0 31 | - pip=22.0.4=pyhd8ed1ab_0 32 | - pysam=0.18.0=py39h5030a8b_2 33 | - python=3.9.10=h85951f9_2_cpython 34 | - python_abi=3.9=2_cp39 35 | - readline=8.1=h46c0cb4_0 36 | - setuptools=60.10.0=py39hf3d152e_0 37 | - sniffles=2.0.5=pyhdfd78af_0 38 | - sqlite=3.37.1=h4ff8645_0 39 | - tk=8.6.12=h27826a3_0 40 | - tzdata=2022a=h191b570_0 41 | - wheel=0.37.1=pyhd8ed1ab_0 42 | - xz=5.2.5=h516909a_1 43 | - zlib=1.2.11=h36c2ea0_1013 44 | -------------------------------------------------------------------------------- /scripts/process.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser(description="Processing variant file to identifie the passed variant", usage="%(prog)s [options]", 5 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, argument_default=argparse.SUPPRESS) 6 | 7 | parser.add_argument("input", help="Input file from clariovant", nargs='?', type=argparse.FileType('r'), default=sys.stdin) 8 | parser.add_argument("output", help="The output file from clariovant", nargs='?', type=argparse.FileType('w'), default=sys.stdout) 9 | parser.add_argument("-f", "--filter", help="Minimum threshold for variant to be passed (default: %(default)s)", type=int, default=200 ) 10 | 11 | args = parser.parse_args() 12 | 13 | myFile = args.input 14 | dataOut = args.output 15 | threshold = args.filter 16 | 17 | 18 | # myFile = sys.argv[1] 19 | # with open(myFile, "r") as dataIn, open(myFile+"_filter.vcf", 'w') as dataOut: 20 | 21 | for line in myFile: 22 | lineSplit = line.split() 23 | if line.startswith("#"): 24 | dataOut.write(line) 25 | elif lineSplit[4].startswith("<"): 26 | pass 27 | else: 28 | if int(float(lineSplit[5])) >= threshold: 29 | lineSplit[6] = 'PASS' 30 | lineSplit[5] = str(int(float(lineSplit[5]))) 31 | dataOut.write("{}\n".format("\t".join(lineSplit))) 32 | 33 | myFile.close() 34 | dataOut.close() 35 | -------------------------------------------------------------------------------- /envs/whatshap.yaml: -------------------------------------------------------------------------------- 1 | name: Whatshap 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1 9 | - _openmp_mutex=4.5 10 | - biopython=1.79 11 | - bzip2=1.0.8 12 | - c-ares=1.18.1 13 | - ca-certificates=2022.6.15 14 | - htslib=1.15.1 15 | - isa-l=2.30.0 16 | - keyutils=1.6.1 17 | - krb5=1.19.3 18 | - ld_impl_linux-64=2.36.1 19 | - libblas=3.9.0 20 | - libcblas=3.9.0 21 | - libcurl=7.83.1 22 | - libdeflate=1.10 23 | - libedit=3.1.20191231 24 | - libev=4.33 25 | - libffi=3.4.2 26 | - libgcc-ng=12.1.0 27 | - libgfortran-ng=12.1.0 28 | - libgfortran5=12.1.0 29 | - libgomp=12.1.0 30 | - liblapack=3.9.0 31 | - libnghttp2=1.47.0 32 | - libnsl=2.0.0 33 | - libopenblas=0.3.21 34 | - libsqlite=3.39.2 35 | - libssh2=1.10.0 36 | - libstdcxx-ng=12.1.0 37 | - libuuid=2.32.1 38 | - libzlib=1.2.12 39 | - ncurses=6.3 40 | - networkx=2.8.6 41 | - numpy=1.23.2 42 | - openssl=1.1.1q 43 | - packaging=21.3 44 | - pbzip2=1.1.13 45 | - pigz=2.6 46 | - pip=22.2.2 47 | - pyfaidx=0.7.1 48 | - pyparsing=3.0.9 49 | - pysam=0.19.1 50 | - python=3.10.6 51 | - python-isal=1.0.1 52 | - python_abi=3.10 53 | - pyvcf3=1.0.3 54 | - readline=8.1.2 55 | - scipy=1.9.0 56 | - setuptools=65.2.0 57 | - six=1.16.0 58 | - tk=8.6.12 59 | - tzdata=2022c 60 | - whatshap=1.4 61 | - wheel=0.37.1 62 | - xopen=1.6.0 63 | - xz=5.2.6 64 | - zlib=1.2.12 65 | -------------------------------------------------------------------------------- /cluster/slurm_status.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import re 3 | import subprocess as sp 4 | import shlex 5 | import sys 6 | import time 7 | import logging 8 | logger = logging.getLogger("__name__") 9 | 10 | STATUS_ATTEMPTS = 20 11 | 12 | jobid = sys.argv[1] 13 | 14 | for i in range(STATUS_ATTEMPTS): 15 | try: 16 | sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid))) 17 | res = {x.split("|")[0]: x.split("|")[1] for x in sacct_res.decode().strip().split("\n")} 18 | break 19 | except sp.CalledProcessError as e: 20 | logger.error("sacct process error") 21 | logger.error(e) 22 | except IndexError as e: 23 | pass 24 | # Try getting job with scontrol instead in case sacct is misconfigured 25 | try: 26 | sctrl_res = sp.check_output(shlex.split("scontrol -o show job {}".format(jobid))) 27 | m = re.search("JobState=(\w+)", sctrl_res.decode()) 28 | res = {jobid: m.group(1)} 29 | break 30 | except sp.CalledProcessError as e: 31 | logger.error("scontrol process error") 32 | logger.error(e) 33 | if i >= STATUS_ATTEMPTS - 1: 34 | print("failed") 35 | exit(0) 36 | else: 37 | time.sleep(1) 38 | 39 | status = res[jobid] 40 | 41 | if (status == "BOOT_FAIL"): 42 | print("failed") 43 | elif (status == "OUT_OF_MEMORY"): 44 | print("failed") 45 | elif (status.startswith("CANCELLED")): 46 | print("failed") 47 | elif (status == "COMPLETED"): 48 | print("success") 49 | elif (status == "DEADLINE"): 50 | print("failed") 51 | elif (status == "FAILED"): 52 | print("failed") 53 | elif (status == "NODE_FAIL"): 54 | print("failed") 55 | elif (status == "PREEMPTED"): 56 | print("failed") 57 | elif (status == "TIMEOUT"): 58 | print("failed") 59 | # Unclear whether SUSPENDED should be treated as running or failed 60 | elif (status == "SUSPENDED"): 61 | print("failed") 62 | else: 63 | print("running") 64 | -------------------------------------------------------------------------------- /envs/pythonRun.yaml: -------------------------------------------------------------------------------- 1 | name: PythonRun 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1 9 | - _openmp_mutex=4.5 10 | - biopython=1.74 11 | - ca-certificates=2022.12.7 12 | - certifi=2022.12.7 13 | - cycler=0.11.0 14 | - dbus=1.13.6 15 | - expat=2.5.0 16 | - fontconfig=2.14.2 17 | - freetype=2.12.1 18 | - gettext=0.21.1 19 | - glib=2.66.3 20 | - gst-plugins-base=1.14.5 21 | - gstreamer=1.14.5 22 | - icu=64.2 23 | - jpeg=9e 24 | - kiwisolver=1.4.4 25 | - ld_impl_linux-64=2.40 26 | - libblas=3.9.0 27 | - libcblas=3.9.0 28 | - libclang=9.0.1 29 | - libffi=3.2.1 30 | - libgcc-ng=12.2.0 31 | - libgfortran-ng=12.2.0 32 | - libgfortran5=12.2.0 33 | - libglib=2.66.3 34 | - libgomp=12.2.0 35 | - libiconv=1.16 36 | - liblapack=3.9.0 37 | - libllvm9=9.0.1 38 | - libopenblas=0.3.21 39 | - libpng=1.6.39 40 | - libsqlite=3.40.0 41 | - libstdcxx-ng=12.2.0 42 | - libuuid=2.32.1 43 | - libxcb=1.13 44 | - libxkbcommon=0.10.0 45 | - libxml2=2.9.10 46 | - libzlib=1.2.13 47 | - matplotlib=3.1.1 48 | - matplotlib-base=3.1.1 49 | - ncurses=6.3 50 | - nspr=4.35 51 | - nss=3.82 52 | - numpy=1.17.2 53 | - openssl=1.1.1t 54 | - packaging=23.0 55 | - pandas=1.2.3 56 | - patsy=0.5.3 57 | - pcre=8.45 58 | - pip=23.0.1 59 | - pthread-stubs=0.4 60 | - pyfaidx=0.5.5.2 61 | - pyparsing=3.0.9 62 | - pyqt=5.12.3 63 | - python=3.7.8 64 | - python-dateutil=2.8.2 65 | - python_abi=3.7 66 | - pytz=2022.7.1 67 | - qt=5.12.5 68 | - readline=8.1.2 69 | - scipy=1.5.3 70 | - seaborn=0.12.2 71 | - seaborn-base=0.12.2 72 | - setuptools=59.8.0 73 | - six=1.16.0 74 | - sqlite=3.40.0 75 | - statsmodels=0.13.5 76 | - tk=8.6.12 77 | - tornado=6.2 78 | - typing-extensions=4.5.0 79 | - typing_extensions=4.5.0 80 | - wheel=0.38.4 81 | - xorg-libxau=1.0.9 82 | - xorg-libxdmcp=1.1.3 83 | - xz=5.2.6 84 | - zlib=1.2.13 85 | - pip: 86 | - pyqt5-sip==4.19.18 87 | - pyqtchart==5.12 88 | - pyqtwebengine==5.12.1 -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | # Information about samples 4 | ########################### 5 | sample_directory: "" # samples by default should exist in this directory unless otherwise. 6 | read_type: "ont" # ont,clr or ccs 7 | sample_extension: "gz" # please add sample extension ex: fastq , fasta or gz # default gz 8 | # sample list If specified this will overlap the default behavior which is all samples in directory specified above. 9 | # If you leave empty the input will be all the samples in directory. 10 | sample_list: [] 11 | sample_name: "SAMPLE" 12 | delete_files: "" 13 | delete_samples: "" 14 | ########################### 15 | 16 | # Information about reference 17 | ############################### 18 | reference: "/reference/GRCh38-2.1.0/genome.fa" 19 | chrs: [] 20 | 21 | 22 | # Samtools Parameters 23 | ##################### 24 | samtools_threads: 5 25 | ##################### 26 | 27 | 28 | # Aligner 29 | ######### 30 | aligner: "minimap" # minimap or ngmlr 31 | aligner_threads: 10 32 | #minimap_other_tags: "" 33 | minimap_other_tags: "-y" 34 | ######### 35 | 36 | 37 | # Structural Variant Parameters 38 | ############################### 39 | min_sv_len: 50 40 | sv_threads: 5 41 | phase_sv: 'False' 42 | mosaic_sv: 'False' 43 | 44 | 45 | 46 | # Calling Variant Parameters 47 | ############################ 48 | clair_location: "bin/Clair/clair.py" # not used anymore 49 | clair_coverage: 2 50 | clair_threads: 5 51 | # chr_split: 24925062 52 | chr_split: 29925062 53 | filter_chrs: True # Case sensitive options [True, Fasle]. 54 | clair_pypy: "/home/source/Clair/pypy3/pypy3.5-7.0.0-linux_x86_64-portable/bin/pypy" 55 | clair_model: "" 56 | gvcf_snv: 'False' 57 | #tmp_directory: "/tmp" 58 | tmp_directory: "" 59 | 60 | 61 | 62 | # Update SNPs 63 | ############# 64 | update_snps: False 65 | paternal_snps: "" 66 | maternal_snps: "" 67 | 68 | 69 | # Methylation 70 | ############# 71 | methylation: False 72 | fast5_dir: False 73 | methylation_threads: 8 74 | 75 | # Zipping Parameters 76 | #################### 77 | bgzip_threads: 5 78 | 79 | 80 | # Scripts 81 | ########## 82 | read_raw_coverage: "scripts/rawcoverage.py" 83 | read_raw_coverage_threads: 5 84 | 85 | updat_snps_script: "scripts/phasing_report_update_vcf.py" 86 | updat_sv: "scripts/update_sv_hp_ps.py" 87 | hap_methylation: "scripts/update_meth_hp_ps.py" 88 | 89 | 90 | # Cluster 91 | ########## 92 | cluster_jobs: 10 93 | number_of_tries: 0 94 | 95 | ... 96 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | ======= 131 | # Ignore directories 132 | samples/ 133 | .snakemake/ 134 | -------------------------------------------------------------------------------- /modules/stat.smk: -------------------------------------------------------------------------------- 1 | ############################### 2 | ###### STATISTICS RULES ###### 3 | ############################### 4 | 5 | 6 | 7 | #### RAW READS STATISTICS #### 8 | ############################## 9 | 10 | rule readsStat: 11 | """ 12 | Input is the reads in directory output is info about reads 13 | """ 14 | input: expand(data_dir + "/{sample}", sample=sample_list) 15 | output:data_dir + "/statistics/raw_reads/reads_stat.txt", 16 | message: "Calculating read coverage statistics for: {input}", 17 | params: 18 | read_stat_script = rawcoverage_script, 19 | threads: config['read_raw_coverage_threads'] 20 | log: data_dir + "/statistics/raw_reads/reads_stat.log", 21 | benchmark: data_dir + "/benchmark/raw_reads/stat.benchmark.txt" 22 | conda: READ_STAT_ENV 23 | shell: 24 | """ 25 | python {params.read_stat_script} -i {input} -o {output} -t {threads} 2>{log} 26 | """ 27 | 28 | #### BAM STATISTICS #### 29 | ######################## 30 | 31 | rule bamStatistics: 32 | """ 33 | Calculate statistics from merged bam file 34 | """ 35 | input:data_dir + "/align/{aligner}/data.bam" 36 | output:data_dir + "/statistics/{aligner}/data.stat" 37 | message:"Calculating aligned reads statistics from bam file" 38 | benchmark: data_dir + "/benchmark/align/{aligner}/stat.benchmark.txt" 39 | conda: MINIMAP2_ENV 40 | shell:""" 41 | samtools stats {input} > {output} 42 | """ 43 | 44 | #### SV STATISTICS #### 45 | ####################### 46 | 47 | rule svStat: 48 | input: expand(data_dir + "/sv/{aligner}/sniffles.vcf", aligner=config['aligner']) 49 | output: data_dir + "/statistics/sv/data.stat" 50 | message: "calculating statistics for structural variant" 51 | benchmark: data_dir + "/benchmark/sv/stat.benchmark.txt" 52 | conda: VARIANT_ENV 53 | shell:""" 54 | SURVIVOR stats {input} -1 -1 -1 {output} 55 | """ 56 | 57 | #### SNPs STATISTICS #### 58 | ######################### 59 | 60 | rule snpStat: 61 | input: 62 | snp_file = expand(data_dir + "/phased/{aligner}/data.vcf.gz", aligner=config['aligner']) , 63 | snp_file_index = expand(data_dir + "/phased/{aligner}/data.vcf.gz.tbi", aligner=config['aligner']) , 64 | output: data_dir + "/statistics/snp/snp.txt", 65 | message: "Calculate SNPs statistics" 66 | benchmark: data_dir + "/benchmark/snp/stat.benchmark.txt" 67 | conda: VARIANT_ENV 68 | shell:""" 69 | bcftools stats {input.snp_file} > {output} 70 | """ 71 | #### ALL STATISTICS No READs #### 72 | ################################# 73 | 74 | rule statNoReads: 75 | input: 76 | expand(data_dir + "/statistics/{aligner}/data.stat", aligner=config['aligner']), 77 | data_dir + "/statistics/sv/data.stat", 78 | data_dir + "/statistics/snp/snp.txt", 79 | output: data_dir + "/stat.NoReads.txt" 80 | shell: "touch {output}" 81 | 82 | #### ALL STATISTICS #### 83 | ####################### 84 | 85 | rule stat: 86 | input: 87 | expand(data_dir + "/statistics/{aligner}/data.stat", aligner=config['aligner']), 88 | data_dir + "/statistics/raw_reads/reads_stat.txt", 89 | data_dir + "/statistics/sv/data.stat", 90 | data_dir + "/statistics/snp/snp.txt" 91 | output: data_dir + "/stat.txt" 92 | shell: "touch {output}" 93 | -------------------------------------------------------------------------------- /scripts/update_meth_hp_ps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | This script update Methylation file to add both HP haplotag and PS phasing block, It takes as input meth file, hp, ps. 5 | """ 6 | import argparse 7 | import sys, os 8 | from operator import itemgetter 9 | from collections import Counter 10 | 11 | # Python program to print 12 | # green text with red background 13 | # 14 | # from colorama import init 15 | # from termcolor import colored 16 | # 17 | # init() 18 | 19 | 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.", 24 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 25 | description='Produce phasing report for Methylation', 26 | add_help=True, ) 27 | parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01') 28 | # parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) 29 | # parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout) 30 | 31 | parser.add_argument('input', nargs='?', help="Methylation file", 32 | type=argparse.FileType('r'), 33 | default=sys.stdin) 34 | parser.add_argument('hp', nargs='?', help="tab delimeted read\thp\tps file", 35 | type=argparse.FileType('r')) 36 | parser.add_argument('output', nargs='?', help="Output file, PS and HP will be added.", 37 | type=argparse.FileType('w+'), 38 | default=sys.stdout) 39 | 40 | parser.set_defaults(func=update_meth) 41 | 42 | # if not argument print help. 43 | if len(sys.argv) == 1 and sys.stdin.isatty(): # sys.stdin.isatty() returns false if there's something in stdin 44 | parser.print_help(sys.stderr) 45 | sys.exit(1) 46 | 47 | args = parser.parse_args() 48 | 49 | 50 | if 'func' in args: 51 | args.func(args) 52 | else: 53 | parser.print_help() 54 | 55 | def update_meth(args): 56 | # check if the input from stdin 57 | if not sys.stdin.isatty(): # there is nothing in the stdin 58 | if args.input.name.endswith("gz"): 59 | import gzip 60 | myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default. 61 | else: 62 | myfile = args.input 63 | else: 64 | myfile = args.input 65 | 66 | # read the Haplotype file as dictionary 67 | hp_dic = {} 68 | with args.hp as hp_in: 69 | for line in hp_in: 70 | id, hp, ps = line.split() 71 | hp_dic[id] = [hp.rsplit(":", 1)[-1], ps.rsplit(":", 1)[-1]] # read hp, ps 72 | 73 | 74 | with myfile as data_in, args.output as data_out: 75 | first = True 76 | n = 0 77 | for line in data_in: 78 | n+=1 79 | if first: 80 | first = False 81 | data_out.write(line.strip()+"\tHP\tPS\n") 82 | continue 83 | line_split = line.split() 84 | read = line_split[4] 85 | hp, ps = hp_dic.get(read, ['.', '.']) # In case f the read have not been haplotyped. 86 | data_out.write("{}\t{}\t{}\n".format(line.strip(), hp, ps)) 87 | 88 | def main(): 89 | args = get_args() 90 | 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /envs/princess_env.yaml: -------------------------------------------------------------------------------- 1 | name: princess_env 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | - bioconda 6 | - defaults 7 | dependencies: 8 | - bcftools=1.9=h68d8f2e_7 9 | - bedtools=2.29.2=hc088bd4_0 10 | - biopython=1.74=py37h516909a_0 11 | - bwa=0.7.17=hed695b0_6 12 | - bzip2=1.0.8=h516909a_1 13 | - ca-certificates=2020.12.5=ha878542_0 14 | - certifi=2020.12.5=py37h89c1867_1 15 | - curl=7.65.3=hf8cf82a_0 16 | - cycler=0.10.0=py_1 17 | - dbus=1.13.6=he372182_0 18 | - decorator=4.4.0=py_0 19 | - eigen=3.3.7=h6bb024c_1000 20 | - expat=2.2.5=he1b5a44_1004 21 | - fontconfig=2.13.1=h86ecdb6_1001 22 | - freetype=2.10.0=he983fc9_1 23 | - gettext=0.19.8.1=hc5be6a0_1002 24 | - glib=2.58.3=h6f030ca_1002 25 | - gsl=2.5=h294904e_1 26 | - gst-plugins-base=1.14.5=h0935bb2_0 27 | - gstreamer=1.14.5=h36ae1b5_0 28 | - hdf5=1.10.5=nompi_h3c11f04_1104 29 | - htslib=1.9=ha228f0b_7 30 | - icu=64.2=he1b5a44_1 31 | - jpeg=9c=h14c3975_1001 32 | - kiwisolver=1.1.0=py37hc9558a2_0 33 | - krb5=1.16.3=h05b26f9_1001 34 | - libblas=3.8.0=14_openblas 35 | - libcblas=3.8.0=14_openblas 36 | - libclang=9.0.0=hc9558a2_1 37 | - libcurl=7.65.3=hda55be3_0 38 | - libdeflate=1.0=h14c3975_1 39 | - libedit=3.1.20170329=hf8c457e_1001 40 | - libffi=3.2.1=he1b5a44_1006 41 | - libgcc-ng=9.1.0=hdf63c60_0 42 | - libgfortran-ng=7.3.0=hdf63c60_2 43 | - libiconv=1.15=h516909a_1005 44 | - liblapack=3.8.0=14_openblas 45 | - libllvm9=9.0.0=hc9558a2_2 46 | - libopenblas=0.3.7=h6e990d7_2 47 | - libpng=1.6.37=hed695b0_0 48 | - libssh2=1.8.2=h22169c7_2 49 | - libstdcxx-ng=9.1.0=hdf63c60_0 50 | - libuuid=2.32.1=h14c3975_1000 51 | - libxcb=1.13=h14c3975_1002 52 | - libxkbcommon=0.9.1=hebb1f50_0 53 | - libxml2=2.9.9=hee79883_5 54 | - llvm-openmp=8.0.1=hc9558a2_0 55 | - matplotlib=3.1.1=py37_1 56 | - matplotlib-base=3.1.1=py37he7580a8_1 57 | - minimap2=2.17=h8b12597_1 58 | - nanopolish=0.11.2=h705302d_0 59 | - ncurses=6.1=hf484d3e_1002 60 | - networkx=2.4=py_0 61 | - ngmlr=0.2.7=he860b03_1 62 | - nspr=4.23=he1b5a44_0 63 | - nss=3.47=he751ad9_0 64 | - numpy=1.17.2=py37h95a1406_0 65 | - openmp=8.0.1=0 66 | - openssl=1.1.1c=h516909a_0 67 | - pandas=0.25.2=py37hb3f55d8_0 68 | - patsy=0.5.1=py_0 69 | - pcre=8.43=he1b5a44_0 70 | - perl=5.26.2=h516909a_1006 71 | - pip=19.3.1=py37_0 72 | - pthread-stubs=0.4=h14c3975_1001 73 | - pyfaidx=0.5.5.2=py_1 74 | - pyparsing=2.4.2=py_0 75 | - pyqt=5.12.3=py37hcca6a23_0 76 | - pysam=0.15.3=py37hda2845c_1 77 | - python=3.7.3=h33d41f4_1 78 | - python-dateutil=2.8.0=py_0 79 | - python_abi=3.7=1_cp37m 80 | - pytz=2019.3=py_0 81 | - pyvcf=0.6.8=py37_1000 82 | - qt=5.12.5=h0c104cb_0 83 | - readline=8.0=hf8c457e_0 84 | - samtools=1.9=h10a08f8_12 85 | - scipy=1.3.1=py37h921218d_2 86 | - seaborn=0.9.0=py_1 87 | - setuptools=41.4.0=py37_0 88 | - six=1.12.0=py37_1000 89 | - sniffles=1.0.12=h8b12597_1 90 | - sqlite=3.30.1=hcee41ef_0 91 | - statsmodels=0.10.1=py37hc1659b7_0 92 | - survivor=1.0.6=h6bb024c_0 93 | - tabix=0.2.6=ha92aebf_0 94 | - tclap=1.2.1=h470a237_1 95 | - tk=8.6.9=hed695b0_1003 96 | - tornado=6.0.3=py37h516909a_0 97 | - vcflib=1.0.0_rc3=py37hc088bd4_0 98 | - whatshap=0.18=py37h6bb024c_0 99 | - wheel=0.33.6=py37_0 100 | - xopen=0.8.3=py37_0 101 | - xorg-libxau=1.0.9=h14c3975_0 102 | - xorg-libxdmcp=1.1.3=h516909a_0 103 | - xz=5.2.4=h14c3975_1001 104 | - zlib=1.2.11=h516909a_1006 105 | - pip: 106 | - pyqt5-sip==4.19.18 107 | - pyqtwebengine==5.12.1 108 | -------------------------------------------------------------------------------- /modules/methylation.smk: -------------------------------------------------------------------------------- 1 | ################################# 2 | ###### METHYLATION RULES ####### 3 | ################################# 4 | 5 | 6 | 7 | #### NANOPOLISH INDEX #### 8 | ########################## 9 | 10 | rule nanoIndex: 11 | """ 12 | Preparing index to links read ids with their signal-level data in the FAST5 files 13 | """ 14 | input: 15 | fastq_file=data_dir + "/{sample}", 16 | output: data_dir + "/{sample}.index.readdb" 17 | message: "Input file is {wildcards.sample}" 18 | params: 19 | fast5_dir = config['fast5_dir']#lambda wildcards: ont_sample_dir[wildcards.sample] 20 | benchmark: data_dir + "/benchmark/methylation/index.{sample}.benchmark.txt" 21 | conda: PRINCESS_ENV 22 | shell:""" 23 | nanopolish index -d {params.fast5_dir} {input.fastq_file} 24 | """ 25 | 26 | #### NANOPOLISH METHYLATION #### 27 | ################################ 28 | 29 | rule callMeth: 30 | """ 31 | Calling Methylation 32 | """ 33 | input: 34 | fastq_file=data_dir + "/{sample}", 35 | bam_file=data_dir + "/align/{aligner}/{sample}.bam", 36 | bam_index=data_dir + "/align/{aligner}/{sample}.bam.bai", 37 | fastq_index=data_dir + "/{sample}.index.readdb", 38 | output: data_dir + "/meth/{aligner}/{sample}.methylation_calls.tsv" 39 | params: 40 | ref = REFERENCES, 41 | threads: config['methylation_threads'] 42 | message: "Calling Methylation for sample: {wildcards.sample}" 43 | benchmark: data_dir + "/benchmark/methylation/{aligner}/call_methylation.{sample}.benchmark.txt" 44 | conda: PRINCESS_ENV 45 | shell:""" 46 | nanopolish call-methylation -t 8 -r {input.fastq_file} -b {input.bam_file} -g {params.ref} > {output} 47 | """ 48 | 49 | #### NANOPOLISH METHYLATION Haplotype #### 50 | ########################################## 51 | 52 | rule callMethHap: 53 | """ 54 | Haplotype Methylation 55 | """ 56 | input: 57 | meth = data_dir + "/meth/{aligner}/{sample}.methylation_calls.tsv", 58 | bam = data_dir + "/align/{aligner}/data_hap.tab", 59 | output: data_dir + "/meth/{aligner}/{sample}.methylation_calls_hap.tsv" 60 | params: 61 | update_script = config['hap_methylation'], 62 | message: "Updating Methylation for {wildcards.sample} using align/{wildcards.aligner}/data_hap.tab" 63 | benchmark: data_dir + "/benchmark/methylation/{aligner}/call_methylation.{sample}.hap.benchmark.txt" 64 | shell:""" 65 | python {params.update_script} {input.meth} {input.bam} {output} 66 | """ 67 | 68 | #### CALL ALL METHYLATION #### 69 | ############################## 70 | 71 | rule allMethylation: 72 | """ 73 | Call all methylation samples. 74 | """ 75 | input: lambda wildcards: expand(data_dir + "/meth/{aligner}/{sample}.methylation_calls.tsv", aligner=wildcards.aligner, sample=config['sample_list'].split()) 76 | output: data_dir + "/meth/{aligner}/methylation_calls.tsv", 77 | message: "Collecting all methylation samples {input}" 78 | shell:""" 79 | touch {output} 80 | """ 81 | 82 | #### CALL ALL METHYLATION PHASED & HAPLOTYPED #### 83 | ################################################# 84 | 85 | rule allMethylationHap: 86 | """ 87 | Call all methylation samples phased. 88 | """ 89 | input: lambda wildcards: expand(data_dir + "/meth/{aligner}/{sample}.methylation_calls_hap.tsv", aligner=wildcards.aligner, sample=config['sample_list'].split()) 90 | output: data_dir + "/meth/{aligner}/methylation_calls_hap.tsv", 91 | message: "Collecting all methylation samples {input}" 92 | shell:""" 93 | touch {output} 94 | """ 95 | -------------------------------------------------------------------------------- /modules/phasing.smk: -------------------------------------------------------------------------------- 1 | 2 | ############################ 3 | ###### PHASING RULES ###### 4 | ########################### 5 | 6 | 7 | 8 | 9 | #### GENOTYPING #### 10 | #################### 11 | 12 | rule gt: 13 | """ 14 | Genotype SNPs one chromosome per time. 15 | """ 16 | input: 17 | bam=data_dir + "/align/{aligner}/data.bam", 18 | bam_index=data_dir + "/align/{aligner}/data.bam.bai", 19 | snps=data_dir + "/snp/{aligner}/data.{chr}.vcf", 20 | output: 21 | data_dir + "/gt/{aligner}/data.{chr}.vcf" 22 | params: 23 | reference=REFERENCES, 24 | conda: WHATSHAP_ENV 25 | log: 26 | data_dir + "/gt/{aligner}/data.{chr}.log" 27 | benchmark: data_dir + "/benchmark/gt/{aligner}/{chr}.benchmark.txt" 28 | shell:""" 29 | whatshap genotype --reference {params.reference} \ 30 | --ignore-read-groups \ 31 | --output {output} {input.snps} {input.bam} > {log} 2>&1 32 | """ 33 | 34 | #### PHASING #### 35 | ################# 36 | 37 | rule phasing: 38 | """ 39 | Phase SNPs one chromosome per time 40 | """ 41 | input: 42 | bam=data_dir + "/align/{aligner}/data.bam", 43 | bam_index=data_dir + "/align/{aligner}/data.bam.bai", 44 | snps=data_dir + "/snp/{aligner}/data.{chr}.vcf", 45 | output: 46 | phased=temp(data_dir + "/phased/{aligner}/data.{chr}.vcf"), 47 | params: 48 | reference=REFERENCES, 49 | read_list=data_dir + "/phased/{aligner}/data.{chr}.reads", 50 | log: 51 | data_dir + "/phased/{aligner}/data.{chr}.log" 52 | conda: WHATSHAP_ENV 53 | benchmark: data_dir + "/benchmark/phase/{aligner}/{chr}.benchmark.txt" 54 | shell:""" 55 | whatshap phase --reference {params.reference} \ 56 | --output {output.phased} {input.snps} {input.bam} \ 57 | --ignore-read-groups \ 58 | --output-read-list {params.read_list} > {log} 2>&1 59 | """ 60 | 61 | #### CONCAT PHASING #### 62 | ######################## 63 | 64 | rule allPhased: 65 | """ 66 | Concat all the phased SNPs into one file. 67 | """ 68 | input:lambda wildcards: expand(data_dir + "/phased/{aligner}/data.{chr}.vcf", aligner=wildcards.aligner, chr=chr_list), 69 | output: temp(data_dir + "/phased/{aligner}/data.vcf") 70 | conda: VARIANT_ENV 71 | params: 72 | sample_name = SAMPLE_NAME, 73 | benchmark: data_dir + "/benchmark/phase/{aligner}/concat_phased.benchmark.txt" 74 | shell:""" 75 | echo "{params.sample_name}" > sample_name.txt && vcfcat {input} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output} 76 | """ 77 | 78 | #### HAPLOTYPE BAM FILE #### 79 | ############################ 80 | 81 | rule partionBam: 82 | """ 83 | Partion a bam file based on the phased SNPs, 84 | It will use the updated SNPs if the parental SNPs were provided. 85 | """ 86 | input: 87 | bam = data_dir + "/align/{aligner}/data.bam", # SM filed must be set to the sample name in vcf file 88 | bam_index = data_dir + "/align/{aligner}/data.bam.bai", 89 | snp = lambda wildcards: data_dir + "/phased/{aligner}/data_updated.vcf.gz" if config['update_snps'] else data_dir + "/phased/{aligner}/data.vcf.gz", 90 | snp_index = lambda wildcards: data_dir + "/phased/{aligner}/data_updated.vcf.gz.tbi" if config['update_snps'] else data_dir + "/phased/{aligner}/data.vcf.gz.tbi", 91 | output: 92 | hap_bam = data_dir + "/align/{aligner}/data_hap.bam" 93 | message: "Partitioning bam file" 94 | conda: WHATSHAP_ENV 95 | params: 96 | ref = REFERENCES 97 | shell:""" 98 | whatshap haplotag --ignore-read-groups -o {output.hap_bam} -r {params.ref} {input.snp} {input.bam} 99 | """ 100 | -------------------------------------------------------------------------------- /cluster/scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import sys, os 5 | import subprocess 6 | from subprocess import Popen, PIPE 7 | import yaml 8 | 9 | 10 | def eprint(*args, **kwargs): 11 | print(*args, file=sys.stderr, **kwargs) 12 | 13 | 14 | def run_cmd(cmd): 15 | eprint("Running from subprocess" + str(cmd)) 16 | try: 17 | subprocess.run(cmd, check=True, universal_newlines=True) 18 | except subprocess.CalledProcessError as e: 19 | logger.error("Error in subprocess:\n{}".format(e.returncode)) 20 | 21 | 22 | def convert_time_to_seconds(run_time): 23 | # Number of : in input: 24 | colons = run_time.count(":") 25 | if colons == 3: 26 | # dd:hh:mm:ss 27 | d, h, m, s = run_time.split(":") 28 | return str(day2sec(int(d)) + hours2sec(int(h)) + minutes2sec(int(m)) + int(s)) 29 | elif colons == 2: 30 | # hh:mm:ss 31 | h, m, s = run_time.split(":") 32 | return str(hours2sec(int(h)) + minutes2sec(int(m)) + int(s)) 33 | elif colons == 1: 34 | # mm:ss 35 | m, s = run_time.split(":") 36 | return str(minutes2sec(int(m)) + int(s)) 37 | else: 38 | return run_time 39 | 40 | 41 | def day2sec(days): 42 | return days * 24 * 60 * 60 43 | 44 | 45 | def hours2sec(hours): 46 | return hours * 60 * 60 47 | 48 | 49 | def minutes2sec(minutes): 50 | return minutes * 60 51 | 52 | 53 | def qsub_to_slurm_time(qsub_time): 54 | qsub_time = qsub_time.split(":") 55 | slurm_time = "" 56 | if len(qsub_time) == 4: 57 | slurm_time = "{}-{}:{}:{}".format( 58 | qsub_time[0], qsub_time[1], qsub_time[2], qsub_time[3] 59 | ) 60 | elif len(qsub_time) == 3: 61 | slurm_time = "{}:{}:{}".format(qsub_time[0], qsub_time[1], qsub_time[2]) 62 | return slurm_time 63 | 64 | 65 | # let snakemake read job_properties 66 | from snakemake.utils import read_job_properties 67 | 68 | 69 | jobscript = sys.argv[1] 70 | 71 | job_properties = read_job_properties(jobscript) 72 | 73 | 74 | # default parameters defined in cluster_spec (accessed via snakemake read_job_properties) 75 | cluster_param = job_properties["cluster"] 76 | 77 | 78 | if job_properties["type"] == "single": 79 | cluster_param["name"] = "snakejob.{}".format(job_properties["rule"]) 80 | elif job_properties["type"] == "group": 81 | cluster_param["name"] = job_properties["groupid"] 82 | else: 83 | raise NotImplementedError( 84 | f"Don't know what to do with job_properties['type']=={job_properties['type']}" 85 | ) 86 | 87 | 88 | # don't overwrite default parameters if defined in rule (or config file) 89 | if ("threads" in job_properties) and ("threads" not in cluster_param): 90 | cluster_param["threads"] = job_properties["threads"] 91 | for res in ["time", "mem"]: 92 | if (res in job_properties["resources"]) and (res not in cluster_param): 93 | cluster_param[res] = job_properties["resources"][res] 94 | 95 | # check which system you are on and load command command_options 96 | key_mapping_file = os.path.join(os.path.dirname(__file__), "key_mapping.yaml") 97 | command_options = yaml.load(open(key_mapping_file), Loader=yaml.BaseLoader) 98 | system = command_options["system"] 99 | command = command_options[system]["command"] 100 | key_mapping = command_options[system]["key_mapping"] 101 | 102 | 103 | ## TODO: Comment this line || test while using normal time 01:00:00:00 104 | # time in hours 105 | if "time" in cluster_param: 106 | # cluster_param["time"]=int(cluster_param["time"])*60 107 | # cluster_param["time"]=convert_time_to_seconds(cluster_param["time"]) 108 | if system == "pbs": 109 | cluster_param["time"] = cluster_param["time"] 110 | elif system == "slurm": 111 | cluster_param["time"] = qsub_to_slurm_time(cluster_param["time"]) 112 | 113 | 114 | # construct command: 115 | for key in key_mapping: 116 | if key in cluster_param: 117 | command += " " 118 | command += key_mapping[key].format(cluster_param[key]) 119 | 120 | command += " {}".format(jobscript) 121 | 122 | eprint("submit command: " + command) 123 | 124 | # run_cmd(command.split(' ')) 125 | p = Popen(command.split(" "), stdout=PIPE, stderr=PIPE) 126 | output, error = p.communicate() 127 | if p.returncode != 0: 128 | raise Exception( 129 | "Job can't be submitted\n" + output.decode("utf-8") + error.decode("utf-8") 130 | ) 131 | else: 132 | res = output.decode("utf-8") 133 | 134 | if system == "lsf": 135 | import re 136 | 137 | match = re.search(r"Job <(\d+)> is submitted", res) 138 | jobid = match.group(1) 139 | 140 | elif system == "pbs": 141 | jobid = res.strip().split(".")[0] 142 | 143 | else: 144 | jobid = int(res.strip().split()[-1]) 145 | 146 | print(jobid) 147 | -------------------------------------------------------------------------------- /envs/clair3.yaml_back: -------------------------------------------------------------------------------- 1 | name: clair3.0.1.11 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=2_gnu 10 | - _tflow_select=2.3.0=mkl 11 | - absl-py=1.0.0=pyhd8ed1ab_0 12 | - astor=0.8.1=pyh9f0ad1d_0 13 | - astunparse=1.6.3=pyhd8ed1ab_0 14 | - biopython=1.79=py36h8f6f2f9_0 15 | - blinker=1.4=py_1 16 | - blosc=1.21.1=hd32f23e_0 17 | - brotlipy=0.7.0=py36h8f6f2f9_1001 18 | - bzip2=1.0.8=h7f98852_4 19 | - c-ares=1.18.1=h7f98852_0 20 | - ca-certificates=2022.5.18.1=ha878542_0 21 | - cachetools=2.1.0=py_0 22 | - certifi=2021.5.30=py36h5fab9bb_0 23 | - cffi=1.14.4=py36h211aa47_0 24 | - charset-normalizer=2.0.12=pyhd8ed1ab_0 25 | - clair3=0.1.11=py36hb9dc472_5 26 | - click=8.0.1=py36h5fab9bb_0 27 | - cryptography=35.0.0=py36hb60f036_0 28 | - cycler=0.11.0=pyhd8ed1ab_0 29 | - dataclasses=0.8=pyh787bdff_2 30 | - expat=2.4.8=h27087fc_0 31 | - freetype=2.10.4=h0708190_1 32 | - gast=0.3.3=py_0 33 | - gdbm=1.18=h0a1914f_2 34 | - google-auth=1.2.1=py_0 35 | - google-auth-oauthlib=0.4.1=py_2 36 | - google-pasta=0.2.0=pyh8c360ce_0 37 | - grpcio=1.38.1=py36h8e87921_0 38 | - h5py=2.10.0=nompi_py36h4510012_106 39 | - hdf5=1.10.6=nompi_h6a2412b_1114 40 | - htslib=1.10.2=hd3b49d5_1 41 | - idna=3.3=pyhd8ed1ab_0 42 | - importlib-metadata=4.8.1=py36h5fab9bb_0 43 | - isa-l=2.30.0=ha770c72_4 44 | - jpeg=9e=h166bdaf_1 45 | - keras-preprocessing=1.1.2=pyhd8ed1ab_0 46 | - keyutils=1.6.1=h166bdaf_0 47 | - kiwisolver=1.3.1=py36h605e78d_1 48 | - krb5=1.19.3=h3790be6_0 49 | - lcms2=2.11=hcbb858e_1 50 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 51 | - libblas=3.9.0=14_linux64_openblas 52 | - libcblas=3.9.0=14_linux64_openblas 53 | - libcurl=7.83.1=h7bff187_0 54 | - libdeflate=1.6=h516909a_0 55 | - libedit=3.1.20191231=he28a2e2_2 56 | - libev=4.33=h516909a_1 57 | - libffi=3.2.1=he1b5a44_1007 58 | - libgcc-ng=12.1.0=h8d9b700_16 59 | - libgfortran-ng=12.1.0=h69a702a_16 60 | - libgfortran5=12.1.0=hdcd56e2_16 61 | - libgomp=12.1.0=h8d9b700_16 62 | - liblapack=3.9.0=14_linux64_openblas 63 | - libnghttp2=1.47.0=h727a467_0 64 | - libnsl=2.0.0=h7f98852_0 65 | - libopenblas=0.3.20=pthreads_h78a6416_0 66 | - libpng=1.6.37=h21135ba_2 67 | - libprotobuf=3.18.0=h780b84a_1 68 | - libssh2=1.10.0=ha56f1ee_2 69 | - libstdcxx-ng=12.1.0=ha89aaad_16 70 | - libtiff=4.1.0=hc3755c2_3 71 | - libzlib=1.2.12=h166bdaf_0 72 | - lz4-c=1.9.2=he1b5a44_3 73 | - lzo=2.10=h516909a_1000 74 | - markdown=3.3.7=pyhd8ed1ab_0 75 | - matplotlib-base=3.3.4=py36hd391965_0 76 | - mock=4.0.3=py36h5fab9bb_1 77 | - ncurses=6.2=h58526e2_4 78 | - networkx=2.7.1=pyhd8ed1ab_0 79 | - numexpr=2.7.3=py36h0cdc3f0_0 80 | - numpy=1.19.5=py36hfc0c790_2 81 | - oauthlib=3.2.0=pyhd8ed1ab_0 82 | - olefile=0.46=pyh9f0ad1d_1 83 | - openssl=1.1.1o=h166bdaf_0 84 | - opt_einsum=3.3.0=pyhd8ed1ab_1 85 | - pandas=1.1.5=py36h284efc9_0 86 | - parallel=20191122=0 87 | - pbzip2=1.1.13=0 88 | - perl=5.32.1=2_h7f98852_perl5 89 | - pigz=2.4=h84994c4_0 90 | - pillow=8.1.0=py36h4f9996e_1 91 | - pip=21.3.1=pyhd8ed1ab_0 92 | - protobuf=3.18.0=py36hc4f0c31_0 93 | - pyasn1=0.4.8=py_0 94 | - pyasn1-modules=0.0.5=py36_0 95 | - pycparser=2.21=pyhd8ed1ab_0 96 | - pyfaidx=0.6.4=pyh5e36f6f_0 97 | - pyjwt=2.4.0=pyhd8ed1ab_0 98 | - pyopenssl=22.0.0=pyhd8ed1ab_0 99 | - pyparsing=3.0.9=pyhd8ed1ab_0 100 | - pypy3.6=7.3.2=h45e8706_2 101 | - pysam=0.16.0.1=py36h4c34d4e_1 102 | - pysocks=1.7.1=py36h5fab9bb_3 103 | - pytables=3.6.1=py36hb7ec5aa_3 104 | - python=3.6.10=h8356626_1011_cpython 105 | - python-dateutil=2.8.2=pyhd8ed1ab_0 106 | - python-isal=0.11.1=py36h8f6f2f9_0 107 | - python_abi=3.6=2_cp36m 108 | - pytz=2022.1=pyhd8ed1ab_0 109 | - readline=8.1=h46c0cb4_0 110 | - requests=2.27.1=pyhd8ed1ab_0 111 | - requests-oauthlib=1.3.1=pyhd8ed1ab_0 112 | - rsa=3.1.4=py36_0 113 | - samtools=1.10=h2e538c0_3 114 | - scipy=1.5.3=py36h81d768a_1 115 | - setuptools=58.0.4=py36h5fab9bb_2 116 | - six=1.16.0=pyh6c4a22f_0 117 | - snappy=1.1.9=hbd366e4_1 118 | - sqlite=3.37.0=h9cd32fc_0 119 | - tensorboard=2.3.0=py_0 120 | - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0 121 | - tensorflow=2.2.0=mkl_py36h5a57954_0 122 | - tensorflow-base=2.2.0=mkl_py36hd506778_0 123 | - tensorflow-estimator=2.6.0=py36hc4f0c31_0 124 | - termcolor=1.1.0=py_2 125 | - tk=8.6.12=h27826a3_0 126 | - tornado=6.1=py36h8f6f2f9_1 127 | - typing_extensions=4.1.1=pyha770c72_0 128 | - urllib3=1.26.9=pyhd8ed1ab_0 129 | - werkzeug=2.0.2=pyhd8ed1ab_0 130 | - whatshap=1.0=py36hf1ae8f4_1 131 | - wheel=0.37.1=pyhd8ed1ab_0 132 | - wrapt=1.13.1=py36h8f6f2f9_0 133 | - xopen=1.2.0=py36h5fab9bb_0 134 | - xz=5.2.5=h516909a_1 135 | - zipp=3.6.0=pyhd8ed1ab_0 136 | - zlib=1.2.12=h166bdaf_0 137 | - zstd=1.4.4=h6597ccf_3 138 | -------------------------------------------------------------------------------- /modules/sv.smk: -------------------------------------------------------------------------------- 1 | 2 | ###################### 3 | ###### SV RULES ###### 4 | ##################### 5 | 6 | 7 | #### SNIFFLES #### 8 | ################## 9 | rule sniffles: 10 | """ 11 | Identify structural variants using Sniffles2. 12 | """ 13 | input: 14 | datain=data_dir + "/align/{aligner}/data_hap.bam" if config['phase_sv'] else data_dir + "/align/{aligner}/data.bam", 15 | data_index=data_dir + "/align/{aligner}/data_hap.bam.bai" if config['phase_sv'] else data_dir + "/align/{aligner}/data.bam.bai", 16 | output: 17 | dataout=data_dir + "/sv/{aligner}/sniffles.vcf", 18 | dataout_snf=data_dir + "/sv/{aligner}/sniffles.snf" 19 | message: "Running Sniffles in rule: {rule}\nUsing {input.datain} output:{output.dataout}" 20 | params: 21 | min_sv_len=config['min_sv_len'], 22 | sv_threads=config['sv_threads'], 23 | sample_name = SAMPLE_NAME, 24 | phase = "--phase" if config['phase_sv'] else "", 25 | mosaic = "--non-germline" if config['mosaic_sv'] else "", 26 | conda: SNIFFLES_ENV 27 | priority: 2 28 | log: data_dir + "/sv/{aligner}/sniffles.log" 29 | benchmark: data_dir + "/benchmark/sv/{aligner}/sv.benchmark.txt" 30 | shell:""" 31 | sniffles --minsvlen {params.min_sv_len} --sample-id {params.sample_name} -t {params.sv_threads} --input {input.datain} --vcf {output.dataout} --snf {output.dataout_snf} {params.phase} {params.mosaic} > {log} 2>&1 32 | """ 33 | 34 | #### HAPLOTYPE SVs #### 35 | ####################### 36 | # TODO: orphan code # # # # # # # # # # # # # # 37 | rule phaseSVs: 38 | """ 39 | This rules takes as input a taged tabed bam file 40 | from whatshap and vcf file contains SVs and update 41 | the SVs to add haplotype HP and phase blocks PS. 42 | """ 43 | input: 44 | bam = data_dir + "/align/{aligner}/data_hap.tab", 45 | sv = data_dir + "/sv/{aligner}/sniffles.vcf", 46 | output:data_dir + "/sv/{aligner}/sniffles_hp_updated.vcf" 47 | message: "Updating SVs using align/{aligner}/data_hap.tab" 48 | params: 49 | update_script = updat_sv, 50 | shell:""" 51 | python {params.update_script} {input.sv} {input.bam} {output} -c {params.min_conflict} 52 | """ 53 | 54 | #### SORTING SVs #### 55 | ##################### 56 | 57 | rule vcfSort: 58 | """ 59 | To concat the haplotype SVs with SNVs, SVs needs to be sorted first. 60 | """ 61 | input: 62 | vcffile = data_dir + "/{sample}.vcf.gz", 63 | ref = REFERENCES, 64 | output:data_dir + "/{sample}.sorted.vcf.gz" 65 | conda: PRINCESS_ENV 66 | shell:""" 67 | zcat {input.vcffile} | awk 'BEGIN{{OFS="\t";}} /^#/{{print $0}} !/^#/{{ if ($2==0){{$2=1;print}} else {{print $0}} }}' | bedtools sort -header -faidx {input.ref}.fai -i - | bgzip > {output} 68 | """ 69 | 70 | #### BGZIP SVs #### 71 | ################### 72 | 73 | rule bgzipFile: 74 | """ 75 | General rule to bgzip files 76 | """ 77 | input:data_dir + "/{name}.vcf" 78 | output:data_dir + "/{name}.vcf.gz" 79 | threads: config['bgzip_threads'] 80 | conda: VARIANT_ENV 81 | shell:""" 82 | bgzip -c -@ {threads} {input} > {output} 83 | """ 84 | 85 | #### CHANGE SVs SAMPLE NAME #### 86 | ############################### 87 | 88 | rule changeSampleName: 89 | """ 90 | Sniffles name the sample as the bam, but Clair call it SAMPLE 91 | This rule will change the sample name in the SV file. 92 | """ 93 | input:data_dir + "/{sample}.sorted.vcf.gz" 94 | output:data_dir + "/{sample}.sorted.namechnage.vcf.gz" 95 | conda: PRINCESS_ENV 96 | shell:""" 97 | echo SAMPLE > sample.name && bcftools reheader -s sample.name -o {output} {input} && rm sample.name 98 | """ 99 | 100 | #### CONCAT SVs WITH SNPs #### 101 | ############################## 102 | 103 | rule SVsSNPsCombined: 104 | """ 105 | Concat haplotype SNPs with haplotype and Genotype SVs. 106 | """ 107 | input: 108 | sv = data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz", 109 | sv_index =data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz.tbi", 110 | snp = lambda wildcards: data_dir + "/phased/{aligner}/data_updated.vcf.gz" if config['update_snps'] else data_dir + "/phased/{aligner}/data.vcf.gz", 111 | ref = REFERENCES, 112 | output: data_dir + "/sv/{aligner}/sv_snp.vcf.gz" 113 | params: 114 | extension = "z" # output a compressed file. 115 | message: "Concat sv with SNPs" 116 | log: data_dir + "/sv/{aligner}/sv_snp.log" 117 | threads: config['samtools_threads'] 118 | conda: PRINCESS_ENV 119 | shell:""" 120 | vcfcat {input.sv} {input.snp}| bedtools sort -header -faidx {input.ref}.fai -i - | bgzip > {output} 2> {log} 121 | """ 122 | # shell:""" 123 | # bcftools concat -a -O {params.extension} -o {output} --threads {threads} {input.sv} {input.snp} > {log} 2>&1 124 | # """ 125 | -------------------------------------------------------------------------------- /cluster/cluster_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | long: &long_queue ill-prod #scavenger 4 | short: &short_queue ill-prod #medium 5 | 6 | __default__: 7 | "nCPUs" : "16" 8 | "mem" : 20000 9 | "queue" : *long_queue #"analysis" 10 | "name" : "JOBNAME.{rule}.{wildcards}" 11 | "time" : "01:00:00:00" 12 | # "resources" : "\"select[mem>20000] rusage[mem=20000] span[hosts=1]\"" 13 | #"output" : "log/{rule}.PBS_JOBID.out" 14 | #"error" : "log/{rule}.PBS_JOBID.err" 15 | 16 | 17 | 18 | ########################## 19 | ###### ALIGN RULES ###### 20 | ######################### 21 | 22 | minimap2: 23 | queue: *long_queue 24 | time: "72:00:00" 25 | nCPUs: "12" 26 | mem: 20G 27 | 28 | indexBam: 29 | queue: *short_queue 30 | time: "10:00:00" 31 | nCPUs: "4" 32 | mem: 10G 33 | 34 | mergeAlign: 35 | queue: *long_queue 36 | time: "01:00:00:00" 37 | nCPUs: "8" 38 | mem: 10G 39 | 40 | sam2bam: 41 | queue: *long_queue 42 | time: "01:00:00:00" 43 | nCPUs: "2" 44 | mem: 10G 45 | 46 | addRG: 47 | queue: *long_queue 48 | time: "01:00:00:00" 49 | nCPUs: "2" 50 | mem: 10G 51 | 52 | bam2tab: 53 | queue: *short_queue 54 | time: "01:00:00:00" 55 | nCPUs: "2" 56 | mem: 10G 57 | 58 | mvAlign: 59 | queue: *short_queue 60 | time: "01:00:00" 61 | nCPUs: "2" 62 | mem: 1G 63 | 64 | #<*><*><*><*><*><*><*><*><*> 65 | 66 | 67 | ###################### 68 | ###### SV RULES ###### 69 | ##################### 70 | 71 | sniffles: 72 | time: "10:00:00" 73 | nCPUs: "16" 74 | mem: 30G 75 | 76 | phaseSVs: 77 | queue: *long_queue 78 | time: "24:00:00" 79 | nCPUs: "8" 80 | mem: 20G 81 | 82 | vcfSort: 83 | queue: *short_queue 84 | time: "10:00:00" 85 | nCPUs: "8" 86 | mem: 10G 87 | 88 | bgzipFile: 89 | queue: *long_queue 90 | time: "01:00:00:00" 91 | nCPUs: "2" 92 | mem: 10G 93 | 94 | changeSampleName: 95 | queue: *long_queue 96 | time: "01:00:00:00" 97 | nCPUs: "2" 98 | mem: 10G 99 | 100 | SVsSNPsCombined: 101 | queue: *long_queue 102 | time: "01:00:00:00" 103 | nCPUs: "8" 104 | mem: 10G 105 | 106 | #<*><*><*><*><*><*><*><*><*> 107 | 108 | 109 | 110 | ######################### 111 | ###### SNPs RULES ###### 112 | ######################### 113 | 114 | concatChromosome: 115 | queue: *long_queue 116 | time: "04:00:00:00" 117 | nCPUs: "2" 118 | mem: 10G 119 | 120 | concactSNPs: 121 | queue: *long_queue 122 | time: "01:00:00:00" 123 | nCPUs: "2" 124 | mem: 10G 125 | 126 | callSNVsChunk: 127 | queue: *long_queue 128 | time: "07:00:00:00" 129 | nCPUs: "5" 130 | mem: 30G 131 | host: "host=c86q-23+1:host=c86q-22+1:host=c86q-21+1:host=c86q-20+1:host=c86q-19+1:host=c86q-18+1:host=c86q-17+1:host=c86q-16+1:host=c86q-15+1:host=c86q-14+1:host=c86q-13+1:host=c86q-12+1:host=c86q-11" 132 | 133 | updateHeader: 134 | queue: *long_queue 135 | time: "01:00:00:00" 136 | nCPUs: "2" 137 | mem: 4G 138 | 139 | vcfIndex: 140 | queue: *long_queue 141 | time: "01:00:00:00" 142 | nCPUs: "2" 143 | mem: 4G 144 | 145 | mergeParentalSNPs: 146 | queue: *long_queue 147 | time: "01:00:00:00" 148 | nCPUs: "8" 149 | mem: 16G 150 | 151 | updateSNPs: 152 | queue: *long_queue 153 | time: "01:00:00:00" 154 | nCPUs: "4" 155 | mem: 20G 156 | 157 | #<*><*><*><*><*><*><*><*><*> 158 | 159 | 160 | 161 | ################################# 162 | ###### METHYLATION RULES ####### 163 | ################################# 164 | 165 | nanoIndex: 166 | queue: *long_queue 167 | nCPUs: "5" 168 | mem: 50G 169 | 170 | callMeth: 171 | queue: *long_queue 172 | time: "07:00:00:00" 173 | nCPUs: "8" 174 | mem: 50G 175 | 176 | allMethylation: 177 | queue: *long_queue 178 | nCPUs: "1" 179 | mem: 2G 180 | time: "00:40:00" 181 | 182 | #<*><*><*><*><*><*><*><*><*> 183 | 184 | 185 | 186 | ############################ 187 | ###### PHASING RULES ###### 188 | ########################### 189 | 190 | gt: 191 | queue: *long_queue 192 | nCPUs: "1" 193 | mem: 50G 194 | time: "72:00:00" 195 | 196 | phasing: 197 | queue: *long_queue 198 | nCPUs: "4" 199 | mem: 50G 200 | time: "05:00:00:00" 201 | 202 | allPhased: 203 | queue: *long_queue 204 | time: "01:00:00:00" 205 | nCPUs: "2" 206 | mem: 10G 207 | 208 | partionBam: 209 | queue: *long_queue 210 | nCPUs: "4" 211 | mem: 50G 212 | time: "05:00:00:00" 213 | 214 | #<*><*><*><*><*><*><*><*><*> 215 | 216 | 217 | 218 | ############################### 219 | ###### STATISTICS RULES ###### 220 | ############################### 221 | 222 | readsStat: 223 | queue: *long_queue 224 | nCPUs: "8" 225 | mem: 20G 226 | time: "05:00:00:00" 227 | 228 | bamStatistics: 229 | queue: *long_queue 230 | time: "01:00:00:00" 231 | nCPUs: "4" 232 | mem: 10G 233 | 234 | svStat: 235 | queue: *long_queue 236 | time: "01:00:00:00" 237 | nCPUs: "4" 238 | mem: 10G 239 | 240 | snpStat: 241 | queue: *long_queue 242 | time: "01:00:00:00" 243 | nCPUs: "4" 244 | mem: 10G 245 | 246 | stat: 247 | queue: *short_queue 248 | time: "00:00:10:00" 249 | nCPUs: "1" 250 | mem: 1G 251 | 252 | 253 | #<*><*><*><*><*><*><*><*><*> 254 | 255 | ... 256 | -------------------------------------------------------------------------------- /scripts/rawcoverage.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import concurrent.futures as cf 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | # import multiprocessing as mp 7 | import pandas as pd 8 | import seaborn as sns 9 | 10 | plt.switch_backend('agg') 11 | from functools import partial 12 | from os import path as opath 13 | import ntpath 14 | import sys 15 | from Bio import SeqIO 16 | 17 | 18 | def main(): 19 | args = get_args() 20 | files = flat_list(args.input) 21 | nfiles = len(files) 22 | nworkers = min(nfiles, args.threads) 23 | # cpus = mp.cpu_count() 24 | with cf.ProcessPoolExecutor(max_workers=nworkers) as executor, open(args.output, 'w') as data_out: 25 | # return pd.concat([i for i in executor.map(process_reads, files)], ignore_index=True) 26 | df = pd.concat([i for i in executor.map(process_reads, files)], ignore_index=True) 27 | data_out.write("Reads: {length}\n" 28 | "Bases: {nbases}\n" 29 | "Mean read length: {rmean}\n" 30 | "Median: {rmdeian}\n" 31 | "Max: {rmax}\n" 32 | "Min: {rmin}\n" 33 | "N50: {n50}". \ 34 | format(length=len(df), nbases=np.sum(df["lengths"]), rmean=np.mean(df["lengths"]), 35 | rmdeian=np.median(df["lengths"]), 36 | rmax=np.max(df["lengths"]), 37 | rmin=np.min(df["lengths"]), 38 | n50=get_N50(np.sort(df['lengths'])) 39 | )) 40 | 41 | plot_output = opath.join(opath.dirname(args.output), ntpath.basename(args.output).rsplit(".", 1)[0] + ".png") 42 | sns.set() 43 | myplot = sns.distplot(np.log(df['lengths'])) 44 | myplot.set(xlabel='Log Read Length') 45 | myplot.get_figure().savefig(plot_output) 46 | 47 | # if i want to count reads c = s.groupby(['length']).size().reset_index(name='count') 48 | 49 | 50 | def get_args(): 51 | parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for more info.", 52 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 53 | description='Calulate statistics form fasta, fastq, fasta.gz and fastq.gz files ', 54 | add_help=True, ) 55 | 56 | parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01') 57 | 58 | parser.add_argument("-i", "--input", nargs="+", 59 | help=" one or more reads file ex: -i 1.fasta -i 2.fasta .... or -i 1.fasta 2.fasta", 60 | action="append", required=True, metavar="FOO.fasta/q/gz") 61 | parser.add_argument("-o", "--output", help=" output statistics file", metavar="FOO.txt") 62 | 63 | parser.add_argument("-t", "--threads", type=int, metavar='N', default=1, 64 | help=" Number of threads default %(default)d") 65 | 66 | args = parser.parse_args() 67 | 68 | return args 69 | 70 | 71 | def flat_list(my_list): 72 | """ 73 | Transform list of lists to flat list 74 | :param my_list: list of lists ex: [[1],[1, 2], [a,v]] 75 | :return: [1, 1, 2, a, v] 76 | """ 77 | return [element for each_list in my_list for element in each_list] 78 | 79 | 80 | def process_reads(read_file): 81 | file_handle, file_type = open_handle(read_file) 82 | return (pd.DataFrame( 83 | data=[len(rec) for rec in SeqIO.parse(file_handle, file_type)], 84 | columns=["lengths"]).dropna()) 85 | 86 | 87 | def open_handle(myfile): 88 | if opath.isfile(myfile): 89 | if myfile.endswith(('fastq.gz', 'fq.gz')): 90 | import gzip 91 | return gzip.open(myfile, 'rt'), "fastq" 92 | elif myfile.endswith('fasta.gz'): 93 | import gzip 94 | return gzip.open(myfile, 'rt'), "fasta" 95 | elif myfile.endswith('.fasta', ): 96 | return open(myfile, 'r'), 'fasta' 97 | elif myfile.endswith('.fastq'): 98 | return open(myfile, 'r'), 'fastq' 99 | # elif myfile.endswith("fastq.tar.gz"): 100 | # import tarfile 101 | # tar = tarfile.open(myfile, 'r:gz')#, 'fasta' 102 | # for member in tar.getmembers(): 103 | # f = tar.extractfile(member) 104 | # if f is not None: 105 | # print(type(f)) 106 | # return open(f, 'r'), 'fastq' 107 | # elif myfile.endswith("fasta.tar.gz"): 108 | # import tarfile 109 | # tar = tarfile.open(myfile, 'r:gz')#, 'fasta' 110 | # for member in tar.getmembers(): 111 | # f = tar.extractfile(member) 112 | # if f is not None: 113 | # return open(f, 'r'), 'fasta' 114 | else: 115 | sys.exit("This file {} is of unknown extension!".format(myfile)) 116 | else: 117 | sys.exit("This file {} does not exist.".format(myfile)) 118 | 119 | 120 | def get_N50(read_lengths): 121 | return read_lengths[np.where(np.cumsum(read_lengths) >= 0.5 * np.sum(read_lengths))[0][0]] 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /modules/align.smk: -------------------------------------------------------------------------------- 1 | ########################## 2 | ###### ALIGN RULES ###### 3 | ######################### 4 | 5 | 6 | 7 | #### MINIMAP2 #### 8 | ################## 9 | 10 | # Minimap2 Parameters 11 | #==================== 12 | 13 | if config["read_type"].lower() in ["clr", "ccs"]: 14 | minimap2_read_type = "-H" 15 | x_param = "-ax map-pb" 16 | elif config["read_type"].lower() == "ont": 17 | x_param = "-ax map-ont" 18 | minimap2_read_type = "" 19 | else: 20 | minimap2_read_type = "" 21 | x_param = "" 22 | 23 | rule minimap2: 24 | """ 25 | Using Minimap2 to align reads 26 | """ 27 | input: 28 | datain=data_dir + "/{sample}" 29 | output: 30 | dataout=temp(data_dir + "/align/minimap/{sample}.bam") 31 | params: 32 | reference=REFERENCES, 33 | h = minimap2_read_type, 34 | md = "--MD", 35 | x = x_param, 36 | sample_name = SAMPLE_NAME, 37 | # rg = "@RG\\tSM:SAMPLE\\tID:LONG", should be used like -R {params.rg} 38 | minimap_other_tags = config['minimap_other_tags'], 39 | log: 40 | data_dir + "/align/minimap/{sample}.log" 41 | message: 42 | "Running minimap2 , sample is: {wildcards.sample} in rule {rule}" 43 | threads: config['aligner_threads'] 44 | benchmark: data_dir + "/benchmark/align/{sample}.minimap.benchmark.txt" 45 | conda: MINIMAP2_ENV 46 | shell:""" 47 | if [[ ! -z "{params.minimap_other_tags}" ]]; then 48 | minimap2 -Y -R '@RG\\tSM:{params.sample_name}\\tID:{params.sample_name}' {params.x} "{params.reference}" "{input.datain}" {params.h} "{params.md}" -t "{threads}" "{params.minimap_other_tags}" 2>{log} | samtools sort -@ {threads} - > "{output.dataout}" 2>>{log} 49 | else 50 | minimap2 -Y -R '@RG\\tSM:{params.sample_name}\\tID:{params.sample_name}' {params.x} "{params.reference}" "{input.datain}" {params.h} "{params.md}" -t "{threads}" 2>{log} | samtools sort -@ {threads} - > "{output.dataout}" 2>>{log} 51 | fi 52 | """ 53 | #### NGMLR #### 54 | ############### 55 | 56 | rule ngmlr: 57 | """ 58 | Using ngmlr to align reads 59 | """ 60 | input: 61 | datain=data_dir + "/{sample}" 62 | output: 63 | dataout=temp(data_dir + "/align/ngmlr/{sample}.sam") 64 | params: 65 | reference=REFERENCES, 66 | platform="pacbio" if config["read_type"] in ["clr", "ccs"] else "ont" if config["read_type"] == "ont" else "" 67 | log: 68 | data_dir + "/align/ngmlr/{sample}.log" 69 | message: 70 | "Running ngmlr , sample is: {wildcards.sample}" 71 | threads: config['aligner_threads'] 72 | benchmark: data_dir + "/benchmark/align/{sample}.ngmlr.benchmark.txt" 73 | conda: PRINCESS_ENV 74 | shell:""" 75 | ngmlr -r "{params.reference}" -q "{input.datain}" --rg-sm SAMPLE -o "{output.dataout}" -t "{threads}" -x "{params.platform}" --bam-fix > {log} 2>&1 76 | """ 77 | 78 | #### SAM2BAM #### 79 | ################ 80 | 81 | rule sam2bam: 82 | input: data_dir + "/align/ngmlr/{sample}.sam" 83 | output: temp(data_dir + "/align/ngmlr/{sample}.bam") 84 | message: "Covert SAM to sorted BAM" 85 | threads: config['aligner_threads'] 86 | benchmark: data_dir + "/benchmark/align/{sample}.sam2bam.benchmark.txt" 87 | conda: PRINCESS_ENV 88 | shell:""" 89 | samtools view -bhS {input} | samtools sort -@ {threads} - > {output} 90 | """ 91 | 92 | #### INDEX BAM #### 93 | ################### 94 | 95 | rule indexBam: 96 | """ 97 | Indexing bam file. 98 | """ 99 | input: 100 | data_dir + "/{sample}.bam" 101 | output: 102 | temp(data_dir + "/{sample}.bam.bai") 103 | benchmark: data_dir + "/benchmark/align/{sample}.index.benchmark.txt" 104 | message: "Indexing {input}" 105 | conda: MINIMAP2_ENV 106 | shell: 107 | "samtools index {input}" 108 | 109 | #### MERGE BAM FILES #### 110 | ######################## 111 | 112 | rule mergeAlign: 113 | input: 114 | bams=lambda wildcards: expand(data_dir + "/align/{aligner}/{sample}.bam", aligner=wildcards.aligner, sample=sample_list), 115 | index_bams=lambda wildcards: expand(data_dir + "/align/{aligner}/{sample}.bam.bai", aligner=wildcards.aligner, sample=sample_list), 116 | output: 117 | file_name=temp(data_dir + "/align/{aligner}/data.bam") 118 | message:"Mergeing data" 119 | threads: config['samtools_threads'] 120 | benchmark: data_dir + "/benchmark/align/{aligner}.merging.benchmark.txt" 121 | log: 122 | data_dir + "/align/{aligner}/merge.log" 123 | conda: MINIMAP2_ENV 124 | threads: config['aligner_threads'] 125 | shell:""" 126 | samtools merge -@ {threads} {output} {input.bams} > {log} 2>&1 127 | """ 128 | 129 | #### ADD RG TO BAM FILE #### 130 | ############################ 131 | 132 | rule addRG: 133 | input:data_dir + "/{sample}.bam" 134 | output:temp(data_dir + "/{sample}_rg.bam") 135 | params: 136 | rg = "@RG\\tSM:SAMPLE\\tID:LONG", 137 | conda: PRINCESS_ENV 138 | shell:""" 139 | samtools addreplacerg -r "{params.rg}" -o {output} {input} 140 | """ 141 | 142 | 143 | 144 | #### CONVERT BAM FILE TO TAB #### 145 | ################################ 146 | 147 | rule bam2tab: 148 | """ 149 | This rules takes bam file and extract to tab delimeted file: reads HP PS. 150 | """ 151 | input: 152 | bam_file = data_dir + "/align/{aligner}/data_hap.bam", 153 | output: data_dir + "/align/{aligner}/data_hap.tab", 154 | message: "Extracting read hp and ps info from tagged bam file." 155 | conda: PRINCESS_ENV 156 | benchmark: data_dir + "/benchmark/align/{aligner}.bam2tab.benchmark.txt" 157 | shell:""" 158 | samtools index {input} && samtools view {input.bam_file} | grep "PS:i:" | awk 'BEGIN{{OFS="\\t";}}{{print $1,$(NF-2), $(NF)}}' > {output} 159 | """ 160 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | 2 | # import Lib 3 | ############ 4 | import os, glob, ntpath, math, shutil 5 | from snakemake.utils import min_version 6 | 7 | ############################ 8 | 9 | # Snake Version 10 | ############### 11 | min_version("5.7.1") 12 | 13 | 14 | 15 | # Config File 16 | ############# 17 | # # if len(config) == 0: 18 | if os.path.isfile("config.yaml"): 19 | configfile: "config.yaml" 20 | else: 21 | sys.exit("Looks like there is no config.yaml file in " + os.getcwd() + " make sure there is one or at least specify one with the --configfile commandline parameter.") 22 | ############# 23 | 24 | 25 | 26 | # Listing samples 27 | ################# 28 | # GET SAMPLES EXTENSION 29 | sample_extension = config['sample_extension'] if config['sample_extension'] else "gz" 30 | 31 | # GET WORKING DIRECTORY DEFAULT IS CURRENT DIRECTORY 32 | data_dir = config["sample_directory"] if config['sample_directory'] else os.getcwd() 33 | 34 | # GET SAMPLES LIST 35 | sample_list = config['sample_list'] 36 | if not isinstance(sample_list, list): 37 | sample_list = sample_list.split() 38 | ############# 39 | 40 | 41 | 42 | # Output sample name 43 | ################### 44 | SAMPLE_NAME = config['sample_name'] 45 | ############# 46 | 47 | 48 | # Clean after success 49 | #################### 50 | source_dir = config['delete_files'] 51 | samples_names = config['delete_samples'] 52 | def clean(source_dir, data_dir, samples_names): 53 | file_list = os.listdir(source_dir) 54 | if samples_names: 55 | for sample in samples_names: os.remove(os.path.join(data_dir, os.path.basename(sample))) 56 | for sample in file_list: 57 | if os.path.isfile(sample): 58 | os.remove(sample) 59 | else: 60 | shutil.rmtree(sample) 61 | ############# 62 | 63 | 64 | 65 | # Config reference and chromosomes list 66 | ####################################### 67 | REFERENCES = config["reference"] 68 | chr_list = config['chrs'] 69 | 70 | # chromosomes List split to chunks 71 | split_size = config['chr_split'] if config['chr_split'] and (config['chr_split'] >= 1000000) else 1000000 72 | ref_index_file = REFERENCES+".fai" 73 | chr_range = {} 74 | with open(ref_index_file, 'r') as data_in: 75 | for line in data_in: 76 | chr, length = line.split()[0:2] 77 | if chr in chr_list: 78 | # Identify number of splits 79 | chr_split = int(length) // split_size 80 | chr_split = chr_split if chr_split > 1 else 1 81 | # step_value = int(length)//chr_split if chr_split > 0 else int(length) 82 | step_value = int(length)//chr_split 83 | ranges = list(range(0, int(length), step_value)) 84 | if len(ranges) == chr_split + 1: 85 | ranges[-1] = int(length) 86 | else: 87 | ranges.append(int(length)) 88 | ranges[0] = 1 89 | chr_range[chr] = ranges 90 | ############# 91 | 92 | 93 | 94 | # Declare aligner 95 | ################# 96 | aligner = config["aligner"] 97 | ############# 98 | 99 | 100 | 101 | # Methylation variables 102 | ####################### 103 | # ont_sample_dir = config['fast5_dir'] 104 | ############# 105 | 106 | 107 | 108 | # Preparing conda environments. 109 | ############################### 110 | PRINCESS_ENV=os.getcwd()+"/envs/princess_env.yaml" 111 | SNIFFLES_ENV=os.getcwd()+"/envs/sniffles.yaml" 112 | #CLAIR_ENV=os.getcwd()+"/envs/clair3.yaml" 113 | CLAIR_ENV=os.getcwd()+"/envs/clair3_no_depend.yaml" 114 | MINIMAP2_ENV=os.getcwd()+"/envs/minimap2.yaml" 115 | WHATSHAP_ENV=os.getcwd()+"/envs/whatshap.yaml" 116 | VARIANT_ENV=os.getcwd()+"/envs/variant_tools.yaml" 117 | READ_STAT_ENV=os.getcwd()+"/envs/pythonRun.yaml" 118 | ############# 119 | 120 | 121 | 122 | # Importing scripts 123 | ################### 124 | rawcoverage_script = config['read_raw_coverage'] 125 | updat_sv = config['updat_sv'] 126 | ############# 127 | 128 | 129 | 130 | # Include all snakemake files sub-modules 131 | ######################################## 132 | prefixed = ["./modules/"+filename for filename in os.listdir('./modules') if filename.endswith(".smk")] 133 | for f in prefixed: 134 | include: f 135 | ################################### 136 | 137 | 138 | 139 | # Building output 140 | ################## 141 | final_output = [] 142 | 143 | if config['sample_list']: 144 | if not config['methylation']: 145 | pass 146 | # elif config['methylation'] and all(value for value in ont_sample_dir.values()): 147 | elif config['methylation'] and config['fast5_dir']: 148 | final_output.append(data_dir + "/result" + "/methylation.{}_calls_hap.tsv".format(aligner)) # DONE 149 | else: 150 | sys.exit("Every ONT sample should have corresponding fast5 directory, please correct fast5_dir files in config.yaml or use -md option") 151 | 152 | if config['update_snps'] and config['paternal_snps'] and config['maternal_snps']: 153 | final_output.extend([data_dir + "/result/.allReadsparental.{aligner}.txt".format(aligner=aligner)]) 154 | else: 155 | final_output.extend([data_dir + "/result/.all.Reads.{}.txt".format(aligner)]) 156 | else: 157 | if config['update_snps'] and config['paternal_snps'] and config['maternal_snps']: 158 | final_output.extend([data_dir + "/result/.allReadsparental.{aligner}.txt".format(aligner=aligner)]) 159 | else: 160 | final_output.extend([data_dir + "/result/.all.noReads.{}.txt".format(aligner)]) 161 | 162 | 163 | ############## 164 | 165 | 166 | # RULES 167 | ####### 168 | onstart: 169 | shell("cat pictures/start.txt") 170 | 171 | rule all: 172 | input: final_output 173 | 174 | ## ------------------------------------------------------------------------------------ ## 175 | ## Success and failure messages 176 | ## ------------------------------------------------------------------------------------ ## 177 | onsuccess: 178 | clean(source_dir, data_dir, samples_names) 179 | if os.path.exists(os.path.join(data_dir, ".snakemake")): 180 | import shutil 181 | shutil.rmtree(os.path.join(data_dir, ".snakemake"), ignore_errors=True) 182 | shell("mkdir -p {data_dir}/snake_log &&\ 183 | find . -maxdepth 1 \( -name 'snakejob*' -or -name 'slurm*' \) -type f -exec mv -t {data_dir}/snake_log {{}} \; &&\ 184 | cat {source_dir}/pictures/success.txt") 185 | 186 | 187 | onerror: 188 | shell("mkdir -p {data_dir}/snake_log &&\ 189 | find . -maxdepth 1 \( -name 'snakejob*' -or -name 'slurm*' \) -type f -exec mv -t {data_dir}/snake_log {{}} \; &&\ 190 | cat {source_dir}/pictures/fail.txt") 191 | -------------------------------------------------------------------------------- /envs/run_princess_env.yaml: -------------------------------------------------------------------------------- 1 | name: princess_env2 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=2_gnu 10 | - aioeasywebdav=2.4.0=py37h89c1867_1001 11 | - aiohttp=3.8.1=py37h540881e_1 12 | - aiosignal=1.2.0=pyhd8ed1ab_0 13 | - amply=0.1.5=pyhd8ed1ab_0 14 | - appdirs=1.4.4=pyh9f0ad1d_0 15 | - async-timeout=4.0.2=pyhd8ed1ab_0 16 | - asynctest=0.13.0=py_0 17 | - atk-1.0=2.36.0=h3371d22_4 18 | - attmap=0.13.2=pyhd8ed1ab_0 19 | - attrs=21.4.0=pyhd8ed1ab_0 20 | - backports=1.0=py_2 21 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 22 | - bcrypt=3.2.2=py37h540881e_0 23 | - boto3=1.23.9=pyhd8ed1ab_0 24 | - botocore=1.26.10=pyhd8ed1ab_0 25 | - bottleneck=1.3.4=py37hda87dfa_1 26 | - brotli=1.0.9=h166bdaf_7 27 | - brotli-bin=1.0.9=h166bdaf_7 28 | - brotlipy=0.7.0=py37h540881e_1004 29 | - bzip2=1.0.8=h7f98852_4 30 | - c-ares=1.18.1=h7f98852_0 31 | - ca-certificates=2022.5.18.1=ha878542_0 32 | - cachetools=5.0.0=pyhd8ed1ab_0 33 | - cairo=1.16.0=ha61ee94_1011 34 | - certifi=2022.5.18.1=py37h89c1867_0 35 | - cffi=1.15.0=py37h036bc23_0 36 | - charset-normalizer=2.0.12=pyhd8ed1ab_0 37 | - coincbc=2.10.5=hcee13e7_1 38 | - configargparse=1.5.3=pyhd8ed1ab_0 39 | - connection_pool=0.0.3=pyhd3deb0d_0 40 | - cryptography=37.0.1=py37h9ce1e76_0 41 | - cycler=0.11.0=pyhd8ed1ab_0 42 | - datrie=0.8.2=py37h5e8e339_3 43 | - decorator=5.1.1=pyhd8ed1ab_0 44 | - defusedxml=0.7.1=pyhd8ed1ab_0 45 | - docutils=0.18.1=py37h89c1867_1 46 | - dpath=2.0.6=py37h89c1867_1 47 | - dropbox=11.31.0=pyhd8ed1ab_0 48 | - expat=2.4.8=h27087fc_0 49 | - fftw=3.3.10=nompi_h77c792f_102 50 | - filechunkio=1.8=py_2 51 | - filelock=3.7.0=pyhd8ed1ab_0 52 | - font-ttf-dejavu-sans-mono=2.37=hab24e00_0 53 | - font-ttf-inconsolata=3.000=h77eed37_0 54 | - font-ttf-source-code-pro=2.038=h77eed37_0 55 | - font-ttf-ubuntu=0.83=hab24e00_0 56 | - fontconfig=2.14.0=h8e229c2_0 57 | - fonts-conda-ecosystem=1=0 58 | - fonts-conda-forge=1=0 59 | - fonttools=4.33.3=py37h540881e_0 60 | - freetype=2.10.4=h0708190_1 61 | - fribidi=1.0.10=h36c2ea0_0 62 | - frozenlist=1.3.0=py37h540881e_1 63 | - ftputil=5.0.4=pyhd8ed1ab_0 64 | - gdk-pixbuf=2.42.8=hff1cb4f_0 65 | - gettext=0.19.8.1=h73d1719_1008 66 | - ghostscript=9.54.0=h27087fc_2 67 | - giflib=5.2.1=h36c2ea0_2 68 | - gitdb=4.0.9=pyhd8ed1ab_0 69 | - gitpython=3.1.27=pyhd8ed1ab_0 70 | - google-api-core=2.8.0=pyhd8ed1ab_1 71 | - google-api-python-client=2.49.0=pyhd8ed1ab_0 72 | - google-auth=2.6.6=pyh6c4a22f_0 73 | - google-auth-httplib2=0.1.0=pyhd8ed1ab_0 74 | - google-cloud-core=2.2.2=pyh6c4a22f_0 75 | - google-cloud-storage=2.1.0=pyh6c4a22f_0 76 | - google-crc32c=1.1.2=py37h5d4fa31_3 77 | - google-resumable-media=2.1.0=pyh6c4a22f_0 78 | - googleapis-common-protos=1.56.2=py37h89c1867_0 79 | - graphite2=1.3.13=h58526e2_1001 80 | - graphviz=3.0.0=h5abf519_1 81 | - grpcio=1.46.3=py37h0327239_0 82 | - gtk2=2.24.33=h90689f9_2 83 | - gts=0.7.6=h64030ff_2 84 | - harfbuzz=4.3.0=hf9f4e7c_0 85 | - httplib2=0.20.4=pyhd8ed1ab_0 86 | - icu=70.1=h27087fc_0 87 | - idna=3.3=pyhd8ed1ab_0 88 | - imagemagick=7.1.0_35=pl5321heb7c40d_0 89 | - importlib-metadata=4.11.4=py37h89c1867_0 90 | - importlib_metadata=4.11.4=hd8ed1ab_0 91 | - importlib_resources=5.7.1=pyhd8ed1ab_1 92 | - iniconfig=1.1.1=pyh9f0ad1d_0 93 | - jbig=2.1=h7f98852_2003 94 | - jinja2=3.1.2=pyhd8ed1ab_0 95 | - jmespath=1.0.0=pyhd8ed1ab_0 96 | - jpeg=9e=h166bdaf_1 97 | - jsonschema=4.5.1=pyhd8ed1ab_0 98 | - jupyter_core=4.10.0=py37h89c1867_0 99 | - kiwisolver=1.4.2=py37h7cecad7_1 100 | - lcms2=2.12=hddcbb42_0 101 | - ld_impl_linux-64=2.36.1=hea4e1c9_2 102 | - lerc=3.0=h9c3ff4c_0 103 | - libblas=3.9.0=14_linux64_openblas 104 | - libbrotlicommon=1.0.9=h166bdaf_7 105 | - libbrotlidec=1.0.9=h166bdaf_7 106 | - libbrotlienc=1.0.9=h166bdaf_7 107 | - libcblas=3.9.0=14_linux64_openblas 108 | - libcrc32c=1.1.2=h9c3ff4c_0 109 | - libdeflate=1.10=h7f98852_0 110 | - libffi=3.4.2=h7f98852_5 111 | - libgcc-ng=12.1.0=h8d9b700_16 112 | - libgd=2.3.3=h18fbbfe_3 113 | - libgfortran-ng=12.1.0=h69a702a_16 114 | - libgfortran5=12.1.0=hdcd56e2_16 115 | - libglib=2.70.2=h174f98d_4 116 | - libgomp=12.1.0=h8d9b700_16 117 | - libiconv=1.16=h516909a_0 118 | - liblapack=3.9.0=14_linux64_openblas 119 | - libnsl=2.0.0=h7f98852_0 120 | - libopenblas=0.3.20=pthreads_h78a6416_0 121 | - libpng=1.6.37=h21135ba_2 122 | - libprotobuf=3.20.1=h6239696_0 123 | - librsvg=2.54.3=h7abd40a_0 124 | - libsodium=1.0.18=h36c2ea0_1 125 | - libstdcxx-ng=12.1.0=ha89aaad_16 126 | - libtiff=4.3.0=h0fcbabc_4 127 | - libtool=2.4.6=h9c3ff4c_1008 128 | - libuuid=2.32.1=h7f98852_1000 129 | - libwebp=1.2.2=h3452ae3_0 130 | - libwebp-base=1.2.2=h7f98852_1 131 | - libxcb=1.13=h7f98852_1004 132 | - libxml2=2.9.14=h22db469_0 133 | - libzlib=1.2.12=h166bdaf_0 134 | - logmuse=0.2.6=pyh8c360ce_0 135 | - lz4-c=1.9.3=h9c3ff4c_1 136 | - markupsafe=2.1.1=py37h540881e_1 137 | - matplotlib-base=3.5.2=py37hc347a89_0 138 | - multidict=6.0.2=py37h540881e_1 139 | - munkres=1.1.4=pyh9f0ad1d_0 140 | - nbformat=5.4.0=pyhd8ed1ab_0 141 | - ncurses=6.3=h27087fc_1 142 | - networkx=2.7.1=pyhd8ed1ab_0 143 | - nomkl=1.0=h5ca1d4c_0 144 | - numexpr=2.8.0=py37h85a3170_102 145 | - numpy=1.21.6=py37h976b520_0 146 | - oauth2client=4.1.3=py_0 147 | - openjpeg=2.4.0=hb52868f_1 148 | - openssl=3.0.3=h166bdaf_0 149 | - packaging=21.3=pyhd8ed1ab_0 150 | - pandas=1.3.5=py37h8c16a72_0 151 | - pango=1.50.7=hbd2fdc8_0 152 | - paramiko=2.11.0=pyhd8ed1ab_0 153 | - pcre=8.45=h9c3ff4c_0 154 | - peppy=0.31.2=pyhd8ed1ab_2 155 | - perl=5.32.1=2_h7f98852_perl5 156 | - pillow=9.1.1=py37h44f0d7a_0 157 | - pip=22.1.1=pyhd8ed1ab_0 158 | - pixman=0.40.0=h36c2ea0_0 159 | - pkg-config=0.29.2=h36c2ea0_1008 160 | - plac=1.3.5=pyhd8ed1ab_0 161 | - pluggy=1.0.0=py37h89c1867_3 162 | - ply=3.11=py_1 163 | - prettytable=3.3.0=pyhd8ed1ab_0 164 | - protobuf=3.20.1=py37hd23a5d3_0 165 | - psutil=5.9.1=py37h540881e_0 166 | - pthread-stubs=0.4=h36c2ea0_1001 167 | - pulp=2.6.0=py37h89c1867_1 168 | - py=1.11.0=pyh6c4a22f_0 169 | - pyasn1=0.4.8=py_0 170 | - pyasn1-modules=0.2.7=py_0 171 | - pycparser=2.21=pyhd8ed1ab_0 172 | - pygments=2.12.0=pyhd8ed1ab_0 173 | - pygraphviz=1.6=py37h8f50634_0 174 | - pynacl=1.5.0=py37h540881e_1 175 | - pyopenssl=22.0.0=pyhd8ed1ab_0 176 | - pyparsing=3.0.9=pyhd8ed1ab_0 177 | - pyrsistent=0.18.1=py37h540881e_1 178 | - pysftp=0.2.9=py_1 179 | - pysocks=1.7.1=py37h89c1867_5 180 | - pytest=7.1.2=py37h89c1867_0 181 | - python=3.7.12=hf930737_100_cpython 182 | - python-dateutil=2.8.2=pyhd8ed1ab_0 183 | - python-fastjsonschema=2.15.3=pyhd8ed1ab_0 184 | - python-irodsclient=1.1.3=pyhd8ed1ab_0 185 | - python_abi=3.7=2_cp37m 186 | - pytz=2022.1=pyhd8ed1ab_0 187 | - pyu2f=0.1.5=pyhd8ed1ab_0 188 | - pyyaml=6.0=py37h540881e_4 189 | - ratelimiter=1.2.0=py_1002 190 | - readline=8.1=h46c0cb4_0 191 | - requests=2.27.1=pyhd8ed1ab_0 192 | - retry=0.9.2=py_0 193 | - rsa=4.8=pyhd8ed1ab_0 194 | - s3transfer=0.5.2=pyhd8ed1ab_0 195 | - scipy=1.7.3=py37hf2a6cf1_0 196 | - setuptools=62.3.2=py37h89c1867_0 197 | - six=1.16.0=pyh6c4a22f_0 198 | - slacker=0.14.0=py_0 199 | - smart_open=6.0.0=pyhd8ed1ab_0 200 | - smmap=3.0.5=pyh44b312d_0 201 | - snakemake=6.15.5=hdfd78af_0 202 | - snakemake-minimal=6.15.5=pyhdfd78af_0 203 | - sqlite=3.38.5=h4ff8645_0 204 | - stone=3.3.1=pyhd8ed1ab_0 205 | - stopit=1.1.2=py_0 206 | - tabulate=0.8.9=pyhd8ed1ab_0 207 | - tk=8.6.12=h27826a3_0 208 | - tomli=2.0.1=pyhd8ed1ab_0 209 | - toposort=1.7=pyhd8ed1ab_0 210 | - traitlets=5.2.1.post0=pyhd8ed1ab_0 211 | - typing-extensions=4.2.0=hd8ed1ab_1 212 | - typing_extensions=4.2.0=pyha770c72_1 213 | - ubiquerg=0.6.1=pyh9f0ad1d_0 214 | - unicodedata2=14.0.0=py37h540881e_1 215 | - uritemplate=4.1.1=pyhd8ed1ab_0 216 | - urllib3=1.26.9=pyhd8ed1ab_0 217 | - veracitools=0.1.3=py_0 218 | - wcwidth=0.2.5=pyh9f0ad1d_2 219 | - wheel=0.37.1=pyhd8ed1ab_0 220 | - wrapt=1.14.1=py37h540881e_0 221 | - xorg-kbproto=1.0.7=h7f98852_1002 222 | - xorg-libice=1.0.10=h7f98852_0 223 | - xorg-libsm=1.2.3=hd9c2040_1000 224 | - xorg-libx11=1.7.2=h7f98852_0 225 | - xorg-libxau=1.0.9=h7f98852_0 226 | - xorg-libxdmcp=1.1.3=h7f98852_0 227 | - xorg-libxext=1.3.4=h7f98852_1 228 | - xorg-libxrender=0.9.10=h7f98852_1003 229 | - xorg-libxt=1.2.1=h7f98852_2 230 | - xorg-renderproto=0.11.1=h7f98852_1002 231 | - xorg-xextproto=7.3.0=h7f98852_1002 232 | - xorg-xproto=7.0.31=h7f98852_1007 233 | - xz=5.2.5=h516909a_1 234 | - yaml=0.2.5=h7f98852_2 235 | - yarl=1.7.2=py37h540881e_2 236 | - yte=1.4.0=py37h89c1867_0 237 | - zipp=3.8.0=pyhd8ed1ab_0 238 | - zlib=1.2.12=h166bdaf_0 239 | - zstd=1.5.2=h8a70e8d_1 240 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Princess 2 | [![GitHub](https://img.shields.io/github/license/MeHelmy/princess)](https://opensource.org/licenses/MIT) ![GitHub last commit](https://img.shields.io/github/last-commit/MeHelmy/princess) 3 | --- 4 | ## What is new? 5 | - Clair3 for calling single nucleotide polymorphisms (SNPs) and insertions/deletions (Indels) 6 | - Ability to use different models than the default one that comes with Clair3, which can be helpful in cases where there is new kit/training dataset or when working with data other than the human genome. 7 | - Sniffles2 for detecting structural variants (SVs) 8 | - Generation of a gVCF file for cohort analysis 9 | - Generation of an SNF file for cohort structural variant analysis 10 | - The pipeline has been fully tested on both PBS and Slurm systems with easy configuration 11 | - The main conda environment has been updated for improved granularity. 12 | --- 13 | 14 | Princess is a fast and scalable framework to detect and report haplotype resolved Single Nucleotide Variants (SNV) and Structural Variations (SVs) at scale. It can leverage your cluster environment to speed up the detection which starts with one or many fasta or fastq files. 15 | Publication: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02486-w 16 | 17 | 18 | 19 | 20 | ![princess](./pictures/leia.jpg) 21 | 22 | ## Princess 23 | 24 | * __Mapping__: Minimap2 or NGMLR 25 | * __SNVs__: Clair3 26 | * __SVs__: Sniffles2 27 | * __Phasing SNVs__: WhatsHap 28 | * __Phasing SVs__: Sniffles2 29 | * __Extend Phasing__: PRINCESS-subtool 30 | * __Phased Methylation__: Nanopolish + PRINCESS-subtool 31 | * __QC Statistics__ for each step 32 | 33 | --- 34 | 35 | ## Installation 36 | Princess was tested on CentOS release 6.7, Conda version 4.7.12 is installed: 37 | for more information about [Installing Conda press here](https://bioconda.github.io/user/install.html#install-conda, "Install Conda") 38 | To download same Conda version [here](https://repo.continuum.io/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh "Conda 4.7.12")* 39 | 40 | 1. After conda is installed. Snakemake should be installed and yaml 41 | ~~~ 42 | conda install snakemake=5.7.1 43 | conda install pyyaml 44 | ~~~ 45 | 2. Downloading PRINCESS 46 | ~~~ 47 | git clone https://github.com/MeHelmy/princess.git 48 | ~~~ 49 | 50 | --- 51 | 52 | ## Tutorial 53 | 54 | To have an overview about princess write command `princess -h`. 55 | You will have the following list of commands that we can use in princess. 56 | 57 | ~~~ 58 | usage: princess [-h] {all,align,sv,snv,variant,phase,overview} ... 59 | 60 | Princess A framework for long-reads analysis. 61 | 62 | optional arguments: 63 | -h, --help show this help message and exit 64 | 65 | Sub-commands: 66 | Valid sub-commands 67 | 68 | {all,align,sv,snv,variant,phase,overview} 69 | all This command will run the following: Align the reads. 70 | Identify SVs Identify SNVs Phase both SNVs and SVs 71 | align This command will use the input sequence files and 72 | align them against the reference using either Minimap2 73 | or NGMLR use -a to choose aligner otherwise Minimap2 74 | will be used by default. 75 | sv This command will use bam file to identify SV using 76 | Sniffles. 77 | snv This command will use bam file to identify SNVs using 78 | Clair3. 79 | variant This command will use bam file to identify SVs and 80 | SNVs. 81 | phase This command will use use reads to identify SNVs by 82 | Clair and Phase them. 83 | overview This command will show what steps will run. 84 | 85 | princess version 0.01. use command -h for info. 86 | ~~~ 87 | 88 | 89 | Assume that we want only to run `snv` command, to know more about its option: 90 | 91 | `princess snv -h` 92 | 93 | 94 | ~~~ 95 | usage: princess snv [-h] [-v] -d Working directory -r {ont,clr,ccs} [-l] [-u] 96 | [-e] [-a {minimap,ngmlr}] 97 | [-s sampleFiles [sampleFiles ...]] -f REF [-j JOBS] 98 | [-g LOG_FILE] [-c CHRS [CHRS ...]] [-t] 99 | 100 | optional arguments: 101 | -h, --help show this help message and exit 102 | -v, --version show program's version number and exit 103 | -d Working directory, --directory Working directory 104 | Working directory. 105 | -r {ont,clr,ccs}, --ReadType {ont,clr,ccs} 106 | Read technology (Note: clr is not supported anymore by clair3) 107 | -l, --removeFiles remove princess source script after running default: 108 | False) 109 | -u, --UseConda Use conda for running default: True) 110 | -e, --Cluster Use cluster while running default: True) 111 | -a {minimap,ngmlr}, --Aligner {minimap,ngmlr} 112 | In case if you want to choose specific aligner 113 | otherwise default will be used default: minimap) 114 | -s sampleFiles [sampleFiles ...], --sampleFiles sampleFiles [sampleFiles ...] 115 | list of fatsa, fastq, or gz files. 116 | -f REF, --ref REF The reference file will be used to align reads to. 117 | -j JOBS, --jobs JOBS Number of running jobs default: 200 ) 118 | -g LOG_FILE, --log LOG_FILE 119 | Log file: PrincessLog.txt ) 120 | -c CHRS [CHRS ...], --chr CHRS [CHRS ...] 121 | Chromosomes list, if not specified we will use all 122 | Chromosomes. 123 | -t, --filter Filter identified SNVs using Princess algorithm 124 | default: True) 125 | ~~~ 126 | 127 | 128 | ~~~ 129 | princess all -d ./princess_all -r ont -s reads.split00.fastq.gz reads.split01.fastq.gz -f hs37d5_mainchr.fa 130 | ~~~ 131 | 132 | `-r` defines the reads type. 133 | `-s` samples that we would like to analyze. 134 | `-f` **full path** to the reference. 135 | 136 | *__Note__* 137 | I am assuming that the reference file is indexed, if not please use the following command. 138 | `samtools faidx hs37d5_mainchr.fa` as a result you will have `hs37d5_mainchr.fa.fai`. 139 | 140 | Done!! 141 | 142 | ### For methylation calling. 143 | Methylation calling is a part from the `all` option. 144 | 145 | ``` 146 | optional arguments: 147 | -h, --help show this help message and exit 148 | -v, --version show program's version number and exit 149 | -d Working directory, --directory Working directory 150 | Working directory. 151 | -r {ont,clr,ccs}, --ReadType {ont,clr,ccs} 152 | Read technology 153 | -l, --removeFiles remove princess source script after running default: False) 154 | -u, --UseConda Use conda for running default: True) 155 | -e, --Cluster Use cluster while running default: True) 156 | -a {minimap,ngmlr}, --Aligner {minimap,ngmlr} 157 | In case if you want to choose specific aligner otherwise default will be used default: minimap) 158 | -s sampleFiles [sampleFiles ...], --sampleFiles sampleFiles [sampleFiles ...] 159 | list of fatsa, fastq, or gz files. 160 | -f REF, --ref REF The reference file will be used to align reads to. 161 | -j JOBS, --jobs JOBS Number of running jobs default: 200 ) 162 | -g LOG_FILE, --log LOG_FILE 163 | Log file: PrincessLog.txt ) 164 | -c CHRS [CHRS ...], --chr CHRS [CHRS ...] 165 | Chromosomes list, if not specified we will use all Chromosomes. 166 | -t, --filter Filter identified SNVs using Princess algorithm default: True) 167 | -m, --methylation Identify methylation, mutually inclusive with -md default: False) 168 | -md Fast5 Directory, --methylationDirectory Fast5 Directory 169 | Fast5 directory will be used to identify methylation mutually inclusive with option -m default: False) 170 | ``` 171 | By choosing the flag __`--methylation`__, Princess will call the methylation on the input data (ONT data), this option is inclusive with the option __`--methylationDirectory`__ which requires the fasta5 directory. 172 | 173 | ## Test case 174 | 175 | We uploaded a HiFi compressed data file from the publicly available HG002 data set. 176 | The complete data set (High-fidelity 15kb long-read dataset of HG002, Ashkenazim Son.) is available [Here](https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data/AshkenazimTrio/HG002_NA24385_son/PacBio_CCS_15kb/) 177 | 178 | To download the test data run the following command: 179 | ``` 180 | wget https://bcm.box.com/shared/static/sdml5d7csxprgu3cl5cve0lgv5jnrrlv --output-document HiFi.fastq.gz 181 | ``` 182 | After download is finished you shall have a HiFi fastq file called `HiFi.fastq.gz`, to run the analysis test run the following command: 183 | ``` 184 | Full/Path/To/princess all --directory $PWD/analysis --ReadType ccs --ref Path/To/Reference/genome.fa --jobs 7 --sampleFiles $PWD/HiFi.fastq.gz --latency-wait 200 -p 185 | ``` 186 | all: The command to run full analysis for other options please run `princess -h` 187 | ---directory: The out put directory it could be any name, use the full path, in my case the output is same place. 188 | --ReadType: Read type, the supported read types are clr, ccs, and ont. 189 | --ref: Path to the reference please use samtools faidx with refernce before running Princess. 190 | --jobs: Number of running jobs on cluster. 191 | --sampleFiles: Sample fastq file we downloaded, it could be more than one either compressed or not. 192 | --latency-wait 200 -p: These are additional Snakemake option to wait 200 seconds before collecting output. 193 | 194 | 195 | 196 | 197 | 198 | 199 | ## Output 200 | 201 | Princess will create these directories: 202 | - align contains directory [minimap or ngmlr] based on the aligner that was specified. 203 | - sv contains the structural variant file sv/minimap/sniffles.vcf 204 | - snp contains single nucleotide variant calls per chromosomes 205 | - phased contains phased variant 206 | - stat contains Statistics 207 | - meth contains methylation info (if user choose to run methylation) 208 | 209 | ## Collect benchmark Statistics 210 | ``` 211 | cd benchmark # There is a directory benchmark contains all the analyses that were done by PRINCESS 212 | find "$PWD" -type f | grep -v "myBenchMark.txt" > myBenchMark.txt 213 | while read -r line; do n=$(echo $line | awk -v FS=/ '{print $(NF-1)"-"$(NF)}'); awk -v f=$line -v o=$n 'NR!=1 {print o"\t"$(NF)}' $line ;done < myBenchMark.txt 214 | ``` 215 | - meth contains methylation info (if user choose to run methylation) 216 | 217 | --- 218 | 219 | ## Converting from PBS to Slurm 220 | 1- Please ensure that you modify the `cluster/cluster_config.yaml` to specify the appropriate long-running node. For example, you can set the long queue system as follows: 221 | `long: &long_queue long_queue` 222 | Where long_queue is the queue system that can run for a long time. Similarly, you can set the short queue in the following way: 223 | `short: &short_queue short_queue`, . Please refer to your cluster system administrator for more details. 224 | 2- Please, ensure that you changed `cluster/config.yaml` from `cluster-status: "pbs_status.py"` to `cluster-status: "slurm_status.py"` 225 | 3- In the `cluster/key_mapping.yaml` file. Please, change `system: "pbs"` to `system: "slurm"` 226 | 4- Finally, in the `cluster/cluster_config.yaml` file, I set CPU and memory to each job to suit my cluster. 227 | E.g. 228 | ``` 229 | minimap2: 230 | queue: *long_queue 231 | time: "72:00:00" 232 | nCPUs: "12" 233 | mem: 20G 234 | ``` 235 | Here, I am using 12 CPUs, 20G memory, and the job running time is "72:00:00" maximum (three days.). You may need to use a different configuration based on the resources availability in your cluster. Please, refer to your system administrator for more details. 236 | 237 | -------------------------------------------------------------------------------- /scripts/update_sv_hp_ps.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python3 3 | 4 | """ 5 | This script update vcf file to add both HP haplotag and PS phasing block info fields, It takes as input vcf file, hp, ps. 6 | """ 7 | import argparse 8 | import sys, os 9 | from operator import itemgetter 10 | from collections import Counter 11 | 12 | # Python program to print 13 | # green text with red background 14 | # 15 | # from colorama import init 16 | # from termcolor import colored 17 | # 18 | # init() 19 | 20 | 21 | 22 | 23 | def get_args(): 24 | parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.", 25 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 26 | description='Phase SVs Using haplotyped reads in tab format', 27 | add_help=True, ) 28 | parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01') 29 | # parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) 30 | # parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout) 31 | 32 | parser.add_argument('input', nargs='?', help="Structural variant vcf file", 33 | type=argparse.FileType('r'), 34 | default=sys.stdin) 35 | parser.add_argument('hp', nargs='?', help="tab delimeted read\thp\tps file", 36 | type=argparse.FileType('r')) 37 | parser.add_argument('output', nargs='?', help="Output file, PS and HP will be added.", 38 | type=argparse.FileType('w+'), 39 | default=sys.stdout) 40 | parser.add_argument('-c', '--conflict', dest='ignore_conflict', metavar='Max Conflict Reads', type=int, help='Minumum number of conflict reads to ignore', default=0) 41 | 42 | parser.set_defaults(func=update_vcf) 43 | 44 | # if no argument print help. 45 | if len(sys.argv) == 1 and sys.stdin.isatty(): # sys.stdin.isatty() returns false if there's something in stdin 46 | parser.print_help(sys.stderr) 47 | sys.exit(1) 48 | 49 | args = parser.parse_args() 50 | 51 | 52 | if 'func' in args: 53 | args.func(args) 54 | else: 55 | parser.print_help() 56 | 57 | def update_vcf(args): 58 | # check if the input from stdin 59 | if not sys.stdin.isatty(): # there is nothing in the stdin 60 | if args.input.name.endswith("gz"): 61 | import gzip 62 | myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default. 63 | else: 64 | myfile = args.input 65 | else: 66 | myfile = args.input 67 | 68 | # read the Haplotyped reads file as dictionary 69 | hp_dic = {} 70 | with args.hp as hp_in: 71 | for line in hp_in: 72 | id, hp, ps = line.split() 73 | hp_dic[id] = [hp.rsplit(":", 1)[-1], ps.rsplit(":", 1)[-1]] # read -> [hp, ps] 74 | 75 | 76 | with myfile as data_in, args.output as data_out: 77 | for line in data_in: 78 | reads = [] 79 | if line.startswith('##'): 80 | data_out.write(line) 81 | elif line.startswith("#"): 82 | # data_out.write("##INFO=\n") 83 | data_out.write("##INFO=\n") 84 | data_out.write("##FORMAT=\n") 85 | data_out.write(line) 86 | else: 87 | line_split = line.split() 88 | if line_split[-1].split(":", 1)[0] == "1/1" or line_split[-1].split(":", 1)[0] == "0/0" or line_split[-1].split(":", 1)[0] == "./.": # no gt to phase 89 | data_out.write("{}\n".format("\t".join(line_split))) 90 | elif line_split[-1].split(":", 1)[0] == "0/1" or line_split[-1].split(":", 1)[0] == "1/0": 91 | reads = [i for i in line_split[7].split(";") if i.startswith("RNAMES")][0].split("=",1)[-1].split(",") 92 | svtype = [i for i in line_split[7].split(";") if i.startswith("SVTYPE")][0].split("=",1)[-1].split(",") 93 | svlen = [i for i in line_split[7].split(";") if i.startswith("SVLEN")][0].split("=",1)[-1].split(",") 94 | #reads = line_split[7].split(";")[10].split(",") #info field -> reads 95 | #reads[0] = reads[0].split("=")[-1] 96 | myvalues = list(map(hp_dic.get, reads)) # list of lists first element id hp second is ps or None on case there are no reads with hp and ps to support this sv 97 | id = line_split[2] 98 | # If any value not None 99 | # print(f'{line_split[1]}\t{id}\t{svtype[0]}\t{svlen[0]}\t{myvalues}') 100 | if any(myvalues): # any value is not none 101 | # print(f'{id}\t{svtype[0]}\t{svlen[0]}\t{myvalues}') 102 | ps_dict = categorize_ps_up(myvalues, args.ignore_conflict) 103 | if 0 in list(ps_dict.values()): # means that the hp is conflicting do not update anything and add flag that is is conflicting. 104 | line_split[7] = "{info};CONFLICT={conflict}".format(info=line_split[7], conflict=1) 105 | line_split[-2] = "{}:{}".format(line_split[-2], "PS") 106 | line_split[-1] = "{}:{}".format(line_split[-1], ",".join(ps_dict.keys())) 107 | data_out.write("{}\n".format("\t".join(line_split))) 108 | else: # update the gt field and ps to sv 109 | line_split[7] = "{info};CONFLICT={conflict}".format(info=line_split[7], conflict=0) 110 | line_split[-2] = "{}:{}".format(line_split[-2], "PS") 111 | # if values are negative then it is hp=1 1|0 else it is hp2 0|1 112 | # line_split[-1] = line_split[-1].replace("/", "|") 113 | hp_new_value = line_split[-1].split(':') 114 | try: 115 | if list(ps_dict.values())[0] < 1: # haplotype 1 116 | hp_new_value[0] = "1|0" 117 | else: 118 | hp_new_value[0] = "0|1" 119 | except Exception as e: 120 | print(e) 121 | 122 | 123 | hp_new_value = ":".join(hp_new_value) 124 | line_split[-1] = "{}:{}".format(hp_new_value, ",".join(ps_dict.keys())) 125 | data_out.write("{}\n".format("\t".join(line_split))) 126 | else: # all are none 127 | line_split[7] = "{info};CONFLICT=2".format(info=line_split[7]) 128 | line_split[-2] = "{}:{}".format(line_split[-2], "PS") 129 | line_split[-1] = "{}:{}".format(line_split[-1], ".") 130 | data_out.write("{}\n".format("\t".join(line_split))) 131 | 132 | 133 | # Test case [['1', '23200'], ['2', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['1', '23200'], ['2', '23200'], ['1', '23200'], ['1', '23200'], ['2', '23200'], ['2', '23200'], ['2', '23200']] 134 | # 135 | # [['1', '13164067'], ['1', '13164067'], ['1', '13164067'], ['1', '13164067'], ['1', '13164067'], ['2', '12948612'], ['1', '13164067'], ['1', '13164067'], ['2', '12948612'], ['1', '13164067'], ['2', '12948612']] 136 | def categorize_ps(myvalues): 137 | myvalues = [i for i in myvalues if i is not None] # remove None 138 | ps_dict = {} 139 | for i in myvalues: 140 | hp = int(i[0]) 141 | ps = i[1] 142 | if ps in ps_dict: 143 | if hp == 1: 144 | if ps_dict[ps] < 0: 145 | ps_dict[ps] = ps_dict[ps] - 1 146 | else: #conflict 147 | ps_dict[ps] = 0 148 | 149 | else: # means that it is haplotype 2 hp=2 150 | if ps_dict[ps] > 0: 151 | ps_dict[ps] = ps_dict[ps] + 1 152 | else: #conflict 153 | ps_dict[ps] = 0 154 | else: 155 | if hp == 1: 156 | ps_dict[ps] = -1 157 | else: 158 | ps_dict[ps] = 1 159 | return ps_dict 160 | 161 | 162 | def most_frequent(List): 163 | return max(set(List), key = List.count) 164 | 165 | 166 | def categorize_ps_conflict(myvalues, max_conflict): 167 | ps_dict = {} 168 | myvalues = [i for i in myvalues if i is not None] # remove None 169 | hp = [i[0] for i in myvalues] 170 | hp_count = {i: hp.count(i) for i in hp} # i.e {'1': 3, '2': 2} or {'1': 3} 171 | # TODO: chek if we have two hap with differnt phase block they should not be counted as conflict 172 | if len(hp_count) > 1: # they are conflicting 173 | if min(hp_count.values()) <= max_conflict: # we are less than or equal the minium accepted number of conflict reads 174 | # calculate PS and HP 175 | for i in myvalues: 176 | # get the hp based on the hoghest number 177 | hp = max(hp_count, key = hp_count.get) # either 1 or 2 178 | ps = most_frequent([i[1] for i in myvalues if i[0] == hp]) 179 | ps_dict[ps] = int(hp) if int(hp) > 1 else -1 180 | else: # Number of reads conflicting are higher than user suggestion 181 | ps = most_frequent([i[1] for i in myvalues]) 182 | ps_dict[ps] = 0 # ['ps', 0] 183 | else: 184 | # Data are not conflict calculate normally: 185 | ps_dict = categorize_ps(myvalues) 186 | return ps_dict 187 | 188 | 189 | def categorize_ps_up(myvalues: 'list', min_conflict: 'int') -> {}: 190 | myvalues = [i for i in myvalues if i is not None] # remove None 191 | ps_dict = {} 192 | if min_conflict > 0: 193 | min_conflict += 1 194 | for i in myvalues: 195 | hp = int(i[0]) 196 | ps = i[1] 197 | if ps in ps_dict: 198 | if hp == 1: 199 | if ps_dict[ps] < 0: 200 | ps_dict[ps] = ps_dict[ps] - 1 201 | else: #conflict 202 | if min_conflict == 0: 203 | ps_dict[ps] = 0 204 | else: 205 | min_conflict -= 1 206 | ps_dict[ps] -= 1 207 | else: # means that it is haplotype 2 hp=2 208 | if ps_dict[ps] > 0: 209 | ps_dict[ps] = ps_dict[ps] + 1 210 | else: #conflict 211 | if min_conflict == 0: 212 | ps_dict[ps] = 0 213 | else: 214 | min_conflict -= 1 215 | ps_dict[ps] += 1 216 | else: 217 | if hp == 1: 218 | ps_dict[ps] = -1 219 | else: 220 | ps_dict[ps] = 1 221 | return ps_dict 222 | 223 | def main(): 224 | args = get_args() 225 | 226 | 227 | 228 | if __name__ == "__main__": 229 | # del_test_840 = [['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175'], ['1', '531175'], ['2', '531175'], ['1', '531175']] 230 | # 231 | # hp = [i[0] for i in del_test_840] 232 | # hp_count = {i: hp.count(i) for i in hp} 233 | # 234 | # print(f'Number of reads supports HP {hp_count}') 235 | # 236 | # print(f'CONF 19 DEL 840 {categorize_ps_up(del_test_840, 19)}') 237 | # exit(1) 238 | main() 239 | -------------------------------------------------------------------------------- /scripts/phasing_report_update_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | bamtools merge should be used before this script. where the vcf file should be merged with both paternal and maternal, respectively. 5 | """ 6 | import argparse 7 | import sys, re 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser(epilog="%(prog)s version 0.01. use command -h for info.", 11 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 12 | description='Produce phasing report', 13 | add_help=True, ) 14 | parser.add_argument('-v', '--version', action='version', version='%(prog)s 0.01') 15 | # parser.add_argument('input', help='Input file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) 16 | # parser.add_argument('output', help='Output file', nargs="?", type=argparse.FileType('w'), default=sys.stdout) 17 | 18 | # parser.add_argument('input', nargs='?', help="Phased vcf file", 19 | # type=argparse.FileType('r'), 20 | # default=sys.stdin) 21 | # parser.add_argument('output', nargs='?', help="Output file if no file result will be directed to stander output", 22 | # type=argparse.FileType('w+'), 23 | # default=sys.stdout) 24 | parser.add_argument('-i', '--input', nargs='?', help="Phased vcf file", required=True) 25 | parser.add_argument('-o', '--output', nargs='?', help="Output file for blocks", required=True) 26 | parser.add_argument('-s', '--stat', nargs='?', help="Output statistics file for phased datat", required=True) 27 | parser.add_argument("-u", '--update_snps', help="Output updated snp file", required=True) 28 | parser.add_argument('-t', '--tolerance', help="Percent of tolerance.", type=int, action='store', default=5) 29 | parser.add_argument('-n', '--min_snps', help="Minimum Number of SNPs per block.", type=int, action='store', default=10) 30 | 31 | parser.set_defaults(func=phase_filtering) 32 | args = parser.parse_args() 33 | if 'func' in args: 34 | args.func(args) 35 | else: 36 | parser.print_help() 37 | 38 | 39 | 40 | def phase_filtering(args): 41 | # check if the input from stdin 42 | # if sys.stdin.isatty(): 43 | # if args.input.name.endswith("gz"): 44 | # import gzip 45 | # myfile = gzip.open(args.input.name, 'rt') # t is not a must normally it is default. 46 | # else: 47 | # myfile = args.input 48 | # else: 49 | myfile = args.input 50 | if myfile.endswith("gz"): 51 | import gzip 52 | myfile_open = gzip.open(myfile, 'rt', encoding='utf-8') 53 | #myfile_open = gzip.open(myfile, 'rt', encoding='utf-16') 54 | else: 55 | myfile_open = open(myfile, 'r') 56 | 57 | phasing_dictionary = {} 58 | 59 | 60 | with myfile_open as data_in, open(args.output, 'w') as data_out: 61 | nonphased_hetero = 0 62 | snp_number = 0 63 | maternal = 0 64 | paternal = 0 65 | homo_number = 0 66 | unknown_phased = 0 67 | non_sample_snp = 0 68 | for line in data_in: 69 | if line.startswith('#'): 70 | pass # chnage to print in output file 71 | else: 72 | snp_number += 1 73 | line_split = line.split() 74 | gt_flag, sample, father, mother = line_split[8:12] 75 | # gt_flag, sample, father, mother = line_split[8:12] 76 | if "1/1" in sample: 77 | homo_number +=1 78 | # 9 is first sample followed by father and mother 79 | # ID f0/1 f1/0 m0/1 m1/0 80 | # paternal 81 | if "|" in sample and bool(re.search(r'\d',father)) and not bool(re.search(r'\d', mother)): 82 | paternal += 1 83 | gt_index = gt_flag.split(":").index("PS") 84 | id = sample.split(":")[gt_index] 85 | 86 | f0_1 = 0 87 | f1_0 = 0 88 | if sample[0] == "0": 89 | f0_1 = 1 90 | else: 91 | f1_0 = 1 92 | 93 | if id not in phasing_dictionary: 94 | phasing_dictionary[id] = [f0_1, f1_0 , 0, 0] 95 | else: 96 | phasing_dictionary[id][0] += f0_1 97 | phasing_dictionary[id][1] += f1_0 98 | # maternal 99 | elif "|" in sample and bool(re.search(r'\d', mother)) and not bool(re.search(r'\d',father)): 100 | maternal += 1 101 | gt_index = gt_flag.split(":").index("PS") 102 | id = sample.split(":")[gt_index] 103 | 104 | m0_1 = 0 105 | m1_0 = 0 106 | if sample[0] == "0": 107 | m0_1 = 1 108 | else: 109 | m1_0 = 1 110 | 111 | if id not in phasing_dictionary: 112 | phasing_dictionary[id] = [0, 0, m0_1, m1_0] 113 | else: 114 | phasing_dictionary[id][2] += m0_1 115 | phasing_dictionary[id][3] += m1_0 116 | # Unknown if it is right or wrong cause no equivliant in mother or father 117 | elif "|" in sample: 118 | unknown_phased += 1 119 | elif "1/0" in sample or "0/1" in sample: 120 | #elif "1/1" not in sample and "." not in sample: 121 | nonphased_hetero += 1 # is it hetero or homo zygot 122 | elif sample.startswith("."): 123 | non_sample_snp += 1 124 | 125 | 126 | for k, v in phasing_dictionary.items(): 127 | data_out.write("{}\t{}\n".format(str(k), "\t".join(map(str, v)))) 128 | # print("Number SNPs: {snp}\nUnknown phased case: {unknown}\n \ 129 | # Number of non-phased Hetero: {not_phased_hetero}\n \ 130 | # Maternal phased: {mother}\nPaternal phased: \ 131 | # {father}\nDone".format(unknown=unknown_phased, not_phased_hetero=nonphased_hetero, snp=snp_number, mother=maternal, father=paternal)) 132 | with open(args.stat, 'w') as stat_out: 133 | stat_out.write("\ 134 | Number SNPs: {snp}\n\ 135 | Homozygot number 1/1: {homo}\n\ 136 | Unknown phased case: {unknown_cases}\n\ 137 | Number of non-phased Hetero: {not_phased_hetero}\n\ 138 | Total number of Phased SNPs: {total}\n\ 139 | Maternal phased: {mother}\n\ 140 | Paternal phased:{father}\n\ 141 | SNP only in paternal: {no_snp}".format(unknown_cases=unknown_phased, homo=homo_number, not_phased_hetero=nonphased_hetero, snp=snp_number, mother=maternal, father=paternal, total=str(maternal+paternal), no_snp = non_sample_snp)) 142 | 143 | 144 | # updating vcf phased snps 145 | chr = "" 146 | new_block_value = "" 147 | if args.update_snps: 148 | if myfile.endswith("gz"): 149 | import gzip 150 | myfile_open = gzip.open(myfile, 'rt', encoding='utf-8') 151 | #myfile_open = gzip.open(myfile, 'rt', encoding='utf-16') 152 | else: 153 | myfile_open = open(myfile, 'r') 154 | 155 | with myfile_open as data_in, open(args.update_snps , "w") as output: 156 | for line in data_in: 157 | if line.startswith('##'): 158 | output.write(line) 159 | elif line.startswith("#"): 160 | output.write("##INFO=\n") 161 | output.write("{}\n".format("\t".join(line.split()[:-2]))) 162 | else: 163 | line_split = line.split() 164 | chr_value = line_split[0] 165 | # identify wich chromosome we are using. 166 | if chr_value != chr: 167 | chr = chr_value 168 | first_block = True 169 | snp_format, format_value = line_split[8:10] 170 | format_value_split = format_value.split(':') 171 | if "PS" in snp_format and "|" in format_value.split(":")[0]: # It is phased 172 | # pritn(line) 173 | # gt_value = format_value.split(":")[0] 174 | block_value = format_value.split(":")[snp_format.split(":").index("PS")] 175 | block_not_conflict = False 176 | if block_value in phasing_dictionary: # this snp have a similr one in parents vcf file. 177 | block_not_conflict, gt_value = not_conflecting(phasing_dictionary[block_value], args) 178 | # update the PS value. 179 | if block_not_conflict: 180 | # add +N of snps supoorting the block to p-snp 181 | # udate the PS tag for each chromsome to be the first value in the first block (ps) 182 | # write the updated line 183 | # Update gt 184 | if first_block: 185 | first_block = False 186 | new_block_value = block_value 187 | 188 | format_value_split[0] = gt_value 189 | # Update PS 190 | format_value_split[snp_format.split(":").index("PS")] = new_block_value 191 | line_split[9] = ":".join(format_value_split) 192 | line_split[7] = line_split[7] + ";parental-snps=" + str( 193 | sum(phasing_dictionary[block_value])) 194 | output.write("{}\n".format("\t".join(line_split[:-2]))) 195 | # print("block --> "+ new_block_value) 196 | else: 197 | # add -1 of snps supoorting the block to p-snp 198 | # udate the PS tag for each chromsome to be the first value in the first block (ps) 199 | # write the updated line 200 | # Update gt 201 | format_value_split[0] = gt_value 202 | # Update PS 203 | # format_value_split[snp_format.split(":").index("PS")] = new_block_value 204 | line_split[9] = ":".join(format_value_split) 205 | line_split[7] = line_split[7] + ";parental-snps=-" + str( 206 | sum(phasing_dictionary[block_value])) # add sum 207 | output.write("{}\n".format("\t".join(line_split[:-2]))) 208 | # print("block --> "+ new_block_value) 209 | else: 210 | # No information form parental about it 211 | # keep it the same add 0 to p-snp flag 212 | line_split[7] = line_split[7] + ";parental-snps=0" 213 | output.write("{}\n".format("\t".join(line_split[:-2]))) 214 | elif bool(re.search(r'\d',format_value)): 215 | # exit(format_value.split(":")) 216 | # elif "/" in format_value.split(":")[0]: 217 | # Write the line without change 218 | line_split[7] = line_split[7] + ";parental-snps=." 219 | output.write("{}\n".format("\t".join(line_split[:-2]))) 220 | # else: 221 | # print(line) 222 | 223 | 224 | 225 | 226 | 227 | def hasNumbers(inputString): 228 | return any(char.isdigit() for char in inputString) 229 | 230 | def is_not_conflict(block_snp_list, args): 231 | all_snps_in_block = sum(block_snp_list) 232 | if all_snps_in_block >= args.min_snps: 233 | # assuming that the list is formed like that F0|1 F1|0 M0|1 M1|0 234 | tolerance_percentage = args.tolerance * all_snps_in_block/100 # tolerance is 5% 235 | index_m, value_m = max(enumerate(block_snp_list[:2]), key=operator.itemgetter(1)) 236 | index_f, value_f = max(enumerate(block_snp_list[2:]), key=operator.itemgetter(1)) 237 | 238 | if block_snp_list.count(0) == 3 or ( ( (index_m == 0 and index_f == 1) or (index_m == 1 and index_f == 0) ) and ( any(i <= 5*sum(block_snp_list[:2])/100 for i in block_snp_list[:2]) and any(i <= 5*sum(block_snp_list[2:])/100 for i in block_snp_list[2:])) ): 239 | return (True, "{}|{}".format(index_m, index_f)) # it means 0|1 or 1|0 240 | 241 | def not_conflecting(block, args): 242 | tolerance = args.tolerance / sum(block) * 100 243 | max_index = block.index(max(block)) # bigest value in snps 244 | # assuming that the list is formed like that F0|1 F1|0 M0|1 M1|0 245 | # if sum(block) >= args.min_snps: 246 | if (max_index == 0 or max_index==3): 247 | non_conflict = block[0] + block[3] 248 | conflict = block[1] + block[2] 249 | gt = "0|1" # parental|maternal 250 | elif (max_index == 1 or max_index ==2): 251 | non_conflict = block[1] + block[2] 252 | conflict = block[0] + block[3] 253 | gt = "1|0" 254 | return (conflict / (conflict + non_conflict) * 100 <= tolerance, gt) 255 | # else: 256 | # return(False, "") 257 | 258 | 259 | 260 | 261 | def main(): 262 | args = get_args() 263 | 264 | 265 | 266 | if __name__ == "__main__": 267 | main() 268 | -------------------------------------------------------------------------------- /modules/snp.smk: -------------------------------------------------------------------------------- 1 | ######################### 2 | ###### SNPs RULES ###### 3 | ######################### 4 | 5 | 6 | #### CLAIR ####### 7 | ################## 8 | 9 | # CLAIR Parameters 10 | #================= 11 | 12 | # if config["clair_model"]: 13 | # training_data=config["clair_model"] 14 | def platform(wildcards): 15 | if config['read_type'] == "ccs": 16 | return "hifi" 17 | elif config['read_type'] == "ont": 18 | return "ont" 19 | elif config['read_type'] == "clr": 20 | return "hifi" 21 | else: 22 | print("Unknow data type, supported format are: ont, ccs, and clr") 23 | exit(1) 24 | 25 | def get_model(conda_dir): 26 | training_data = "" 27 | if config['read_type'] == "ccs": 28 | training_data=config["clair_model"] if config["clair_model"] else None 29 | elif config['read_type'] == "ont": 30 | training_data=config["clair_model"] if config["clair_model"] else None 31 | elif config['read_type'] == "clr": 32 | training_data=config["clair_model"] if config["clair_model"] else None 33 | else: 34 | print("Unknown data type, supported format are: ont, ccs, and clr") 35 | exit(1) 36 | return training_data 37 | # if config['read_type'] == "ccs": 38 | # # training_data=config["training_data_ccs"] 39 | # platform="hifi" 40 | # training_data=config["clair_model"] if config["clair_model"] else os.path.join(os.environ['CONDA_PREFIX'], "bin/models/hifi") 41 | # elif config['read_type'] == "ont": 42 | # # training_data=config["training_data_ont"] 43 | # platform="ont" 44 | # training_data=config["clair_model"] if config["clair_model"] else os.path.join(os.environ['CONDA_PREFIX'], "bin/models/ont") 45 | # # training_data="/bin/models/ont" 46 | # elif config['read_type'] == "clr": 47 | # platform="hifi" 48 | # # training_data=config["training_data_clr"] 49 | # training_data=config["clair_model"] if config["clair_model"] else os.path.join(os.environ['CONDA_PREFIX'], "bin/models/hifi") 50 | # # training_data="/bin/models/hifi" 51 | # else: 52 | # print("Unknow data type, supported format are: ont, ccs, and clr") 53 | # exit(1) 54 | 55 | 56 | # CLAIR RULE 57 | #=========== 58 | 59 | 60 | # CLAIR CHUNK RULE 61 | #================= 62 | 63 | # [ ! -f {output.gvcf} ] && cp {output.vcf} {output.gvcf} && cp {output.vcf}.tbi {output.gvcf}.tbi &&\ 64 | if config['gvcf_snv']: 65 | rule callSNVsChunk: 66 | """ 67 | Calling SNPs using clair in case gVCF file is required. 68 | """ 69 | input: 70 | bam=data_dir + "/align/{aligner}/data.bam", 71 | data_index=data_dir + "/align/{aligner}/data.bam.bai", 72 | reference=REFERENCES, 73 | output: 74 | vcf = temp(data_dir + "/snp/{aligner}/chr.split.{chr}_{region,\d+}/merge_output.vcf.gz"), 75 | gvcf = temp(data_dir + "/snp/{aligner}/chr.split.{chr}_{region,\d+}/merge_output.gvcf.gz") 76 | params: 77 | train_data = lambda wildcards: get_model(os.environ['CONDA_PREFIX']), 78 | platform = platform, 79 | start = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region)], 80 | end = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region) + 1], 81 | gvcf = "--gvcf", 82 | benchmark: data_dir + "/benchmark/snp/{aligner}/chr.split.{chr}_{region}/{chr}_{region}.benchmark.txt" 83 | conda: CLAIR_ENV 84 | log: data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/data.split.{chr}_{region}.log" 85 | threads: config['clair_threads'] 86 | shell: 87 | """ 88 | if [ {params.train_data} == "None" ] 89 | then 90 | model="$CONDA_PREFIX/bin/models/{params.platform}" 91 | else 92 | model="{params.train_data}" 93 | fi 94 | 95 | echo $'{wildcards.chr}\t{params.start}\t{params.end}' > {wildcards.chr}.{params.start}.{params.end}.bed &&\ 96 | run_clair3.sh \ 97 | --bam_fn {input.bam} \ 98 | --ref_fn {input.reference} \ 99 | --threads {threads} \ 100 | --platform {params.platform} \ 101 | --model_path $model \ 102 | --output $PWD/snp/{wildcards.aligner}/chr.split.{wildcards.chr}_{wildcards.region} \ 103 | --bed_fn={wildcards.chr}.{params.start}.{params.end}.bed \ 104 | {params.gvcf} > {log} 2>&1 \ 105 | &&\ 106 | if [ ! -f {output.gvcf} ]; then 107 | cp {output.vcf} {output.gvcf} 108 | fi &&\ 109 | rm {wildcards.chr}.{params.start}.{params.end}.bed 110 | """ 111 | else: 112 | rule callSNVsChunk: 113 | """ 114 | Calling SNPs using clair 115 | """ 116 | input: 117 | bam=data_dir + "/align/{aligner}/data.bam", 118 | data_index=data_dir + "/align/{aligner}/data.bam.bai", 119 | reference=REFERENCES, 120 | output: 121 | vcf = temp(data_dir + "/snp/{aligner}/chr.split.{chr}_{region,\d+}/merge_output.vcf.gz") 122 | params: 123 | train_data = lambda wildcards: get_model(os.environ['CONDA_PREFIX']), 124 | platform = platform, 125 | start = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region)], 126 | end = lambda wildcards: chr_range[wildcards.chr][int(wildcards.region) + 1], 127 | benchmark: data_dir + "/benchmark/snp/{aligner}/chr.split.{chr}_{region}/{chr}_{region}.benchmark.txt" 128 | conda: CLAIR_ENV 129 | log: data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/data.split.{chr}_{region}.log" 130 | threads: config['clair_threads'] 131 | shell: 132 | """ 133 | if [ {params.train_data} == "None" ] 134 | then 135 | model="$CONDA_PREFIX/bin/models/{params.platform}" 136 | else 137 | model="{params.train_data}" 138 | fi 139 | 140 | echo $'{wildcards.chr}\t{params.start}\t{params.end}' > {wildcards.chr}.{params.start}.{params.end}.bed &&\ 141 | run_clair3.sh \ 142 | --bam_fn {input.bam} \ 143 | --ref_fn {input.reference} \ 144 | --threads {threads} \ 145 | --platform {params.platform} \ 146 | --model_path $model \ 147 | --output $PWD/snp/{wildcards.aligner}/chr.split.{wildcards.chr}_{wildcards.region} \ 148 | --bed_fn={wildcards.chr}.{params.start}.{params.end}.bed > {log} 2>&1 \ 149 | && rm {wildcards.chr}.{params.start}.{params.end}.bed 150 | """ 151 | # resources: 152 | # mem_mb=lambda wildcards, attempt: 1024 * (attempt + 1) if attempt < 3 153 | # --model_path $CONDA_PREFIX{params.train_data} \ 154 | # --model_path {params.train_data} \ 155 | 156 | #### CALL VARIANT BY CHUNKS ####### 157 | ################################### 158 | 159 | if config['gvcf_snv']: 160 | ## TODO: This function will raise an error if there is missing gvcf, missing gvcf results from calling varaint in alt contigs where there are no variants this no gVCF, solution is to check if the vcf file is empty then just copy it with gVCF name. 161 | # if $(head -n 1000 {input} | grep -q -v "#") ; then 162 | # vcfcat {input} | vcfstreamsort > {params.temp_chr}\ 163 | # && first_max=$(find_max {params.temp_chr} {params.read_type})\ 164 | # && threshold=$(filsn {params.temp_chr} $first_max)\ 165 | # && awk -v threshold=$threshold '/^#/{{print}} !/^#/{{if ( $6 >= threshold ) {{print $0}}}}' {params.temp_chr} | awk '/^#/ {{ print }} !/^#/ {{ if ($4 != $5 ) {{ print }} }}' > {output} 166 | # else 167 | # cp {input} {output} 168 | # fi 169 | rule concatChromosome: 170 | """ 171 | Concat split chromosomes regions in case of gVCF file is required. 172 | """ 173 | input: 174 | vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/merge_output.vcf.gz", aligner=wildcards.aligner, chr=wildcards.chr, region=list(range(0,len(chr_range[wildcards.chr]) - 1))), 175 | gvcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/merge_output.gvcf.gz", aligner=wildcards.aligner, chr=wildcards.chr, region=list(range(0,len(chr_range[wildcards.chr]) - 1))), 176 | output: 177 | vcf = temp(data_dir + "/snp/{aligner}/data.{chr}.vcf"), 178 | gvcf = temp(data_dir + "/snp/{aligner}/data.{chr}.gvcf") 179 | message: "Concat variant split per Chromosome" 180 | params: 181 | tmp_dir=config["tmp_directory"], 182 | conda: VARIANT_ENV 183 | benchmark: data_dir + "/benchmark/snp/{aligner}/{chr}.benchmark.txt" 184 | shell:""" 185 | if [[ ! -z "{params.tmp_dir}" ]]; then 186 | bcftools concat {input.vcf} | bcftools sort -T {params.tmp_dir} > {output.vcf} &&\ 187 | bcftools concat {input.gvcf} | bcftools sort -T {params.tmp_dir} > {output.gvcf} 188 | else 189 | bcftools concat {input.vcf} | bcftools sort > {output.vcf} &&\ 190 | bcftools concat {input.gvcf} | bcftools sort > {output.gvcf} 191 | fi 192 | """ 193 | else: 194 | rule concatChromosome: 195 | """ 196 | Concat split chromosomes regions 197 | """ 198 | input: 199 | vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/chr.split.{chr}_{region}/merge_output.vcf.gz", aligner=wildcards.aligner, chr=wildcards.chr, region=list(range(0,len(chr_range[wildcards.chr]) - 1))), 200 | output: 201 | vcf = temp(data_dir + "/snp/{aligner}/data.{chr}.vcf") 202 | message: "Concat variant split per Chromosome" 203 | params: 204 | tmp_dir=config["tmp_directory"], 205 | conda: VARIANT_ENV 206 | benchmark: data_dir + "/benchmark/snp/{aligner}/{chr}.benchmark.txt" 207 | shell:""" 208 | if [[ ! -z "{params.tmp_dir}" ]]; then 209 | bcftools concat {input.vcf} | bcftools sort -T {params.tmp_dir} > {output.vcf} 210 | else 211 | bcftools concat {input.vcf} | bcftools sort > {output.vcf} 212 | fi 213 | """ 214 | 215 | 216 | #### UPDATE HEADER ####### 217 | ########################## 218 | 219 | rule updateHeader: 220 | """ 221 | Update the phased SNPs in phased/aligner/data.vcf 222 | Where the PS in header defined as Integer where it should be String. 223 | Result: phased/aligner/data_update_header.vcf 224 | Will be used in mergeParentalSNPs rule later 225 | """ 226 | input:data_dir + "/{sample}.vcf" 227 | output:data_dir + "/{sample}_update_header.vcf" 228 | message:"Update header file to change from float to string" 229 | shell:""" 230 | sed 's/ID=PS,Number=1,Type=Integer,Descri/ID=PS,Number=1,Type=String,Descri/' {input} > {output} 231 | """ 232 | 233 | 234 | #### INDEXING VCF FILE ######## 235 | ############################### 236 | 237 | rule vcfIndex: 238 | """ 239 | Index VCF file. 240 | """ 241 | input: data_dir + "/{sample}.vcf.gz" 242 | output: data_dir + "/{sample}.vcf.gz.tbi" 243 | message: "Indexing vcf file {input}" 244 | conda: VARIANT_ENV 245 | shell:""" 246 | tabix -p vcf {input} 247 | """ 248 | 249 | rule gvcfIndex: 250 | """ 251 | Index VCF file. 252 | """ 253 | input: data_dir + "/{sample}.gvcf.gz" 254 | output: data_dir + "/{sample}.gvcf.gz.tbi" 255 | message: "Indexing vcf file {input}" 256 | conda: VARIANT_ENV 257 | shell:""" 258 | tabix -p vcf {input} 259 | """ 260 | 261 | 262 | #### MERGING PHASED VCF FILE WITH PARENTAL SNPs ######## 263 | ######################################################## 264 | 265 | rule mergeParentalSNPs: 266 | """ 267 | If the user wanted to update identified SNVs this will be the first rule in sequence, 268 | Input: phased SNVs after updating header using update_header the bgzip and index it using 269 | bgzip_vcf and vcf_index respectively. 270 | """ 271 | input: 272 | sample_snps = data_dir + "/phased/{aligner}/data_update_header.vcf.gz", 273 | sample_snps_index = data_dir + "/phased/{aligner}/data_update_header.vcf.gz.tbi", 274 | maternal_snps = config['maternal_snps'], 275 | paternal_snps = config['paternal_snps'], 276 | output: data_dir + "/phased/{aligner}/data_paternal_maternal.vcf.gz" 277 | message: data_dir + "/merging vcf from samplepaternal and maternal respectively" 278 | benchmark: data_dir + "/benchmark/snp/{aligner}/merge_parental.benchmark.txt" 279 | conda: VARIANT_ENV 280 | shell:""" 281 | bcftools merge {input.sample_snps} {input.paternal_snps} {input.maternal_snps} | bgzip > {output} 282 | """ 283 | 284 | #### UPDATING PHASED SNPs ######## 285 | ################################## 286 | 287 | rule updateSNPs: 288 | """ 289 | Here we shall take the input from mergeParentalSNPs but we need to unzip it first. 290 | """ 291 | input: data_dir + "/phased/{aligner}/data_paternal_maternal.vcf.gz" 292 | output: 293 | updated_vcf = data_dir + "/phased/{aligner}/data_updated.vcf", 294 | message: "Running update SNPs" 295 | params: 296 | update_script = config['updat_snps_script'], 297 | phased_stat = data_dir + "/statistics/phased/phasing_stat.txt", 298 | block_tsv = data_dir + "/statistics/phased/blocks.tsv", 299 | benchmark: data_dir + "/benchmark/snp/{aligner}/update_snps.benchmark.txt" 300 | conda: READ_STAT_ENV 301 | shell:""" 302 | mkdir -p statistics/phased && 303 | python {params.update_script} -i {input} -u {output.updated_vcf} -o {params.block_tsv} -s {params.phased_stat} 304 | """ 305 | 306 | 307 | #### CONCAT SNPs ######## 308 | ######################### 309 | 310 | if config['gvcf_snv']: 311 | rule concatSNPs: 312 | """ 313 | Rule to concat the identifed SNPs this will only be called by the user 314 | in case if he wanted to have only SNPs 315 | """ 316 | input: 317 | vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/data.{chr}.vcf", aligner=wildcards.aligner, chr=chr_list), 318 | gvcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/data.{chr}.gvcf", aligner=wildcards.aligner, chr=chr_list), 319 | output: 320 | vcf = data_dir + "/snp/{aligner}/data.vcf", 321 | gvcf = data_dir + "/snp/{aligner}/data.gvcf", 322 | message: "Concat SNP files" 323 | benchmark: data_dir + "/benchmark/snp/{aligner}/concat_snp.txt" 324 | params: 325 | sample_name = SAMPLE_NAME, 326 | conda: VARIANT_ENV 327 | shell:""" 328 | echo "{params.sample_name}" > sample_name.txt && vcfcat {input.vcf} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output.vcf} &&\ 329 | vcfcat {input.gvcf} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output.gvcf} 330 | """ 331 | else: 332 | rule concatSNPs: 333 | """ 334 | Rule to concat the identified SNPs this will only be called by the user 335 | in case if he wanted to have only SNPs and indels and no gVCF required. 336 | """ 337 | input: 338 | vcf = lambda wildcards: expand(data_dir + "/snp/{aligner}/data.{chr}.vcf", aligner=wildcards.aligner, chr=chr_list), 339 | output: 340 | vcf = data_dir + "/snp/{aligner}/data.vcf" 341 | message: "Concat SNP files" 342 | benchmark: data_dir + "/benchmark/snp/{aligner}/concat_snp.txt" 343 | params: 344 | sample_name = SAMPLE_NAME, 345 | conda: VARIANT_ENV 346 | shell:""" 347 | echo "{params.sample_name}" > sample_name.txt && vcfcat {input.vcf} | vcfstreamsort | bcftools reheader --samples sample_name.txt -o {output.vcf} 348 | """ 349 | 350 | # #### Bgzip gVCF ######### 351 | # ######################### 352 | 353 | rule bgzipgVCFFile: 354 | """ 355 | General rule to bgzip gVCF files 356 | """ 357 | input:data_dir + "/{name}.gvcf" 358 | output:data_dir + "/{name}.gvcf.gz" 359 | threads: config['bgzip_threads'] 360 | conda: VARIANT_ENV 361 | shell:""" 362 | bgzip -c -@ {threads} {input} > {output} 363 | """ 364 | -------------------------------------------------------------------------------- /modules/output.smk: -------------------------------------------------------------------------------- 1 | 2 | ########################## 3 | ###### OUTPUT RULES #### 4 | ######################### 5 | 6 | #### Align Moving ######## 7 | ######################### 8 | 9 | rule mvAlign: 10 | input: 11 | bam = data_dir + "/align/{aligner}/data.bam", 12 | bamindex = data_dir + "/align/{aligner}/data.bam.bai", 13 | output: 14 | bam = data_dir +'/result' + '/.aligning.{aligner}.done', 15 | message: "Moving Aligned bam to result directory {input.bam}" 16 | params: 17 | bam = data_dir +'/result' + "/aligning.{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME), 18 | shell:""" 19 | function rm_last2() {{ 20 | d1=$(dirname $1) 21 | d2=$(dirname $d1) 22 | rm -rf $d2 23 | }} 24 | mv {input.bamindex} {params.bam}.bai && mv {input.bam} {params.bam} &&\ 25 | mkdir -p {data_dir}/log &&\ 26 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : &&\ 27 | rm_last2 {input.bam} &&\ 28 | touch {output} 29 | """ 30 | 31 | 32 | #### SVs Moving ######## 33 | ######################## 34 | 35 | rule mvSV: 36 | input: 37 | vcf = data_dir + "/sv/{aligner}/sniffles.vcf", 38 | snf = data_dir + "/sv/{aligner}/sniffles.snf", 39 | bam = data_dir + "/align/{aligner}/data.bam", 40 | bamindex = data_dir + "/align/{aligner}/data.bam.bai", 41 | output: 42 | vcf = data_dir +'/result' + '/.SVs.{aligner}.done', 43 | params: 44 | vcf = data_dir +'/result' + "/{sample}.{{aligner}}.SVs.vcf".format(sample=SAMPLE_NAME), 45 | snf = data_dir +'/result' + "/{sample}.{{aligner}}.SVs.snf".format(sample=SAMPLE_NAME), 46 | bam = data_dir +'/result' + "/{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME), 47 | bamindex = data_dir +'/result' + "/{sample}.{{aligner}}.bam.bai".format(sample=SAMPLE_NAME), 48 | message: "Moving called SVs to result directory {input}" 49 | priority: 1 50 | shell:""" 51 | function rm_last2() {{ 52 | d1=$(dirname $1) 53 | d2=$(dirname $d1) 54 | rm -rf $d2 55 | }} 56 | mkdir -p {data_dir}/log &&\ 57 | if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then 58 | mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || : 59 | fi &&\ 60 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 61 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 62 | fi &&\ 63 | mv {input.vcf} {params.vcf} &&\ 64 | if [ -f {input.snf} ]; then 65 | mv {input.snf} {params.snf} && rm_last2 {input.snf} || : 66 | else 67 | rm_last2 {input.vcf} || : 68 | fi &&\ 69 | mv {input.bam} {params.bam} && mv {input.bamindex} {params.bamindex} && rm_last2 {input.bam} &&\ 70 | touch {output.vcf} 71 | """ 72 | 73 | 74 | #### SNVs Moving ######## 75 | ######################### 76 | 77 | if config['gvcf_snv']: 78 | rule mvSNV: 79 | input: 80 | vcf=data_dir + "/snp/{aligner}/data.vcf.gz", 81 | gvcf=data_dir + "/snp/{aligner}/data.gvcf.gz", 82 | vcfindex=data_dir + "/snp/{aligner}/data.vcf.gz.tbi", 83 | gvcfindex=data_dir + "/snp/{aligner}/data.gvcf.gz.tbi", 84 | bam=data_dir + "/align/{aligner}/data.bam", 85 | bamindex=data_dir + "/align/{aligner}/data.bam.bai", 86 | output: 87 | data_dir + '/result' + '/.SNVs.{aligner}.done' 88 | params: 89 | vcf=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz".format(sample=SAMPLE_NAME), 90 | gvcf=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.gvcf.gz".format(sample=SAMPLE_NAME), 91 | vcfindex=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME), 92 | gvcfindex=data_dir + '/result' + "/{sample}.{{aligner}}.SNVs.gvcf.gz.tbi".format(sample=SAMPLE_NAME), 93 | bam=data_dir + '/result' + "/{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME), 94 | bamindex=data_dir + '/result' + "/{sample}.{{aligner}}.bam.bai".format(sample=SAMPLE_NAME), 95 | message: "Moving called SNVs to result directory {input}" 96 | priority: 1 97 | shell: """ 98 | function rm_last2() {{ 99 | d1=$(dirname $1) 100 | d2=$(dirname $d1) 101 | rm -rf $d2 102 | }} 103 | mkdir -p {data_dir}/log &&\ 104 | mv {input.vcf} {params.vcf} &&\ 105 | mv {input.gvcf} {params.gvcf} &&\ 106 | mv {input.vcfindex} {params.vcfindex} &&\ 107 | mv {input.gvcfindex} {params.gvcfindex} &&\ 108 | mv {input.bam} {params.bam} &&\ 109 | mv {input.bamindex} {params.bamindex} &&\ 110 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 111 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 112 | fi &&\ 113 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 114 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 115 | fi &&\ 116 | rm_last2 {input.vcf} && rm_last2 {input.bam} &&\ 117 | touch {output} 118 | """ 119 | else: 120 | rule mvSNV: 121 | input: 122 | vcf = data_dir + "/snp/{aligner}/data.vcf.gz", 123 | vcfindex = data_dir + "/snp/{aligner}/data.vcf.gz.tbi", 124 | bam = data_dir + "/align/{aligner}/data.bam", 125 | bamindex = data_dir + "/align/{aligner}/data.bam.bai", 126 | output: 127 | data_dir +'/result' + '/.SNVs.{aligner}.done' 128 | params: 129 | vcf = data_dir +'/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz".format(sample=SAMPLE_NAME), 130 | vcfindex = data_dir +'/result' + "/{sample}.{{aligner}}.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME), 131 | bam = data_dir +'/result' + "/{sample}.{{aligner}}.bam".format(sample=SAMPLE_NAME), 132 | bamindex = data_dir +'/result' + "/{sample}.{{aligner}}.bam.bai".format(sample=SAMPLE_NAME), 133 | message: "Moving called SNVs to result directory {input}" 134 | priority: 1 135 | shell:""" 136 | function rm_last2() {{ 137 | d1=$(dirname $1) 138 | d2=$(dirname $d1) 139 | rm -rf $d2 140 | }} 141 | mkdir -p {data_dir}/log &&\ 142 | mv {input.vcf} {params.vcf} &&\ 143 | mv {input.vcfindex} {params.vcfindex} &&\ 144 | mv {input.bam} {params.bam} &&\ 145 | mv {input.bamindex} {params.bamindex} &&\ 146 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 147 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 148 | fi &&\ 149 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 150 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 151 | fi &&\ 152 | rm_last2 {input.vcf} && rm_last2 {input.bam} &&\ 153 | touch {output} 154 | """ 155 | 156 | #### Variants Moving ######## 157 | ############################ 158 | 159 | rule mvVariants: 160 | input: 161 | snv = data_dir + "/snp/{aligner}/data.vcf.gz", 162 | snvindex = data_dir + "/snp/{aligner}/data.vcf.gz.tbi", 163 | sv = data_dir + "/sv/{aligner}/sniffles.vcf", 164 | snf = data_dir + "/sv/{aligner}/sniffles.snf", 165 | bam = data_dir + "/align/{aligner}/data.bam", 166 | bamindex = data_dir + "/align/{aligner}/data.bam.bai", 167 | output: 168 | data_dir + "/result" + "/.variant.{aligner}.done" 169 | message: "Moving called SNVs to result directory {input}" 170 | shell:""" 171 | function rm_last2() {{ 172 | d1=$(dirname $1) 173 | d2=$(dirname $d1) 174 | rm -rf $d2 175 | }} 176 | mkdir -p {data_dir}/log &&\ 177 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 178 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 179 | fi &&\ 180 | if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then 181 | mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || : 182 | fi &&\ 183 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 184 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 185 | fi &&\ 186 | if [ -f {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log ]; then 187 | mv {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log {data_dir}/log || : 188 | fi &&\ 189 | mv {input.snv} {data_dir}/result/SNVs.{wildcards.aligner}.vcf.gz &&\ 190 | mv {input.snvindex} {data_dir}/result/SNVs.{wildcards.aligner}.vcf.gz.tbi &&\ 191 | rm_last2 {input.snv} &&\ 192 | mv {input.sv} {data_dir}/result/SVs.{wildcards.aligner}.vcf &&\ 193 | mv {input.snf} {data_dir}/result/SVs.{wildcards.aligner}.snf &&\ 194 | rm_last2 {input.sv} &&\ 195 | rm_last2 {input.snf} &&\ 196 | mv {input.bam} {data_dir}/result/align.{wildcards.aligner}.bam &&\ 197 | mv {input.bamindex} {data_dir}/result/align.{wildcards.aligner}.bam.bai &&\ 198 | rm_last2 {input.bam} &&\ 199 | touch {output} 200 | """ 201 | 202 | #### Phasing Moving ######## 203 | ############################ 204 | 205 | rule mvPhasing: 206 | input: 207 | snv = data_dir + "/phased/{aligner}/data.vcf.gz", 208 | snvindex = data_dir + "/phased/{aligner}/data.vcf.gz.tbi", 209 | bam = data_dir + "/align/{aligner}/data.bam", 210 | bamindex = data_dir + "/align/{aligner}/data.bam.bai", 211 | output: 212 | snv = data_dir + "/result" + "/phased.SNVs.{aligner}.done", 213 | params: 214 | snv = data_dir + "/result" + "/phased.SNVs.{aligner}.vcf.gz", 215 | snvindex = data_dir + "/result" + "/phased.SNVs.{aligner}.vcf.gz.tbi", 216 | bam = data_dir +'/result' + '/aligning.{aligner}.bam', 217 | bamindex = data_dir +'/result' + '/aligning.{aligner}.bam.bai', 218 | message: "Moving called phased SNVs to result directory {input}" 219 | shell:""" 220 | function rm_last2() {{ 221 | d1=$(dirname $1) 222 | d2=$(dirname $d1) 223 | rm -rf $d2 ||: 224 | }} 225 | function rm_last1() {{ 226 | d1=$(dirname $1) 227 | rm -rf $d1 ||: 228 | }} 229 | mkdir -p {data_dir}/log &&\ 230 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 231 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 232 | fi &&\ 233 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 234 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 235 | fi &&\ 236 | if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then 237 | mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || : 238 | fi &&\ 239 | if [ -f {data_dir}/phased/{wildcards.aligner}/*.txt ]; then 240 | mv {data_dir}/phased/{wildcards.aligner}/*.txt {data_dir}/log || : 241 | fi &&\ 242 | if [ -f {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log ]; then 243 | mv {data_dir}/snp/{wildcards.aligner}/chrsplit/*.log {data_dir}/log || : 244 | fi &&\ 245 | mv {input.snv} {params.snv} &&\ 246 | mv {input.snvindex} {params.snvindex} &&\ 247 | mv {input.bam} {params.bam} && 248 | mv {input.bamindex} {params.bamindex} &&\ 249 | rm_last2 {input.snv} &&\ 250 | rm_last2 {input.bam} &&\ 251 | rm_last1 {data_dir}/snp/{wildcards.aligner} &&\ 252 | touch {output} 253 | """ 254 | 255 | #### All Moving ######## 256 | ######################## 257 | 258 | rule mvmethylation: 259 | input: 260 | methylation = data_dir + "/meth/"+ "{aligner}" + "/methylation_calls_hap.tsv", 261 | output: 262 | methylation = data_dir + "/result" + "/methylation.{aligner}_calls_hap.tsv" 263 | shell:""" 264 | function rm_last2() {{ 265 | d1=$(dirname $1) 266 | d2=$(dirname $d1) 267 | rm -rf $d2 ||: 268 | }} 269 | mv {input.methylation} {output.methylation} &&\ 270 | rm_last2 input.methylation} 271 | """ 272 | 273 | rule mvParentalPhased: 274 | input: 275 | stat = data_dir + "/stat.txt" if config['sample_list'] else data_dir + "/stat.NoReads.txt", 276 | phasedSNVs = data_dir + "/phased/{aligner}/data_updated.vcf", 277 | # phasedSVs = data_dir + "/sv/{aligner}/sniffles_hp_updated.vcf", 278 | bam = data_dir + "/align/{aligner}/data_hap.bam", 279 | bamindex = data_dir + "/align/{aligner}/data_hap.bam.bai", 280 | output: 281 | stat = data_dir + "/result/.allReadsparental.{aligner}.txt" #if config['sample_list'] else data_dir + "/result/.allNoReadsparental.{aligner}.txt", 282 | params: 283 | stat = data_dir + "/result/stat.{aligner}.txt", 284 | phasedSNVs = data_dir + "/result/{aligner}.phased.SNVs.vcf", 285 | # phasedSVs = data_dir + "/result/{aligner}.phased.SVs.vcf", 286 | bam = data_dir + "/result/{aligner}.hap.bam", 287 | bamindex = data_dir + "/result/{aligner}.hap.bam.bai", 288 | shell:""" 289 | function rm_last2() {{ 290 | d1=$(dirname $1) 291 | d2=$(dirname $d1) 292 | rm -rf $d2 ||: 293 | }} 294 | function rm_last1() {{ 295 | d1=$(dirname $1) 296 | rm -rf $d1 ||: 297 | }} 298 | mkdir -p {data_dir}/log &&\ 299 | if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then 300 | mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || : 301 | fi &&\ 302 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 303 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 304 | fi &&\ 305 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 306 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 307 | fi &&\ 308 | if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then 309 | mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || : 310 | fi &&\ 311 | mv {input.stat} {params.stat} &&\ 312 | mv {data_dir}/statistics {data_dir}/result &&\ 313 | mv {input.phasedSNVs} {params.phasedSNVs} &&\ 314 | bgzip {params.phasedSNVs} &&\ 315 | tabix {params.phasedSNVs}.gz &&\ 316 | mv {input.bam} {params.bam} &&\ 317 | mv {input.bamindex} {params.bamindex} &&\ 318 | rm_last2 {input.phasedSNVs} &&\ 319 | rm_last2 {input.bam} &&\ 320 | touch {output} 321 | """ 322 | # mv {input.phasedSVs} {params.phasedSVs} &&\ 323 | # rm_last2 {input.phasedSVs} &&\ 324 | 325 | if config['gvcf_snv']: 326 | rule mvNoParentalPhased: 327 | """ 328 | The rule here will make sure to get the rule for all command without parental comparison and later will delete the data. 329 | """ 330 | input: 331 | stat = data_dir + "/stat.txt" if config['sample_list'] else data_dir + "/stat.NoReads.txt", 332 | # phasedSvsSNVs = data_dir + "/sv/{aligner}/sv_snp.vcf.gz", 333 | phasedSNVs = data_dir + "/phased/{aligner}/data.vcf.gz", 334 | phasedSNVsindex = data_dir + "/phased/{aligner}/data.vcf.gz.tbi", 335 | SVs = data_dir + "/sv/{aligner}/sniffles.vcf", 336 | snf = data_dir + "/sv/{aligner}/sniffles.snf", 337 | # phasedSVs = data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz", 338 | bam = data_dir + "/align/{aligner}/data_hap.bam", 339 | bamindex = data_dir + "/align/{aligner}/data_hap.bam.bai", 340 | gvcf = data_dir + "/snp/{aligner}/data.gvcf.gz", 341 | output: 342 | stat = data_dir + "/result/.all.Reads.{aligner}.txt" if config['sample_list'] else data_dir + "/result/.all.noReads.{aligner}.txt", 343 | params: 344 | stat = data_dir + "/result/stat.{sample}.{{aligner}}.txt".format(sample=SAMPLE_NAME), 345 | # phasedSvsSNVs = data_dir + "/result/{aligner}.phased.sv_snp.vcf.gz", 346 | phasedSNVs = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz".format(sample=SAMPLE_NAME), 347 | phasedSNVsindex = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME), 348 | SVs = data_dir + "/result/{sample}.{{aligner}}.SVs.vcf".format(sample=SAMPLE_NAME), 349 | snf = data_dir + "/result/{sample}.{{aligner}}.SVs.snf".format(sample=SAMPLE_NAME), 350 | # phasedSVs = data_dir + "/result/{aligner}.SVs.phased.vcf.gz", 351 | bam = data_dir + "/result/{sample}.{{aligner}}.hap.bam".format(sample=SAMPLE_NAME), 352 | bamindex = data_dir + "/result/{sample}.{{aligner}}.hap.bam.bai".format(sample=SAMPLE_NAME), 353 | copy_gvcf = "True" if config['gvcf_snv'] else "False", 354 | gvcf = data_dir + "/result/{sample}.{{aligner}}.SNVs.gvcf.gz".format(sample=SAMPLE_NAME), 355 | shell:""" 356 | function rm_last2() {{ 357 | d1=$(dirname $1) 358 | d2=$(dirname $d1) 359 | rm -rf $d2 ||: 360 | }} 361 | function rm_last1() {{ 362 | d1=$(dirname $1) 363 | rm -rf $d1 ||: 364 | }} 365 | mkdir -p {data_dir}/log &&\ 366 | if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then 367 | mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || : 368 | fi &&\ 369 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 370 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 371 | fi &&\ 372 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 373 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 374 | fi &&\ 375 | if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then 376 | mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || : 377 | fi &&\ 378 | mv {input.gvcf} {params.gvcf} &&\ 379 | mv {input.stat} {params.stat} &&\ 380 | mv {input.phasedSNVs} {params.phasedSNVs} &&\ 381 | mv {input.phasedSNVsindex} {params.phasedSNVsindex} &&\ 382 | mv {input.SVs} {params.SVs} &&\ 383 | mv {input.snf} {params.snf} &&\ 384 | mv {input.bam} {params.bam} &&\ 385 | mv {input.bamindex} {params.bamindex} &&\ 386 | rm_last2 {input.SVs} &&\ 387 | rm_last2 {input.phasedSNVs} &&\ 388 | rm_last2 {input.bam} &&\ 389 | rm_last1 {data_dir}/snp/{wildcards.aligner} &&\ 390 | touch {output} 391 | """ 392 | else: 393 | rule mvNoParentalPhased: 394 | """ 395 | The rule here will make sure to get the rule for all command without parental comparison and later will delete the data. 396 | """ 397 | input: 398 | stat = data_dir + "/stat.txt" if config['sample_list'] else data_dir + "/stat.NoReads.txt", 399 | # phasedSvsSNVs = data_dir + "/sv/{aligner}/sv_snp.vcf.gz", 400 | phasedSNVs = data_dir + "/phased/{aligner}/data.vcf.gz", 401 | phasedSNVsindex = data_dir + "/phased/{aligner}/data.vcf.gz.tbi", 402 | SVs = data_dir + "/sv/{aligner}/sniffles.vcf", 403 | snf = data_dir + "/sv/{aligner}/sniffles.snf", 404 | # phasedSVs = data_dir + "/sv/{aligner}/sniffles_hp_updated.sorted.namechnage.vcf.gz", 405 | bam = data_dir + "/align/{aligner}/data_hap.bam", 406 | bamindex = data_dir + "/align/{aligner}/data_hap.bam.bai", 407 | output: 408 | stat = data_dir + "/result/.all.Reads.{aligner}.txt" if config['sample_list'] else data_dir + "/result/.all.noReads.{aligner}.txt", 409 | params: 410 | stat = data_dir + "/result/stat.{sample}.{{aligner}}.txt".format(sample=SAMPLE_NAME), 411 | # phasedSvsSNVs = data_dir + "/result/{aligner}.phased.sv_snp.vcf.gz", 412 | phasedSNVs = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz".format(sample=SAMPLE_NAME), 413 | phasedSNVsindex = data_dir + "/result/{sample}.{{aligner}}.phased.SNVs.vcf.gz.tbi".format(sample=SAMPLE_NAME), 414 | SVs = data_dir + "/result/{sample}.{{aligner}}.SVs.vcf".format(sample=SAMPLE_NAME), 415 | snf = data_dir + "/result/{sample}.{{aligner}}.SVs.snf".format(sample=SAMPLE_NAME), 416 | # phasedSVs = data_dir + "/result/{aligner}.SVs.phased.vcf.gz", 417 | bam = data_dir + "/result/{sample}.{{aligner}}.hap.bam".format(sample=SAMPLE_NAME), 418 | bamindex = data_dir + "/result/{sample}.{{aligner}}.hap.bam.bai".format(sample=SAMPLE_NAME), 419 | copy_gvcf = "True" if config['gvcf_snv'] else "False", 420 | shell:""" 421 | function rm_last2() {{ 422 | d1=$(dirname $1) 423 | d2=$(dirname $d1) 424 | rm -rf $d2 ||: 425 | }} 426 | function rm_last1() {{ 427 | d1=$(dirname $1) 428 | rm -rf $d1 ||: 429 | }} 430 | mkdir -p {data_dir}/log &&\ 431 | if [ -f {data_dir}/sv/{wildcards.aligner}/*.log ]; then 432 | mv {data_dir}/sv/{wildcards.aligner}/*.log {data_dir}/log || : 433 | fi &&\ 434 | if [ -f {data_dir}/snp/{wildcards.aligner}/*.log ]; then 435 | mv {data_dir}/snp/{wildcards.aligner}/*.log {data_dir}/log || : 436 | fi &&\ 437 | if [ -f {data_dir}/align/{wildcards.aligner}/*.log ]; then 438 | mv {data_dir}/align/{wildcards.aligner}/*.log {data_dir}/log || : 439 | fi &&\ 440 | if [ -f {data_dir}/phased/{wildcards.aligner}/*.log ]; then 441 | mv {data_dir}/phased/{wildcards.aligner}/*.log {data_dir}/log || : 442 | fi &&\ 443 | mv {input.stat} {params.stat} &&\ 444 | mv {input.phasedSNVs} {params.phasedSNVs} &&\ 445 | mv {input.phasedSNVsindex} {params.phasedSNVsindex} &&\ 446 | mv {input.SVs} {params.SVs} &&\ 447 | mv {input.snf} {params.snf} &&\ 448 | mv {input.bam} {params.bam} &&\ 449 | mv {input.bamindex} {params.bamindex} &&\ 450 | rm_last2 {input.SVs} &&\ 451 | rm_last2 {input.phasedSNVs} &&\ 452 | rm_last2 {input.bam} &&\ 453 | rm_last1 {data_dir}/snp/{wildcards.aligner} &&\ 454 | touch {output} 455 | """ 456 | # mv {input.phasedSVs} {params.phasedSVs} &&\ 457 | # mv {input.phasedSvsSNVs} {params.phasedSvsSNVs} &&\ 458 | # rm_last2 {input.phasedSvsSNVs} &&\ 459 | -------------------------------------------------------------------------------- /princess: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Snakemake file wrapper for princess. 5 | """ 6 | import argparse 7 | import sys, os, subprocess, ntpath, yaml 8 | from distutils.dir_util import copy_tree 9 | import filecmp, shutil 10 | from pathlib import Path 11 | from collections import namedtuple 12 | import logging 13 | import filecmp 14 | from typing import Any 15 | 16 | # Create a custom logger 17 | logger = logging.getLogger(__name__) 18 | logger.setLevel(logging.DEBUG) 19 | f_handler = None 20 | 21 | 22 | def get_args(): 23 | parser = argparse.ArgumentParser( 24 | epilog="%(prog)s version 0.01. use command -h for info.", 25 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 26 | description="Princess A framework for long-reads analysis.", 27 | add_help=True, 28 | ) 29 | 30 | parent_parser = argparse.ArgumentParser( 31 | add_help=False, formatter_class=argparse.ArgumentDefaultsHelpFormatter 32 | ) 33 | parent_parser.add_argument( 34 | "-v", "--version", action="version", version="%(prog)s 0.01" 35 | ) 36 | 37 | # Adding the main params for any commands 38 | parent_parser.add_argument( 39 | "-d", 40 | "--directory", 41 | help="Working directory.", 42 | metavar="Working directory", 43 | required=True, 44 | ) 45 | parent_parser.add_argument( 46 | "-r", 47 | "--ReadType", 48 | dest="read_type", 49 | type=str.lower, 50 | choices=["ont", "clr", "ccs"], 51 | help="Read technology", 52 | required=True, 53 | ) 54 | parent_parser.add_argument( 55 | "-u", 56 | "--UseConda", 57 | dest="use_conda", 58 | help="Use conda for running default: %(default)s)", 59 | action="store_false", 60 | ) 61 | parent_parser.add_argument( 62 | "-e", 63 | "--Cluster", 64 | dest="is_cluster", 65 | help="Use cluster while running default: %(default)s)", 66 | action="store_false", 67 | ) 68 | parent_parser.add_argument( 69 | "-a", 70 | "--Aligner", 71 | dest="aligner", 72 | choices=["minimap", "ngmlr"], 73 | help="In case if you want to choose specific aligner otherwise default will be used default: %(default)s)", 74 | default="minimap", 75 | ) 76 | parent_parser.add_argument( 77 | "-s", 78 | "--samplesFiles", 79 | dest="samples_files", 80 | metavar="samplesFiles", 81 | nargs="+", 82 | help="list of Fasta, Fastq, or gz files.", 83 | ) 84 | parent_parser.add_argument( 85 | "-f", 86 | "--ref", 87 | dest="ref", 88 | help="The reference file will be used to align reads to.", 89 | required=True, 90 | ) 91 | parent_parser.add_argument( 92 | "-j", 93 | "--jobs", 94 | dest="jobs", 95 | type=str, 96 | help="Number of running jobs default: %(default)s )", 97 | default="200", 98 | ) 99 | parent_parser.add_argument( 100 | "-g", 101 | "--log", 102 | dest="log_file", 103 | type=str, 104 | help="Log file: %(default)s )", 105 | default="PrincessLog.txt", 106 | ) 107 | parent_parser.add_argument( 108 | "-sn", 109 | "--sample-name", 110 | dest="sample_name", 111 | type=str, 112 | help="A sample name to use for BAMs, SVs, and SNVs helps when you are planning to merge multiple samples in the downstream analysis %(default)s", 113 | default="SAMPLE", 114 | ) 115 | parent_parser.add_argument( 116 | "-sp", 117 | "--phase-sv", 118 | dest="phase_sv", 119 | help="Phase the identified SV, default: %(default)s", 120 | action="store_true", 121 | ) 122 | parent_parser.add_argument( 123 | "-ms", 124 | "--mosaic-sv", 125 | dest="mosaic_sv", 126 | help="Identify mosaic SV, default: %(default)s", 127 | action="store_true", 128 | ) 129 | parent_parser.add_argument( 130 | "-gv", 131 | "--gvcf-snv", 132 | dest="gvcf_snv", 133 | help="Identify gVCF SNVs, default: %(default)s", 134 | action="store_true", 135 | ) 136 | 137 | # Sub-commands: 138 | subparser = parser.add_subparsers( 139 | title="Sub-commands", description="Valid sub-commands", dest="command" 140 | ) 141 | 142 | # All subparser. 143 | all_subparser = subparser.add_parser( 144 | "all", 145 | help="""This command will run the following:\n 146 | Align the reads.\nIdentify SVs\nIdentify SNVs\nPhase both SNVs and SVs""", 147 | parents=[parent_parser], 148 | ) 149 | all_subparser.add_argument( 150 | "-c", 151 | "--chr", 152 | dest="chrs", 153 | type=str, 154 | help="Chromosomes list,\ 155 | if not specified Princess will use all Chromosomes.", 156 | nargs="+", 157 | default=[], 158 | ) 159 | all_subparser.add_argument( 160 | "-t", 161 | "--filter", 162 | dest="filter", 163 | help="Filter identified SNVs using Princess algorithm\ 164 | default: %(default)s)", 165 | action="store_false", 166 | ) 167 | all_subparser.add_argument( 168 | "-m", 169 | "--methylation", 170 | dest="detect_methylation", 171 | help="Identify methylation, mutually inclusive with -md default: %(default)s)", 172 | action="store_true", 173 | ) 174 | all_subparser.add_argument( 175 | "-md", 176 | "--methylationDirectory", 177 | metavar="Fast5 Directory", 178 | dest="methylation_dir", 179 | help="Fast5 directory will be used to identify\ 180 | methylation mutually inclusive with option -m default: %(default)s)", 181 | default=False, 182 | ) 183 | all_subparser.add_argument( 184 | "-cm", 185 | "--clair-model", 186 | metavar="Clair model", 187 | dest="clair_model", 188 | help="Clair model, if not supplied we will use default model came with conda installation of Clair3.\nThe folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-00000-of-00002, full_alignment.data-00001-of-00002 and full_alignment.index)", 189 | default=None, 190 | ) 191 | all_subparser.set_defaults(func=all_analysis) 192 | 193 | # Align subparser. 194 | align_subparser = subparser.add_parser( 195 | "align", 196 | help="This command will align the input sequence files against the reference using either Minimap2 or NGMLR. You can use the -a option to choose the aligner, otherwise Minimap2 will be used by default.", 197 | parents=[parent_parser], 198 | ) 199 | align_subparser.set_defaults(func=align) 200 | 201 | # SV subparser. 202 | sv_subparser = subparser.add_parser( 203 | "sv", 204 | help="This command will use bam file \ 205 | to identify SV using Sniffles.", 206 | parents=[parent_parser], 207 | ) 208 | sv_subparser.set_defaults(func=sv) 209 | 210 | # SNV subparser. 211 | snv_subparser = subparser.add_parser( 212 | "snv", 213 | help="This command will use bam file \ 214 | to identify SNVs using Clair3.", 215 | parents=[parent_parser], 216 | ) 217 | snv_subparser.add_argument( 218 | "-c", 219 | "--chr", 220 | dest="chrs", 221 | type=str, 222 | help="Chromosomes list,\ 223 | if not specified we will use all Chromosomes.", 224 | nargs="+", 225 | default=[], 226 | ) 227 | snv_subparser.add_argument( 228 | "-t", 229 | "--filter", 230 | dest="filter", 231 | help="Filter identified SNVs using Princess algorithm\ 232 | default: %(default)s)", 233 | action="store_false", 234 | ) 235 | snv_subparser.add_argument( 236 | "-cm", 237 | "--clair-model", 238 | metavar="Clair model", 239 | dest="clair_model", 240 | help="Clair model, if not supplied we will use default model came with conda installation of Clair3.\nThe folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-00000-of-00002, full_alignment.data-00001-of-00002 and full_alignment.index)", 241 | default="", 242 | ) 243 | snv_subparser.set_defaults(func=snv) 244 | 245 | # VARIANT [SNV, and SV] subparser. 246 | variant_subparser = subparser.add_parser( 247 | "variant", 248 | help="This command will use bam file \ 249 | to identify SVs and SNVs.", 250 | parents=[parent_parser], 251 | ) 252 | variant_subparser.add_argument( 253 | "-c", 254 | "--chr", 255 | dest="chrs", 256 | type=str, 257 | help="Chromosomes list,\ 258 | if not specified we will use all Chromosomes.", 259 | nargs="+", 260 | default=[], 261 | ) 262 | variant_subparser.add_argument( 263 | "-t", 264 | "--filter", 265 | dest="filter", 266 | help="Filter identified SNVs using Princess algorithm\ 267 | default: %(default)s)", 268 | action="store_false", 269 | ) 270 | variant_subparser.add_argument( 271 | "-cm", 272 | "--clair-model", 273 | metavar="Clair model", 274 | dest="clair_model", 275 | help="Clair model, if not supplied we will use default model came with conda installation of Clair3.\nThe folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-00000-of-00002, full_alignment.data-00001-of-00002 and full_alignment.index)", 276 | default=None, 277 | ) 278 | variant_subparser.set_defaults(func=variant) 279 | 280 | # Phase subparser. 281 | phase_subparser = subparser.add_parser( 282 | "phase", 283 | help="This command will use use reads \ 284 | to identify SNVs by Clair and Phase them.", 285 | parents=[parent_parser], 286 | ) 287 | phase_subparser.add_argument( 288 | "-c", 289 | "--chr", 290 | dest="chrs", 291 | type=str, 292 | help="Chromosomes list,\ 293 | if not specified we will use all Chromosomes.", 294 | nargs="+", 295 | default=[], 296 | ) 297 | phase_subparser.add_argument( 298 | "-t", 299 | "--filter", 300 | dest="filter", 301 | help="Filter identified SNVs using Princess algorithm\ 302 | default: %(default)s)", 303 | action="store_false", 304 | ) 305 | phase_subparser.set_defaults(func=phase) 306 | 307 | # Overview subparser. 308 | overview_subparser = subparser.add_parser( 309 | "overview", 310 | help="This command will show what steps will run.", 311 | parents=[parent_parser], 312 | ) 313 | overview_subparser.add_argument( 314 | "-c", 315 | "--chr", 316 | dest="chrs", 317 | type=str, 318 | help="Chromosomes list,\ 319 | if not specified we will use all Chromosomes.", 320 | nargs="+", 321 | default=[], 322 | ) 323 | overview_subparser.set_defaults(func=overview) 324 | 325 | # if no argument print help. 326 | if len(sys.argv) == 1: 327 | parser.print_help(sys.stderr) 328 | sys.exit(1) 329 | 330 | args, unknownargs = parser.parse_known_args() 331 | unknownargs = sort_params(args, unknownargs) 332 | 333 | if "func" in args: 334 | ( 335 | current_dir, 336 | running_file, 337 | work_dir, 338 | conf_yaml, 339 | aligner, 340 | sample_list_from_config, 341 | number_of_jobs, 342 | number_of_tries, 343 | ) = required_vars(args, unknownargs) 344 | Main_vars = namedtuple( 345 | "Main_vars", 346 | "current_dir, running_file, work_dir, conf_yaml, aligner, sample_list_from_config, number_of_jobs, number_of_tries", 347 | ) 348 | main_vars = Main_vars( 349 | current_dir, 350 | running_file, 351 | work_dir, 352 | conf_yaml, 353 | aligner, 354 | sample_list_from_config, 355 | number_of_jobs, 356 | number_of_tries, 357 | ) 358 | log_dir = os.path.join(work_dir, args.log_file) 359 | global f_handler 360 | f_handler = logging.FileHandler(log_dir) 361 | f_handler.setLevel(logging.DEBUG) 362 | f_format = logging.Formatter( 363 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 364 | ) 365 | f_handler.setFormatter(f_format) 366 | logger.addHandler(f_handler) 367 | args.func(args, unknownargs, main_vars) 368 | else: 369 | parser.print_help() 370 | 371 | 372 | def overview(args, unknownargs, main_vars): 373 | if not args.samples_files and not main_vars.sample_list_from_config: 374 | print( 375 | "You need to support sequence read file/s either by using -s parameter or through sample_list filed in config.yaml file", 376 | file=sys.stderr, 377 | ) 378 | exit( 379 | f"Error exiting, see log file {os.path.join(args.directory, args.log_file)}" 380 | ) 381 | # check if the user gave existing chromosomes. 382 | chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"] 383 | is_valid_chrs(chr_list, args.ref) 384 | if not chr_list: 385 | chr_list = get_chrs(args.ref) 386 | chrs = "chrs=" + str(chr_list) 387 | samples, samples_names = get_sample_names(args, main_vars) 388 | # If we have samples let us create working directory If not exists. 389 | if not os.path.exists(main_vars.work_dir): 390 | os.makedirs(main_vars.work_dir) 391 | sample_dir = "sample_directory=" + main_vars.work_dir 392 | reference = ( 393 | "reference=" + args.ref 394 | if args.ref 395 | else "reference=" + main_vars.conf_yaml["reference"] 396 | ) 397 | # If we are already in main princess directory do nothing 398 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 399 | pass 400 | else: 401 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 402 | os.chdir(main_vars.work_dir) 403 | chr_log = " ".join(chr_list) if chr_list else "All Chromosomes" 404 | logger.info("Analyzed Chromosomes: ".format(chr_log)) 405 | running_command = "running_command=" + args.command 406 | 407 | cmd = [ 408 | "snakemake", 409 | "-n", 410 | "-p", 411 | "-r", 412 | "-j", 413 | args.jobs, 414 | "--config", 415 | sample_dir, 416 | samples, 417 | reference, 418 | chrs, 419 | running_command, 420 | *unknownargs, 421 | ] 422 | run_cmd(cmd) 423 | os.chdir(main_vars.current_dir) 424 | 425 | 426 | def align(args, unknownargs, main_vars): 427 | # Do we have samples? 428 | if not args.samples_files and not main_vars.sample_list_from_config: 429 | print( 430 | "You need to support sequence read file/s either by using -s parameter or through sample_list filed in config.yaml file" 431 | ) 432 | exit( 433 | f"Error extincting, see log file {os.path.join(args.directory, args.log_file)}" 434 | ) 435 | # If we have samples let us create working directory If not exists. 436 | if not os.path.exists(main_vars.work_dir): 437 | os.makedirs(main_vars.work_dir) 438 | # Get samples to pass to Snakefile 439 | samples_names = ( 440 | args.samples_files if args.samples_files else main_vars.sample_list_from_config 441 | ) 442 | sample_dir = "sample_directory=" + main_vars.work_dir 443 | reference = ( 444 | "reference=" + args.ref 445 | if args.ref 446 | else "reference=" + main_vars.conf_yaml["reference"] 447 | ) 448 | 449 | # If we are already in main princess directory do nothing 450 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 451 | pass 452 | else: 453 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 454 | # Move to working directory to start 455 | os.chdir(main_vars.work_dir) 456 | cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml") 457 | result = os.path.join( 458 | main_vars.work_dir, "result", ".aligning.{}.done".format(main_vars.aligner) 459 | ) 460 | 461 | reset_times = main_vars.number_of_tries 462 | running_command = "running_command=" + args.command 463 | delete_files = "delete_files=" + main_vars.running_file 464 | delete_samples = "delete_samples=" + str(samples_names) 465 | sample_name = "sample_name=" + args.sample_name 466 | # TODO: send only what needed instead of sending full object. 467 | samples, samples_names = get_sample_names(args, main_vars) 468 | if args.is_cluster: 469 | cmd = [ 470 | "snakemake", 471 | "-p", 472 | result, 473 | "-j", 474 | args.jobs, 475 | "--profile", 476 | "cluster", 477 | "--nolock", 478 | "--restart-times", 479 | reset_times, 480 | "--config", 481 | sample_dir, 482 | samples, 483 | reference, 484 | running_command, 485 | delete_files, 486 | delete_samples, 487 | sample_name, 488 | *unknownargs, 489 | ] 490 | else: 491 | cmd = [ 492 | "snakemake", 493 | "-p", 494 | result, 495 | "-j", 496 | args.jobs, 497 | "--cluster-config", 498 | cluster_config, 499 | "--nolock", 500 | "--restart-times", 501 | reset_times, 502 | "--config", 503 | sample_dir, 504 | samples, 505 | reference, 506 | running_command, 507 | delete_files, 508 | delete_samples, 509 | sample_name, 510 | *unknownargs, 511 | ] 512 | print(cmd) 513 | run_cmd(cmd) 514 | os.chdir(main_vars.current_dir) 515 | 516 | 517 | def sv(args, unknownargs, main_vars): 518 | if check_samples( 519 | main_vars.work_dir, 520 | main_vars.aligner, 521 | args.samples_files, 522 | main_vars.sample_list_from_config, 523 | args.command, 524 | args.log_file, 525 | ): 526 | pass 527 | if not os.path.exists(main_vars.work_dir): 528 | os.makedirs(main_vars.work_dir) 529 | # If we are already in main princess directory do nothing 530 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 531 | pass 532 | else: 533 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 534 | os.chdir(main_vars.work_dir) 535 | cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml") 536 | result = os.path.join( 537 | main_vars.work_dir, "result", ".SVs.{}.done".format(main_vars.aligner) 538 | ) 539 | sample_dir = "sample_directory=" + main_vars.work_dir 540 | reference = ( 541 | "reference=" + args.ref 542 | if args.ref 543 | else "reference=" + main_vars.conf_yaml["reference"] 544 | ) 545 | samples_names = ( 546 | args.samples_files if args.samples_files else main_vars.sample_list_from_config 547 | ) 548 | reset_times = main_vars.number_of_tries 549 | running_command = "running_command=" + args.command 550 | delete_files = "delete_files=" + main_vars.running_file 551 | delete_samples = "delete_samples=" + str(samples_names) 552 | phase_sv = "phase_sv=" + str(args.phase_sv) 553 | mosaic_sv = "mosaic_sv=" + str(args.mosaic_sv) 554 | 555 | if samples_names: 556 | samples, samples_names_str = get_sample_names(args, main_vars) 557 | if args.is_cluster: 558 | cmd = [ 559 | "snakemake", 560 | "-p", 561 | result, 562 | "-j", 563 | args.jobs, 564 | "--profile", 565 | "cluster", 566 | "--config", 567 | sample_dir, 568 | samples, 569 | reference, 570 | running_command, 571 | delete_files, 572 | delete_samples, 573 | phase_sv, 574 | mosaic_sv, 575 | *unknownargs, 576 | ] 577 | else: 578 | cmd = [ 579 | "snakemake", 580 | "-p", 581 | result, 582 | "-j", 583 | args.jobs, 584 | "--cluster-config", 585 | cluster_config, 586 | "--nolock", 587 | "--restart-times", 588 | reset_times, 589 | "--config", 590 | sample_dir, 591 | samples, 592 | reference, 593 | running_command, 594 | delete_files, 595 | delete_samples, 596 | phase_sv, 597 | mosaic_sv, 598 | *unknownargs, 599 | ] 600 | else: 601 | if args.is_cluster: 602 | cmd = [ 603 | "snakemake", 604 | "-p", 605 | result, 606 | "-j", 607 | args.jobs, 608 | "--profile", 609 | "cluster", 610 | "--nolock", 611 | "--restart-times", 612 | reset_times, 613 | "--config", 614 | sample_dir, 615 | reference, 616 | running_command, 617 | delete_files, 618 | delete_samples, 619 | phase_sv, 620 | mosaic_sv, 621 | *unknownargs, 622 | ] 623 | else: 624 | cmd = [ 625 | "snakemake", 626 | "-p", 627 | result, 628 | "-j", 629 | args.jobs, 630 | "--cluster-config", 631 | cluster_config, 632 | "--config", 633 | sample_dir, 634 | reference, 635 | running_command, 636 | delete_files, 637 | delete_samples, 638 | phase_sv, 639 | mosaic_sv, 640 | *unknownargs, 641 | ] 642 | run_cmd(cmd) 643 | os.chdir(main_vars.current_dir) 644 | 645 | 646 | def snv(args, unknownargs, main_vars): 647 | if check_samples( 648 | main_vars.work_dir, 649 | main_vars.aligner, 650 | args.samples_files, 651 | main_vars.sample_list_from_config, 652 | args.command, 653 | args.log_file, 654 | ): 655 | pass 656 | chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"] 657 | is_valid_chrs(chr_list, args.ref) 658 | if not chr_list: 659 | chr_list = get_chrs(args.ref) 660 | chrs = "chrs=" + str(chr_list) 661 | 662 | # Will I filter the SNVs? 663 | # TODO: filtering was developed for Clair2, now we do not, we need to develop another procedure to filter variants identified by Clair3 664 | filter_snv = "filter_chrs=" + str(args.filter) 665 | 666 | # Which model to use 667 | clair_model: str = ( 668 | "clair_model=" + args.clair_model if args.clair_model else "clair_model=''" 669 | ) 670 | 671 | # If we have samples let us create working directory If not exists. 672 | if not os.path.exists(main_vars.work_dir): 673 | os.makedirs(main_vars.work_dir) 674 | # If we are already in main princess directory do nothing 675 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 676 | pass 677 | else: 678 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 679 | os.chdir(main_vars.work_dir) 680 | cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml") 681 | result = os.path.join( 682 | main_vars.work_dir, "result", ".SNVs.{}.done".format(main_vars.aligner) 683 | ) 684 | sample_dir = "sample_directory=" + main_vars.work_dir 685 | reference = ( 686 | "reference=" + args.ref 687 | if args.ref 688 | else "reference=" + main_vars.conf_yaml["reference"] 689 | ) 690 | samples_names = ( 691 | args.samples_files if args.samples_files else main_vars.sample_list_from_config 692 | ) 693 | reset_times = main_vars.number_of_tries 694 | chr_log = " ".join(chr_list) if chr_list else "All Chromosomes" 695 | logger.info("Analyzed Chromosomes: ".format(chr_log)) 696 | logger.info("Clair model: ".format(clair_model)) 697 | running_command = "running_command=" + args.command 698 | delete_files = "delete_files=" + main_vars.running_file 699 | delete_samples = "delete_samples=" + str(samples_names) 700 | gvcf_snv = "gvcf_snv=" + str(args.gvcf_snv) 701 | 702 | if samples_names: 703 | samples, samples_names_str = get_sample_names(args, main_vars) 704 | if args.is_cluster: 705 | cmd = [ 706 | "snakemake", 707 | "-p", 708 | result, 709 | "-j", 710 | args.jobs, 711 | "--profile", 712 | "cluster", 713 | "--nolock", 714 | "--restart-times", 715 | reset_times, 716 | "--config", 717 | filter_snv, 718 | sample_dir, 719 | samples, 720 | reference, 721 | chrs, 722 | running_command, 723 | delete_files, 724 | delete_samples, 725 | clair_model, 726 | gvcf_snv, 727 | *unknownargs, 728 | ] 729 | else: 730 | cmd = [ 731 | "snakemake", 732 | "-p", 733 | result, 734 | "-j", 735 | args.jobs, 736 | "--cluster-config", 737 | cluster_config, 738 | "--nolock", 739 | "--restart-times", 740 | reset_times, 741 | "--config", 742 | filter_snv, 743 | sample_dir, 744 | samples, 745 | reference, 746 | chrs, 747 | running_command, 748 | delete_files, 749 | delete_samples, 750 | clair_model, 751 | gvcf_snv, 752 | *unknownargs, 753 | ] 754 | else: 755 | if args.is_cluster: 756 | cmd = [ 757 | "snakemake", 758 | "-p", 759 | result, 760 | "-j", 761 | args.jobs, 762 | "--profile", 763 | "cluster", 764 | "--nolock", 765 | "--restart-times", 766 | reset_times, 767 | "--config", 768 | filter_snv, 769 | sample_dir, 770 | reference, 771 | chrs, 772 | running_command, 773 | delete_files, 774 | delete_samples, 775 | clair_model, 776 | gvcf_snv, 777 | *unknownargs, 778 | ] 779 | else: 780 | cmd = [ 781 | "snakemake", 782 | "-p", 783 | result, 784 | "-j", 785 | args.jobs, 786 | "--cluster-config", 787 | cluster_config, 788 | "--nolock", 789 | "--restart-times", 790 | reset_times, 791 | "--config", 792 | filter_snv, 793 | sample_dir, 794 | reference, 795 | chrs, 796 | running_command, 797 | delete_files, 798 | delete_samples, 799 | clair_model, 800 | gvcf_snv, 801 | *unknownargs, 802 | ] 803 | run_cmd(cmd) 804 | os.chdir(main_vars.current_dir) 805 | 806 | 807 | def variant(args, unknownargs, main_vars): 808 | if check_samples( 809 | main_vars.work_dir, 810 | main_vars.aligner, 811 | args.samples_files, 812 | main_vars.sample_list_from_config, 813 | args.command, 814 | args.log_file, 815 | ): 816 | pass 817 | # check if the user gave existing chromosomes. 818 | chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"] 819 | is_valid_chrs(chr_list, args.ref) 820 | if not chr_list: 821 | chr_list = get_chrs(args.ref) 822 | chrs = "chrs=" + str(chr_list) 823 | # Will I filter the SNVs? 824 | filter_snv = "filter_chrs=" + str(args.filter) 825 | samples_names = ( 826 | args.samples_files if args.samples_files else main_vars.sample_list_from_config 827 | ) 828 | # If we have samples let us create working directory If not exists. 829 | if not os.path.exists(main_vars.work_dir): 830 | os.makedirs(main_vars.work_dir) 831 | # If we are already in main princess directory do nothing 832 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 833 | pass 834 | else: 835 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 836 | os.chdir(main_vars.work_dir) 837 | cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml") 838 | result = os.path.join( 839 | main_vars.work_dir, "result", ".variant.{}.done".format(main_vars.aligner) 840 | ) 841 | sample_dir = "sample_directory=" + main_vars.work_dir 842 | reference = ( 843 | "reference=" + args.ref 844 | if args.ref 845 | else "reference=" + main_vars.conf_yaml["reference"] 846 | ) 847 | reset_times = main_vars.number_of_tries 848 | running_command = "running_command=" + args.command 849 | delete_files = "delete_files=" + main_vars.running_file 850 | delete_samples = "delete_samples=" + str(samples_names) 851 | clair_model: str = ( 852 | "clair_model=" + args.clair_model if args.clair_model else "clair_model=''" 853 | ) 854 | 855 | if samples_names: 856 | samples, samples_names_str = get_sample_names(args, main_vars) 857 | if args.is_cluster: 858 | cmd = [ 859 | "snakemake", 860 | "-p", 861 | result, 862 | "-j", 863 | args.jobs, 864 | "--profile", 865 | "cluster", 866 | "--nolock", 867 | "--restart-times", 868 | reset_times, 869 | "--config", 870 | filter_snv, 871 | sample_dir, 872 | samples, 873 | reference, 874 | chrs, 875 | running_command, 876 | delete_files, 877 | delete_samples, 878 | clair_model, 879 | *unknownargs, 880 | ] 881 | else: 882 | cmd = [ 883 | "snakemake", 884 | "-p", 885 | result, 886 | "-j", 887 | args.jobs, 888 | "--cluster-config", 889 | cluster_config, 890 | "--nolock", 891 | "--restart-times", 892 | reset_times, 893 | "--config", 894 | filter_snv, 895 | sample_dir, 896 | samples, 897 | reference, 898 | chrs, 899 | running_command, 900 | delete_files, 901 | delete_samples, 902 | clair_model, 903 | *unknownargs, 904 | ] 905 | else: 906 | if args.is_cluster: 907 | cmd = [ 908 | "snakemake", 909 | "-p", 910 | result, 911 | "-j", 912 | args.jobs, 913 | "--profile", 914 | "cluster", 915 | "--nolock", 916 | "--restart-times", 917 | reset_times, 918 | "--config", 919 | filter_snv, 920 | sample_dir, 921 | reference, 922 | chrs, 923 | running_command, 924 | delete_files, 925 | delete_samples, 926 | clair_model, 927 | *unknownargs, 928 | ] 929 | else: 930 | cmd = [ 931 | "snakemake", 932 | "-p", 933 | result, 934 | "-j", 935 | args.jobs, 936 | "--cluster-config", 937 | cluster_config, 938 | "--nolock", 939 | "--restart-times", 940 | reset_times, 941 | "--config", 942 | filter_snv, 943 | sample_dir, 944 | reference, 945 | chrs, 946 | running_command, 947 | delete_files, 948 | delete_samples, 949 | clair_model, 950 | *unknownargs, 951 | ] 952 | 953 | log_chrs = " ".join(chr_list) if chr_list else "All Chromosomes" 954 | logger.info("{}{}".format("Chromosomes that will be analyzed: ", log_chrs)) 955 | logger.info("SNVs will be filtered: {}".format(str(args.filter))) 956 | logger.info("Clair model: ".format(args.clair_model)) 957 | logger.info("Work directory: {}".format(sample_dir)) 958 | logger.info("Reference: {}".format(args.ref)) 959 | logger.info("Aligner: {}".format(main_vars.aligner)) 960 | logger.info("Cluster Will be used: {}".format(args.is_cluster)) 961 | logger.info("Samples:\n{}".format("\n".join(samples_names))) 962 | logger.info("Results:\t{}".format(result)) 963 | run_cmd(cmd) 964 | os.chdir(main_vars.current_dir) 965 | 966 | 967 | def all_analysis(args, unknownargs, main_vars): 968 | if check_samples( 969 | main_vars.work_dir, 970 | main_vars.aligner, 971 | args.samples_files, 972 | main_vars.sample_list_from_config, 973 | args.command, 974 | args.log_file, 975 | ): 976 | pass 977 | if ( 978 | args.detect_methylation 979 | and not args.methylation_dir 980 | and dir_path(args.methylation_dir) 981 | ): 982 | logger.error("Option -m and -md is mutually inclusive") 983 | exit( 984 | f"Error exiting, see log file {os.path.join(args.directory, args.log_file)}" 985 | ) 986 | 987 | # Methylation option and directory 988 | methylation_option = "methylation=" + str(args.detect_methylation) 989 | meth_dir = "fast5_dir=" + str(args.methylation_dir) 990 | 991 | # check if the user gave existing chromosomes. 992 | chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"] 993 | is_valid_chrs(chr_list, args.ref) 994 | if not chr_list: 995 | chr_list = get_chrs(args.ref) 996 | chrs = "chrs=" + str(chr_list) 997 | 998 | # Will I filter the SNVs? 999 | filter_snv = "filter_chrs=" + str(args.filter) 1000 | samples_names = ( 1001 | args.samples_files if args.samples_files else main_vars.sample_list_from_config 1002 | ) 1003 | 1004 | # If we have samples let us create working directory If not exists. 1005 | if not os.path.exists(main_vars.work_dir): 1006 | os.makedirs(main_vars.work_dir) 1007 | # If we are already in main princess directory do nothing 1008 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 1009 | pass 1010 | else: 1011 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 1012 | os.chdir(main_vars.work_dir) 1013 | cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml") 1014 | sample_dir = "sample_directory=" + main_vars.work_dir 1015 | reference = ( 1016 | "reference=" + args.ref 1017 | if args.ref 1018 | else "reference=" + main_vars.conf_yaml["reference"] 1019 | ) 1020 | reset_times = main_vars.number_of_tries 1021 | running_command = "running_command=" + args.command 1022 | delete_files = "delete_files=" + main_vars.running_file 1023 | delete_samples = "delete_samples=" + str(samples_names) 1024 | sample_name = "sample_name=" + args.sample_name 1025 | phase_sv = "phase_sv=" + str(args.phase_sv) 1026 | mosaic_sv = "mosaic_sv=" + str(args.mosaic_sv) 1027 | clair_model = ( 1028 | "clair_model=" + args.clair_model if args.clair_model else "clair_model=''" 1029 | ) 1030 | gvcf_snv = "gvcf_snv=" + str(args.gvcf_snv) 1031 | 1032 | if samples_names: 1033 | samples, samples_names_str = get_sample_names(args, main_vars) 1034 | if args.is_cluster: 1035 | cmd = [ 1036 | "snakemake", 1037 | "-p", 1038 | "-j", 1039 | args.jobs, 1040 | "--profile", 1041 | "cluster", 1042 | "--nolock", 1043 | "--restart-times", 1044 | reset_times, 1045 | "--config", 1046 | methylation_option, 1047 | meth_dir, 1048 | filter_snv, 1049 | sample_dir, 1050 | samples, 1051 | reference, 1052 | chrs, 1053 | running_command, 1054 | delete_files, 1055 | delete_samples, 1056 | clair_model, 1057 | sample_name, 1058 | phase_sv, 1059 | mosaic_sv, 1060 | gvcf_snv, 1061 | *unknownargs, 1062 | ] 1063 | else: 1064 | cmd = [ 1065 | "snakemake", 1066 | "-p", 1067 | "-j", 1068 | args.jobs, 1069 | "--cluster-config", 1070 | cluster_config, 1071 | "--nolock", 1072 | "--restart-times", 1073 | reset_times, 1074 | "--config", 1075 | methylation_option, 1076 | meth_dir, 1077 | filter_snv, 1078 | sample_dir, 1079 | samples, 1080 | reference, 1081 | chrs, 1082 | running_command, 1083 | delete_files, 1084 | delete_samples, 1085 | clair_model, 1086 | sample_name, 1087 | phase_sv, 1088 | mosaic_sv, 1089 | gvcf_snv, 1090 | *unknownargs, 1091 | ] 1092 | else: 1093 | if args.is_cluster: 1094 | cmd = [ 1095 | "snakemake", 1096 | "-p", 1097 | "-j", 1098 | args.jobs, 1099 | "--profile", 1100 | "cluster", 1101 | "--nolock", 1102 | "--restart-times", 1103 | reset_times, 1104 | "--config", 1105 | methylation_option, 1106 | meth_dir, 1107 | filter_snv, 1108 | sample_dir, 1109 | reference, 1110 | chrs, 1111 | running_command, 1112 | delete_files, 1113 | delete_samples, 1114 | clair_model, 1115 | sample_name, 1116 | phase_sv, 1117 | mosaic_sv, 1118 | gvcf_snv, 1119 | *unknownargs, 1120 | ] 1121 | else: 1122 | cmd = [ 1123 | "snakemake", 1124 | "-p", 1125 | "-j", 1126 | args.jobs, 1127 | "--cluster-config", 1128 | cluster_config, 1129 | "--nolock", 1130 | "--restart-times", 1131 | reset_times, 1132 | "--config", 1133 | methylation_option, 1134 | meth_dir, 1135 | filter_snv, 1136 | sample_dir, 1137 | reference, 1138 | chrs, 1139 | running_command, 1140 | delete_files, 1141 | delete_samples, 1142 | clair_model, 1143 | sample_name, 1144 | phase_sv, 1145 | mosaic_sv, 1146 | gvcf_snv, 1147 | *unknownargs, 1148 | ] 1149 | 1150 | log_chrs = " ".join(chr_list) if chr_list else "All Chromosomes" 1151 | logger.info("{}{}".format("Chromosomes that will be analyzed: ", log_chrs)) 1152 | logger.info("SNVs will be filtered: {}".format(str(args.filter))) 1153 | logger.info("Work directory: {}".format(sample_dir)) 1154 | logger.info("Reference: {}".format(args.ref)) 1155 | logger.info("Aligner: {}".format(main_vars.aligner)) 1156 | logger.info("Cluster Will be used: {}".format(args.is_cluster)) 1157 | logger.info("Methylation will be detected: {}".format(args.detect_methylation)) 1158 | logger.info("Fast5 directory for Methylation: {}".format(args.methylation_dir)) 1159 | logger.info("Samples:\n{}".format("\n".join(samples_names))) 1160 | logger.info("Sample name: {}".format(args.sample_name)) 1161 | logger.info("Clair modle: {}".format(args.clair_model)) 1162 | logger.info("Running command:\n{}".format((str(cmd)))) 1163 | run_cmd(cmd) 1164 | os.chdir(main_vars.current_dir) 1165 | 1166 | 1167 | def phase(args, unknownargs, main_vars): 1168 | if check_samples( 1169 | main_vars.work_dir, 1170 | main_vars.aligner, 1171 | args.samples_files, 1172 | main_vars.sample_list_from_config, 1173 | args.command, 1174 | args.log_file, 1175 | ): 1176 | pass 1177 | # check if the user gave existing chromosomes. 1178 | chr_list = args.chrs if args.chrs else main_vars.conf_yaml["chrs"] 1179 | is_valid_chrs(chr_list, args.ref) 1180 | if not chr_list: 1181 | chr_list = get_chrs(args.ref) 1182 | chrs = "chrs=" + str(chr_list) 1183 | 1184 | # Will I filter the SNVs? 1185 | filter_snv = "filter_chrs=" + str(args.filter) 1186 | 1187 | # If we have samples let us create working directory If not exists. 1188 | if not os.path.exists(main_vars.work_dir): 1189 | os.makedirs(main_vars.work_dir) 1190 | # If we are already in main princess directory do nothing 1191 | if os.path.samefile(main_vars.running_file, main_vars.work_dir): 1192 | pass 1193 | else: 1194 | copy_tree(main_vars.running_file, main_vars.work_dir, update=1) 1195 | os.chdir(main_vars.work_dir) 1196 | cluster_config = os.path.join(main_vars.work_dir, "cluster", "cluster_config.yaml") 1197 | # result = os.path.join(main_vars.work_dir, 'phased', main_vars.aligner, 'data.vcf') 1198 | result = os.path.join( 1199 | main_vars.work_dir, "result", "phased.SNVs.{}.done".format(main_vars.aligner) 1200 | ) 1201 | sample_dir = "sample_directory=" + main_vars.work_dir 1202 | reference = ( 1203 | "reference=" + args.ref 1204 | if args.ref 1205 | else "reference=" + main_vars.conf_yaml["reference"] 1206 | ) 1207 | samples_names = ( 1208 | args.samples_files if args.samples_files else main_vars.sample_list_from_config 1209 | ) 1210 | reset_times = main_vars.number_of_tries 1211 | running_command = "running_command=" + args.command 1212 | delete_files = "delete_files=" + main_vars.running_file 1213 | delete_samples = "delete_samples=" + str(samples_names) 1214 | 1215 | if samples_names: 1216 | samples, samples_names_str = get_sample_names(args, main_vars) 1217 | if args.is_cluster: 1218 | cmd = [ 1219 | "snakemake", 1220 | "-p", 1221 | result, 1222 | "-j", 1223 | args.jobs, 1224 | "--profile", 1225 | "cluster", 1226 | "--nolock", 1227 | "--restart-times", 1228 | reset_times, 1229 | "--config", 1230 | filter_snv, 1231 | sample_dir, 1232 | samples, 1233 | reference, 1234 | chrs, 1235 | running_command, 1236 | delete_files, 1237 | delete_samples, 1238 | *unknownargs, 1239 | ] 1240 | else: 1241 | cmd = [ 1242 | "snakemake", 1243 | "-p", 1244 | result, 1245 | "-j", 1246 | args.jobs, 1247 | "--cluster-config", 1248 | cluster_config, 1249 | "--nolock", 1250 | "--restart-times", 1251 | reset_times, 1252 | "--config", 1253 | filter_snv, 1254 | sample_dir, 1255 | samples, 1256 | reference, 1257 | chrs, 1258 | running_command, 1259 | delete_files, 1260 | delete_samples, 1261 | *unknownargs, 1262 | ] 1263 | else: 1264 | if args.is_cluster: 1265 | cmd = [ 1266 | "snakemake", 1267 | "-p", 1268 | result, 1269 | "-j", 1270 | args.jobs, 1271 | "--profile", 1272 | "cluster", 1273 | "--nolock", 1274 | "--restart-times", 1275 | reset_times, 1276 | "--config", 1277 | filter_snv, 1278 | sample_dir, 1279 | reference, 1280 | chrs, 1281 | running_command, 1282 | delete_files, 1283 | delete_samples, 1284 | *unknownargs, 1285 | ] 1286 | else: 1287 | cmd = [ 1288 | "snakemake", 1289 | "-p", 1290 | result, 1291 | "-j", 1292 | args.jobs, 1293 | "--cluster-config", 1294 | cluster_config, 1295 | "--nolock", 1296 | "--restart-times", 1297 | reset_times, 1298 | "--config", 1299 | filter_snv, 1300 | sample_dir, 1301 | reference, 1302 | chrs, 1303 | running_command, 1304 | delete_files, 1305 | delete_samples, 1306 | *unknownargs, 1307 | ] 1308 | log_chrs = " ".join(chr_list) if chr_list else "All Chromosomes" 1309 | logger.info("{}{}".format("Chromosomes that will be analyzed: ", log_chrs)) 1310 | logger.info("SNVs will be filtered: {}".format(str(args.filter))) 1311 | logger.info("Work directory: {}".format(sample_dir)) 1312 | logger.info("Reference: {}".format(args.ref)) 1313 | logger.info("Aligner: {}".format(main_vars.aligner)) 1314 | logger.info("Cluster Will be used: {}".format(args.is_cluster)) 1315 | logger.info("Samples:\n{}".format("\n".join(samples_names))) 1316 | logger.info("Samples:\n{}".format("\n".join(samples_names))) 1317 | logger.info("Results:\n{}".format(result)) 1318 | logger.info("running command\n{}".format("\t".join(cmd))) 1319 | run_cmd(cmd) 1320 | os.chdir(main_vars.current_dir) 1321 | 1322 | 1323 | def sort_params(args, unknownargs): 1324 | # To follow directly the param --config 1325 | if args.aligner: 1326 | unknownargs.insert(0, "aligner={}".format(args.aligner)) 1327 | 1328 | if args.read_type: 1329 | unknownargs.insert(0, "read_type={}".format(args.read_type)) 1330 | 1331 | # add other snakemake params at the tail of the list 1332 | if args.use_conda: 1333 | unknownargs.append("--use-conda") 1334 | 1335 | return unknownargs 1336 | 1337 | 1338 | def required_vars(args, unknownargs): 1339 | current_dir = os.getcwd() 1340 | running_file = os.path.dirname(os.path.realpath(__file__)) 1341 | work_dir = os.path.abspath(args.directory) 1342 | 1343 | # creating DIRECTORY 1344 | if not os.path.exists(work_dir): 1345 | os.makedirs(work_dir) 1346 | 1347 | # loading info from yaml file (configfile) 1348 | if not os.path.exists(os.path.join(work_dir, "config.yaml")) or not filecmp.cmp( 1349 | os.path.join(running_file, "config.yaml"), 1350 | os.path.exists(os.path.join(work_dir, "config.yaml")), 1351 | ): 1352 | shutil.copy(os.path.join(running_file, "config.yaml"), work_dir) 1353 | 1354 | with open(os.path.join(work_dir, "config.yaml"), "r") as myyaml: 1355 | conf_yaml = yaml.safe_load(myyaml) 1356 | 1357 | aligner = args.aligner if args.aligner else str(conf_yaml["aligner"]) 1358 | # TODO: you shall create this variable by checking first if it was passed as argument else use config file. 1359 | sample_list_from_config = conf_yaml["sample_list"] 1360 | number_of_jobs = args.jobs if args.jobs else str(conf_yaml["cluster_jobs"]) 1361 | number_of_tries = str(conf_yaml["number_of_tries"]) 1362 | 1363 | return ( 1364 | current_dir, 1365 | running_file, 1366 | work_dir, 1367 | conf_yaml, 1368 | aligner, 1369 | sample_list_from_config, 1370 | number_of_jobs, 1371 | number_of_tries, 1372 | ) 1373 | 1374 | 1375 | def get_sample_names(args, main_vars): 1376 | final_samples = "" 1377 | samples_names = "" 1378 | if args.samples_files or main_vars.sample_list_from_config: 1379 | samples = ( 1380 | [os.path.abspath(i) for i in args.samples_files] 1381 | if args.samples_files 1382 | else main_vars.sample_list_from_config 1383 | ) 1384 | # get samples names and soft link them in the new directory 1385 | for sample in samples: 1386 | if not os.path.isfile(sample): 1387 | print("This sample {} does not exist".format(sample)) 1388 | exit( 1389 | f"Error extincting, see log file {os.path.join(args.directory, args.log_file)}" 1390 | ) 1391 | absolute_name = ntpath.basename(sample) 1392 | if not os.path.islink( 1393 | os.path.join(main_vars.work_dir, absolute_name) 1394 | ) and not os.path.isfile(os.path.join(main_vars.work_dir, absolute_name)): 1395 | os.symlink(sample, os.path.join(main_vars.work_dir, absolute_name)) 1396 | if samples_names: 1397 | samples_names += " " + absolute_name 1398 | else: 1399 | samples_names += absolute_name 1400 | final_samples = "sample_list=" + samples_names 1401 | return final_samples, samples_names 1402 | 1403 | 1404 | def run_cmd(cmd): 1405 | try: 1406 | subprocess.run(cmd, check=True, universal_newlines=True) 1407 | except subprocess.CalledProcessError as e: 1408 | v = " ".join(cmd) 1409 | print(f"Running:\n{v}") 1410 | logger.error( 1411 | "Error in subprocess:\nCommand: {}\nError: {}".format(" ".join(cmd), e) 1412 | ) 1413 | 1414 | 1415 | def is_valid_chrs(chr_list, ref): 1416 | if chr_list: 1417 | if os.path.isfile(ref + ".fai"): 1418 | chr_names = set() 1419 | with open(ref + ".fai", "r") as data_in: 1420 | for line in data_in: 1421 | chr_names.add(str(line.split()[0])) 1422 | if not set(chr_list).issubset(chr_names): 1423 | print( 1424 | "The chromosomes names you gave {} one or more of them does not exists in the reference.\nSupported Chromosomes are:{}".format( 1425 | str(chr_list), sorted(chr_names) 1426 | ) 1427 | ) 1428 | else: 1429 | print( 1430 | "Please make sure that {ref}.fai exists.\nOtherwise run:\nsamtools faidx {ref}".format( 1431 | ref=ref 1432 | ) 1433 | ) 1434 | 1435 | 1436 | def get_chrs(ref): 1437 | if os.path.isfile(ref + ".fai"): 1438 | chr_names = [] 1439 | with open(ref + ".fai", "r") as data_in: 1440 | for line in data_in: 1441 | chr_names.append(str(line.split()[0])) 1442 | return chr_names 1443 | else: 1444 | print( 1445 | "Please make sure that {ref}.fai exists.\nOtherwise run:\nsamtools faidx {ref}".format( 1446 | ref=ref 1447 | ) 1448 | ) 1449 | 1450 | 1451 | def clean(source_dir, samples_names): 1452 | file_list = os.listdir(source_dir) 1453 | if samples_names: 1454 | for f in samples_names.split(): 1455 | os.remove(f) 1456 | for f in file_list: 1457 | if os.path.isfile(f): 1458 | os.remove(f) 1459 | else: 1460 | shutil.rmtree(f) 1461 | 1462 | 1463 | def dir_path(path): 1464 | return True if os.path.isdir(path) else False 1465 | 1466 | 1467 | def check_samples(work_dir, aligner, samples_files, sample_list, command, log_file): 1468 | if ( 1469 | not ( 1470 | os.path.exists(os.path.join(work_dir, "align", aligner, "data.bam")) 1471 | or os.path.exists(os.path.join(work_dir, "align", aligner, "data_hap.bam")) 1472 | ) 1473 | and not samples_files 1474 | and not sample_list 1475 | ): 1476 | logger.error( 1477 | "Please if you want run {} command there should be aligned file like {} otherwise use -s to support samples or sample_list filed in config.yaml file to support Princess with the files to align".format( 1478 | command, os.path.join(work_dir, "align", aligner, "data.bam") 1479 | ) 1480 | ) 1481 | exit(f"Error extincting, see log file {os.path.join(work_dir, log_file)}") 1482 | else: 1483 | return True 1484 | 1485 | 1486 | def main(): 1487 | get_args() 1488 | 1489 | 1490 | if __name__ == "__main__": 1491 | main() 1492 | --------------------------------------------------------------------------------