├── environment ├── conda │ ├── deactivate │ │ └── env_vars.sh │ ├── activate │ │ └── env_vars.sh │ ├── conda_convert.yml │ ├── testing │ │ ├── testasm.yml │ │ └── rtest.yml │ ├── conda_dipassm.yml │ ├── conda_bifrost.yml │ ├── conda_merqury.yml │ ├── conda_pbtools.yml │ ├── conda_shelltools.yml │ ├── conda_rtools.yml │ ├── conda_export_env_vars.sh │ ├── conda_pyscript.yml │ ├── conda_biotools.yml │ └── conda_evaltools.yml ├── sync │ ├── include.txt │ ├── exclude.txt │ └── sync_commands.md └── snakemake │ ├── cluster │ ├── denbi_tu_slurm │ │ ├── denbi_cluster.json │ │ └── config.yaml │ ├── deep │ │ ├── config.yaml │ │ └── deep_cluster.json │ ├── denbi_tu_pbs │ │ ├── config.yaml │ │ └── denbi_cluster.json │ └── hhu_pbs │ │ ├── config.yaml │ │ └── hilbert_queues.md │ ├── demo │ └── config.yaml │ ├── server │ ├── denbi_europa │ │ └── config.yaml │ └── d3compute │ │ └── config.yaml │ └── laptop │ └── config.yaml ├── smk_config ├── selectors │ ├── hgsvc_blacklist.yml │ ├── test_pb.yml │ ├── ccs_prod_run.yml │ ├── test_ont.yml │ ├── hgsvc_clr_run.yml │ ├── clr_prod_linknhr_run.yml │ ├── clr_prod_run.yml │ ├── hgsvc_ccs_run.yml │ └── lansdorp.yml ├── demo │ ├── run_env.yml │ ├── na12878.yml │ └── params.yml ├── run_env │ ├── smk_cfg_env-laptop.yml │ ├── smk_cfg_env-hhu.yml │ ├── smk_cfg_env-mmci.yml │ └── smk_cfg_env-valet.yml ├── data_sources │ ├── sseq_local_denbi.yml │ ├── hgsvc_local_denbi.yml │ ├── hgsvc_ftp_src_illumina.yml │ ├── hgsvc_ftp_src_strandseq.yml │ └── hgsvc_local_hhu.yml ├── samples │ ├── hgsvc │ │ ├── AFR │ │ │ ├── ESN │ │ │ │ ├── hg03125.yml │ │ │ │ └── hg03371.yml │ │ │ ├── GWD │ │ │ │ ├── hg02818.yml │ │ │ │ └── hg02587.yml │ │ │ ├── MSL │ │ │ │ ├── hg03486.yml │ │ │ │ └── hg03065.yml │ │ │ ├── LWK │ │ │ │ └── na19036.yml │ │ │ ├── ACB │ │ │ │ └── hg02011.yml │ │ │ ├── ASW │ │ │ │ └── na19983.yml │ │ │ └── YRI │ │ │ │ ├── na19240.yml │ │ │ │ ├── na19238.yml │ │ │ │ └── na19239.yml │ │ ├── AMR │ │ │ ├── PEL │ │ │ │ └── hg01573.yml │ │ │ ├── CLM │ │ │ │ └── hg01114.yml │ │ │ ├── MXL │ │ │ │ └── na19650.yml │ │ │ └── PUR │ │ │ │ ├── hg00732.yml │ │ │ │ └── hg00731.yml │ │ ├── SAS │ │ │ ├── PJL │ │ │ │ └── hg02492.yml │ │ │ ├── BEB │ │ │ │ └── hg03009.yml │ │ │ ├── ITU │ │ │ │ ├── hg03721.yml │ │ │ │ └── hg03732.yml │ │ │ ├── STU │ │ │ │ └── hg03683.yml │ │ │ └── GIH │ │ │ │ └── na20847.yml │ │ ├── EUR │ │ │ ├── CEU │ │ │ │ └── na12329.yml │ │ │ ├── FIN │ │ │ │ └── hg00171.yml │ │ │ ├── IBS │ │ │ │ └── hg01505.yml │ │ │ ├── GBR │ │ │ │ └── hg00096.yml │ │ │ └── TSI │ │ │ │ └── na20509.yml │ │ └── EAS │ │ │ ├── KHV │ │ │ ├── hg01596.yml │ │ │ └── hg02018.yml │ │ │ ├── CDX │ │ │ └── hg00864.yml │ │ │ ├── JPT │ │ │ └── na18939.yml │ │ │ ├── CHB │ │ │ └── na18534.yml │ │ │ └── CHS │ │ │ ├── hg00514.yml │ │ │ ├── hg00513.yml │ │ │ └── hg00512.yml │ ├── na12878.yml │ ├── na24143.yml │ └── na24149.yml └── params │ ├── smk_cfg_params_RV7.yml │ ├── smk_cfg_params_RV8.yml │ ├── smk_cfg_params_RV10.yml │ ├── smk_cfg_params_RV9.yml │ ├── smk_cfg_params_RV11.yml │ └── smk_cfg_params_RV12.yml ├── annotation ├── grch38 │ ├── known_regions │ │ ├── ucsc_segdups.tsv.gz │ │ ├── GRCh38_p13_chromXY_PAR.tsv │ │ └── Modeled_regions_for_GRCh38.tsv │ ├── 20200723_GRCh38_p13_regions.bed │ └── issues │ │ └── grch38_p13_unknown.tsv ├── 20200507_ASanders_100cell_controls.txt ├── 1kg_hgsvc_colors.csv ├── sample_table.tsv ├── NA24385_selected_libraries_sseq.csv ├── in_preparation │ └── bl_supp_HG02818_HG03125_HG03486_NA19434.txt └── 20200507_ASanders_wgs_cells.txt ├── scripts ├── run_saarclust.R ├── install_saarclust.R ├── install_strandphaser.R ├── run_strandphaser.R ├── run_breakpointr.R ├── eval │ ├── response │ │ ├── response-reviewer3-comment12.py │ │ └── response-reviewer3-comment2.py │ └── extract_contigs.py ├── install_breakpointr.R ├── fb-parallel-timeout.sh ├── utilities │ ├── mem_profiler.py │ ├── check_scripts │ │ ├── fastq_checker.py │ │ ├── tagging_checker.py │ │ └── fasta_checker.py │ ├── process_logger.py │ ├── inspect_environment.py │ ├── version_checker.py │ └── summarize_vcf.py ├── dev │ ├── ref_phasing │ │ ├── prep_vcf.py │ │ └── prep_ref.py │ ├── cluster_splitter.py │ └── hybrid_renamer.py └── plot_saarclust_diagnostics.R ├── notes ├── align_ccs_racon.md ├── minimap_ctg_ref.md └── align_strandseq.md ├── smk_include ├── module_includes.smk ├── link_data_sources.smk ├── results │ ├── run_eur_trios.smk │ ├── run_sas_trios.smk │ ├── run_amr_trios.smk │ └── run_eas_trios.smk ├── dev │ └── run_all_eval.smk ├── haploid_read_coverage.smk └── eval_known_reference.smk ├── LICENSE ├── .gitignore ├── docs ├── demo.md └── autoconf.md ├── README.md └── notebooks ├── 2020_project └── processing │ └── clean_segdups_annotation.ipynb ├── dev └── merge_numpy_aln_dumps.ipynb ├── subsample_hg00733_strandseq.ipynb └── dump_sample_table.ipynb /environment/conda/deactivate/env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | unset SGE_ROOT 4 | unset SGE_CELL -------------------------------------------------------------------------------- /smk_config/selectors/hgsvc_blacklist.yml: -------------------------------------------------------------------------------- 1 | file_download_blacklist: "annotation/hgsvc_blacklist.txt" -------------------------------------------------------------------------------- /environment/conda/activate/env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export SGE_ROOT=/TL/deep-gridengine 4 | export SGE_CELL=deep -------------------------------------------------------------------------------- /environment/sync/include.txt: -------------------------------------------------------------------------------- 1 | *.fasta 2 | *.vcf.bgz 3 | *.vcf.bgz.tbi 4 | *stats 5 | *.pdf 6 | *.txt 7 | *.tsv 8 | *.log 9 | *.rsrc -------------------------------------------------------------------------------- /smk_config/demo/run_env.yml: -------------------------------------------------------------------------------- 1 | 2 | num_cpu_max: 24 3 | num_cpu_high: 24 4 | num_cpu_medium: 12 5 | num_cpu_low: 4 6 | 7 | env_module_singularity: False -------------------------------------------------------------------------------- /smk_config/selectors/test_pb.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - flye 5 | var_caller: 6 | - longshot 7 | name: 8 | - pacbio_test -------------------------------------------------------------------------------- /annotation/grch38/known_regions/ucsc_segdups.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ptrebert/project-diploid-assembly/HEAD/annotation/grch38/known_regions/ucsc_segdups.tsv.gz -------------------------------------------------------------------------------- /annotation/grch38/known_regions/GRCh38_p13_chromXY_PAR.tsv: -------------------------------------------------------------------------------- 1 | "PAR#1" "X" 10001 2781479 2 | "PAR#2" "X" 155701383 156030895 3 | "PAR#1" "Y" 10001 2781479 4 | "PAR#2" "Y" 56887903 57217415 5 | -------------------------------------------------------------------------------- /environment/conda/conda_convert.yml: -------------------------------------------------------------------------------- 1 | name: convert 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.6 8 | - pip 9 | - seqtk 10 | - dnaio 11 | -------------------------------------------------------------------------------- /smk_config/selectors/ccs_prod_run.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - pereg 5 | var_caller: 6 | - deepvar 7 | 8 | skip_targets: 9 | name: 10 | - pacbio_test 11 | - nanopore_test 12 | -------------------------------------------------------------------------------- /environment/conda/testing/testasm.yml: -------------------------------------------------------------------------------- 1 | name: testasm 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - Python=3.6.* 8 | - pip=19.2.3 9 | - flye=2.6 10 | - raven-assembler=0.0.1 -------------------------------------------------------------------------------- /smk_config/run_env/smk_cfg_env-laptop.yml: -------------------------------------------------------------------------------- 1 | 2 | notify: False 3 | notify_email: ebertp@hhu.de 4 | 5 | env_module_singularity: False 6 | 7 | num_cpu_max: 4 8 | num_cpu_high: 3 9 | num_cpu_medium: 2 10 | num_cpu_low: 1 11 | -------------------------------------------------------------------------------- /smk_config/run_env/smk_cfg_env-hhu.yml: -------------------------------------------------------------------------------- 1 | 2 | notify: False 3 | notify_email: ebertp@hhu.de 4 | 5 | env_module_singularity: Singularity 6 | 7 | num_cpu_max: 72 8 | num_cpu_high: 24 9 | num_cpu_medium: 12 10 | num_cpu_low: 6 11 | -------------------------------------------------------------------------------- /smk_config/run_env/smk_cfg_env-mmci.yml: -------------------------------------------------------------------------------- 1 | 2 | notify: True 3 | notify_email: pebert@mpi-inf.mpg.de 4 | 5 | env_module_singularity: False 6 | 7 | num_cpu_max: 48 8 | num_cpu_high: 24 9 | num_cpu_medium: 12 10 | num_cpu_low: 6 11 | -------------------------------------------------------------------------------- /smk_config/selectors/test_ont.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - shasta 5 | - flye 6 | var_caller: 7 | - longshot 8 | name: 9 | - nanopore_test 10 | 11 | select_target_path: REPORT_DRAFT_HAPLOID_ASSEMBLY 12 | -------------------------------------------------------------------------------- /smk_config/selectors/hgsvc_clr_run.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - flye 5 | hap_assembler: 6 | - flye 7 | var_caller: 8 | - longshot 9 | 10 | skip_targets: 11 | name: 12 | - pacbio_test 13 | - nanopore_test 14 | -------------------------------------------------------------------------------- /smk_config/run_env/smk_cfg_env-valet.yml: -------------------------------------------------------------------------------- 1 | 2 | notify: False 3 | notify_email: pebert@mpi-inf.mpg.de 4 | 5 | force_local_copy: True 6 | 7 | env_module_singularity: False 8 | 9 | num_cpu_max: 36 10 | num_cpu_high: 36 11 | num_cpu_medium: 12 12 | num_cpu_low: 6 13 | -------------------------------------------------------------------------------- /smk_config/selectors/clr_prod_linknhr_run.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - uw27 5 | - jax27 6 | - hhu26 7 | - hhu27 8 | var_caller: 9 | - longshot 10 | 11 | skip_targets: 12 | name: 13 | - pacbio_test 14 | - nanopore_test 15 | -------------------------------------------------------------------------------- /smk_config/selectors/clr_prod_run.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - flye 5 | - uw27r 6 | - mpi27r 7 | - jax27r 8 | - hhu26 9 | var_caller: 10 | - longshot 11 | 12 | skip_targets: 13 | name: 14 | - pacbio_test 15 | - nanopore_test 16 | -------------------------------------------------------------------------------- /smk_config/selectors/hgsvc_ccs_run.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - pereg 5 | hap_assembler: 6 | - pereg 7 | var_caller: 8 | - deepvar 9 | 10 | skip_targets: 11 | name: 12 | - pacbio_test 13 | - nanopore_test 14 | - hifi_subsampling 15 | -------------------------------------------------------------------------------- /scripts/run_saarclust.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(SaaRclust)) 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | 7 | scaffoldDenovoAssembly( 8 | configfile = args[1], 9 | bamfolder = args[2], 10 | outputfolder = args[3] 11 | ) 12 | 13 | warnings() 14 | 15 | quit(save='no') -------------------------------------------------------------------------------- /environment/conda/conda_dipassm.yml: -------------------------------------------------------------------------------- 1 | name: dipassm 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python=3.6.* 7 | - pip=19.2.3 8 | - openssl=1.0.2t 9 | - Snakemake=5.10.0 10 | - drmaa=0.7.9 11 | - pyyaml=5.3 12 | - pandas=1.0.5 13 | - pytables=3.6.1 14 | - intervaltree=3.0.2 -------------------------------------------------------------------------------- /smk_config/selectors/lansdorp.yml: -------------------------------------------------------------------------------- 1 | 2 | select_targets: 3 | nhr_assembler: 4 | - hifiasm 5 | hap_assembler: 6 | - hifiasm 7 | var_caller: 8 | - deepvar 9 | name: 10 | - Lansdorp 11 | 12 | skip_targets: 13 | name: 14 | - pacbio_test 15 | - nanopore_test 16 | - hifi_subsampling 17 | 18 | -------------------------------------------------------------------------------- /environment/snakemake/cluster/denbi_tu_slurm/denbi_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__": 3 | { 4 | "cores": "{threads}", 5 | "mem_mb": "{resources.mem_per_cpu_mb}", 6 | "name": "{jobid}_{rule}", 7 | "output": "log/cluster_jobs/{jobid}_{rule}.stdout", 8 | "error": "log/cluster_jobs/{jobid}_{rule}.stderr" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /environment/conda/conda_bifrost.yml: -------------------------------------------------------------------------------- 1 | name: bifrost 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - python=3.6 8 | - pip 9 | - c-compiler 10 | - cmake>=3.0.0 11 | - compilers 12 | - cxx-compiler 13 | - libclang>=8.0.0 14 | - pkg-config 15 | - pthread-stubs 16 | - xz 17 | - zlib 18 | -------------------------------------------------------------------------------- /environment/conda/conda_merqury.yml: -------------------------------------------------------------------------------- 1 | name: merqury 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python<3.8 7 | - r-base=3.6.1 8 | - r-ggplot2 9 | - r-argparse 10 | - r-scales 11 | - samtools 12 | - bedtools 13 | - igvtools 14 | - openjdk 15 | - c-compiler 16 | - cxx-compiler 17 | - compilers 18 | - make 19 | -------------------------------------------------------------------------------- /environment/conda/conda_pbtools.yml: -------------------------------------------------------------------------------- 1 | name: pbtools 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=2.7.* 7 | - python-consensuscore2=3.4.1 8 | - genomicconsensus=2.3.3 9 | - pbgcpp=1.9.0 10 | - pbccs=3.4.1 11 | - pbbam=1.0.6 12 | - pbmm2=1.1.0 13 | - pbcoretools=0.2.4 14 | - bam2fastx=1.3.0 15 | - hap.py=0.3.10 -------------------------------------------------------------------------------- /environment/conda/conda_shelltools.yml: -------------------------------------------------------------------------------- 1 | name: shelltools 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python=3.6.* 7 | - pip=19.2.3 8 | - wget=1.19.4 9 | - libssh2=1.8.0 10 | - openssl=1.0.2t 11 | - libcurl=7.64.0 12 | - curl=7.64.0 13 | - aria2=1.34.0 14 | - pbgzip=2016.08.04 15 | - zlib=1.2.11 16 | - bzip2=1.0.8 17 | -------------------------------------------------------------------------------- /environment/sync/exclude.txt: -------------------------------------------------------------------------------- 1 | *snakemake* 2 | *cluster_jobs* 3 | *references/* 4 | *output/alignments/* 5 | *output/container/* 6 | *output/check_files/* 7 | */haploid_fasta/* 8 | */haploid_fastq/* 9 | */haploid_bam/* 10 | */layout/* 11 | *temp* 12 | *tmp* 13 | *processing* 14 | *.sh 15 | *.bam 16 | *.sam 17 | *.bai 18 | *.pbi 19 | *.fastq 20 | *.fastq.gz 21 | *input/*.fasta -------------------------------------------------------------------------------- /environment/snakemake/demo/config.yaml: -------------------------------------------------------------------------------- 1 | 2 | cores: 24 3 | latency-wait: 120 4 | keep-going: True 5 | rerun-incomplete: True 6 | keep-incomplete: True 7 | restart-times: 0 8 | use-conda: True 9 | use-envmodules: False 10 | nolock: False 11 | resources: 12 | mem_total_mb=131072 13 | default-resources: 14 | - mem_per_cpu_mb=1024 15 | - mem_total_mb=1024 16 | - runtime_hrs=1 17 | - runtime_min=59 18 | -------------------------------------------------------------------------------- /smk_config/data_sources/sseq_local_denbi.yml: -------------------------------------------------------------------------------- 1 | 2 | data_source_strandseq_denbi_local: 3 | comment: "deNBI local Strand-seq source for NA24385 and sub-sampled HG00733" 4 | output: 'strandseq_local_denbi.json' 5 | server: 'localhost' 6 | data_source: '/beeond/data/strandseq' 7 | collect_files: 8 | - 'fastq.gz' 9 | sort_into: 10 | - 'fastq' 11 | assume_correct_filenames: True 12 | 13 | 14 | -------------------------------------------------------------------------------- /environment/snakemake/server/denbi_europa/config.yaml: -------------------------------------------------------------------------------- 1 | # Default runtime profile for deNBI Tuebingen infrastructure / single VM 2 | 3 | directory: /mnt/vol/quobyte/projects/diploid-assembly 4 | jobs: 36 5 | latency-wait: 300 6 | keep-going: True 7 | rerun-incomplete: True 8 | restart-times: 0 9 | default-resources: 10 | - runtime_hrs=1 11 | - runtime_min=59 12 | - mem_per_cpu_mb=2048 13 | - mem_total_mb=4096 -------------------------------------------------------------------------------- /environment/snakemake/laptop/config.yaml: -------------------------------------------------------------------------------- 1 | 2 | cores: 4 3 | latency-wait: 5 4 | keep-going: True 5 | rerun-incomplete: True 6 | keep-incomplete: True 7 | restart-times: 0 8 | use-conda: True 9 | use-envmodules: False 10 | nolock: False 11 | resources: 12 | mem_total_mb=14336 13 | default-resources: 14 | - mem_per_cpu_mb=1024 15 | - mem_total_mb=1024 16 | - runtime_hrs=1 17 | - runtime_min=59 18 | #forcerun: "config.dump" 19 | -------------------------------------------------------------------------------- /environment/snakemake/server/d3compute/config.yaml: -------------------------------------------------------------------------------- 1 | # Default runtime profile for TM infrastructure / D3 compute server 2 | 3 | cores: 48 4 | latency-wait: 10 5 | keep-going: True 6 | rerun-incomplete: True 7 | keep-incomplete: True 8 | restart-times: 0 9 | use-conda: True 10 | use-envmodules: False 11 | nolock: True 12 | resources: 13 | mem_total_mb=1520432 14 | default-resources: 15 | - mem_per_cpu_mb=1024 16 | - mem_total_mb=1024 17 | - runtime_hrs=1 18 | - runtime_min=59 19 | -------------------------------------------------------------------------------- /smk_config/data_sources/hgsvc_local_denbi.yml: -------------------------------------------------------------------------------- 1 | 2 | data_source_pacbio_hhu_local: 3 | comment: "HGSVC local deNBI-VALET source for HG00514 Sequel2 PacBio CCS data" 4 | output: 'hgsvc_local_denbi_hifi.json' 5 | server: 'localhost' 6 | data_source: '/beeond/data/share/2020-07_HG00514_HiFi' 7 | collect_files: 8 | - 'fastq.gz' 9 | sort_into: 10 | - 'fastq' 11 | file_infix: 'hgsvc_pbsq2-' 12 | fix_tech: 'ccs' 13 | local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}' 14 | -------------------------------------------------------------------------------- /scripts/install_saarclust.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | if (is.element('SaaRclust', installed.packages()[,1])) { 4 | print('Removing previously installed version of SaaRclust') 5 | remove.packages('SaaRclust') 6 | } 7 | 8 | args = commandArgs(trailingOnly=TRUE) 9 | 10 | git.commit = args[1] 11 | 12 | devtools::install_git( 13 | "git://github.com/daewoooo/SaaRclust.git", 14 | ref = git.commit, 15 | dependencies=FALSE, 16 | upgrade=FALSE 17 | ) 18 | 19 | quit(save="no") 20 | -------------------------------------------------------------------------------- /environment/conda/testing/rtest.yml: -------------------------------------------------------------------------------- 1 | name: rtest 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - r 6 | dependencies: 7 | - Python=3.6.* 8 | - pip=19.2.3 9 | - wget=1.19.4 10 | - r-base<3.6 11 | - r-devtools=2.2.1 12 | - r-usethis=1.5.1 13 | - r-withr=2.1.2 14 | - r-igraph=1.2.4.1 15 | - r-zoo=1.8_6 16 | - r-cluster=2.1.0 17 | - r-doparallel=1.0.15 18 | - r-foreach=1.4.7 19 | - r-biocmanager=1.30.7 20 | - bioconductor-rhtslib=1.14.1 21 | - bioconductor-bamsignals=1.14.0 -------------------------------------------------------------------------------- /scripts/install_strandphaser.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | if (is.element('StrandPhaseR', installed.packages()[,1])) { 4 | print('Removing previously installed version of StrandPhaseR') 5 | remove.packages('StrandPhaseR') 6 | } 7 | 8 | args = commandArgs(trailingOnly=TRUE) 9 | 10 | git.commit = args[1] 11 | 12 | devtools::install_git( 13 | "git://github.com/daewoooo/StrandPhaseR.git", 14 | ref = git.commit, 15 | dependencies=FALSE, 16 | upgrade=FALSE 17 | ) 18 | 19 | quit(save="no") 20 | -------------------------------------------------------------------------------- /environment/conda/conda_rtools.yml: -------------------------------------------------------------------------------- 1 | name: rtools 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python=3.7.* 7 | - pip 8 | - ca-certificates>=2020.10 9 | - openssl>=1.1 10 | - quast=5.0.2 11 | - augustus>=3.3 12 | - busco>=4.1 # R dependency; update required b/c of LD_LIBRARY_PATH issue (2021/01) 13 | - circos=0.69.8 14 | - hmmer=3.1b2 15 | - blast>=2.10 16 | - htslib>=1.9 17 | - perl>=5.26 18 | - r-base>=4.0 19 | - minimap2=2.17 # this is made explicit b/c of QUAST 20 | 21 | -------------------------------------------------------------------------------- /annotation/20200507_ASanders_100cell_controls.txt: -------------------------------------------------------------------------------- 1 | GM12329x02PE20490 2 | GM18534Bx02PE20392 3 | GM18939x02PE20464 4 | GM19650Ax02PE20523 5 | GM19983x02PE20496 6 | GM20509Bx01PE20515 7 | GM20847Bx02PE20410 8 | HG00096x02PE20385 9 | HG00171Ax02PE20490 10 | HG00864x02PE20396 11 | HG01114x02PE20328 12 | HG01505x02PE20494 13 | HG01573x02PE20391 14 | HG01596x02PE20501 15 | HG02011x02PE20571 16 | HG02018x01PE20491 17 | HG02492x02PE20423 18 | HG02587x02PE20390 19 | HG03009x02PE20385 20 | HG03065x02PE20587 21 | HG03371x02PE20572 22 | HG03683x01PE20461 23 | HG03732x02PE20594 24 | GM19036Bx02PE20369 -------------------------------------------------------------------------------- /environment/snakemake/cluster/deep/config.yaml: -------------------------------------------------------------------------------- 1 | # Default runtime profile for DEEP infrastructure / SGE cluster 2 | 3 | directory: /TL/deep/fhgfs/projects/pebert/cbi/diploid-assembly 4 | drmaa-log-dir: /TL/deep/fhgfs/projects/pebert/cbi/diploid-assembly/log/cluster_jobs 5 | drmaa: " {cluster.clusterSpec}" 6 | jobname: "SMK{jobid}_{name}" 7 | cluster-config: /home/pebert/work/code/github/project-diploid-assembly/environment/snakemake/cluster/deep/deep_cluster.json 8 | local-cores: 4 9 | jobs: 200 10 | latency-wait: 1 11 | keep-going: True 12 | rerun-incomplete: False 13 | restart-times: 1 -------------------------------------------------------------------------------- /smk_config/data_sources/hgsvc_ftp_src_illumina.yml: -------------------------------------------------------------------------------- 1 | 2 | data_source_JAX_illumina: 3 | comment: "HGSVC FTP source for Illumina short read data (JAX)" 4 | output: 'hgsvc_JAX_illumina.json' 5 | server: 'ftp.1000genomes.ebi.ac.uk' 6 | data_source: 'vol1/ftp/data_collections/HGSVC2/working/20191004_Illumina' 7 | collect_files: 8 | - 'fastq.gz' 9 | sort_into: 10 | - 'fastq' 11 | file_infix: 'hgsvc_ilnvs-' 12 | fix_tech: '150pe' 13 | file_suffix: 'library_id' 14 | local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}_short' 15 | assume_paired_reads: True 16 | -------------------------------------------------------------------------------- /smk_config/data_sources/hgsvc_ftp_src_strandseq.yml: -------------------------------------------------------------------------------- 1 | 2 | 3 | data_source_EMBL_strandseq: 4 | comment: "HGSVC FTP source for Strand-seq data (EMBL)" 5 | output: 'hgsvc_EMBL_strandseq.json' 6 | server: 'ftp.1000genomes.ebi.ac.uk' 7 | data_source: 'vol1/ftp/data_collections/HGSVC2/working/20200120_Strandseq/fastq' 8 | collect_files: 9 | - 'fastq.gz' 10 | sort_into: 11 | - 'fastq' 12 | file_infix: 'hgsvc_ilnxs-' 13 | fix_tech: '80pe' 14 | file_suffix: 'library_id' 15 | local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}_sseq' 16 | assume_paired_reads: True -------------------------------------------------------------------------------- /environment/conda/conda_export_env_vars.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | REPOSITORY_PREFIX="/home/pebert/work/code/github/project-diploid-assembly" 4 | 5 | CONDA_PREFIX="/TL/epigenetics2/work/pebert/conda/envs/dipassm" 6 | 7 | cd ${CONDA_PREFIX} 8 | mkdir -p ./etc/conda/activate.d 9 | mkdir -p ./etc/conda/deactivate.d 10 | #touch ./etc/conda/activate.d/env_vars.sh 11 | #touch ./etc/conda/deactivate.d/env_vars.sh 12 | 13 | cp -f ${REPOSITORY_PREFIX}/environment/conda/activate/env_vars.sh ./etc/conda/activate.d/env_vars.sh 14 | cp -f ${REPOSITORY_PREFIX}/environment/conda/deactivate/env_vars.sh ./etc/conda/deactivate.d/env_vars.sh -------------------------------------------------------------------------------- /scripts/run_strandphaser.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(StrandPhaseR)) 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | 7 | bam.folder = args[1] 8 | config.file = args[2] 9 | variant.calls = args[3] 10 | wc.regions = args[4] 11 | output.folder = args[5] 12 | sample.individual = args[6] 13 | 14 | strandPhaseR( 15 | inputfolder=bam.folder, 16 | configfile=config.file, 17 | outputfolder=output.folder, 18 | positions=variant.calls, 19 | fillMissAllele=variant.calls, 20 | WCregions=wc.regions, 21 | exportVCF=sample.individual 22 | ) 23 | 24 | warnings() 25 | 26 | quit(save='no') -------------------------------------------------------------------------------- /environment/snakemake/cluster/denbi_tu_slurm/config.yaml: -------------------------------------------------------------------------------- 1 | # Default runtime profile for deNBI Tuebingen infrastructure / SLURM cluster 2 | 3 | directory: /mnt/vol/gridshare/projects/diploid-assembly 4 | cluster: " sbatch --export=ALL --cpus-per-task={cluster.cores} --mem-per-cpu={cluster.mem_mb}M --job-name={cluster.name} --output={cluster.output} --error={cluster.error} " 5 | cluster-config: /mnt/vol/gridshare/user/code/project-diploid-assembly/environment/snakemake/cluster/denbi_tu/denbi_cluster.json 6 | local-cores: 2 7 | jobs: 200 8 | latency-wait: 1 9 | keep-going: True 10 | rerun-incomplete: False 11 | restart-times: 1 12 | default-resources: 13 | - mem_per_cpu_mb=2048 14 | - mem_total_mb=4096 -------------------------------------------------------------------------------- /scripts/run_breakpointr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(breakpointR)) 4 | 5 | args = commandArgs(trailingOnly=TRUE) 6 | 7 | bam.folder = args[1] 8 | config.file = args[2] 9 | output.folder = args[3] 10 | num.cpu = args[4] 11 | output.file = args[5] 12 | 13 | breakpointr( 14 | inputfolder=bam.folder, 15 | outputfolder=output.folder, 16 | configfile=config.file, 17 | numCPU=num.cpu 18 | ) 19 | 20 | exportRegions( 21 | datapath=file.path(output.folder, "data"), 22 | file=output.file, 23 | collapseInversions=TRUE, 24 | collapseRegionSize=5000000, 25 | minRegionSize=5000000, 26 | state="wc" 27 | ) 28 | 29 | warnings() 30 | 31 | quit(save='no') -------------------------------------------------------------------------------- /environment/conda/conda_pyscript.yml: -------------------------------------------------------------------------------- 1 | name: pyscript 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python=3.6.* 7 | - pip=19.2.3 8 | - wget=1.19.4 9 | - openssl=1.0.2t 10 | - libblas=3.8.0 11 | - libcblas=3.8.0 12 | - liblapack=3.8.0 13 | - libopenblas=0.3.6 14 | - libgfortran-ng=7.3.0 15 | - libgcc-ng=9.1.0 16 | - libstdcxx-ng=9.1.0 17 | - libxml2=2.9.9 18 | - libcurl=7.64.0 19 | - lp_solve=5.5.2.5 20 | - openblas=0.3.6 21 | - make=4.2.1 22 | - numpy=1.17.1 23 | - matplotlib=3.1.1 24 | - htslib=1.9 25 | - libdeflate=1.3 # v1.0 may trigger pysam import error - DO NOT DOWNGRADE 26 | - pysam=0.15.3 27 | - dnaio=0.4.1 28 | - biopython=1.76 29 | -------------------------------------------------------------------------------- /environment/snakemake/cluster/deep/deep_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__": { 3 | "clusterSpec": " -V -S /bin/bash -l h_vmem=32G,slots_free=4,mem_free=8G", 4 | "jobName": "{rule}__defaultSpec__" 5 | }, 6 | "arrow_contig_polishing_pass1": { 7 | "clusterSpec": " -V -S /bin/bash -l h_vmem=124G,slots_free=16,mem_free=80G", 8 | "jobName": "{rule}_arpol1" 9 | }, 10 | "racon_contig_polishing_pass1": { 11 | "clusterSpec": " -V -S /bin/bash -l h_vmem=124G,slots_free=8,mem_free=80G", 12 | "jobName": "{rule}_rcpol1" 13 | }, 14 | "quast_analysis_reference_strandseq_polished_haploid_assembly": { 15 | "clusterSpec": " -V -S /bin/bash -l h_vmem=124G,slots_free=12,mem_free=80G", 16 | "jobName": "{rule}_quast" 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /scripts/eval/response/response-reviewer3-comment12.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import upsetplot 5 | 6 | # plotting script for response letter 7 | # created by Tobias Marschall 8 | 9 | d = pd.read_csv('pete-zenodo/variants_freeze3_sv_insdel.tsv.gz', sep='\t') 10 | 11 | d = d.assign(PAV=lambda df: True) 12 | d = d.assign(PANGENIE_STRICT=lambda df: df.PG_CONF==4) 13 | d = d.assign(PANGENIE_LENIENT=lambda df: df.PG_CONF>0) 14 | d = d.assign(ILLUMINA=lambda df: df['1KGHC_OVERLAP']=='OVR') 15 | 16 | counts = d.groupby(by=['PAV', 'PANGENIE_STRICT','PANGENIE_LENIENT','ILLUMINA']).size() 17 | upsetplot.plot(counts, sort_by='cardinality') 18 | plt.savefig('response-reviewer3-comment12.pdf') 19 | 20 | print(counts) 21 | 22 | 23 | -------------------------------------------------------------------------------- /scripts/eval/response/response-reviewer3-comment2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import upsetplot 5 | 6 | # plotting script for response letter 7 | # created by Tobias Marschall 8 | 9 | d = pd.read_csv('pete-zenodo/variants_freeze3_sv_insdel.tsv.gz', sep='\t') 10 | 11 | d = d.assign(PAV=lambda df: True) 12 | d = d.assign(IN_AUDANO=lambda df: ~df.AUDANO2019.isna()) 13 | d = d.assign(IN_CHAISSON=lambda df: ~df.HGSVC1.isna()) 14 | 15 | print(d.groupby(by=['PAV', 'IN_AUDANO']).size()) 16 | print(d.groupby(by=['PAV', 'IN_CHAISSON']).size()) 17 | 18 | counts = d.groupby(by=['PAV', 'IN_AUDANO','IN_CHAISSON']).size() 19 | print(counts) 20 | upsetplot.plot(counts, sort_by='cardinality') 21 | plt.savefig('response-reviewer3-comment2.pdf') 22 | 23 | 24 | -------------------------------------------------------------------------------- /environment/snakemake/cluster/denbi_tu_pbs/config.yaml: -------------------------------------------------------------------------------- 1 | # Default runtime profile for deNBI Tuebingen infrastructure / PBS-TORQUE cluster 2 | 3 | cluster: >- 4 | qsub 5 | -d ../run_folder 6 | -l walltime={cluster.run_hrs}:{cluster.run_min}:00,nodes=1:ppn={cluster.cores},mem={cluster.mem_mb}M 7 | -N {cluster.name} -o {cluster.output} -e {cluster.error} 8 | cluster-config: environment/snakemake/cluster/denbi_tu_pbs/denbi_cluster.json 9 | local-cores: 2 10 | jobs: 40 11 | latency-wait: 300 12 | keep-going: True 13 | rerun-incomplete: True 14 | keep-incomplete: False 15 | restart-times: 1 16 | use-conda: True 17 | use-envmodules: False 18 | nolock: True 19 | max-status-checks-per-second: 0.01 20 | default-resources: 21 | - runtime_hrs=1 22 | - runtime_min=59 23 | - mem_per_cpu_mb=2048 24 | - mem_total_mb=4096 -------------------------------------------------------------------------------- /notes/align_ccs_racon.md: -------------------------------------------------------------------------------- 1 | # Align CCS reads to reference for Racon polishing 2 | 3 | Sent by Aaron Wenger via e-mail on 2019-06-10 4 | 5 | Relevance: 6 | - Racon accepts minimap's PAF output, but does not seem to perform polishing 7 | - Generally, for CCS reads, preset asm20 is suggested; not used here, see the following: 8 | 9 | ``` 10 | Use "-x map-pb" to minimap2 even when aligning CCS reads. 11 | Racon will clip out segments that have no coverage, and map-pb does 12 | a better job than `-x asm5` of avoiding alignment clipping at 13 | quality dropouts in the draft assembly. 14 | ``` 15 | 16 | ``` 17 | The parameters we use are: 18 | $ minimap2 -a -x map-pb --eqx -m 5000 --secondary=no draft-asm.fa reads.fastq | samtools sort | samtools view -q 10 -F0x704 - > draft-asm.reads.sam 19 | $ racon reads.fastq draft-asm.reads.sam draft-asm.fa -u > polished-asm.fa 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /environment/snakemake/cluster/hhu_pbs/config.yaml: -------------------------------------------------------------------------------- 1 | # Default runtime profile for HHU HILBERT infrastructure / PBS Professional 2 | 3 | cluster: >- 4 | qsub 5 | -A {cluster.account} -l walltime={cluster.run_hrs}:{cluster.run_min}:00 6 | -l select=1:ncpus={cluster.cores}:mem={cluster.mem_mb}mb{cluster.arch} 7 | -N {cluster.name} -o {cluster.output} -e {cluster.error} 8 | cluster-config: environment/snakemake/cluster/hhu_pbs/hilbert_cluster.json 9 | local-cores: 1 10 | jobs: 50 11 | latency-wait: 300 12 | keep-going: True 13 | keep-incomplete: False 14 | rerun-incomplete: True 15 | restart-times: 1 16 | max-status-checks-per-second: 0.001 17 | use-conda: True 18 | use-envmodules: True 19 | conda-prefix: /gpfs/project/ebertp/projects/conda_envs 20 | nolock: False 21 | default-resources: 22 | - mem_per_cpu_mb=1024 23 | - mem_total_mb=1024 24 | - runtime_hrs=1 25 | - runtime_min=59 26 | -------------------------------------------------------------------------------- /annotation/1kg_hgsvc_colors.csv: -------------------------------------------------------------------------------- 1 | super_pop population hex rgb 2 | AFR ACB f4971d 244,151,29 3 | AFR AFR DB7D27 219,125,39 4 | AFR ASW e9651e 233,101,30 5 | AFR ESN fecf0d 254,207,13 6 | AFR GWD fbeb09 251,235,9 7 | AFR LWK cb9a31 203,154,49 8 | AFR MSL dfb819 223,184,25 9 | AFR YRI feca6a 254,202,106 10 | AMR AMR D72519 215,37,25 11 | AMR CLM cc3133 204,49,51 12 | AMR MXL df0036 223,0,54 13 | AMR PEL e61420 230,20,32 14 | AMR PUR cb3413 203,52,19 15 | EAS CDX 369934 54,153,52 16 | EAS CHB aeca10 174,202,16 17 | EAS CHS 66b42a 102,180,42 18 | EAS EAS 41A22F 65,162,47 19 | EAS JPT 158d34 21,141,52 20 | EAS KHV 4dad38 77,173,56 21 | EUR ASK 2930de 41,48,222 22 | EUR CEU 264999 38,73,153 23 | EUR EUR 2D6F91 45,111,145 24 | EUR FIN 32bac5 50,186,197 25 | EUR GBR 70c0d2 112,192,210 26 | EUR IBS 6385af 99,133,175 27 | EUR TSI 293065 41,48,101 28 | SAS BEB 831b82 131,27,130 29 | SAS GIH 6b3f94 107,63,148 30 | SAS ITU b12e60 177,46,96 31 | SAS PJL dd1384 221,19,132 32 | SAS SAS 782B8A 120,43,138 33 | SAS STU a7529b 167,82,155 34 | -------------------------------------------------------------------------------- /scripts/install_breakpointr.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | if (is.element('breakpointR', installed.packages()[,1])) { 4 | print('Removing previously installed version of breakpointR') 5 | remove.packages('breakpointR') 6 | } 7 | 8 | args = commandArgs(trailingOnly=TRUE) 9 | 10 | git.commit = args[1] 11 | 12 | if (is.na(as.numeric(git.commit))) { 13 | # means proper git tag 14 | 15 | # if dev version is installed, breakpointRdata 16 | # is not automatically included as a dependency, 17 | # so trigger setup manually 18 | devtools::install_git( 19 | "git://github.com/daewoooo/breakpointRdata.git", 20 | dependencies=FALSE, 21 | upgrade=FALSE 22 | ) 23 | 24 | devtools::install_git( 25 | "git://github.com/daewoooo/breakpointR.git", 26 | ref = git.commit, 27 | dependencies=FALSE, 28 | upgrade=FALSE 29 | ) 30 | } else { 31 | BiocManager::install( 32 | c("breakpointR"), 33 | update=FALSE 34 | ) 35 | } 36 | 37 | quit(save="no") -------------------------------------------------------------------------------- /annotation/grch38/known_regions/Modeled_regions_for_GRCh38.tsv: -------------------------------------------------------------------------------- 1 | #region_name chr start stop 2 | CEN1 1 122026460 125184587 3158128 3 | CEN2 2 92188146 94090557 1902412 4 | CEN3 3 90772459 93655574 2883116 5 | CEN4 4 49708101 51743951 2035851 6 | CEN5 5 46485901 50059807 3573907 7 | CEN6 6 58553889 59829934 1276046 8 | CEN7 7 58169654 60828234 2658581 9 | HET7 7 61377789 61528020 150232 10 | CEN8 8 44033745 45877265 1843521 11 | CEN9 9 43236168 45518558 2282391 12 | CEN10 10 39686683 41593521 1906839 13 | CEN11 11 51078349 54425074 3346726 14 | CEN12 12 34769408 37185252 2415845 15 | CEN13 13 16000001 18051248 2051248 16 | CEN14 14 16000001 18173523 2173523 17 | CEN15 15 17000001 19725254 2725254 18 | CEN16 16 36311159 38280682 1969524 19 | CEN17 17 22813680 26885980 4072301 20 | CEN18 18 15460900 20861206 5400307 21 | CEN19 19 24498981 27190874 2691894 22 | CEN20 20 26436233 30038348 3602116 23 | CEN21 21 10864561 12915808 2051248 24 | CEN22 22 12954789 15054318 2099530 25 | CENX X 58605580 62412542 3806963 26 | CENY Y 10316945 10544039 227095 -------------------------------------------------------------------------------- /environment/sync/sync_commands.md: -------------------------------------------------------------------------------- 1 | ### Sync from deNBI cloud cluster to MMCI/MPI 2 | 3 | Executed as daily cronjob at 6am on d3compute09 4 | 5 | ```bash 6 | rsync --recursive --delete-before --prune-empty-dirs \ 7 | --exclude-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/exclude.txt \ 8 | --include-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/include.txt \ 9 | centos@valet:/mnt/vol/beeond_backup/projects/diploid-assembly \ 10 | /scratch/bioinf/projects/diploid-genome-assembly/sync/denbi 11 | ``` 12 | 13 | Executed as daily cronjob at 6pm on lap-13-72 14 | 15 | ```bash 16 | rsync --recursive --delete-before --prune-empty-dirs \ 17 | -e "ssh contact.mpi-inf.mpg.de ssh" \ 18 | --exclude-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/exclude.txt \ 19 | --include-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/include.txt \ 20 | /mnt/sshfs/hhu/project/ebertp/projects/rfdga \ 21 | /scratch/bioinf/projects/diploid-genome-assembly/sync/hhu 22 | ``` 23 | -------------------------------------------------------------------------------- /smk_include/module_includes.smk: -------------------------------------------------------------------------------- 1 | 2 | # modules w/o other dependencies 3 | include: 'constraints.smk' 4 | include: 'aux_utilities.smk' 5 | include: 'environments.smk' 6 | include: 'link_data_sources.smk' 7 | include: 'scrape_data_sources.smk' 8 | include: 'query_data_repos.smk' 9 | include: 'handle_reference_download.smk' 10 | 11 | # input preparation stage, one or two dependencies to above modules 12 | include: 'handle_data_download.smk' 13 | include: 'preprocess_input.smk' 14 | include: 'preprocess_references.smk' 15 | 16 | # actual pipeline processing steps 17 | include: 'variant_calling.smk' 18 | include: 'integrative_phasing.smk' 19 | 20 | include: 'strandseq_dga_split.smk' 21 | include: 'strandseq_dga_joint.smk' 22 | 23 | include: 'collect_statistics.smk' 24 | include: 'run_alignments.smk' 25 | include: 'run_assemblies.smk' 26 | include: 'prepare_custom_references.smk' 27 | 28 | include: 'run_polishing.smk' 29 | 30 | include: 'haploid_assembly_clustering.smk' 31 | include: 'haploid_read_coverage.smk' 32 | 33 | include: 'create_plots.smk' 34 | include: 'eval_known_reference.smk' 35 | 36 | include: 'targets.smk' -------------------------------------------------------------------------------- /scripts/fb-parallel-timeout.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# -lt 3 ]]; 4 | then 5 | echo "usage: $0 [regions file] [ncpus] [timeout] [logfile] [freebayes arguments]" 6 | echo 7 | echo "Run freebayes in parallel over regions listed in regions file, using ncpus processors." 8 | echo "Will merge and sort output, producing a uniform VCF stream on stdout. Flags to freebayes" 9 | echo "which would write to e.g. a particular file will obviously cause problms, so caution is" 10 | echo "encouraged when using this script." 11 | echo 12 | echo "This script: adapted by Tobias Marschall" 13 | echo 14 | echo "For original version, see this github repo:" 15 | echo 16 | echo "https://github.com/ekg/freebayes/blob/master/scripts/freebayes-parallel" 17 | echo 18 | exit 19 | fi 20 | 21 | regionsfile=$1 22 | shift 23 | ncpus=$1 24 | shift 25 | timeout=$1 26 | shift 27 | logfile=$1 28 | shift 29 | 30 | command=("freebayes" "$@") 31 | 32 | ( cat "$regionsfile" | parallel -k --joblog "$logfile" -j "$ncpus" "timeout ${timeout} ${command[@]}" --region {} ) | vcffirstheader | vcfstreamsort -w 10000 | vcfuniq -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Peter Ebert 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /annotation/grch38/20200723_GRCh38_p13_regions.bed: -------------------------------------------------------------------------------- 1 | #chrom start end name score 2 | chr1 122026460 125184588 CEN1 1000 3 | chr10 39686683 41593522 CEN10 1000 4 | chr11 51078349 54425075 CEN11 1000 5 | chr12 34769408 37185253 CEN12 1000 6 | chr13 16000001 18051249 CEN13 1000 7 | chr14 16000001 18173524 CEN14 1000 8 | chr15 17000001 19725255 CEN15 1000 9 | chr16 36311159 38280683 CEN16 1000 10 | chr17 22813680 26885981 CEN17 1000 11 | chr18 15460900 20861207 CEN18 1000 12 | chr19 24498981 27190875 CEN19 1000 13 | chr2 92188146 94090558 CEN2 1000 14 | chr20 26436233 30038349 CEN20 1000 15 | chr21 10864561 12915809 CEN21 1000 16 | chr22 12954789 15054319 CEN22 1000 17 | chr3 90772459 93655575 CEN3 1000 18 | chr4 49708101 51743952 CEN4 1000 19 | chr5 46485901 50059808 CEN5 1000 20 | chr6 58553889 59829935 CEN6 1000 21 | chr7 58169654 60828235 CEN7 1000 22 | chr7 61377789 61528021 HET7 750 23 | chr8 44033745 45877266 CEN8 1000 24 | chr9 43236168 45518559 CEN9 1000 25 | chrX 10001 2781480 PAR1X 500 26 | chrX 58605580 62412543 CENX 1000 27 | chrX 155701383 156030896 PAR2X 500 28 | chrY 10001 2781480 PAR1Y 500 29 | chrY 10316945 10544040 CENY 1000 30 | chrY 56887903 57217416 PAR2Y 500 31 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/ESN/hg03125.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03125: 3 | individual: HG03125 4 | sex: female 5 | super_population: AFR 6 | population: ESN 7 | family: NG34 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG03125_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03125_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03125_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03125: 28 | - aliases: 29 | 1: &ccs_reads HG03125_hgsvc_pbsq2-ccs_1000 30 | - defaults: 31 | hap_reads: *ccs_reads 32 | vc_reads: *ccs_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *ccs_reads 35 | pol_pass: racon-p2 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: pereg 42 | hap_assembler: pereg 43 | var_caller: deepvar -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/GWD/hg02818.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG02818: 3 | individual: HG02818 4 | sex: female 5 | super_population: AFR 6 | population: GWD 7 | family: GB66 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG02818_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG02818_hgsvc_ilnxs-80pe_sseq 17 | source_type: local 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG02818_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG02818: 28 | - aliases: 29 | 1: &ccs_reads HG02818_hgsvc_pbsq2-ccs_1000 30 | - defaults: 31 | hap_reads: *ccs_reads 32 | vc_reads: *ccs_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *ccs_reads 35 | pol_pass: racon-p2 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: pereg 42 | hap_assembler: pereg 43 | var_caller: deepvar -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/MSL/hg03486.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03486: 3 | individual: HG03486 4 | sex: female 5 | super_population: AFR 6 | population: MSL 7 | family: SL61 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG03486_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03486_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03486_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03486: 28 | - aliases: 29 | 1: &ccs_reads HG03486_hgsvc_pbsq2-ccs_1000 30 | - defaults: 31 | hap_reads: *ccs_reads 32 | vc_reads: *ccs_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *ccs_reads 35 | pol_pass: racon-p2 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: pereg 42 | hap_assembler: pereg 43 | var_caller: deepvar -------------------------------------------------------------------------------- /scripts/utilities/mem_profiler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import time 5 | import psutil 6 | 7 | 8 | workdir = os.getcwd() 9 | logfile = os.path.join(workdir, 'memlog.txt') 10 | 11 | bytes_to_gigabytes = 1024 ** 3 12 | 13 | time_limit = 86400 14 | 15 | with open(logfile, 'w') as foo: 16 | pass 17 | 18 | sleep_time = 0 19 | 20 | with open(logfile, 'a') as log: 21 | header = '\t'.join(['#time', 'threads', 'load', 'mem_tot', 'mem_free', 'swap_tot', 'swap_free']) 22 | _ = log.write(header + '\n') 23 | while sleep_time < time_limit: 24 | now = str(time.ctime()).replace(' ', '_') 25 | threads = str(psutil.cpu_count(logical=True)) 26 | pct_cpu = str(round(psutil.cpu_percent(), 2)) 27 | mem = psutil.virtual_memory() 28 | mem_tot = str(round(mem.total / bytes_to_gigabytes, 2)) 29 | mem_av = str(round(mem.available / bytes_to_gigabytes, 2)) 30 | swap = psutil.swap_memory() 31 | swap_tot = str(round(swap.total / bytes_to_gigabytes, 2)) 32 | swap_free = str(round(swap.free / bytes_to_gigabytes, 2)) 33 | 34 | logline = '\t'.join([now, threads, pct_cpu, mem_tot, mem_av, swap_tot, swap_free]) 35 | _ = log.write(logline + '\n') 36 | 37 | sleep_time += 60 38 | time.sleep(60) 39 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AMR/PEL/hg01573.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG01573: 3 | individual: HG01573 4 | sex: female 5 | super_population: AMR 6 | population: PEL 7 | family: PEL003 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG01573_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "no squashed assembly possible" 16 | - strandseq: 17 | readset: &sseq_reads HG01573_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads HG01573_hgsvc_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB36890 24 | load_type: complete 25 | comment: "698 cohort" 26 | 27 | 28 | sample_targets_HG01573: 29 | - aliases: 30 | 2: &clr_hgsvc HG01573_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_hgsvc 33 | vc_reads: *clr_hgsvc 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_hgsvc 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/LWK/na19036.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA19036: 3 | individual: NA19036 4 | sex: female 5 | super_population: AFR 6 | population: LWK 7 | family: NA19036 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: NA19036_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "no squashed assembly possible" 16 | - strandseq: 17 | readset: &sseq_reads NA19036_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads NA19036_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB31736 24 | load_type: complete 25 | comment: "2504 cohort" 26 | 27 | 28 | sample_targets_NA19036: 29 | - aliases: 30 | 2: &clr_hgsvc NA19036_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_hgsvc 33 | vc_reads: *clr_hgsvc 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_hgsvc 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - h1 42 | - h2 43 | - target: 44 | nhr_assembler: flye 45 | hap_assembler: flye 46 | var_caller: longshot 47 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/SAS/PJL/hg02492.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG02492: 3 | individual: HG02492 4 | sex: male 5 | super_population: SAS 6 | population: PJL 7 | family: PK06 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG02492_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG02492_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG02492_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG02492: 28 | - aliases: 29 | 1: &clr_reads HG02492_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: uw27 46 | hap_assembler: flye 47 | var_caller: longshot -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EUR/CEU/na12329.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA12329: 3 | individual: NA12329 4 | sex: female 5 | super_population: EUR 6 | population: CEU 7 | family: 1328 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: NA12329_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads NA12329_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads NA12329_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_NA12329: 28 | - aliases: 29 | 1: &clr_reads NA12329_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: jax27 46 | hap_assembler: flye 47 | var_caller: longshot -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/SAS/BEB/hg03009.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03009: 3 | individual: HG03009 4 | sex: male 5 | super_population: SAS 6 | population: BEB 7 | family: HG03009 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: HG03009_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03009_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03009_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB31736 23 | load_type: complete 24 | comment: "2504 cohort" 25 | 26 | 27 | sample_targets_HG03009: 28 | - aliases: 29 | 1: &clr_reads HG03009_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: jax27 46 | hap_assembler: flye 47 | var_caller: longshot -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/SAS/ITU/hg03721.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03721: 3 | individual: HG03721 4 | sex: female 5 | super_population: SAS 6 | population: ITU 7 | family: IT003 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: HG03721_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03721_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03721_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03721: 28 | - aliases: 29 | 1: &clr_reads HG03721_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: hhu26 46 | hap_assembler: flye 47 | var_caller: longshot 48 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/SAS/STU/hg03683.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03683: 3 | individual: HG03683 4 | sex: female 5 | super_population: SAS 6 | population: STU 7 | family: ST012 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG03683_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03683_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03683_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03683: 28 | - aliases: 29 | 1: &clr_reads HG03683_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: jax27 46 | hap_assembler: flye 47 | var_caller: longshot 48 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/KHV/hg01596.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG01596: 3 | individual: HG01596 4 | sex: male 5 | super_population: EAS 6 | population: KHV 7 | family: VN002 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: HG01596_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG01596_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG01596_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB31736 23 | load_type: complete 24 | comment: "2504 cohort" 25 | 26 | 27 | sample_targets_HG01596: 28 | - aliases: 29 | 1: &clr_reads HG01596_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: hhu27 46 | hap_assembler: flye 47 | var_caller: longshot 48 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/SAS/ITU/hg03732.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03732: 3 | individual: HG03732 4 | sex: male 5 | super_population: SAS 6 | population: ITU 7 | family: IT003 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG03732_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03732_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03732_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03732: 28 | - aliases: 29 | 1: &clr_reads HG03732_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: hhu26 46 | hap_assembler: flye 47 | var_caller: longshot 48 | 49 | -------------------------------------------------------------------------------- /smk_config/data_sources/hgsvc_local_hhu.yml: -------------------------------------------------------------------------------- 1 | 2 | data_source_pacbio_hifi_hhu_local: 3 | comment: "HGSVC local HHU-HILBERT source for Sequel2 PacBio CCS data" 4 | output: 'hgsvc_local_hhu_hifi.json' 5 | server: 'localhost' 6 | data_source: '/gpfs/project/ebertp/data/globus/sequence_data/HiFi' 7 | collect_files: 8 | - 'fastq.gz' 9 | sort_into: 10 | - 'fastq' 11 | file_infix: 'hgsvc_pbsq2-' 12 | fix_tech: 'ccs' 13 | local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}' 14 | 15 | 16 | data_source_pacbio_clr_hhu_local: 17 | comment: "HGSVC local HHU-HILBERT source for Sequel2 PacBio CLR data" 18 | output: 'hgsvc_local_hhu_clr.json' 19 | server: 'localhost' 20 | data_source: '/gpfs/project/ebertp/data/globus/sequence_data/CLR' 21 | collect_files: 22 | - 'bam' 23 | sort_into: 24 | - 'bam' 25 | file_infix: 'hgsvc_pbsq2-' 26 | fix_tech: 'clr' 27 | assume_pacbio_native: True 28 | local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}' 29 | 30 | 31 | data_source_strandseq_hhu_local: 32 | comment: "HHU local Strand-seq source for NA24385 and sub-sampled HG00733" 33 | output: 'strandseq_local_hhu.json' 34 | server: 'localhost' 35 | data_source: '/gpfs/project/ebertp/data/local_source/strandseq' 36 | collect_files: 37 | - 'fastq.gz' 38 | sort_into: 39 | - 'fastq' 40 | assume_correct_filenames: True 41 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/CDX/hg00864.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00864: 3 | individual: HG00864 4 | sex: female 5 | super_population: EAS 6 | population: CDX 7 | family: HG00864 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: HG00864_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG00864_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG00864_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB31736 23 | load_type: complete 24 | comment: "2504 cohort" 25 | 26 | 27 | sample_targets_HG00864: 28 | - aliases: 29 | 1: &clr_reads HG00864_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: jax27 46 | hap_assembler: flye 47 | var_caller: longshot 48 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EUR/FIN/hg00171.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00171: 3 | individual: HG00171 4 | sex: female 5 | super_population: EUR 6 | population: FIN 7 | family: HG00171 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: HG00171_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG00171_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG00171_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB31736 23 | load_type: complete 24 | comment: "2504 cohort" 25 | 26 | 27 | sample_targets_HG00171: 28 | - aliases: 29 | 1: &clr_reads HG00171_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_reads 32 | vc_reads: *clr_reads 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_reads 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - target: 45 | nhr_assembler: hhu26 46 | hap_assembler: flye 47 | var_caller: longshot 48 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EUR/IBS/hg01505.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG01505: 3 | individual: HG01505 4 | sex: male 5 | super_population: EUR 6 | population: IBS 7 | family: child 8 | member: IBS002 9 | data_sources: 10 | - long_reads: 11 | readset: HG01505_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads HG01505_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads HG01505_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB36890 24 | load_type: complete 25 | comment: "698 cohort" 26 | 27 | 28 | sample_targets_HG01505: 29 | - aliases: 30 | 1: &clr_reads HG01505_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: jax27 47 | hap_assembler: flye 48 | var_caller: longshot 49 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/JPT/na18939.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA18939: 3 | individual: NA18939 4 | sex: female 5 | super_population: EAS 6 | population: JPT 7 | family: NA18939 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: NA18939_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads NA18939_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads NA18939_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB31736 24 | load_type: complete 25 | comment: "2504 cohort" 26 | 27 | 28 | sample_targets_NA18939: 29 | - aliases: 30 | 1: &clr_reads NA18939_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: hhu26 47 | hap_assembler: flye 48 | var_caller: longshot -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EUR/GBR/hg00096.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00096: 3 | individual: HG00096 4 | sex: male 5 | super_population: EUR 6 | population: GBR 7 | family: HG00096 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: HG00096_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads HG00096_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads HG00096_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB31736 24 | load_type: complete 25 | comment: "2504 cohort" 26 | 27 | 28 | sample_targets_HG00096: 29 | - aliases: 30 | 1: &clr_reads HG00096_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: jax27 47 | hap_assembler: flye 48 | var_caller: longshot 49 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EUR/TSI/na20509.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA20509: 3 | individual: NA20509 4 | sex: male 5 | super_population: EUR 6 | population: TSI 7 | family: NA20509 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: NA20509_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads NA20509_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads NA20509_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB31736 24 | load_type: complete 25 | comment: "2504 cohort" 26 | 27 | 28 | sample_targets_NA20509: 29 | - aliases: 30 | 1: &clr_reads NA20509_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: jax27 47 | hap_assembler: flye 48 | var_caller: longshot 49 | -------------------------------------------------------------------------------- /scripts/utilities/check_scripts/fastq_checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import io 5 | import argparse 6 | 7 | import dnaio 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('fastq1', type=str) 12 | parser.add_argument('fastq2', type=str) 13 | args = parser.parse_args() 14 | return args 15 | 16 | 17 | def collect_read_names(fastq_path): 18 | 19 | reads = [] 20 | 21 | with dnaio.open(fastq_path) as fastx: 22 | for record in fastx: 23 | reads.append(record.name) 24 | 25 | total_reads = len(reads) 26 | reads = set(reads) 27 | 28 | if not total_reads == len(reads): 29 | print('error: read duplicates {}: {} out of {}'.format(fastq_path, total_reads - len(reads), total_reads)) 30 | 31 | return reads 32 | 33 | 34 | def main(): 35 | args = parse_args() 36 | fq1_reads = collect_read_names(args.fastq1) 37 | fq2_reads = collect_read_names(args.fastq2) 38 | 39 | intersect = fq1_reads.intersection(fq2_reads) 40 | if len(intersect) > 0: 41 | print('error: read sets not disjoined: {} out of {} / {}'.format(len(intersect), len(fq1_reads), len(fq2_reads))) 42 | 43 | print('fq1 reads {}'.format(len(fq1_reads))) 44 | print('fq2 reads {}'.format(len(fq2_reads))) 45 | 46 | 47 | return 0 48 | 49 | 50 | if __name__ == '__main__': 51 | main() 52 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/SAS/GIH/na20847.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA20847: 3 | individual: NA20847 4 | sex: female 5 | super_population: SAS 6 | population: GIH 7 | family: NA20847 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: NA20847_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads NA20847_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads NA20847_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB31736 24 | load_type: complete 25 | comment: "2504 cohort" 26 | 27 | 28 | sample_targets_NA20847: 29 | - aliases: 30 | 1: &clr_reads NA20847_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: hhu26 47 | hap_assembler: flye 48 | var_caller: longshot 49 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/CHB/na18534.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA18534: 3 | individual: NA18534 4 | sex: male 5 | super_population: EAS 6 | population: CHB 7 | family: NA18534 8 | member: unrelated 9 | data_sources: 10 | - long_reads: 11 | readset: NA18534_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads NA18534_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads NA18534_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB31736 24 | load_type: complete 25 | comment: "2504 cohort" 26 | 27 | 28 | sample_targets_NA18534: 29 | - aliases: 30 | 1: &clr_reads NA18534_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: jax27 47 | hap_assembler: flye 48 | var_caller: longshot 49 | 50 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/KHV/hg02018.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG02018: 3 | individual: HG02018 4 | sex: female 5 | super_population: EAS 6 | population: KHV 7 | family: VN047 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG02018_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "no phased assembly possible" 16 | - strandseq: 17 | readset: &sseq_reads HG02018_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads HG02018_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB36890 24 | load_type: complete 25 | comment: "698 cohort" 26 | 27 | 28 | sample_targets_HG02018: 29 | - aliases: 30 | 1: &clr_reads HG02018_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - target: 46 | nhr_assembler: jax27 47 | hap_assembler: flye 48 | var_caller: longshot 49 | -------------------------------------------------------------------------------- /scripts/utilities/check_scripts/tagging_checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import io 5 | import argparse 6 | 7 | import pandas as pd 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('tags', type=str) 13 | args = parser.parse_args() 14 | return args 15 | 16 | 17 | def check_haplotags(file_path): 18 | 19 | names = ['readname', 'haplotype', 'phaseset', 'chromosome'] 20 | df = pd.read_csv(file_path, sep='\t', comment='#', header=None, names=names) 21 | 22 | hap_counts = df['haplotype'].value_counts() 23 | total = hap_counts.sum() 24 | 25 | print('--- percent tagged') 26 | 27 | for hap, count in hap_counts.items(): 28 | print(hap, round(count / total * 100, 2)) 29 | 30 | 31 | h1_reads = set(df.loc[df['haplotype'] == 'H1', 'readname'].values) 32 | h2_reads = set(df.loc[df['haplotype'] == 'H2', 'readname'].values) 33 | untagged = set(df.loc[df['haplotype'] == 'none', 'readname'].values) 34 | 35 | print('--- intersect') 36 | 37 | print('h1 v h2 ', len(h1_reads.intersection(h2_reads))) 38 | print('h1 v h0 ', len(h1_reads.intersection(untagged))) 39 | print('h2 v h0 ', len(h2_reads.intersection(untagged))) 40 | 41 | return 42 | 43 | 44 | def main(): 45 | args = parse_args() 46 | check_haplotags(args.tags) 47 | 48 | return 0 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /scripts/dev/ref_phasing/prep_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import re 6 | import io 7 | 8 | import pysam 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--vcf-in', '-i', dest='input', type=str) 13 | parser.add_argument('--out-pattern', '-o', dest='output', type=str) 14 | parser.add_argument('--chromosomes', '-c', dest='chrom', default='"^chr[0-9]+$"') 15 | args = parser.parse_args() 16 | return args 17 | 18 | 19 | def main(): 20 | args = parse_args() 21 | 22 | chrom_match = re.compile(args.chrom.strip('"')) 23 | 24 | with pysam.VariantFile(args.input) as vcf: 25 | collected_chroms = [c for c in vcf.header.contigs if chrom_match(c) is not None] 26 | 27 | call = 'bcftools view --regions {} --output-type v --output-file {} {}' 28 | for c in collected_chroms: 29 | out_file = args.output.strip('"').format(c) 30 | out_path = os.path.dirname(out_file) 31 | os.makedirs(out_path, exist_ok=True) 32 | tmp = call.format(c, out_file, args.input) 33 | try: 34 | out = subprocess.check_output(tmp, stderr=subprocess.STDOUT, shell=True, executable='/bin/bash') 35 | except subprocess.CalledProcessError as spe: 36 | raise RuntimeError(spe.output.decode('utf-8')) 37 | 38 | print(sorted(collected_chroms)) 39 | return 0 40 | 41 | if __name__ == '__main__': 42 | main() -------------------------------------------------------------------------------- /environment/conda/conda_biotools.yml: -------------------------------------------------------------------------------- 1 | name: biotools 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python=3.6.* 7 | - pip=19.2.3 8 | - openssl=1.0.2t 9 | - gxx_impl_linux-64=7.3.0 10 | - gxx_linux-64=7.3.0 11 | - ld_impl_linux-64=2.33.1 12 | - libblas=3.8.0 13 | - libcblas=3.8.0 14 | - liblapack=3.8.0 15 | - libopenblas=0.3.6 16 | - libgfortran-ng=7.3.0 17 | - libgcc-ng=9.1.0 18 | - libstdcxx-ng=9.1.0 19 | - libxml2=2.9.9 20 | - lp_solve=5.5.2.5 21 | - openblas=0.3.6 22 | - make=4.2.1 23 | - bwa=0.7.17 24 | - minimap2=2.17 25 | - mummer4 26 | - sambamba=0.7.1 # FIX github.com/biod/sambamba/issues/393 - DO NOT DOWNGRADE 27 | - ldc=1.13.0=h02c9852_1 # FIX https://github.com/bcbio/bcbio-nextgen/issues/3032 - DO NOT UP- or DOWNGRADE 28 | - htslib=1.9 29 | - libdeflate=1.3 # v1.0 may trigger pysam import error - DO NOT DOWNGRADE 30 | - pysam=0.15.3 31 | - samtools=1.9 32 | - bamtools=2.5.1 33 | - bedtools=2.29.0 34 | - bedops=2.4.37 35 | - bcftools=1.9 36 | - fastqc=0.11.8 37 | - freebayes=1.3.1 38 | - longshot=0.4.0 39 | - wtdbg=2.5 40 | - flye=2.7 41 | - canu=2.0 42 | - racon=1.4.10 43 | - lighter=1.1.2 44 | - bcalm=2.2.2 45 | - dnaio=0.4.2 46 | - cutadapt=2.10 47 | - trim-galore=0.6.5 48 | # - bifrost=1.0.3 49 | - graphaligner=1.0.11 50 | - ucsc-bedgraphtobigwig=377 51 | - pigz=2.4 52 | - pip: 53 | - git+https://bitbucket.org/whatshap/whatshap@a3f8c91 54 | -------------------------------------------------------------------------------- /smk_config/demo/na12878.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA12878: 3 | individual: NA12878 4 | sex: female 5 | super_population: EUR 6 | population: CEU 7 | family: 1463 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: NA12878_demo_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: complete 15 | - strandseq: 16 | readset: NA12878_demo_il25k-100pe_sseq 17 | library_fractions: one 18 | 19 | 20 | sample_targets_NA12878: 21 | - aliases: 22 | 1: &ccs_reads NA12878_demo_pbsq2-ccs_1000 23 | 2: &sseq_reads NA12878_demo_il25k-100pe_sseq 24 | - defaults: 25 | hap_reads: *ccs_reads 26 | sseq_reads: *sseq_reads 27 | vc_reads: *ccs_reads 28 | pol_reads: *ccs_reads 29 | hap_assm_mode: split 30 | hap: 31 | - h1-un 32 | - h2-un 33 | - target: 34 | nhr_assembler: flye 35 | hap_assembler: flye 36 | var_caller: freebayes 37 | pol_pass: racon-p2 38 | 39 | 40 | data_source_NA12878_demo: 41 | output: 'na12878_demo_local.json' 42 | server: 'localhost' 43 | data_source: '../demo_data/' 44 | collect_files: 45 | - 'fastq.gz' 46 | sort_into: 47 | - 'fastq' 48 | assume_correct_filenames: True 49 | 50 | force_local_copy: False 51 | 52 | link_data_input: 53 | - '../demo_data/NA12878_demo_reference.fasta' 54 | 55 | link_data_output: 56 | - 'references/assemblies/NA12878_demo_reference.fasta' -------------------------------------------------------------------------------- /smk_include/link_data_sources.smk: -------------------------------------------------------------------------------- 1 | 2 | CONFIG_FORCE_LOCAL_COPY = bool(config.get('force_local_copy', False)) 3 | 4 | if not CONFIG_FORCE_LOCAL_COPY: 5 | # making copies can be I/O intensive, 6 | # this should not run on a cluster submit node 7 | localrules: master_link_data_sources 8 | 9 | 10 | rule master_link_data_sources: 11 | """ 12 | This is the place to inject external data 13 | into the pipeline via symlinking 14 | """ 15 | input: 16 | ancient([os.path.abspath(fp) for fp in config.get('link_data_input', [])]) 17 | output: 18 | config.get('link_data_output', []) 19 | run: 20 | input_files = list(input) 21 | output_links = list(output) 22 | 23 | if len(input_files) != len(output_links): 24 | raise RuntimeError('Cannot inject data via sym linking, no 1-to-1 correspondence ' 25 | 'between input and output: {} vs {}'.format(len(input_files), len(output_links))) 26 | 27 | import os 28 | import shutil 29 | for input_file, output_link in zip(input_files, output_links): 30 | assert os.path.isfile(input_file), 'Invalid path to input file for linking/copying: {}'.format(input_file) 31 | os.makedirs(os.path.dirname(output_link), exist_ok=True) 32 | if CONFIG_FORCE_LOCAL_COPY: 33 | shutil.copy(input_file, output_link) 34 | else: 35 | os.symlink(input_file, output_link) -------------------------------------------------------------------------------- /smk_config/samples/na12878.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA12878: 3 | individual: NA12878 4 | sex: female 5 | super_population: EUR 6 | population: CEU 7 | family: 1463 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: NA12878_giab_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | source_type: ena 16 | bioproject: PRJNA540705 17 | - strandseq: 18 | readset: &sseq_reads NA12878_eriba_il25k-100pe_sseq 19 | source_type: ena 20 | bioproject: PRJEB14185 21 | library_fractions: one 22 | - short_reads: 23 | readset: NA12878_ptg_il2k-100pe_short 24 | source_type: ena 25 | bioproject: PRJEB3381 26 | load_type: complete 27 | - short_reads: 28 | readset: NA12878_1kg_ilnvs-150pe_short 29 | source_type: ena 30 | bioproject: PRJEB31736 31 | load_type: complete 32 | comment: "2504 cohort" 33 | 34 | 35 | sample_targets_NA12878: 36 | - aliases: 37 | 1: &ccs_reads NA12878_giab_pbsq2-ccs_1000 38 | - defaults: 39 | hap_reads: *ccs_reads 40 | vc_reads: *ccs_reads 41 | sseq_reads: *sseq_reads 42 | pol_reads: *ccs_reads 43 | pol_pass: racon-p2 44 | hap_assm_mode: split 45 | hap: 46 | - h1-un 47 | - h2-un 48 | - target: 49 | nhr_assembler: pereg 50 | hap_assembler: pereg 51 | var_caller: deepvar 52 | -------------------------------------------------------------------------------- /scripts/dev/ref_phasing/prep_ref.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse 5 | import re 6 | import io 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--fasta-in', '-i', dest='input', type=str) 12 | parser.add_argument('--fasta-out', '-o', dest='output', type=str) 13 | parser.add_argument('--chromosomes', '-c', dest='chrom', default='"^chr[0-9]+$"') 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def main(): 19 | args = parse_args() 20 | 21 | chrom_match = re.compile(args.chrom.strip('"')) 22 | 23 | out_buffer = io.StringIO() 24 | 25 | collected_chroms = [] 26 | 27 | collect = False 28 | with open(args.input, 'r') as fasta: 29 | for line in fasta: 30 | if line.startswith('>'): 31 | chrom = chrom_match.match(line.strip().strip('>')) 32 | if chrom is None: 33 | collect = False 34 | continue 35 | collected_chroms.append(chrom.group(0)) 36 | out_buffer.write(line) 37 | collect = True 38 | continue 39 | elif collect: 40 | out_buffer.write(line) 41 | else: 42 | continue 43 | 44 | out_path = os.path.dirname(args.output) 45 | os.makedirs(out_path, exist_ok=True) 46 | with open(args.output, 'w') as dump: 47 | _ = dump.write(out_buffer.getvalue()) 48 | 49 | print(sorted(collected_chroms)) 50 | return 0 51 | 52 | if __name__ == '__main__': 53 | main() -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/ACB/hg02011.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG02011: 3 | individual: HG02011 4 | sex: male 5 | super_population: AFR 6 | population: ACB 7 | family: BB13 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG02011_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG02011_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG02011_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG02011: 28 | - aliases: 29 | 2: &clr_hgsvc HG02011_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_hgsvc 32 | vc_reads: *clr_hgsvc 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_hgsvc 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - defaults: 45 | hap_reads: *clr_hgsvc 46 | vc_reads: *clr_hgsvc 47 | sseq_reads: *sseq_reads 48 | pol_reads: *clr_hgsvc 49 | pol_pass: arrow-p1 50 | hap_assm_mode: split 51 | hap: 52 | - h1-un 53 | - h2-un 54 | - target: 55 | nhr_assembler: hhu26 56 | hap_assembler: flye 57 | var_caller: longshot 58 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/ASW/na19983.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA19983: 3 | individual: NA19983 4 | sex: female 5 | super_population: AFR 6 | population: ASW 7 | family: 2436 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: NA19983_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads NA19983_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads NA19983_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_NA19983: 28 | - aliases: 29 | 2: &clr_hgsvc NA19983_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_hgsvc 32 | vc_reads: *clr_hgsvc 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_hgsvc 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - defaults: 45 | hap_reads: *clr_hgsvc 46 | vc_reads: *clr_hgsvc 47 | sseq_reads: *sseq_reads 48 | pol_reads: *clr_hgsvc 49 | pol_pass: arrow-p1 50 | hap_assm_mode: split 51 | hap: 52 | - h1-un 53 | - h2-un 54 | - target: 55 | nhr_assembler: hhu26 56 | hap_assembler: flye 57 | var_caller: longshot 58 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/ESN/hg03371.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03371: 3 | individual: HG03371 4 | sex: male 5 | super_population: AFR 6 | population: ESN 7 | family: NG98 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG03371_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03371_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03371_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03371: 28 | - aliases: 29 | 2: &clr_hgsvc HG03371_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_hgsvc 32 | vc_reads: *clr_hgsvc 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_hgsvc 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - defaults: 45 | hap_reads: *clr_hgsvc 46 | vc_reads: *clr_hgsvc 47 | sseq_reads: *sseq_reads 48 | pol_reads: *clr_hgsvc 49 | pol_pass: arrow-p1 50 | hap_assm_mode: split 51 | hap: 52 | - h1-un 53 | - h2-un 54 | - target: 55 | nhr_assembler: hhu26 56 | hap_assembler: flye 57 | var_caller: longshot 58 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/GWD/hg02587.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG02587: 3 | individual: HG02587 4 | sex: female 5 | super_population: AFR 6 | population: GWD 7 | family: GB24 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG02587_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG02587_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG02587_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG02587: 28 | - aliases: 29 | 2: &clr_hgsvc HG02587_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_hgsvc 32 | vc_reads: *clr_hgsvc 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_hgsvc 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - defaults: 45 | hap_reads: *clr_hgsvc 46 | vc_reads: *clr_hgsvc 47 | sseq_reads: *sseq_reads 48 | pol_reads: *clr_hgsvc 49 | pol_pass: arrow-p1 50 | hap_assm_mode: split 51 | hap: 52 | - h1-un 53 | - h2-un 54 | - target: 55 | nhr_assembler: hhu26 56 | hap_assembler: flye 57 | var_caller: longshot 58 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/MSL/hg03065.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG03065: 3 | individual: HG03065 4 | sex: male 5 | super_population: AFR 6 | population: MSL 7 | family: SL05 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG03065_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG03065_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG03065_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG03065: 28 | - aliases: 29 | 2: &clr_hgsvc HG03065_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_hgsvc 32 | vc_reads: *clr_hgsvc 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_hgsvc 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - defaults: 45 | hap_reads: *clr_hgsvc 46 | vc_reads: *clr_hgsvc 47 | sseq_reads: *sseq_reads 48 | pol_reads: *clr_hgsvc 49 | pol_pass: arrow-p1 50 | hap_assm_mode: split 51 | hap: 52 | - h1-un 53 | - h2-un 54 | - target: 55 | nhr_assembler: hhu26 56 | hap_assembler: flye 57 | var_caller: longshot 58 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AMR/CLM/hg01114.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG01114: 3 | individual: HG01114 4 | sex: female 5 | super_population: AMR 6 | population: CLM 7 | family: CLM03 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG01114_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | - strandseq: 16 | readset: &sseq_reads HG01114_hgsvc_ilnxs-80pe_sseq 17 | source_type: ftp 18 | library_fractions: one 19 | - short_reads: 20 | readset: &short_reads HG01114_1kg_ilnvs-150pe_short 21 | source_type: ena 22 | bioproject: PRJEB36890 23 | load_type: complete 24 | comment: "698 cohort" 25 | 26 | 27 | sample_targets_HG01114: 28 | - aliases: 29 | 2: &clr_hgsvc HG01114_hgsvc_pbsq2-clr_1000 30 | - defaults: 31 | hap_reads: *clr_hgsvc 32 | vc_reads: *clr_hgsvc 33 | sseq_reads: *sseq_reads 34 | pol_reads: *clr_hgsvc 35 | pol_pass: arrow-p1 36 | hap_assm_mode: split 37 | hap: 38 | - h1-un 39 | - h2-un 40 | - target: 41 | nhr_assembler: flye 42 | hap_assembler: flye 43 | var_caller: longshot 44 | - defaults: 45 | hap_reads: *clr_hgsvc 46 | vc_reads: *clr_hgsvc 47 | sseq_reads: *sseq_reads 48 | pol_reads: *clr_hgsvc 49 | pol_pass: arrow-p1 50 | hap_assm_mode: split 51 | hap: 52 | - h1-un 53 | - h2-un 54 | - target: 55 | nhr_assembler: jax27 56 | hap_assembler: flye 57 | var_caller: longshot 58 | -------------------------------------------------------------------------------- /environment/conda/conda_evaltools.yml: -------------------------------------------------------------------------------- 1 | name: evaltools 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - Python=3.6.* 7 | - pip=19.2.3 8 | - openssl=1.0.2t 9 | - gxx_impl_linux-64=7.3.0 10 | - gxx_linux-64=7.3.0 11 | - ld_impl_linux-64=2.33.1 12 | - libblas=3.8.0 13 | - libcblas=3.8.0 14 | - liblapack=3.8.0 15 | - libopenblas=0.3.6 16 | - libgfortran-ng=7.3.0 17 | - libgcc-ng=9.1.0 18 | - libstdcxx-ng=9.1.0 19 | - libxml2=2.9.9 20 | - lp_solve=5.5.2.5 21 | - openblas=0.3.6 22 | - make=4.2.1 23 | - bwa=0.7.17 24 | - minimap2=2.17 25 | - mummer4 26 | - sambamba=0.7.1 # FIX github.com/biod/sambamba/issues/393 - DO NOT DOWNGRADE 27 | - ldc=1.13.0=h02c9852_1 # FIX https://github.com/bcbio/bcbio-nextgen/issues/3032 - DO NOT UP- or DOWNGRADE 28 | - htslib=1.9 29 | - libdeflate=1.3 # v1.0 may trigger pysam import error - DO NOT DOWNGRADE 30 | - pysam=0.15.3 31 | - samtools=1.9 32 | - bamtools=2.5.1 33 | - bedtools=2.29.2 # FIX https://github.com/arq5x/bedtools2/issues/779 - DO NOT DOWNGRADE 34 | - bedops=2.4.37 35 | - bcftools=1.9 36 | - fastqc=0.11.8 37 | - freebayes=1.3.1 38 | - longshot=0.4.0 39 | - wtdbg=2.5 40 | - flye=2.7 41 | - canu=2.0 42 | - racon=1.4.10 43 | - lighter=1.1.2 44 | - bcalm=2.2.2 45 | - dnaio=0.4.2 46 | - cutadapt=2.10 47 | - trim-galore=0.6.5 48 | - graphaligner=1.0.11 49 | - ucsc-bedgraphtobigwig=377 50 | - ucsc-bigwigaverageoverbed=377 51 | - ucsc-bigwigcorrelate=377 52 | - ucsc-bigwigmerge=377 53 | - ucsc-bigwigcluster=377 54 | - pigz=2.4 55 | - pandas=1.0.5 56 | - pytables=3.6.1 57 | - intervaltree=3.0.2 58 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AMR/MXL/na19650.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA19650: 3 | individual: NA19650 4 | sex: male 5 | super_population: AMR 6 | population: MXL 7 | family: m001 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: NA19650_hgsvc_pbsq2-clr 12 | technology: pacbio 13 | data_type: pacbio_native 14 | load_type: parts 15 | comment: "seq_center:UMIGS" 16 | - strandseq: 17 | readset: &sseq_reads NA19650_hgsvc_ilnxs-80pe_sseq 18 | source_type: ftp 19 | library_fractions: one 20 | - short_reads: 21 | readset: &short_reads NA19650_1kg_ilnvs-150pe_short 22 | source_type: ena 23 | bioproject: PRJEB36890 24 | load_type: complete 25 | comment: "698 cohort" 26 | 27 | 28 | sample_targets_NA19650: 29 | - aliases: 30 | 1: &clr_reads NA19650_hgsvc_pbsq2-clr_1000 31 | - defaults: 32 | hap_reads: *clr_reads 33 | vc_reads: *clr_reads 34 | sseq_reads: *sseq_reads 35 | pol_reads: *clr_reads 36 | pol_pass: arrow-p1 37 | hap_assm_mode: split 38 | hap: 39 | - h1-un 40 | - h2-un 41 | - target: 42 | nhr_assembler: flye 43 | hap_assembler: flye 44 | var_caller: longshot 45 | - defaults: 46 | hap_reads: *clr_reads 47 | vc_reads: *clr_reads 48 | sseq_reads: *sseq_reads 49 | pol_reads: *clr_reads 50 | pol_pass: arrow-p1 51 | hap_assm_mode: split 52 | hap: 53 | - h1-un 54 | - h2-un 55 | - target: 56 | nhr_assembler: uw27 57 | hap_assembler: flye 58 | var_caller: longshot 59 | -------------------------------------------------------------------------------- /annotation/sample_table.tsv: -------------------------------------------------------------------------------- 1 | individual sex super_population population family member HiFi CLR 2020_SKIP 2 | HG02011 male AFR ACB BB13 child 0 1 0 3 | NA19983 female AFR ASW 2436 child 0 1 0 4 | HG03125 female AFR ESN NG34 child 1 0 0 5 | HG03371 male AFR ESN NG98 child 0 1 0 6 | HG02587 female AFR GWD GB24 child 0 1 0 7 | HG02818 female AFR GWD GB66 child 1 0 0 8 | NA19036 female AFR LWK NA19036 unrelated 0 1 1 9 | HG03065 male AFR MSL SL05 child 0 1 0 10 | HG03486 female AFR MSL SL61 child 1 0 0 11 | NA19238 female AFR YRI Y117 parent 1 1 0 12 | NA19239 male AFR YRI Y117 parent 1 1 0 13 | NA19240 female AFR YRI Y117 child 1 1 0 14 | HG01114 female AMR CLM CLM03 child 0 1 0 15 | NA19650 male AMR MXL m001 child 0 1 0 16 | HG01573 female AMR PEL PEL003 child 0 1 1 17 | HG00731 male AMR PUR PR05 parent 1 1 0 18 | HG00732 female AMR PUR PR05 parent 1 1 0 19 | HG00733 female AMR PUR PR05 child 1 1 0 20 | HG00864 female EAS CDX HG00864 unrelated 0 1 0 21 | NA18534 male EAS CHB NA18534 unrelated 0 1 0 22 | HG00512 male EAS CHS SH032 parent 1 1 0 23 | HG00513 female EAS CHS SH032 parent 1 1 0 24 | HG00514 female EAS CHS SH032 child 1 1 0 25 | NA18939 female EAS JPT NA18939 unrelated 0 1 0 26 | HG01596 male EAS KHV VN002 unrelated 0 1 0 27 | HG02018 female EAS KHV VN047 child 0 1 1 28 | NA24385 male EUR ASK 3140 child 1 0 0 29 | NA12329 female EUR CEU 1328 child 0 1 0 30 | NA12878 female EUR CEU 1463 child 1 0 0 31 | HG00171 female EUR FIN HG00171 unrelated 0 1 0 32 | HG00096 male EUR GBR HG00096 unrelated 0 1 0 33 | HG01505 male EUR IBS IBS002 child 0 1 0 34 | NA20509 male EUR TSI NA20509 unrelated 0 1 0 35 | HG03009 male SAS BEB HG03009 unrelated 0 1 0 36 | NA20847 female SAS GIH NA20847 unrelated 0 1 0 37 | HG03721 female SAS ITU IT003 parent 0 1 1 38 | HG03732 male SAS ITU IT003 child 0 1 0 39 | HG02492 male SAS PJL PK06 child 0 1 0 40 | HG03683 female SAS STU ST012 child 0 1 0 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # PyCharm idea folder 107 | .idea/ 108 | 109 | # Cached data 110 | cache* 111 | .cache* -------------------------------------------------------------------------------- /docs/demo.md: -------------------------------------------------------------------------------- 1 | # DEMO 2 | 3 | ## Running the pipeline demo data 4 | 5 | The following instructions assume that you have read the [tutorial](tutorial.md) 6 | at least up to the point "*Snakemake execution environment*". Your working directory 7 | should thus look as follows: 8 | 9 | ```bash 10 | /work_dir$ ls -1 11 | project-diploid-assembly/ 12 | smk_env/ 13 | ``` 14 | 15 | Please download the [demo data (DOI: 10.5281/zenodo.3746293)](https://doi.org/10.5281/zenodo.3746293) 16 | into your working directory (~ 6.2 GB), and extract the gzipped tar: 17 | 18 | ```bash 19 | /work_dir$ tar xzvf pipeline_demo.tar.gz 20 | ``` 21 | 22 | After this operation, your working directory should look like this: 23 | 24 | ```bash 25 | /work_dir$ ls -1 26 | demo_data/ 27 | pipeline_demo.tar.gz 28 | project-diploid-assembly/ 29 | smk_env/ 30 | ``` 31 | 32 | The pipeline repository contains a Snakemake *profile* that specifies a compute environment 33 | with **24 CPU cores** and **64 GB of main memory**. You can either use the Snakemake *profile* and the 34 | pipeline run environment configuration that are shipped with the pipeline code in the repository, 35 | or you can use your own based on the information given in the [tutorial](tutorial.md). 36 | In both cases, please proceed to the instructions how to [execute the pipeline](execute.md). 37 | 38 | ## How to interpret the results of the demo 39 | 40 | In all brevity, just don't. The demo data is a heavily downsampled version of a publicly 41 | available PacBio Sequel-2 HiFi/CCS dataset retrieved from EBI/ENA (PRJNA540705), 42 | and of the respective Strand-seq data (PRJEB14185). The objective was to create a dataset 43 | that could be processed from start to finish with moderate resources and within a reasonable 44 | amount of time (less than 24 hours). A successful run of the demo data is a "proof of function" 45 | for the pipeline, but it does not generate "biologically interesting" results. -------------------------------------------------------------------------------- /environment/snakemake/cluster/denbi_tu_pbs/denbi_cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__": { 3 | "cores": "{threads}", 4 | "mem_mb": "{resources.mem_total_mb}", 5 | "name": "{jobid}_{rule}", 6 | "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout", 7 | "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr", 8 | "run_hrs": "{resources.runtime_hrs}", 9 | "run_min": "{resources.runtime_min}" 10 | }, 11 | "handle_partial_fastq_download_request": { 12 | "cores": "36", 13 | "mem_mb": "{resources.mem_total_mb}", 14 | "name": "{jobid}_{rule}", 15 | "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout", 16 | "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr", 17 | "run_hrs": "{resources.runtime_hrs}", 18 | "run_min": "{resources.runtime_min}" 19 | }, 20 | "merge_fastq_input_parts": { 21 | "cores": "36", 22 | "mem_mb": "{resources.mem_total_mb}", 23 | "name": "{jobid}_{rule}", 24 | "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout", 25 | "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr", 26 | "run_hrs": "{resources.runtime_hrs}", 27 | "run_min": "{resources.runtime_min}" 28 | }, 29 | "strandseq_dga_split_merge_tag_groups": { 30 | "cores": "18", 31 | "mem_mb": "{resources.mem_total_mb}", 32 | "name": "{jobid}_{rule}", 33 | "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout", 34 | "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr", 35 | "run_hrs": "{resources.runtime_hrs}", 36 | "run_min": "{resources.runtime_min}" 37 | }, 38 | "strandseq_dga_split_haplo_tagging": { 39 | "cores": "36", 40 | "mem_mb": "{resources.mem_total_mb}", 41 | "name": "{jobid}_{rule}", 42 | "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout", 43 | "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr", 44 | "run_hrs": "{resources.runtime_hrs}", 45 | "run_min": "{resources.runtime_min}" 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /smk_include/results/run_eur_trios.smk: -------------------------------------------------------------------------------- 1 | 2 | localrules: run_hg00171_individual, 3 | run_na20509_individual, 4 | run_hg00096_individual, 5 | run_eur_trios, 6 | run_ceu_trio, 7 | run_ceu_child, 8 | run_ibs_trio, 9 | run_ibs_child, 10 | 11 | 12 | rule run_hg00171_individual: 13 | input: 14 | 'output/targets/EUR_FIN_HG00171/HG00171.fofn' 15 | message: 'Running EUR-FIN-HG00171 individual' 16 | 17 | ####################################################### 18 | 19 | rule run_na20509_individual: 20 | input: 21 | 'output/targets/EUR_TSI_NA20509/NA20509.fofn' 22 | message: 'Running EUR-TSI-NA20509 individual' 23 | 24 | ####################################################### 25 | 26 | rule run_hg00096_individual: 27 | input: 28 | 'output/targets/EUR_GBR_HG00096/HG00096.fofn' 29 | message: 'Running EUR-GBR-HG00096 individual' 30 | 31 | ####################################################### 32 | 33 | rule run_ceu_child: 34 | input: 35 | 'output/targets/EUR_CEU_1328/NA12329.fofn' 36 | message: 'Running EUR-CEU-1328 child' 37 | 38 | rule run_ceu_trio: 39 | input: 40 | rules.run_ceu_child.input, 41 | message: 'Running EUR-CEU-1328 trio' 42 | 43 | ######################################################## 44 | 45 | rule run_ibs_child: 46 | input: 47 | 'output/targets/EUR_IBS_IBS002/HG01505.fofn' 48 | message: 'Running EUR-IBS-IBS002 child' 49 | 50 | rule run_ibs_trio: 51 | input: 52 | rules.run_ibs_child.input, 53 | message: 'Running EUR-IBS-IBS002 trio' 54 | 55 | ######################################################## 56 | 57 | rule run_eur_trios: 58 | input: 59 | rules.run_hg00171_individual.input, 60 | rules.run_na20509_individual.input, 61 | rules.run_hg00096_individual.input, 62 | rules.run_ceu_trio.input, 63 | rules.run_ibs_trio.input, 64 | message: 'Running EUR trios' 65 | -------------------------------------------------------------------------------- /annotation/NA24385_selected_libraries_sseq.csv: -------------------------------------------------------------------------------- 1 | HG002x01PE20301 2 | HG002x01PE20303 3 | HG002x01PE20305 4 | HG002x01PE20306 5 | HG002x01PE20307 6 | HG002x01PE20308 7 | HG002x01PE20313 8 | HG002x01PE20315 9 | HG002x01PE20318 10 | HG002x01PE20319 11 | HG002x01PE20325 12 | HG002x01PE20327 13 | HG002x01PE20328 14 | HG002x01PE20329 15 | HG002x01PE20331 16 | HG002x01PE20332 17 | HG002x01PE20334 18 | HG002x01PE20335 19 | HG002x01PE20336 20 | HG002x01PE20337 21 | HG002x01PE20339 22 | HG002x01PE20340 23 | HG002x01PE20341 24 | HG002x01PE20342 25 | HG002x01PE20343 26 | HG002x01PE20345 27 | HG002x01PE20347 28 | HG002x01PE20350 29 | HG002x01PE20351 30 | HG002x01PE20352 31 | HG002x01PE20353 32 | HG002x01PE20355 33 | HG002x01PE20356 34 | HG002x01PE20357 35 | HG002x01PE20358 36 | HG002x01PE20359 37 | HG002x01PE20361 38 | HG002x01PE20362 39 | HG002x01PE20363 40 | HG002x01PE20364 41 | HG002x01PE20367 42 | HG002x01PE20368 43 | HG002x01PE20374 44 | HG002x01PE20376 45 | HG002x01PE20377 46 | HG002x01PE20378 47 | HG002x01PE20379 48 | HG002x01PE20381 49 | HG002x01PE20387 50 | HG002x01PE20388 51 | HG002x01PE20389 52 | HG002x01PE20391 53 | HG002x01PE20392 54 | HG002x01PE20393 55 | HG002x02PE20403 56 | HG002x02PE20405 57 | HG002x02PE20407 58 | HG002x02PE20414 59 | HG002x02PE20416 60 | HG002x02PE20417 61 | HG002x02PE20419 62 | HG002x02PE20421 63 | HG002x02PE20422 64 | HG002x02PE20424 65 | HG002x02PE20425 66 | HG002x02PE20428 67 | HG002x02PE20430 68 | HG002x02PE20431 69 | HG002x02PE20432 70 | HG002x02PE20434 71 | HG002x02PE20435 72 | HG002x02PE20439 73 | HG002x02PE20440 74 | HG002x02PE20443 75 | HG002x02PE20445 76 | HG002x02PE20450 77 | HG002x02PE20452 78 | HG002x02PE20454 79 | HG002x02PE20456 80 | HG002x02PE20458 81 | HG002x02PE20464 82 | HG002x02PE20466 83 | HG002x02PE20467 84 | HG002x02PE20469 85 | HG002x02PE20473 86 | HG002x02PE20479 87 | HG002x02PE20483 88 | HG002x02PE20484 89 | HG002x02PE20485 90 | HG002x02PE20486 91 | HG002x02PE20487 92 | HG002x02PE20488 93 | HG002x02PE20490 94 | HG002x02PE20491 95 | HG002x02PE20492 96 | HG002x02PE20494 97 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/YRI/na19240.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA19240: 3 | individual: NA19240 4 | sex: female 5 | super_population: AFR 6 | population: YRI 7 | family: Y117 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: NA19240_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: NA19240_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | - strandseq: 21 | readset: &sseq_reads NA19240_1kg_il25k-npe_sseq 22 | source_type: ena 23 | bioproject: PRJEB12849 24 | library_fractions: two 25 | - short_reads: 26 | readset: NA19240_1kg_il25k-125pe_short 27 | source_type: ena 28 | bioproject: PRJEB9396 29 | load_type: parts 30 | - short_reads: 31 | readset: NA19240_1kg_ilnvs-150pe_short 32 | source_type: ena 33 | bioproject: PRJEB36890 34 | load_type: complete 35 | comment: "698 cohort" 36 | 37 | 38 | sample_targets_NA19240: 39 | - aliases: 40 | 1: &ccs_reads NA19240_hgsvc_pbsq2-ccs_1000 41 | 2: &clr_reads NA19240_hgsvc_pbsq2-clr_1000 42 | - defaults: 43 | hap_reads: *ccs_reads 44 | vc_reads: *ccs_reads 45 | sseq_reads: *sseq_reads 46 | pol_reads: *ccs_reads 47 | pol_pass: racon-p2 48 | hap_assm_mode: split 49 | hap: 50 | - h1-un 51 | - h2-un 52 | - target: 53 | nhr_assembler: pereg 54 | hap_assembler: pereg 55 | var_caller: deepvar 56 | - defaults: 57 | hap_reads: *clr_reads 58 | vc_reads: *clr_reads 59 | sseq_reads: *sseq_reads 60 | pol_reads: *clr_reads 61 | pol_pass: arrow-p1 62 | hap_assm_mode: split 63 | hap: 64 | - h1-un 65 | - h2-un 66 | - target: 67 | nhr_assembler: uw27 68 | hap_assembler: flye 69 | var_caller: longshot -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Project repository: Phased Genome Assembly using Strand-seq (PGAS) 2 | 3 | ## Citation 4 | 5 | If you use this pipeline or extract and reuse original code/rules from this repository, 6 | please cite the following two papers: 7 | 8 | > Porubsky and Ebert et al. 9 | > "Fully Phased Human Genome Assembly without Parental Data Using Single-Cell Strand Sequencing and Long Reads." 10 | > Nature Biotechnology, December 2020 11 | > [DOI: 10.1038/s41587-020-0719-5](https://doi.org/10.1038/s41587-020-0719-5) 12 | 13 | > Ebert, Audano, Zhu and Rodriguez-Martin et al. 14 | > "Haplotype-resolved diverse human genomes and integrated analysis of structural variation" 15 | > Science, February 2021 16 | > [DOI: 10.1126/science.abf7117](https://doi.org/10.1126/science.abf7117) 17 | 18 | #### Deprecated citations 19 | 20 | Please do not reference the preprints ([10.1101/855049](https://doi.org/10.1101/855049) and [10.1101/2020.12.16.423102](https://doi.org/10.1101/2020.12.16.423102)) anymore. 21 | 22 | ## Scope of this repository 23 | 24 | This repository contains the Snakemake pipeline code plus some auxiliary scripts to go from raw 25 | input data to polished haploid assemblies. Any self-contained, general purpose software tool used in 26 | the pipeline is either available via conda/bioconda, or via github. In any case, the pipeline 27 | implementation covers the entire software setup required for a complete pipeline run. 28 | 29 | In particular, the code for the `SaaRclust`, `StrandPhaseR` and `breakpointR` R packages is 30 | available in [David Porubsky's github](https://github.com/daewoooo/SaaRclust). 31 | 32 | ## Documentation 33 | 34 | There are several step-by-step manuals available that describe all use cases currently supported 35 | for this pipeline. First-time users should start by reading the [tutorial](docs/tutorial.md). 36 | If you encounter any problems or "strange behaviour" during pipeline execution, please check 37 | the [FAQ](docs/faq.md) for explanations and solutions. If this does not help, please open a 38 | [github issue](https://guides.github.com/features/issues). -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/YRI/na19238.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA19238: 3 | individual: NA19238 4 | sex: female 5 | super_population: AFR 6 | population: YRI 7 | family: Y117 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: NA19238_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: NA19238_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | - strandseq: 21 | readset: &sseq_reads NA19238_1kg_il25k-npe_sseq 22 | source_type: ena 23 | bioproject: PRJEB12849 24 | library_fractions: two 25 | - short_reads: 26 | readset: NA19238_1kg_il25k-125pe_short 27 | source_type: ena 28 | bioproject: PRJEB9396 29 | load_type: parts 30 | - short_reads: 31 | readset: NA19238_1kg_ilnvs-150pe_short 32 | source_type: ena 33 | bioproject: PRJEB31736 34 | load_type: complete 35 | comment: "2504 cohort" 36 | 37 | 38 | sample_targets_NA19238: 39 | - aliases: 40 | 1: &ccs_reads NA19238_hgsvc_pbsq2-ccs_1000 41 | 2: &clr_reads NA19238_hgsvc_pbsq2-clr_1000 42 | - defaults: 43 | hap_reads: *ccs_reads 44 | vc_reads: *ccs_reads 45 | sseq_reads: *sseq_reads 46 | pol_reads: *ccs_reads 47 | pol_pass: racon-p2 48 | hap_assm_mode: split 49 | hap: 50 | - h1-un 51 | - h2-un 52 | - target: 53 | nhr_assembler: pereg 54 | hap_assembler: pereg 55 | var_caller: deepvar 56 | - target: 57 | nhr_assembler: flye 58 | hap_assembler: flye 59 | var_caller: freebayes 60 | - defaults: 61 | hap_reads: *clr_reads 62 | vc_reads: *clr_reads 63 | sseq_reads: *sseq_reads 64 | pol_reads: *clr_reads 65 | pol_pass: arrow-p1 66 | hap_assm_mode: split 67 | hap: 68 | - h1-un 69 | - h2-un 70 | - target: 71 | nhr_assembler: jax27 72 | hap_assembler: flye 73 | var_caller: longshot 74 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AMR/PUR/hg00732.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00732: 3 | individual: HG00732 4 | sex: female 5 | super_population: AMR 6 | population: PUR 7 | family: PR05 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: HG00732_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: HG00732_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | - strandseq: 21 | readset: &sseq_reads HG00732_1kg_il25k-npe_sseq 22 | source_type: ena 23 | bioproject: PRJEB12849 24 | library_fractions: two 25 | - short_reads: 26 | readset: &short_reads HG00732_1kg_il25k-125pe_short 27 | source_type: ena 28 | bioproject: PRJEB9396 29 | load_type: parts 30 | - short_reads: 31 | readset: HG00732_1kg_ilnvs-150pe_short 32 | source_type: ena 33 | bioproject: PRJEB31736 34 | load_type: complete 35 | comment: "2504 cohort" 36 | 37 | 38 | sample_targets_HG00732: 39 | - aliases: 40 | 1: &ccs_reads HG00732_hgsvc_pbsq2-ccs_1000 41 | 2: &clr_reads HG00732_hgsvc_pbsq2-clr_1000 42 | - defaults: 43 | hap_reads: *ccs_reads 44 | vc_reads: *ccs_reads 45 | sseq_reads: *sseq_reads 46 | pol_reads: *ccs_reads 47 | pol_pass: racon-p2 48 | hap_assm_mode: split 49 | hap: 50 | - h1-un 51 | - h2-un 52 | - target: 53 | nhr_assembler: pereg 54 | hap_assembler: pereg 55 | var_caller: deepvar 56 | - defaults: 57 | hap_reads: *clr_reads 58 | vc_reads: *clr_reads 59 | sseq_reads: *sseq_reads 60 | pol_reads: *clr_reads 61 | pol_pass: arrow-p1 62 | hap_assm_mode: split 63 | hap: 64 | - h1-un 65 | - h2-un 66 | - target: 67 | nhr_assembler: flye 68 | hap_assembler: flye 69 | var_caller: longshot 70 | - target: 71 | nhr_assembler: uw27 72 | hap_assembler: flye 73 | var_caller: longshot 74 | -------------------------------------------------------------------------------- /scripts/dev/cluster_splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import io 5 | import dnaio 6 | 7 | infile = 'HG03486_hgsvc_pbsq2-ccs_1000_scV12-pereg.fasta.backup' 8 | outfile = 'HG03486_hgsvc_pbsq2-ccs_1000_scV12-pereg.fasta' 9 | 10 | target_cluster = 'cluster1' 11 | 12 | head_buffer = io.StringIO() 13 | tail_buffer = io.StringIO() 14 | 15 | current_buffer = head_buffer 16 | 17 | cluster_buffer = io.StringIO() 18 | 19 | with open(infile, 'r') as fasta: 20 | for line in fasta: 21 | if line.startswith('>'): 22 | if line.strip() == '>{}'.format(target_cluster): 23 | current_buffer = cluster_buffer 24 | continue 25 | elif cluster_buffer.tell() > 0: 26 | current_buffer = tail_buffer 27 | else: 28 | current_buffer = head_buffer 29 | current_buffer.write(line) 30 | 31 | splitter = 'N' * 100 32 | 33 | cluster_seq = cluster_buffer.getvalue().replace('\n', '').split(splitter) 34 | print('num contigs: {}'.format(len(cluster_seq))) 35 | seq_sizes = [len(s) for s in cluster_seq] 36 | print(seq_sizes) 37 | 38 | print(head_buffer.tell()) 39 | print(tail_buffer.tell()) 40 | 41 | cluster_buffer = [] 42 | 43 | suffices = ['A', 'B', 'C'] 44 | suffix_idx = 0 45 | 46 | with dnaio.FastaWriter(outfile, line_length=80) as fasta: 47 | current_block = 0 48 | for seq_size, seq in zip(seq_sizes, cluster_seq): 49 | current_block += seq_size 50 | cluster_buffer.append(seq) 51 | if current_block > 150e6: 52 | print('Writing block size: {}'.format(current_block)) 53 | block_name = target_cluster + suffices[suffix_idx] 54 | fasta.write(block_name, splitter.join(cluster_buffer)) 55 | cluster_buffer = [] 56 | current_block = 0 57 | suffix_idx += 1 58 | 59 | 60 | print('Writing block size: {}'.format(current_block)) 61 | block_name = target_cluster + suffices[suffix_idx] 62 | fasta.write(block_name, splitter.join(cluster_buffer)) 63 | 64 | with open(outfile, 'a') as fasta: 65 | _ = fasta.write(tail_buffer.getvalue()) -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/CHS/hg00514.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00514: 3 | individual: HG00514 4 | sex: female 5 | super_population: EAS 6 | population: CHS 7 | family: SH032 8 | member: child 9 | data_sources: 10 | - long_reads: 11 | readset: HG00514_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: HG00514_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | comment: "seq_center:UMIGS" 21 | - strandseq: 22 | readset: &sseq_reads HG00514_1kg_il25k-npe_sseq 23 | source_type: ena 24 | bioproject: PRJEB12849 25 | library_fractions: two 26 | - short_reads: 27 | readset: HG00514_1kg_il25k-125pe_short 28 | source_type: ena 29 | bioproject: PRJEB9396 30 | load_type: parts 31 | - short_reads: 32 | readset: HG00514_1kg_ilnvs-150pe_short 33 | source_type: ena 34 | bioproject: PRJEB36890 35 | load_type: complete 36 | comment: "698 cohort" 37 | 38 | 39 | sample_targets_HG00514: 40 | - aliases: 41 | 1: &ccs_reads HG00514_hgsvc_pbsq2-ccs_1000 42 | 3: &clr_hgsvc HG00514_hgsvc_pbsq2-clr_1000 43 | - defaults: 44 | hap_reads: *ccs_reads 45 | vc_reads: *ccs_reads 46 | sseq_reads: *sseq_reads 47 | pol_reads: *ccs_reads 48 | pol_pass: racon-p2 49 | hap_assm_mode: split 50 | hap: 51 | - h1-un 52 | - h2-un 53 | - target: 54 | nhr_assembler: pereg 55 | hap_assembler: pereg 56 | var_caller: deepvar 57 | - target: 58 | nhr_assembler: flye 59 | hap_assembler: flye 60 | var_caller: freebayes 61 | - defaults: 62 | hap_reads: *clr_hgsvc 63 | vc_reads: *clr_hgsvc 64 | sseq_reads: *sseq_reads 65 | pol_reads: *clr_hgsvc 66 | pol_pass: arrow-p1 67 | hap_assm_mode: split 68 | hap: 69 | - h1-un 70 | - h2-un 71 | - target: 72 | nhr_assembler: flye 73 | hap_assembler: flye 74 | var_caller: longshot -------------------------------------------------------------------------------- /scripts/utilities/check_scripts/fasta_checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import io 5 | import argparse 6 | import hashlib 7 | 8 | import dnaio 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('fasta1', type=str) 13 | parser.add_argument('fasta2', type=str) 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | def collect_contig_hashes(fasta_path): 19 | 20 | contigs = dict() 21 | duplicates = list() 22 | 23 | with dnaio.open(fasta_path) as fastx: 24 | for record in fastx: 25 | contig_name = record.name 26 | seq = record.sequence 27 | seq_hash = hashlib.blake2b(seq.encode('ascii')).digest() 28 | if seq_hash in contigs: 29 | # happens for flye assembler 30 | duplicates.append(contig_name) 31 | contigs[seq_hash] = contig_name, len(seq) 32 | 33 | print('assembled contigs {}'.format(len(contigs))) 34 | if duplicates: 35 | print('duplicate contig sequences: {}'.format(sorted(duplicates))) 36 | 37 | return contigs 38 | 39 | 40 | def main(): 41 | args = parse_args() 42 | fasta1 = collect_contig_hashes(args.fasta1) 43 | fasta2 = collect_contig_hashes(args.fasta2) 44 | 45 | total1 = sum([x[1] for x in fasta1.values()]) 46 | total2 = sum([x[1] for x in fasta2.values()]) 47 | 48 | intersect = set(fasta1.keys()).intersection(fasta2.keys()) 49 | if len(intersect) > 0: 50 | dups1 = [fasta1[h] for h in intersect] 51 | dups2 = [fasta2[h] for h in intersect] 52 | 53 | cluster1 = sum([x[1] for x in dups1]) 54 | cluster2 = sum([x[1] for x in dups2]) 55 | 56 | frac_h1 = round(len(dups1) / len(fasta1) * 100, 2) 57 | frac_h2 = round(len(dups2) / len(fasta2) * 100, 2) 58 | 59 | pct_bp_h1 = round(cluster1 / total1 * 100, 2) 60 | pct_bp_h2 = round(cluster2 / total2 * 100, 2) 61 | 62 | print('HOM frac h1 #ctg', frac_h1) 63 | print('HOM frac h1 pct. bp', pct_bp_h1) 64 | 65 | print('HOM frac h2 #ctg', frac_h2) 66 | print('HOM frac h2 pct. bp', pct_bp_h2) 67 | 68 | return 0 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /smk_config/demo/params.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: ba65b53 6 | git_commit_strandphaser: e608407 7 | git_commit_breakpointr: 268d99d 8 | # arbitrarily tying a version number 9 | # to the git commits to avoid additional 10 | # wildcards - increment this number when 11 | # git commits are changed! 12 | git_commit_version: 9 13 | 14 | peregrine_version: 0.1.5.5 15 | deepvariant_version: 0.9.0 16 | shasta_version: 0.4.0 17 | 18 | # Assembler settings 19 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 20 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 21 | 22 | # SaaRclust parameter sets 23 | # goal is to obtain 24 clusters 24 | min_contig_size: 50000 25 | min_region_to_order: 500000 26 | bin_size: 100000 27 | step_size: 100000 28 | prob_threshold: 0.25 29 | init_clusters: 25 30 | desired_clusters: 10 31 | min_mapq: 60 32 | 33 | # VARIANT CALLING 34 | # Postprocessing parameters 35 | filter_vcf_qual: 10 36 | filter_vcf_gq: 100 37 | 38 | freebayes_timeout_sec: 3600 39 | 40 | # not primary alignment || supplementary alignment 41 | bwa_strandseq_aln_discard: 2304 42 | 43 | # read unmapped || not primary alignment || failed QC || PCR dup 44 | minimap_readref_aln_discard: 1796 45 | 46 | # read unmapped || not primary alignment 47 | minimap_contigref_aln_discard: 260 48 | 49 | # read unmapped || not primary alignment || failed QC || PCR dup 50 | minimap_racon_aln_discard: 1796 # same as 0x704 51 | minimap_racon_aln_min_qual: 10 52 | 53 | # main chromosomes to be used 54 | # for known references for main 55 | # pipeline steps (i.e., everything 56 | # before evaluation) 57 | eval_known_ref: GRCh38_GCA_p13 58 | eval_align_ref: hg38_GCA_p13 59 | eval_gene_model: GRCh38_GENCODEv31_basic 60 | use_genome_size: NA12878_demo_reference 61 | main_chromosomes: 62 | - chr1 63 | - chr2 64 | - chr3 65 | - chr4 66 | - chr5 67 | - chr6 68 | - chr7 69 | - chr8 70 | - chr9 71 | - chr10 72 | - chr11 73 | - chr12 74 | - chr13 75 | - chr14 76 | - chr15 77 | - chr16 78 | - chr17 79 | - chr18 80 | - chr19 81 | - chr20 82 | - chr21 83 | - chr22 84 | - chrX 85 | - chrY 86 | -------------------------------------------------------------------------------- /smk_include/dev/run_all_eval.smk: -------------------------------------------------------------------------------- 1 | 2 | include: 'prep_custom_references.smk' 3 | include: 'run_kmer_analysis.smk' 4 | include: 'run_illumina_qv.smk' 5 | include: 'run_tech_comparison.smk' 6 | include: 'run_contig_remap.smk' 7 | include: 'run_bng_hybrids.smk' 8 | 9 | 10 | localrules: master_eval 11 | 12 | wildcard_constraints: 13 | folder_path = '[A-Za-z0-9\-_\/]+', # note: "." is NOT allowed in a folder path 14 | file_name = '[A-Za-z0-9\-_\.]+', 15 | known_ref = 'GRCh3[78][A-Za-z0-9_]+', 16 | genemodel = 'GRCh38[A-Za-z0-9_]+' 17 | 18 | 19 | def quast_busco_determine_targets(wildcards): 20 | """ 21 | Rerun QUAST-LG with manually fixed BUSCO database 22 | to get BUSCO stats as requested 23 | ODB source: 24 | https://busco-data.ezlab.org/v4/data/lineages/eukaryota_odb10.2020-09-10.tar.gz 25 | 26 | NB: this requires a manual fix for QUAST/BUSCO, i.e. adding the above database 27 | to the correct path in QUAST 28 | """ 29 | 30 | genemodel = 'GRCh38_GENCODEv31_basic' 31 | refgenome = 'GRCh38_HGSVC2_noalt' 32 | folder_path = 'evaluation/phased_assemblies' 33 | 34 | fixed_wildcards = { 35 | 'known_ref': refgenome, 36 | 'genemodel': genemodel, 37 | 'folder_path': folder_path 38 | } 39 | 40 | # output/{folder_path}/{file_name}.fasta' 41 | target_path = 'output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{{file_name}}/report.pdf'.format(**fixed_wildcards) 42 | 43 | load_path = os.path.join('output', folder_path) 44 | 45 | phased_assemblies = sorted([f for f in os.listdir(load_path) if f.endswith('.fasta')]) 46 | 47 | compute_targets = [] 48 | for ps_assm in phased_assemblies: 49 | target_file = target_path.format(**{'file_name': ps_assm.strip('.fasta')}) 50 | compute_targets.append(target_file) 51 | 52 | return compute_targets 53 | 54 | 55 | rule master_quast_busco: 56 | input: 57 | quast_busco_determine_targets 58 | 59 | 60 | rule master_eval: 61 | input: 62 | tech_comparison_determine_targets, 63 | kmer_analysis_determine_targets, 64 | illumina_qv_determine_targets, 65 | contig_remap_determine_targets, 66 | bng_hybrids_determine_targets, 67 | quast_busco_determine_targets -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/CHS/hg00513.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00513: 3 | individual: HG00513 4 | sex: female 5 | super_population: EAS 6 | population: CHS 7 | family: SH032 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: HG00513_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: HG00513_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | comment: "seq_center:UMIGS" 21 | - strandseq: 22 | readset: &sseq_reads HG00513_1kg_il25k-npe_sseq 23 | source_type: ena 24 | bioproject: PRJEB12849 25 | library_fractions: two 26 | - short_reads: 27 | readset: HG00513_1kg_il25k-125pe_short 28 | source_type: ena 29 | bioproject: PRJEB9396 30 | load_type: parts 31 | - short_reads: 32 | readset: HG00513_1kg_ilnvs-150pe_short 33 | source_type: ena 34 | bioproject: PRJEB31736 35 | load_type: complete 36 | comment: "2504 cohort" 37 | 38 | 39 | sample_targets_HG00513: 40 | - aliases: 41 | 1: &ccs_reads HG00513_hgsvc_pbsq2-ccs_1000 42 | 2: &clr_reads HG00513_hgsvc_pbsq2-clr_1000 43 | - defaults: 44 | hap_reads: *ccs_reads 45 | vc_reads: *ccs_reads 46 | sseq_reads: *sseq_reads 47 | pol_reads: *ccs_reads 48 | pol_pass: racon-p2 49 | hap_assm_mode: split 50 | hap: 51 | - h1-un 52 | - h2-un 53 | - target: 54 | nhr_assembler: pereg 55 | hap_assembler: pereg 56 | var_caller: deepvar 57 | - target: 58 | nhr_assembler: flye 59 | hap_assembler: flye 60 | var_caller: freebayes 61 | - defaults: 62 | hap_reads: *clr_reads 63 | vc_reads: *clr_reads 64 | sseq_reads: *sseq_reads 65 | pol_reads: *clr_reads 66 | pol_pass: arrow-p1 67 | hap_assm_mode: split 68 | hap: 69 | - h1-un 70 | - h2-un 71 | - target: 72 | nhr_assembler: flye 73 | hap_assembler: flye 74 | var_caller: longshot 75 | - target: 76 | nhr_assembler: hhu26 77 | hap_assembler: flye 78 | var_caller: longshot 79 | 80 | -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/EAS/CHS/hg00512.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00512: 3 | individual: HG00512 4 | sex: male 5 | super_population: EAS 6 | population: CHS 7 | family: SH032 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: HG00512_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: HG00512_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | comment: "seq_center:UMIGS" 21 | - strandseq: 22 | readset: &sseq_reads HG00512_1kg_il25k-npe_sseq 23 | source_type: ena 24 | bioproject: PRJEB12849 25 | library_fractions: two 26 | - short_reads: 27 | readset: &short_reads HG00512_1kg_il25k-125pe_short 28 | source_type: ena 29 | bioproject: PRJEB9396 30 | load_type: parts 31 | - short_reads: 32 | readset: HG00512_1kg_ilnvs-150pe_short 33 | source_type: ena 34 | bioproject: PRJEB36890 35 | load_type: complete 36 | comment: "698 cohort" 37 | 38 | sample_targets_HG00512: 39 | - aliases: 40 | 1: &ccs_reads HG00512_hgsvc_pbsq2-ccs_1000 41 | 2: &clr_reads HG00512_hgsvc_pbsq2-clr_1000 42 | - defaults: 43 | hap_reads: *ccs_reads 44 | vc_reads: *ccs_reads 45 | sseq_reads: *sseq_reads 46 | pol_reads: *ccs_reads 47 | pol_pass: racon-p2 48 | hap_assm_mode: split 49 | hap: 50 | - h1-un 51 | - h2-un 52 | - target: 53 | nhr_assembler: pereg 54 | hap_assembler: pereg 55 | var_caller: deepvar 56 | - target: 57 | nhr_assembler: flye 58 | hap_assembler: flye 59 | var_caller: freebayes 60 | - defaults: 61 | hap_reads: *clr_reads 62 | vc_reads: *clr_reads 63 | sseq_reads: *sseq_reads 64 | pol_reads: *clr_reads 65 | pol_pass: arrow-p1 66 | hap_assm_mode: split 67 | hap: 68 | - h1-un 69 | - h2-un 70 | - target: 71 | nhr_assembler: flye 72 | hap_assembler: flye 73 | var_caller: longshot 74 | - target: 75 | nhr_assembler: hhu26 76 | hap_assembler: flye 77 | var_caller: longshot 78 | 79 | -------------------------------------------------------------------------------- /docs/autoconf.md: -------------------------------------------------------------------------------- 1 | # Autoconf 2 | 3 | ## Using the autoconf.py script 4 | 5 | The probably easiest way to start a pipeline run for your own data is to use the `autoconf.py` script 6 | to generate the necessary configuration file. Since all pipeline configuration is realized with simple 7 | textual [YAML](https://yaml.org/) files, you can edit the auto-generated configuration files if you 8 | need more flexibility. Please note that the `autoconf.py` script only supports generating config files 9 | for one sample at a time. 10 | 11 | Run the `autoconf.py` script as follows: 12 | 13 | ```bash 14 | /work_dir$ conda activate ./smk_env 15 | (smk_env)/work_dir$ cd project-diploid-assembly 16 | (smk_env)/work_dir/project-diploid-assembly$ ./autoconf.py 17 | ``` 18 | 19 | The script is interactively guiding you through the configuration process by asking a series of 20 | basic questions about your data, e.g., the local storage path or the type of long reads. You can 21 | always accept the default value (if one is provided!) by hitting ``. You can reduce 22 | the number of questions to the bare minimum by accepting all defaults: 23 | 24 | ```bash 25 | (smk_env)/work_dir/project-diploid-assembly$ ./autoconf.py --accept-defaults 26 | ``` 27 | 28 | After you successfully completed the autoconf process, you find two additional folders in your working 29 | directory: 30 | 31 | ```bash 32 | /work_dir$ ls -1 33 | autoconf_config/ 34 | autoconf_linked_data/ 35 | project-diploid-assembly/ 36 | smk_env/ 37 | ``` 38 | 39 | The `autoconf_config` folder contains the generated configuration file, and the `autoconf_linked_data` 40 | folder contains symbolic links to the input data. The symbolic links are named following the pattern 41 | required by the pipeline to process your data correctly. 42 | 43 | **Caveat**: if the `autoconf.py` script fails at deriving well-behaved names for your input files, please 44 | open a github issue showing a handful of examples of the file names that cannot be processed. However, since 45 | file names are a matter of personal preference, or sometimes of project requirements, the worst case would be 46 | that you have to create appropriately named symbolic links to your input files yourself. 47 | 48 | Next, please proceed to the documentation on how to [execute the pipeline](execute.md). -------------------------------------------------------------------------------- /notebooks/2020_project/processing/clean_segdups_annotation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "\"\"\"\n", 12 | "What does this do?\n", 13 | "Clean up SD annotation downloaded from UCSC\n", 14 | "Rescales pct. id. into \"score\" BED column (0-1000), and creates combined name\n", 15 | "for output file\n", 16 | "\"\"\"\n", 17 | "\n", 18 | "sd_file = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/known_regions/segdups_hg38.tsv.gz'\n", 19 | "\n", 20 | "df = pd.read_csv(sd_file, sep='\\t')\n", 21 | "df['chrom'] = df['#chrom']\n", 22 | "df.drop('#chrom', axis=1, inplace=True)\n", 23 | "df['score'] = (df['fracMatch'] * 1000).round(0).astype(int)\n", 24 | "df['name'] = df['uid'].astype(str) + '@' + df['score'].astype(str) + '@' + df['name'].astype(str)\n", 25 | "\n", 26 | "\n", 27 | "new_sort_order = ['chrom'] + list(df.columns[:-1])\n", 28 | "df = df[new_sort_order]\n", 29 | "df.sort_values(['chrom', 'chromStart', 'chromEnd'], inplace=True)\n", 30 | "\n", 31 | "tsv_output = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/known_regions/ucsc_segdups.tsv.gz'\n", 32 | "df.to_csv(tsv_output, sep='\\t', header=True, index=False)\n", 33 | "\n", 34 | "bed_output = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/GRCh38_segdups.bed'\n", 35 | "with open(bed_output, 'w') as dump:\n", 36 | " dump.write('#')\n", 37 | " df[['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand']].to_csv(dump, sep='\\t', header=True, index=False)" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.7.6" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 4 62 | } 63 | -------------------------------------------------------------------------------- /scripts/utilities/process_logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import time 4 | 5 | import psutil 6 | 7 | attributes = [ 8 | 'cmdline', 9 | 'cpu_percent', 10 | 'create_time', 11 | 'cwd', 12 | 'exe', 13 | 'memory_info', 14 | 'name', 15 | 'pid', 16 | 'ppid', 17 | 'status', 18 | 'threads', 19 | 'terminal', 20 | 'username', 21 | 'uids' 22 | ] 23 | 24 | system_exe = [ 25 | '/bin', 26 | '/usr/bin', 27 | '/lib', 28 | '/usr/lib', 29 | '/usr/sbin', 30 | '/opt/pbs' 31 | ] 32 | 33 | special_processes = [ 34 | 'sd-pam', 35 | 'ssh-agent', 36 | 'ssh', 37 | 'sshd', 38 | 'screen', 39 | 'SCREEN' 40 | ] 41 | 42 | whitelist = [ 43 | '/smk_env/', 44 | '/globus/' 45 | ] 46 | 47 | 48 | def main(): 49 | 50 | LOGFILE = '/home/ebertp/process.log' 51 | USERNAME = 'ebertp' 52 | 53 | with open(LOGFILE, 'w') as logfile: 54 | pass 55 | 56 | while 1: 57 | cache = dict() 58 | suspects = [] 59 | for process in psutil.process_iter(attrs=attributes, ad_value='N/A'): 60 | cache[process.info['pid']] = process.info['exe'], process.info['cmdline'] 61 | if process.info['username'] != USERNAME: 62 | continue 63 | if any([process.info['exe'].startswith(se) for se in system_exe]): 64 | continue 65 | try: 66 | if any([sp in process.info['cmdline'][0] for sp in special_processes]): 67 | continue 68 | except IndexError: 69 | # process has no cmdline 70 | pass 71 | if any([wl in process.info['exe'] for wl in whitelist]): 72 | continue 73 | suspects.append(process.info) 74 | 75 | with open(LOGFILE, 'a') as logfile: 76 | for p_info in suspects: 77 | _ = logfile.write('PARENT: {} / {}\n'.format(*cache[p_info['ppid']])) 78 | _ = logfile.write('OFFENDER\n') 79 | block = '\n'.join(['{}\t{}'.format(k, p_info[k]) for k in sorted(p_info.keys())]) 80 | _ = logfile.write(block + '\n') 81 | _ = logfile.write('========\n') 82 | 83 | time.sleep(30) 84 | return 85 | 86 | 87 | if __name__ == '__main__': 88 | main() 89 | -------------------------------------------------------------------------------- /smk_config/params/smk_cfg_params_RV7.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: c0eb57f 6 | git_commit_strandphaser: e608407 7 | # arbitrarily tying a version number 8 | # to the git commits to avoid additional 9 | # wildcards - increment this number when 10 | # git commits are changed! 11 | git_commit_version: 7 12 | 13 | peregrine_version: 0.1.5.5 14 | deepvariant_version: 0.9.0 15 | shasta_version: 0.4.0 16 | 17 | # Assembler settings 18 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 19 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 20 | 21 | # SaaRclust parameter sets 22 | # goal is to obtain 24 clusters 23 | min_contig_size: 100000 24 | min_region_to_order: 500000 25 | bin_size: 200000 26 | step_size: 200000 27 | prob_threshold: 0.25 28 | init_clusters: 100 29 | 30 | # this solves a known HET inversion located on chr8 31 | sample_non_default_parameters: 32 | HG00733: 33 | init_clusters: 150 34 | 35 | # VARIANT CALLING 36 | # Postprocessing parameters 37 | filter_vcf_qual: 10 38 | filter_vcf_gq: 100 39 | 40 | freebayes_timeout_sec: 3600 41 | 42 | # not primary alignment || supplementary alignment 43 | bwa_strandseq_aln_discard: 2304 44 | 45 | # read unmapped || not primary alignment || failed QC || PCR dup 46 | minimap_readref_aln_discard: 1796 47 | 48 | # read unmapped || not primary alignment 49 | minimap_contigref_aln_discard: 260 50 | 51 | # read unmapped || not primary alignment || failed QC || PCR dup 52 | minimap_racon_aln_discard: 1796 # same as 0x704 53 | minimap_racon_aln_min_qual: 10 54 | 55 | # main chromosomes to be used 56 | # for known references for main 57 | # pipeline steps (i.e., everything 58 | # before evaluation) 59 | eval_known_ref: GRCh38_GCA_p13 60 | eval_align_ref: hg38_GCA_p13 61 | eval_gene_model: GRCh38_GENCODEv31_basic 62 | use_genome_size: hg38_GCA_p13 63 | main_chromosomes: 64 | - chr1 65 | - chr2 66 | - chr3 67 | - chr4 68 | - chr5 69 | - chr6 70 | - chr7 71 | - chr8 72 | - chr9 73 | - chr10 74 | - chr11 75 | - chr12 76 | - chr13 77 | - chr14 78 | - chr15 79 | - chr16 80 | - chr17 81 | - chr18 82 | - chr19 83 | - chr20 84 | - chr21 85 | - chr22 86 | - chrX 87 | - chrY 88 | -------------------------------------------------------------------------------- /smk_include/results/run_sas_trios.smk: -------------------------------------------------------------------------------- 1 | 2 | localrules: run_hg03009_individual, 3 | run_na20847_individual, 4 | run_sas_trios, 5 | run_itu_trio, 6 | run_itu_mother, 7 | run_itu_child, 8 | run_stu_trio, 9 | run_stu_child, 10 | run_pjl_trio, 11 | run_pjl_child 12 | 13 | 14 | rule run_hg03009_individual: 15 | input: 16 | 'output/targets/SAS_BEB_HG03009/HG03009.fofn' 17 | message: 'Running SAS-BEB-HG03009 individual' 18 | 19 | ######################################################## 20 | 21 | rule run_na20847_individual: 22 | input: 23 | 'output/targets/SAS_GIH_NA20847/NA20847.fofn' 24 | message: 'Running SAS-GIH-NA20847 individual' 25 | 26 | ######################################################## 27 | 28 | rule run_itu_mother: 29 | input: 30 | 'output/targets/SAS_ITU_IT003/HG03721.fofn' 31 | message: 'Running SAS-ITU-IT003 mother' 32 | 33 | rule run_itu_child: 34 | input: 35 | 'output/targets/SAS_ITU_IT003/HG03732.fofn' 36 | message: 'Running SAS-ITU-IT003 child' 37 | 38 | rule run_itu_trio: 39 | input: 40 | rules.run_itu_mother.input, 41 | rules.run_itu_child.input 42 | message: 'Running SAS-ITU-IT003 trio' 43 | 44 | ######################################################### 45 | 46 | rule run_stu_child: 47 | input: 48 | 'output/targets/SAS_STU_ST012/HG03683.fofn' 49 | message: 'Running SAS-STU-ST012 child' 50 | 51 | rule run_stu_trio: 52 | input: 53 | rules.run_stu_child.input 54 | message: 'Running SAS-STU-ST012 trio' 55 | 56 | ######################################################### 57 | 58 | rule run_pjl_child: 59 | input: 60 | 'output/targets/SAS_PJL_PK06/HG02492.fofn' 61 | message: 'Running SAS-PJL-PK06 child' 62 | 63 | rule run_pjl_trio: 64 | input: 65 | rules.run_pjl_child.input 66 | message: 'Running SAS-PJL-PK06 trio' 67 | 68 | ######################################################### 69 | 70 | rule run_sas_trios: 71 | input: 72 | rules.run_hg03009_individual.input, 73 | rules.run_na20847_individual.input, 74 | rules.run_itu_trio.input, 75 | rules.run_stu_trio.input, 76 | rules.run_pjl_trio.input 77 | message: 'Running SAS trios' 78 | -------------------------------------------------------------------------------- /smk_include/results/run_amr_trios.smk: -------------------------------------------------------------------------------- 1 | 2 | localrules: run_amr_trios, 3 | run_pur_trio, 4 | run_pur_father, 5 | run_pur_mother, 6 | run_pur_child, 7 | run_clm_trio, 8 | run_clm_child, 9 | run_mxl_trio, 10 | run_mxl_child, 11 | run_pel_trio, 12 | run_pel_child 13 | 14 | 15 | rule run_pur_father: 16 | input: 17 | 'output/targets/AMR_PUR_PR05/HG00731.fofn' 18 | message: 'Running AMR-PUR-PR05 father' 19 | 20 | 21 | rule run_pur_mother: 22 | input: 23 | 'output/targets/AMR_PUR_PR05/HG00732.fofn' 24 | message: 'Running AMR-PUR-PR05 mother' 25 | 26 | 27 | rule run_pur_child: 28 | input: 29 | 'output/targets/AMR_PUR_PR05/HG00733.fofn' 30 | message: 'Running AMR-PUR-PR05 child' 31 | 32 | 33 | rule run_pur_trio: 34 | input: 35 | rules.run_pur_father.input, 36 | rules.run_pur_mother.input, 37 | rules.run_pur_child.input 38 | message: 'Running AMR-PUR-PR05 trio' 39 | 40 | ############################################# 41 | 42 | rule run_clm_child: 43 | input: 44 | 'output/targets/AMR_CLM_CLM03/HG01114.fofn' 45 | message: 'Running AMR-CLM-CLM03 child' 46 | 47 | rule run_clm_trio: 48 | input: 49 | rules.run_clm_child.input 50 | message: 'Running AMR-CLM-CLM03 trio' 51 | 52 | ############################################## 53 | 54 | rule run_mxl_child: 55 | input: 56 | 'output/targets/AMR_MXL_m001/NA19650.fofn' 57 | message: 'Running AMR-MXL-m001 child' 58 | 59 | rule run_mxl_trio: 60 | input: 61 | rules.run_mxl_child.input 62 | message: 'Running AMR-MXL-m001 trio' 63 | 64 | ############################################## 65 | 66 | rule run_pel_child: 67 | input: 68 | 'output/targets/AMR_PEL_PEL003/HG01573.fofn' 69 | message: 'Running AMR-PEL-PEL003 child' 70 | 71 | rule run_pel_trio: 72 | input: 73 | rules.run_pel_child.input 74 | message: 'Running AMR-PEL-PEL003 trio' 75 | 76 | ############################################## 77 | 78 | rule run_amr_trios: 79 | input: 80 | rules.run_pur_trio.input, 81 | rules.run_clm_trio.input, 82 | rules.run_mxl_trio.input, 83 | rules.run_pel_trio.input 84 | message: 'Running AMR trios' 85 | 86 | -------------------------------------------------------------------------------- /environment/snakemake/cluster/hhu_pbs/hilbert_queues.md: -------------------------------------------------------------------------------- 1 | 2 | Queue: default 3 | queue_type = Route 4 | total_jobs = 0 5 | state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 Begun 6 | :0 7 | resources_max.walltime = 167:59:59 8 | route_destinations = CUDA,short,workq,long 9 | route_retry_time = 30 10 | enabled = True 11 | started = True 12 | 13 | 14 | Queue: short 15 | queue_type = Execution 16 | Priority = 120 17 | total_jobs = -14 18 | state_count = Transit:0 Queued:0 Held:1 Waiting:0 Running:4 Exiting:0 Begun 19 | :0 20 | max_queued = [u:PBS_GENERIC=1024] 21 | from_route_only = True 22 | resources_max.ngpus = 0 23 | resources_max.walltime = 02:00:00 24 | resources_default.preempt_targets = NONE 25 | resources_assigned.mem = 11534336kb 26 | resources_assigned.mpiprocs = 0 27 | resources_assigned.ncpus = 4 28 | resources_assigned.nodect = 4 29 | max_run = [u:PBS_GENERIC=512] 30 | enabled = True 31 | started = True 32 | 33 | 34 | Queue: workq 35 | queue_type = Execution 36 | Priority = 100 37 | total_jobs = 182 38 | state_count = Transit:0 Queued:0 Held:11 Waiting:0 Running:105 Exiting:0 Be 39 | gun:1 40 | max_queued = [u:PBS_GENERIC=1024] 41 | from_route_only = True 42 | resources_max.ngpus = 0 43 | resources_max.walltime = 72:00:00 44 | resources_min.walltime = 02:00:01 45 | resources_default.preempt_targets = NONE 46 | resources_assigned.mem = 4672716800kb 47 | resources_assigned.mpiprocs = 2396 48 | resources_assigned.ncpus = 2626 49 | resources_assigned.nodect = 141 50 | enabled = True 51 | started = True 52 | 53 | 54 | Queue: long 55 | queue_type = Execution 56 | Priority = 80 57 | total_jobs = 82 58 | state_count = Transit:0 Queued:26 Held:2 Waiting:0 Running:95 Exiting:0 Beg 59 | un:4 60 | max_queued = [u:PBS_GENERIC=1024] 61 | from_route_only = True 62 | resources_max.ngpus = 0 63 | resources_max.walltime = 167:00:00 64 | resources_min.walltime = 72:00:01 65 | resources_default.preempt_targets = NONE 66 | resources_assigned.mem = 1525022720kb 67 | resources_assigned.mpiprocs = 24 68 | resources_assigned.ncpus = 272 69 | resources_assigned.nodect = 100 70 | max_run_res.ncpus = [o:PBS_ALL=1024] 71 | enabled = True 72 | started = True -------------------------------------------------------------------------------- /annotation/in_preparation/bl_supp_HG02818_HG03125_HG03486_NA19434.txt: -------------------------------------------------------------------------------- 1 | HG02818x02PE20320 2 | HG02818x02PE20338 3 | HG02818x02PE20341 4 | HG02818x02PE20359 5 | HG02818x02PE20362 6 | HG02818x02PE20383 7 | HG02818x02PE20385 8 | HG02818x02PE20386 9 | HG02818x02PE20387 10 | HG02818x02PE20391 11 | HG02818x02PE20393 12 | HG02818x02PE20395 13 | HG02818x02PE20396 14 | HG03125x02PE20301 15 | HG03125x02PE20302 16 | HG03125x02PE20305 17 | HG03125x02PE20306 18 | HG03125x02PE20308 19 | HG03125x02PE20309 20 | HG03125x02PE20324 21 | HG03125x02PE20325 22 | HG03125x02PE20328 23 | HG03125x02PE20331 24 | HG03125x02PE20333 25 | HG03125x02PE20336 26 | HG03125x02PE20337 27 | HG03125x02PE20342 28 | HG03125x02PE20344 29 | HG03125x02PE20347 30 | HG03125x02PE20352 31 | HG03125x02PE20354 32 | HG03125x02PE20363 33 | HG03125x02PE20366 34 | HG03125x02PE20367 35 | HG03125x02PE20373 36 | HG03125x02PE20374 37 | HG03125x02PE20379 38 | HG03125x02PE20380 39 | HG03125x02PE20382 40 | HG03125x02PE20384 41 | HG03125x02PE20386 42 | HG03125x02PE20388 43 | HG03125x02PE20389 44 | HG03125x02PE20393 45 | HG03486x02PE20504 46 | HG03486x02PE20509 47 | HG03486x02PE20513 48 | HG03486x02PE20516 49 | HG03486x02PE20520 50 | HG03486x02PE20521 51 | HG03486x02PE20527 52 | HG03486x02PE20528 53 | HG03486x02PE20531 54 | HG03486x02PE20535 55 | HG03486x02PE20539 56 | HG03486x02PE20550 57 | HG03486x02PE20553 58 | HG03486x02PE20554 59 | HG03486x02PE20557 60 | HG03486x02PE20561 61 | HG03486x02PE20570 62 | HG03486x02PE20574 63 | HG03486x02PE20581 64 | HG03486x02PE20588 65 | NA19434x02PE20501 66 | NA19434x02PE20503 67 | NA19434x02PE20505 68 | NA19434x02PE20507 69 | NA19434x02PE20508 70 | NA19434x02PE20514 71 | NA19434x02PE20516 72 | NA19434x02PE20523 73 | NA19434x02PE20524 74 | NA19434x02PE20525 75 | NA19434x02PE20532 76 | NA19434x02PE20533 77 | NA19434x02PE20534 78 | NA19434x02PE20535 79 | NA19434x02PE20539 80 | NA19434x02PE20540 81 | NA19434x02PE20541 82 | NA19434x02PE20546 83 | NA19434x02PE20548 84 | NA19434x02PE20551 85 | NA19434x02PE20554 86 | NA19434x02PE20556 87 | NA19434x02PE20561 88 | NA19434x02PE20563 89 | NA19434x02PE20564 90 | NA19434x02PE20567 91 | NA19434x02PE20571 92 | NA19434x02PE20572 93 | NA19434x02PE20579 94 | NA19434x02PE20580 95 | NA19434x02PE20582 96 | NA19434x02PE20584 97 | NA19434x02PE20587 98 | NA19434x02PE20588 99 | NA19434x02PE20594 100 | NA19434x02PE20595 101 | NA19434x02PE20596 -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AFR/YRI/na19239.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA19239: 3 | individual: NA19239 4 | sex: male 5 | super_population: AFR 6 | population: YRI 7 | family: Y117 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: NA19239_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: NA19239_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | - strandseq: 21 | readset: &sseq_reads NA19239_1kg_il25k-npe_sseq 22 | source_type: ena 23 | bioproject: PRJEB12849 24 | library_fractions: two 25 | - short_reads: 26 | readset: NA19239_1kg_il25k-125pe_short 27 | source_type: ena 28 | bioproject: PRJEB9396 29 | load_type: parts 30 | - short_reads: 31 | readset: NA19239_1kg_ilnvs-150pe_short 32 | source_type: ena 33 | bioproject: PRJEB31736 34 | load_type: complete 35 | comment: "2504 cohort" 36 | 37 | 38 | sample_targets_NA19239: 39 | - aliases: 40 | 1: &ccs_reads NA19239_hgsvc_pbsq2-ccs_1000 41 | 2: &clr_reads NA19239_hgsvc_pbsq2-clr_1000 42 | - defaults: 43 | hap_reads: *ccs_reads 44 | vc_reads: *ccs_reads 45 | sseq_reads: *sseq_reads 46 | pol_reads: *ccs_reads 47 | pol_pass: racon-p2 48 | hap_assm_mode: split 49 | hap: 50 | - h1-un 51 | - h2-un 52 | - target: 53 | nhr_assembler: pereg 54 | hap_assembler: pereg 55 | var_caller: deepvar 56 | - target: 57 | nhr_assembler: flye 58 | hap_assembler: flye 59 | var_caller: freebayes 60 | - defaults: 61 | hap_reads: *clr_reads 62 | vc_reads: *clr_reads 63 | sseq_reads: *sseq_reads 64 | pol_reads: *clr_reads 65 | pol_pass: arrow-p1 66 | hap_assm_mode: split 67 | hap: 68 | - h1-un 69 | - h2-un 70 | - target: 71 | nhr_assembler: flye 72 | hap_assembler: flye 73 | var_caller: longshot 74 | - defaults: 75 | hap_reads: *clr_reads 76 | vc_reads: *clr_reads 77 | sseq_reads: *sseq_reads 78 | pol_reads: *clr_reads 79 | pol_pass: arrow-p1 80 | hap_assm_mode: split 81 | hap: 82 | - h1-un 83 | - h2-un 84 | - target: 85 | nhr_assembler: hhu26 86 | hap_assembler: flye 87 | var_caller: longshot -------------------------------------------------------------------------------- /smk_config/samples/hgsvc/AMR/PUR/hg00731.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_HG00731: 3 | individual: HG00731 4 | sex: male 5 | super_population: AMR 6 | population: PUR 7 | family: PR05 8 | member: parent 9 | data_sources: 10 | - long_reads: 11 | readset: HG00731_hgsvc_pbsq2-ccs 12 | technology: pacbio 13 | data_type: fastq 14 | load_type: parts 15 | - long_reads: 16 | readset: HG00731_hgsvc_pbsq2-clr 17 | technology: pacbio 18 | data_type: pacbio_native 19 | load_type: parts 20 | - strandseq: 21 | readset: &sseq_reads HG00731_1kg_il25k-npe_sseq 22 | source_type: ena 23 | bioproject: PRJEB12849 24 | library_fractions: two 25 | - short_reads: 26 | readset: HG00731_1kg_il25k-125pe_short 27 | source_type: ena 28 | bioproject: PRJEB9396 29 | load_type: parts 30 | - short_reads: 31 | readset: HG00731_1kg_ilnvs-150pe_short 32 | source_type: ena 33 | bioproject: PRJEB31736 34 | load_type: complete 35 | comment: "2504 cohort" 36 | 37 | 38 | sample_targets_HG00731: 39 | - aliases: 40 | 1: &ccs_reads HG00731_hgsvc_pbsq2-ccs_1000 41 | 2: &clr_reads HG00731_hgsvc_pbsq2-clr_1000 42 | - defaults: 43 | hap_reads: *ccs_reads 44 | vc_reads: *ccs_reads 45 | sseq_reads: *sseq_reads 46 | pol_reads: *ccs_reads 47 | pol_pass: racon-p2 48 | hap_assm_mode: split 49 | hap: 50 | - h1-un 51 | - h2-un 52 | - target: 53 | nhr_assembler: pereg 54 | hap_assembler: pereg 55 | var_caller: deepvar 56 | - target: 57 | nhr_assembler: flye 58 | hap_assembler: flye 59 | var_caller: freebayes 60 | - defaults: 61 | hap_reads: *clr_reads 62 | vc_reads: *clr_reads 63 | sseq_reads: *sseq_reads 64 | pol_reads: *clr_reads 65 | pol_pass: arrow-p1 66 | hap_assm_mode: split 67 | hap: 68 | - h1-un 69 | - h2-un 70 | - target: 71 | nhr_assembler: flye 72 | hap_assembler: flye 73 | var_caller: longshot 74 | - defaults: 75 | hap_reads: *clr_reads 76 | vc_reads: *clr_reads 77 | sseq_reads: *sseq_reads 78 | pol_reads: *clr_reads 79 | pol_pass: arrow-p1 80 | hap_assm_mode: split 81 | hap: 82 | - h1-un 83 | - h2-un 84 | - target: 85 | nhr_assembler: hhu26 86 | hap_assembler: flye 87 | var_caller: longshot 88 | -------------------------------------------------------------------------------- /smk_config/params/smk_cfg_params_RV8.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: 71b3763 6 | git_commit_strandphaser: e608407 7 | git_commit_breakpointr: 268d99d 8 | # arbitrarily tying a version number 9 | # to the git commits to avoid additional 10 | # wildcards - increment this number when 11 | # git commits are changed! 12 | git_commit_version: 8 13 | 14 | peregrine_version: 0.1.5.5 15 | deepvariant_version: 0.9.0 16 | shasta_version: 0.4.0 17 | 18 | # Assembler settings 19 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 20 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 21 | 22 | # SaaRclust parameter sets 23 | # goal is to obtain 24 clusters 24 | min_contig_size: 100000 25 | min_region_to_order: 500000 26 | bin_size: 200000 27 | step_size: 200000 28 | prob_threshold: 0.25 29 | init_clusters: 100 30 | desired_clusters: 24 31 | 32 | # this solves a known HET inversion located on chr8 33 | sample_non_default_parameters: 34 | HG00733: 35 | use_only_in: 36 | write_saarclust_config_file: 37 | init_clusters: 150 38 | desired_clusters: 25 39 | 40 | 41 | # VARIANT CALLING 42 | # Postprocessing parameters 43 | filter_vcf_qual: 10 44 | filter_vcf_gq: 100 45 | 46 | freebayes_timeout_sec: 3600 47 | 48 | # not primary alignment || supplementary alignment 49 | bwa_strandseq_aln_discard: 2304 50 | 51 | # read unmapped || not primary alignment || failed QC || PCR dup 52 | minimap_readref_aln_discard: 1796 53 | 54 | # read unmapped || not primary alignment 55 | minimap_contigref_aln_discard: 260 56 | 57 | # read unmapped || not primary alignment || failed QC || PCR dup 58 | minimap_racon_aln_discard: 1796 # same as 0x704 59 | minimap_racon_aln_min_qual: 10 60 | 61 | # main chromosomes to be used 62 | # for known references for main 63 | # pipeline steps (i.e., everything 64 | # before evaluation) 65 | eval_known_ref: GRCh38_GCA_p13 66 | eval_align_ref: hg38_GCA_p13 67 | eval_gene_model: GRCh38_GENCODEv31_basic 68 | use_genome_size: hg38_GCA_p13 69 | main_chromosomes: 70 | - chr1 71 | - chr2 72 | - chr3 73 | - chr4 74 | - chr5 75 | - chr6 76 | - chr7 77 | - chr8 78 | - chr9 79 | - chr10 80 | - chr11 81 | - chr12 82 | - chr13 83 | - chr14 84 | - chr15 85 | - chr16 86 | - chr17 87 | - chr18 88 | - chr19 89 | - chr20 90 | - chr21 91 | - chr22 92 | - chrX 93 | - chrY 94 | -------------------------------------------------------------------------------- /scripts/utilities/inspect_environment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import subprocess as sp 7 | 8 | 9 | def main(): 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--outfile', '-o', type=str, dest='outfile') 13 | parser.add_argument('--logfile', '-l', type=str, dest='logfile') 14 | parser.add_argument('--export-conda-env', '-e', action='store_true', default=False, dest='export') 15 | 16 | args = parser.parse_args() 17 | 18 | outfile = args.outfile 19 | logfile = args.logfile 20 | 21 | try: 22 | os.makedirs(os.path.dirname(os.path.abspath(outfile)), exist_ok=True) 23 | os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True) 24 | except TypeError: 25 | # since Conda environments (or the Singularity module on Hilbert) 26 | # only support Python2 (...), exist_ok may cause an exception 27 | # Ignore that and hope that Snakemake creates everything... 28 | pass 29 | 30 | my_env = dict(os.environ) 31 | 32 | env_vars = sorted(my_env.keys()) 33 | 34 | conda_env = None 35 | 36 | with open(logfile, 'w') as log: 37 | 38 | _ = log.write('\n===== Accessible environment:\n') 39 | 40 | for k in env_vars: 41 | _ = log.write('{} - {}\n'.format(k, my_env[k])) 42 | if k == 'CONDA_PREFIX': 43 | conda_env = my_env[k] 44 | 45 | _ = log.write('\nDone\n') 46 | 47 | if args.export and conda_env is None: 48 | _ = logfile.write('\nERROR: cannot export CONDA env, no prefix path found in environment (see above)\n') 49 | elif args.export: 50 | _ = log.write('\n===== Export of active CONDA environment\n\n') 51 | 52 | try: 53 | out = sp.check_output('conda env export --prefix {}'.format(conda_env), 54 | stderr=sp.STDOUT, 55 | shell=True, 56 | env=None) 57 | out = out.decode('utf-8') 58 | _ = log.write(out + '\n\n') 59 | except sp.CalledProcessError as spe: 60 | _ = log.write('Exporting Conda env failed with code {}: {}\n'.format(spe.returncode, spe.output)) 61 | else: 62 | pass 63 | 64 | with open(outfile, 'w') as touch: 65 | _ = touch.write('ENV OK\n') 66 | 67 | return 68 | 69 | 70 | if __name__ == '__main__': 71 | main() 72 | sys.exit(0) 73 | -------------------------------------------------------------------------------- /scripts/utilities/version_checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import re 7 | 8 | 9 | def main(): 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--outfile', '-o', type=str, dest='outfile') 13 | parser.add_argument('--at-least', '-a', type=str, dest='atleast') 14 | parser.add_argument('--logfile', '-l', type=str, dest='logfile') 15 | 16 | args = parser.parse_args() 17 | 18 | outfile = args.outfile 19 | logfile = args.logfile 20 | 21 | try: 22 | os.makedirs(os.path.dirname(os.path.abspath(outfile)), exist_ok=True) 23 | os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True) 24 | except TypeError: 25 | # since Conda environments (or the Singularity module on Hilbert) 26 | # only support Python2 (...), exist_ok may cause an exception 27 | # Ignore that and hope that Snakemake creates everything... 28 | pass 29 | 30 | req_version = [int(v) for v in args.atleast.split('.')] 31 | 32 | version_pattern = re.compile('[0-9]+\\.[0-9]+(\\.[0-9]+)?') 33 | 34 | match_found = False 35 | 36 | with open(logfile, 'w') as log: 37 | _ = log.write('Minimum version required: {}\n'.format(args.atleast)) 38 | for line in sys.stdin.readlines(): 39 | _ = log.write('Processing line: {}\n'.format(line.strip())) 40 | mobj = version_pattern.search(line.strip()) 41 | if mobj is not None: 42 | version_info = mobj.group(0) 43 | _ = log.write('Potential version info found: {}\n'.format(version_info)) 44 | tool_version = [int(v) for v in version_info.split('.')] 45 | for min_v, is_v in zip(req_version, tool_version): 46 | if is_v > min_v and not is_v < min_v: 47 | _ = log.write('Minimum version matched...\n') 48 | match_found = True 49 | break 50 | if match_found: 51 | break 52 | else: 53 | _ = log.write('Version info did not match...\n') 54 | 55 | if match_found: 56 | exit_code = 0 57 | with open(outfile, 'w') as touch: 58 | _ = touch.write('Version confirmed: {}\n'.format('.'.join([str(v) for v in tool_version]))) 59 | else: 60 | exit_code = 1 61 | _ = log.write('No match found') 62 | 63 | return exit_code 64 | 65 | 66 | if __name__ == '__main__': 67 | sys.exit(main()) 68 | -------------------------------------------------------------------------------- /smk_include/haploid_read_coverage.smk: -------------------------------------------------------------------------------- 1 | 2 | 3 | rule dump_haploid_read_coverage: 4 | """ 5 | "Since recently", UCSC tools require old "ASCII" sort order for the big* indices 6 | to be correct. This is incompatible with default locale (UTF-8) on many Linux systems 7 | """ 8 | input: 9 | 'output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.psort.sam.bam' 10 | output: 11 | 'output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.sorted.bedGraph' 12 | log: 13 | bedtools = 'log/output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.bg.log', 14 | sort = 'log/output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.sort.log', 15 | benchmark: 16 | os.path.join('run/output/alignments/hap_reads_to_reference', 17 | '{folder_path}', 18 | '{file_name}_map-to_{aln_reference}.{hap}.bg' + '.t{}.rsrc'.format(config['num_cpu_medium'])) 19 | conda: 20 | '../environment/conda/conda_biotools.yml' 21 | threads: config['num_cpu_medium'] 22 | resources: 23 | runtime_hrs = lambda wildcards, attempt: attempt * 6, 24 | mem_total_mb = lambda wildcards, attempt: attempt * 24576 + 49152, 25 | mem_per_cpu_mb = lambda wildcards, attempt: int((attempt * 24576 + 49152) / config['num_cpu_medium']) 26 | shell: 27 | 'bedtools genomecov -bg -ibam {input} 2> {log.bedtools}' 28 | ' | ' 29 | 'LC_COLLATE=C sort --buffer-size={resources.mem_total_mb}M --parallel={threads} ' 30 | '-k1,1 -k2,2n > {output} 2> {log.sort}' 31 | 32 | 33 | rule convert_hap_read_coverage: 34 | input: 35 | bg_track = 'output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.sorted.bedGraph', 36 | sizes = 'references/assemblies/{aln_reference}.sizes' 37 | output: 38 | 'output/cov_tracks/hap_reads/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.bigWig' 39 | log: 40 | 'log/output/cov_tracks/hap_reads/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.log' 41 | benchmark: 42 | 'run/output/cov_tracks/hap_reads/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.rsrc' 43 | conda: 44 | '../environment/conda/conda_biotools.yml' 45 | resources: 46 | runtime_hrs = lambda wildcards, attempt: attempt * 6, 47 | mem_total_mb = lambda wildcards, attempt: attempt * 16384, 48 | mem_per_cpu_mb = lambda wildcards, attempt: attempt * 16384 49 | shell: 50 | 'bedGraphToBigWig {input.bg_track} {input.sizes} {output} 2> {log}' 51 | -------------------------------------------------------------------------------- /smk_include/results/run_eas_trios.smk: -------------------------------------------------------------------------------- 1 | 2 | localrules: run_hg01596_individual, 3 | run_na18534_individual, 4 | run_na18939_individual, 5 | run_hg00864_individual, 6 | run_eas_trios, 7 | run_chs_trio, 8 | run_chs_father, 9 | run_chs_mother, 10 | run_chs_child, 11 | run_khv_trio, 12 | run_khv_child 13 | 14 | 15 | rule run_hg01596_individual: 16 | input: 17 | 'output/targets/EAS_KHV_HG01596/HG01596.fofn' 18 | message: 'Running EAS-KHV-HG01596 individual' 19 | 20 | ##################################################### 21 | 22 | rule run_na18534_individual: 23 | input: 24 | 'output/targets/EAS_CHB_NA18534/NA18534.fofn' 25 | message: 'Running EAS-CHB-NA18534 individual' 26 | 27 | ##################################################### 28 | 29 | rule run_na18939_individual: 30 | input: 31 | 'output/targets/EAS_JPT_NA18939/NA18939.fofn' 32 | message: 'Running EAS-JPT-NA18939 individual' 33 | 34 | ##################################################### 35 | 36 | rule run_hg00864_individual: 37 | input: 38 | 'output/targets/EAS_CDX_HG00864/HG00864.fofn' 39 | message: 'Running EAS-CDX-HG00864 individual' 40 | 41 | ##################################################### 42 | 43 | rule run_chs_mother: 44 | input: 45 | 'output/targets/EAS_CHS_SH032/HG00512.fofn' 46 | message: 'Running EAS-CHS-SH032 mother' 47 | 48 | 49 | rule run_chs_father: 50 | input: 51 | 'output/targets/EAS_CHS_SH032/HG00513.fofn' 52 | message: 'Running EAS-CHS-SH032 father' 53 | 54 | 55 | rule run_chs_child: 56 | input: 57 | 'output/targets/EAS_CHS_SH032/HG00514.fofn' 58 | message: 'Running EAS-CHS-SH032 child' 59 | 60 | 61 | rule run_chs_trio: 62 | input: 63 | rules.run_chs_father.input, 64 | rules.run_chs_mother.input, 65 | rules.run_chs_child.input 66 | message: 'Running EAS-CHS-SH032 trio' 67 | 68 | ####################################################### 69 | 70 | rule run_khv_child: 71 | input: 72 | 'output/targets/EAS_KHV_VN047/HG02018.fofn' 73 | message: 'Running EAS-KHV-VN047 child' 74 | 75 | 76 | rule run_khv_trio: 77 | input: 78 | rules.run_khv_child.input 79 | message: 'Running EAS-KHV-VN047 trio' 80 | 81 | ######################################################## 82 | 83 | rule run_eas_trios: 84 | input: 85 | rules.run_hg01596_individual.input, 86 | rules.run_na18534_individual.input, 87 | rules.run_na18939_individual.input, 88 | rules.run_hg00864_individual.input, 89 | rules.run_chs_trio.input, 90 | rules.run_khv_trio.input 91 | message: 'Running EAS trios' 92 | -------------------------------------------------------------------------------- /smk_config/params/smk_cfg_params_RV10.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: 9b4aa00 6 | git_commit_strandphaser: e608407 7 | git_commit_breakpointr: 268d99d 8 | # arbitrarily tying a version number 9 | # to the git commits to avoid additional 10 | # wildcards - increment this number when 11 | # git commits are changed! 12 | git_commit_version: 10 13 | 14 | peregrine_version: 0.1.5.5 15 | deepvariant_version: 0.9.0 16 | shasta_version: 0.4.0 17 | 18 | # Assembler settings 19 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 20 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 21 | 22 | # SaaRclust parameter sets 23 | # goal is to obtain approx. 24 | # 24 clusters (for human) 25 | min_contig_size: 100000 26 | min_region_to_order: 500000 27 | bin_size: 200000 28 | step_size: 200000 29 | prob_threshold: 0.25 30 | init_clusters: 100 31 | desired_clusters: 24 32 | min_mapq: 10 33 | 34 | # this solves a known HET inversion located on chr8 35 | sample_non_default_parameters: 36 | HG00733: 37 | use_only_in: 38 | write_saarclust_config_file: 39 | init_clusters: 150 40 | desired_clusters: 25 41 | NA24385: 42 | use_only_in: 43 | write_saarclust_config_file: 44 | init_clusters: 150 45 | desired_clusters: 25 46 | 47 | # VARIANT CALLING 48 | # Postprocessing parameters 49 | filter_vcf_qual: 10 50 | filter_vcf_gq: 100 51 | 52 | freebayes_timeout_sec: 3600 53 | 54 | # not primary alignment || supplementary alignment 55 | bwa_strandseq_aln_discard: 2304 56 | 57 | # read unmapped || not primary alignment || failed QC || PCR dup 58 | minimap_readref_aln_discard: 1796 59 | 60 | # read unmapped || not primary alignment 61 | minimap_contigref_aln_discard: 260 62 | 63 | # read unmapped || not primary alignment || failed QC || PCR dup 64 | minimap_racon_aln_discard: 1796 # same as 0x704 65 | minimap_racon_aln_min_qual: 10 66 | 67 | # main chromosomes to be used 68 | # for known references for main 69 | # pipeline steps (i.e., everything 70 | # before evaluation) 71 | eval_known_ref: GRCh38_GCA_p13 72 | eval_align_ref: hg38_GCA_p13 73 | eval_gene_model: GRCh38_GENCODEv31_basic 74 | use_genome_size: hg38_GCA_p13 75 | main_chromosomes: 76 | - chr1 77 | - chr2 78 | - chr3 79 | - chr4 80 | - chr5 81 | - chr6 82 | - chr7 83 | - chr8 84 | - chr9 85 | - chr10 86 | - chr11 87 | - chr12 88 | - chr13 89 | - chr14 90 | - chr15 91 | - chr16 92 | - chr17 93 | - chr18 94 | - chr19 95 | - chr20 96 | - chr21 97 | - chr22 98 | - chrX 99 | - chrY 100 | -------------------------------------------------------------------------------- /smk_config/params/smk_cfg_params_RV9.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: ba65b53 6 | git_commit_strandphaser: e608407 7 | git_commit_breakpointr: 268d99d 8 | # arbitrarily tying a version number 9 | # to the git commits to avoid additional 10 | # wildcards - increment this number when 11 | # git commits are changed! 12 | git_commit_version: 9 13 | 14 | peregrine_version: 0.1.5.5 15 | deepvariant_version: 0.9.0 16 | shasta_version: 0.4.0 17 | 18 | # Assembler settings 19 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 20 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 21 | 22 | # SaaRclust parameter sets 23 | # goal is to obtain approx. 24 | # 24 clusters (for human) 25 | min_contig_size: 100000 26 | min_region_to_order: 500000 27 | bin_size: 200000 28 | step_size: 200000 29 | prob_threshold: 0.25 30 | init_clusters: 100 31 | desired_clusters: 24 32 | min_mapq: 10 33 | 34 | # this solves a known HET inversion located on chr8 35 | sample_non_default_parameters: 36 | HG00733: 37 | use_only_in: 38 | write_saarclust_config_file: 39 | init_clusters: 150 40 | desired_clusters: 25 41 | NA24385: 42 | use_only_in: 43 | write_saarclust_config_file: 44 | init_clusters: 150 45 | desired_clusters: 25 46 | 47 | # VARIANT CALLING 48 | # Postprocessing parameters 49 | filter_vcf_qual: 10 50 | filter_vcf_gq: 100 51 | 52 | freebayes_timeout_sec: 3600 53 | 54 | # not primary alignment || supplementary alignment 55 | bwa_strandseq_aln_discard: 2304 56 | 57 | # read unmapped || not primary alignment || failed QC || PCR dup 58 | minimap_readref_aln_discard: 1796 59 | 60 | # read unmapped || not primary alignment 61 | minimap_contigref_aln_discard: 260 62 | 63 | # read unmapped || not primary alignment || failed QC || PCR dup 64 | minimap_racon_aln_discard: 1796 # same as 0x704 65 | minimap_racon_aln_min_qual: 10 66 | 67 | # main chromosomes to be used 68 | # for known references for main 69 | # pipeline steps (i.e., everything 70 | # before evaluation) 71 | eval_known_ref: GRCh38_GCA_p13 72 | eval_align_ref: hg38_GCA_p13 73 | eval_gene_model: GRCh38_GENCODEv31_basic 74 | use_genome_size: hg38_GCA_p13 75 | main_chromosomes: 76 | - chr1 77 | - chr2 78 | - chr3 79 | - chr4 80 | - chr5 81 | - chr6 82 | - chr7 83 | - chr8 84 | - chr9 85 | - chr10 86 | - chr11 87 | - chr12 88 | - chr13 89 | - chr14 90 | - chr15 91 | - chr16 92 | - chr17 93 | - chr18 94 | - chr19 95 | - chr20 96 | - chr21 97 | - chr22 98 | - chrX 99 | - chrY 100 | -------------------------------------------------------------------------------- /smk_config/params/smk_cfg_params_RV11.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: aac02ed 6 | git_commit_strandphaser: e608407 7 | git_commit_breakpointr: 268d99d 8 | # arbitrarily tying a version number 9 | # to the git commits to avoid additional 10 | # wildcards - increment this number when 11 | # git commits are changed! 12 | git_commit_version: 11 13 | 14 | peregrine_version: 0.1.6.1 15 | deepvariant_version: 0.10.0 16 | shasta_version: 0.4.0 17 | 18 | # Assembler settings 19 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 20 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 21 | 22 | # SaaRclust parameter sets 23 | # goal is to obtain approx. 24 | # 24 clusters (for human) 25 | min_contig_size: 100000 26 | min_region_to_order: 500000 27 | bin_size: 200000 28 | step_size: 200000 29 | prob_threshold: 0.25 30 | init_clusters: 100 31 | desired_clusters: 24 32 | min_mapq: 10 33 | 34 | # this solves a known HET inversion located on chr8 35 | sample_non_default_parameters: 36 | HG00733: 37 | use_only_in: 38 | write_saarclust_config_file: 39 | init_clusters: 150 40 | desired_clusters: 25 41 | NA24385: 42 | use_only_in: 43 | write_saarclust_config_file: 44 | init_clusters: 150 45 | desired_clusters: 25 46 | 47 | # VARIANT CALLING 48 | # Postprocessing parameters 49 | filter_vcf_qual: 10 50 | filter_vcf_gq: 100 51 | 52 | freebayes_timeout_sec: 3600 53 | 54 | # not primary alignment || supplementary alignment 55 | bwa_strandseq_aln_discard: 2304 56 | 57 | # read unmapped || not primary alignment || failed QC || PCR dup 58 | minimap_readref_aln_discard: 1796 59 | 60 | # read unmapped || not primary alignment 61 | minimap_contigref_aln_discard: 260 62 | 63 | # read unmapped || not primary alignment || failed QC || PCR dup 64 | minimap_racon_aln_discard: 1796 # same as 0x704 65 | minimap_racon_aln_min_qual: 10 66 | 67 | # main chromosomes to be used 68 | # for known references for main 69 | # pipeline steps (i.e., everything 70 | # before evaluation) 71 | eval_known_ref: GRCh38_GCA_p13 72 | eval_align_ref: hg38_GCA_p13 73 | eval_gene_model: GRCh38_GENCODEv31_basic 74 | use_genome_size: hg38_GCA_p13 75 | main_chromosomes: 76 | - chr1 77 | - chr2 78 | - chr3 79 | - chr4 80 | - chr5 81 | - chr6 82 | - chr7 83 | - chr8 84 | - chr9 85 | - chr10 86 | - chr11 87 | - chr12 88 | - chr13 89 | - chr14 90 | - chr15 91 | - chr16 92 | - chr17 93 | - chr18 94 | - chr19 95 | - chr20 96 | - chr21 97 | - chr22 98 | - chrX 99 | - chrY 100 | -------------------------------------------------------------------------------- /scripts/dev/hybrid_renamer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import shutil 5 | 6 | 7 | def get_haplotype(file_name): 8 | 9 | if 'h1-un' in file_name or '_h1_' in file_name: 10 | return 'h1-un' 11 | elif 'h2-un' in file_name or '_h2_' in file_name: 12 | return 'h2-un' 13 | else: 14 | raise ValueError('Unrecognized haplotype: {}'.format(file_name)) 15 | 16 | 17 | def get_read_info(sample, file_name): 18 | if sample == 'NA24385': 19 | return 'hpg_pbsq2-ccs_1000' 20 | elif sample == 'NA12878': 21 | return 'giab_pbsq2-ccs_1000' 22 | else: 23 | if 'ccs' in file_name.lower(): 24 | return 'hgsvc_pbsq2-ccs_1000' 25 | elif 'clr' in file_name.lower(): 26 | return 'hgsvc_pbsq2-clr_1000' 27 | else: 28 | raise ValueError('Unrecognized read type: {}'.format(file_name)) 29 | 30 | 31 | def get_assembler_info(read_info): 32 | if 'ccs' in read_info: 33 | return 'pereg', 'racon-p2' 34 | elif 'clr' in read_info: 35 | return 'flye', 'arrow-p1' 36 | else: 37 | raise ValueError('Cannot match assembler to reads: {}'.format(read_info)) 38 | 39 | 40 | def get_new_file_ext(file_name): 41 | 42 | if 'not_scaffolded' in file_name.lower(): 43 | return 'bng-unsupported.fasta' 44 | elif file_name.endswith('.agp'): 45 | return 'bng-hybrid.agp' 46 | elif file_name.endswith('.fasta'): 47 | return 'bng-scaffolds.fasta' 48 | else: 49 | raise ValueError('Cannot handle file name: {}'.format(file_name)) 50 | 51 | 52 | def build_new_name(file_name): 53 | 54 | sample = file_name.split('_', 1)[0] 55 | if sample.startswith('GM'): 56 | if sample == 'GM00864': 57 | sample = sample.replace('GM', 'HG') 58 | else: 59 | sample = sample.replace('GM', 'NA') 60 | read_info = get_read_info(sample, file_name) 61 | hap = get_haplotype(file_name) 62 | assembler, polisher = get_assembler_info(read_info) 63 | new_file_ext = get_new_file_ext(file_name) 64 | 65 | new_name = '{}_{}-{}.{}.{}.{}'.format(sample, read_info, assembler, hap, polisher, new_file_ext) 66 | return new_name 67 | 68 | 69 | target_path = '/gpfs/project/ebertp/projects/rfdga/production/EVAL/run_folder/output/evaluation/scaffolded_assemblies' 70 | 71 | for root, dirs, files in os.walk(os.getcwd()): 72 | if not files: 73 | continue 74 | for f in files: 75 | if not (f.endswith('.fasta') or f.endswith('.agp')): 76 | continue 77 | old_path = os.path.join(root, f) 78 | new_path = os.path.join(target_path, build_new_name(f)) 79 | if os.path.isfile(new_path): 80 | continue 81 | shutil.copy(old_path, new_path) 82 | 83 | 84 | -------------------------------------------------------------------------------- /smk_include/eval_known_reference.smk: -------------------------------------------------------------------------------- 1 | 2 | localrules: master_eval_known_reference 3 | 4 | rule master_eval_known_reference: 5 | input: 6 | [] 7 | 8 | 9 | rule compute_delta_assembly_reference: 10 | input: 11 | known_ref = 'references/assemblies/{known_ref}.fasta', 12 | assembly = 'output/{folder_path}/{file_name}.fasta' 13 | output: 14 | delta = 'output/evaluation/mummer_delta/{known_ref}/{folder_path}/{file_name}.delta' 15 | log: 16 | 'log/output/evaluation/mummer_delta/{known_ref}/{folder_path}/{file_name}.mummer.log' 17 | benchmark: 18 | 'run/output/evaluation/mummer_delta/{{known_ref}}/{{folder_path}}/{{file_name}}.mummer.t{}.rsrc'.format(config['num_cpu_medium']) 19 | conda: 20 | '../environment/conda/conda_biotools.yml' 21 | threads: config['num_cpu_medium'] 22 | resources: 23 | mem_per_cpu_mb = lambda wildcards, attempt: int((24576 + attempt * 24576) / config['num_cpu_medium']), 24 | mem_total_mb = lambda wildcards, attempt: 24576 + attempt * 24576, 25 | runtime_hrs = lambda wildcards, attempt: 6 * attempt 26 | shell: 27 | 'nucmer --maxmatch -l 100 -c 500 --threads={threads} {input.known_ref} {input.custom_ref} --delta={output.delta} &> {log}' 28 | 29 | 30 | rule quast_analysis_assembly: 31 | input: 32 | dl_chk = 'output/check_files/quast-lg/busco_db_download.ok', 33 | known_ref = 'references/assemblies/{known_ref}.fasta', 34 | genes = 'references/downloads/{genemodel}.gff3.gz', 35 | assembly = 'output/{folder_path}/{file_name}.fasta', 36 | output: 37 | pdf_report = 'output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{file_name}/report.pdf', 38 | html_icarus = 'output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{file_name}/icarus.html', 39 | log: 40 | 'log/output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{file_name}/quast_run.log', 41 | benchmark: 42 | 'run/output/evaluation/quastlg_busco/{{known_ref}}-{{genemodel}}/{{folder_path}}/{{file_name}}/quast_run.t{}.rsrc'.format(config['num_cpu_medium']) 43 | conda: 44 | '../environment/conda/conda_rtools.yml' 45 | threads: config['num_cpu_medium'] 46 | resources: 47 | mem_per_cpu_mb = lambda wildcards, attempt: int((36864 + attempt * 36864) / config['num_cpu_medium']), 48 | mem_total_mb = lambda wildcards, attempt: 36864 + attempt * 36864, 49 | runtime_hrs = lambda wildcards, attempt: 16 * attempt 50 | params: 51 | output_dir = lambda wildcards, output: os.path.dirname(output.pdf_report) 52 | priority: 100 53 | shell: 54 | 'quast-lg.py --threads {threads} -r {input.known_ref}' 55 | ' --features gene:{input.genes} --conserved-genes-finding' 56 | ' --output-dir {params.output_dir} {input.assembly}' 57 | ' &> {log}' 58 | -------------------------------------------------------------------------------- /annotation/20200507_ASanders_wgs_cells.txt: -------------------------------------------------------------------------------- 1 | GM19650Ax02PE20523 2 | GM20847Bx02PE20410 3 | HG01114x02PE20328 4 | HG01505x02PE20494 5 | HG01596x02PE20501 6 | HG02018x01PE20491 7 | HG02587x02PE20340 8 | HG00096x02PE20385 9 | HG00171Ax02PE20490 10 | HG03065x02PE20587 11 | HG03371x02PE20572 12 | GM18939x02PE20464 13 | GM19036Bx02PE20369 14 | GM12329x02PE20440 15 | GM12329x02PE20472 16 | GM18939x02PE20440 17 | GM19036Bx02PE20387 18 | HG00171Ax02PE20432 19 | HG00171Ax02PE20440 20 | HG01573x02PE20332 21 | HG01573x02PE20356 22 | HG01573x02PE20380 23 | HG01573x02PE20388 24 | HG02011x02PE20571 25 | HG02011x02PE20595 26 | HG02011x02PE20596 27 | HG02492x02PE20456 28 | HG02587x02PE20348 29 | HG02587x02PE20372 30 | HG02587x02PE20388 31 | HG02587x02PE20395 32 | HG03732x02PE20571 33 | GM18939x02PE20456 34 | GM12329x02PE20448 35 | GM12329x02PE20455 36 | GM12329x02PE20456 37 | GM12329x02PE20479 38 | GM12329x02PE20480 39 | GM12329x02PE20488 40 | GM12329x02PE20490 41 | GM12329x02PE20495 42 | GM12329x02PE20496 43 | GM18939x02PE20423 44 | GM18939x02PE20448 45 | GM18939x02PE20463 46 | GM18939x02PE20472 47 | GM18939x02PE20479 48 | GM18939x02PE20487 49 | GM18939x02PE20488 50 | GM18939x02PE20494 51 | GM18939x02PE20495 52 | HG00171Ax02PE20424 53 | HG00171Ax02PE20439 54 | HG00171Ax02PE20448 55 | HG00171Ax02PE20456 56 | HG00171Ax02PE20472 57 | HG00171Ax02PE20479 58 | HG00171Ax02PE20480 59 | HG00171Ax02PE20487 60 | HG00171Ax02PE20488 61 | HG00171Ax02PE20495 62 | HG00171Ax02PE20496 63 | HG01505x02PE20424 64 | HG01505x02PE20432 65 | HG01505x02PE20440 66 | HG01505x02PE20487 67 | HG01505x02PE20496 68 | HG02011x02PE20524 69 | HG02011x02PE20532 70 | HG02011x02PE20540 71 | HG02011x02PE20548 72 | HG02011x02PE20556 73 | HG02011x02PE20580 74 | HG02011x02PE20587 75 | HG02011x02PE20588 76 | HG02492x02PE20423 77 | HG02492x02PE20439 78 | HG02492x02PE20440 79 | HG02492x02PE20472 80 | HG02492x02PE20479 81 | HG02492x02PE20480 82 | HG02492x02PE20487 83 | HG02492x02PE20488 84 | HG02492x02PE20495 85 | HG02492x02PE20496 86 | HG02587x02PE20323 87 | HG02587x02PE20324 88 | HG02587x02PE20332 89 | HG02587x02PE20339 90 | HG02587x02PE20355 91 | HG02587x02PE20356 92 | HG02587x02PE20371 93 | HG02587x02PE20379 94 | HG02587x02PE20380 95 | HG02587x02PE20387 96 | HG02587x02PE20390 97 | HG03009x02PE20384 98 | HG03065x02PE20524 99 | HG03065x02PE20532 100 | HG03065x02PE20572 101 | HG03065x02PE20580 102 | HG03065x02PE20588 103 | HG03065x02PE20596 104 | HG03683x01PE20425 105 | HG03683x01PE20461 106 | HG03732x02PE20512 107 | HG03732x02PE20548 108 | HG03732x02PE20572 109 | HG03732x02PE20580 110 | HG03732x02PE20587 111 | HG03732x02PE20588 112 | HG03732x02PE20594 113 | HG03732x02PE20595 114 | HG03732x02PE20596 115 | GM19036Bx02PE20372 116 | HG01573x02PE20324 117 | HG01573x02PE20331 118 | HG01573x02PE20340 119 | HG01573x02PE20396 120 | GM18534Bx02PE20392 121 | HG03065x02PE20556 122 | GM20509Bx01PE20515 123 | GM20509Bx01PE20580 124 | GM20509Bx01PE20504 -------------------------------------------------------------------------------- /notes/minimap_ctg_ref.md: -------------------------------------------------------------------------------- 1 | # Snakefile: minimap contig to reference assembly alignment 2 | 3 | Sent via e-mail by David on 2019-10-08 4 | 5 | Relevance: 6 | - minimap2 parameters to get reasonable alignments between de novo and reference 7 | - reference (`ref` below) refers here to, e.g., GRCh38 8 | 9 | 10 | ``` 11 | ## Snakefile to align denovo contigs to the reference genome 12 | 13 | ## Set config file 14 | configfile: "Snake.config.json" 15 | 16 | FASTA, = glob_wildcards("clustered_assembly/{fasta}.fasta") 17 | 18 | rule all: 19 | input: 20 | #expand("alignments/{fasta}.bed", fasta=FASTA) 21 | "alignments/HG00733_sra_pbsq1-clr_sqa_clustered_v2.bed" 22 | 23 | rule align_fasta: 24 | input: 25 | fasta = "clustered_assembly/{fasta}.fasta", 26 | ref = config["reference"] 27 | output: 28 | "alignments/{fasta}.bam" 29 | log: 30 | "log/{fasta}.bam.log" 31 | threads: 32 | 8 33 | shell: 34 | #"minimap2 -ax asm20 --eqx -r 20000 -s 30000 -t {threads} {input.ref} {input.fasta} | samtools view -F 260 -b - | samtools sort - > {output} 2> {log}" 35 | "minimap2 --secondary=no --eqx -Y -ax asm20 -m 10000 -z 10000,50 -r 50000 --end-bonus=100 -O 5,56 -E 4,1 -B 5 -t {threads} {input.ref} {input.fasta} | samtools view -F 260 -b - | samtools sort - > {output} 2> {log}" 36 | 37 | rule bam2bed: 38 | input: 39 | "alignments/{fasta}.bam" 40 | output: 41 | "alignments/{fasta}.bed" 42 | log: 43 | "log/{fasta}.bed.log" 44 | shell: 45 | "bedtools bamtobed -i {input} > {output} 2> {log}" 46 | 47 | ``` 48 | 49 | Diagnostic plot of the BED output file can be produced as follows: 50 | 51 | ```R 52 | library(SaaRclust) 53 | library(BSgenome.Hsapiens.UCSC.hg38) 54 | 55 | bedfile <- "bedfile with aligned contigs to the reference" 56 | plt1 <- plotClusteredContigs(bedfile = bedfile, min.mapq = 10, bsgenome = BSgenome.Hsapiens.UCSC.hg38, report = 'clustering') 57 | plt2 <- plotClusteredContigs(bedfile = bedfile, min.mapq = 10, bsgenome = BSgenome.Hsapiens.UCSC.hg38, report = 'ordering') 58 | plt3 <- plotClusteredContigs(bedfile = bedfile, min.mapq = 10, bsgenome = BSgenome.Hsapiens.UCSC.hg38, report = 'orienting') 59 | 60 | #To save the plots: 61 | plot destination = "location and the file name where the plot should be saved" 62 | ggsave(filename = , plot = plt1, width = 12, height = 6) 63 | ggsave(filename = , plot = plt2, width = 12, height = 6) 64 | ggsave(filename = , plot = plt3, width = 12, height = 6) 65 | ``` 66 | 67 | ## Note 68 | 69 | Add following packages to `rtools` environment after bug fixing stage. 70 | 71 | ```yaml 72 | - bioconductor-bsgenome=1.50.0=r351_0 73 | - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.1=r351_5 74 | ``` -------------------------------------------------------------------------------- /smk_config/params/smk_cfg_params_RV12.yml: -------------------------------------------------------------------------------- 1 | 2 | # === Software setup settings === 3 | # Specify git commits for SaaRclust 4 | # and StrandPhaseR setup 5 | git_commit_saarclust: d51c66f 6 | git_commit_strandphaser: e608407 7 | git_commit_breakpointr: 268d99d 8 | # arbitrarily tying a version number 9 | # to the git commits to avoid additional 10 | # wildcards - increment this number when 11 | # git commits are changed! 12 | git_commit_version: 12 13 | 14 | peregrine_version: 0.1.6.1 15 | deepvariant_version: 0.10.0 16 | shasta_version: 0.4.0 17 | 18 | # Assembler settings 19 | shasta_target_coverage: 60 # tech-independent recommendation: cov between 40x and 80x 20 | flye_target_coverage: 50 # dev recommendation: ~30x, but we have enough RAM to go a bit higher 21 | 22 | # SaaRclust parameter sets 23 | # goal is to obtain approx. 24 | # 24 clusters (for human) 25 | min_contig_size: 100000 26 | min_region_to_order: 500000 27 | bin_size: 200000 28 | step_size: 200000 29 | prob_threshold: 0.25 30 | init_clusters: 100 31 | desired_clusters: 24 32 | min_mapq: 10 33 | 34 | # this solves a known HET inversion located on chr8 35 | sample_non_default_parameters: 36 | HG00733: 37 | use_only_in: 38 | write_saarclust_config_file: 39 | init_clusters: 150 40 | desired_clusters: 25 41 | NA24385: 42 | use_only_in: 43 | write_saarclust_config_file: 44 | init_clusters: 150 45 | desired_clusters: 25 46 | NA20847: 47 | use_only_in: 48 | write_saarclust_config_file: 49 | desired_clusters: 23 50 | HG00864: 51 | use_only_in: 52 | write_saarclust_config_file: 53 | desired_clusters: 23 54 | 55 | # VARIANT CALLING 56 | # Postprocessing parameters 57 | filter_vcf_qual: 10 58 | filter_vcf_gq: 100 59 | 60 | freebayes_timeout_sec: 3600 61 | 62 | # not primary alignment || supplementary alignment 63 | bwa_strandseq_aln_discard: 2304 64 | 65 | # read unmapped || not primary alignment || failed QC || PCR dup 66 | minimap_readref_aln_discard: 1796 67 | 68 | # read unmapped || not primary alignment 69 | minimap_contigref_aln_discard: 260 70 | 71 | # read unmapped || not primary alignment || failed QC || PCR dup 72 | minimap_racon_aln_discard: 1796 # same as 0x704 73 | minimap_racon_aln_min_qual: 10 74 | 75 | # main chromosomes to be used 76 | # for known references for main 77 | # pipeline steps (i.e., everything 78 | # before evaluation) 79 | eval_known_ref: GRCh38_GCA_p13 80 | eval_align_ref: hg38_GCA_p13 81 | eval_gene_model: GRCh38_GENCODEv31_basic 82 | use_genome_size: hg38_GCA_p13 83 | main_chromosomes: 84 | - chr1 85 | - chr2 86 | - chr3 87 | - chr4 88 | - chr5 89 | - chr6 90 | - chr7 91 | - chr8 92 | - chr9 93 | - chr10 94 | - chr11 95 | - chr12 96 | - chr13 97 | - chr14 98 | - chr15 99 | - chr16 100 | - chr17 101 | - chr18 102 | - chr19 103 | - chr20 104 | - chr21 105 | - chr22 106 | - chrX 107 | - chrY 108 | -------------------------------------------------------------------------------- /notebooks/dev/merge_numpy_aln_dumps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "2846211526\n", 13 | "num_hap\n", 14 | "1 2945021\n", 15 | "2 3464902\n", 16 | "3 2505441\n", 17 | "4 2542069\n", 18 | "5 2043097\n", 19 | " ... \n", 20 | "60 5065712\n", 21 | "61 5569952\n", 22 | "62 21048551\n", 23 | "63 65643213\n", 24 | "64 2636716984\n", 25 | "Name: length, Length: 64, dtype: int64\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import os\n", 31 | "import numpy as np\n", 32 | "import numpy.ma as ma\n", 33 | "import pandas as pd\n", 34 | "import collections as col\n", 35 | "\n", 36 | "input_path = '/home/local/work/data/hgsvc/aln_summary'\n", 37 | "\n", 38 | "\n", 39 | "# chrom_regions = []\n", 40 | "\n", 41 | "# for dump in os.listdir(input_path):\n", 42 | "# if 'hifi-prio' not in dump or not dump.endswith('.npy'):\n", 43 | "# continue\n", 44 | "# a = np.load(os.path.join(input_path, dump))\n", 45 | "# chrom = dump.rsplit('.', 2)[-2]\n", 46 | "# print(chrom)\n", 47 | "# genomic_coordinates = np.arange(a.size, dtype=np.int32)\n", 48 | "# for i in range(1, 65, 1):\n", 49 | "# select_regions = ma.masked_array(genomic_coordinates, mask=(a == i))\n", 50 | "# df = pd.DataFrame(\n", 51 | "# [(s.start, s.stop) for s in ma.clump_masked(select_regions)],\n", 52 | "# columns=['start', 'end'],\n", 53 | "# dtype='int32'\n", 54 | "# )\n", 55 | "# df['chrom'] = chrom\n", 56 | "# df['num_hap'] = i\n", 57 | "# chrom_regions.append(df)\n", 58 | "\n", 59 | "# chrom_regions = pd.concat(chrom_regions, axis=0, ignore_index=False)\n", 60 | "# chrom_regions.sort_values(['chrom', 'start'], inplace=True, axis=0)\n", 61 | " \n", 62 | "# with open(os.path.join(input_path, 'aln_64hap_hifi-prio.mapq60.bed'), 'w') as bed:\n", 63 | "# _ = bed.write('#')\n", 64 | "# chrom_regions.to_csv(\n", 65 | "# bed,\n", 66 | "# header=True,\n", 67 | "# index=False,\n", 68 | "# sep='\\t',\n", 69 | "# columns=['chrom', 'start', 'end', 'num_hap']\n", 70 | "# )\n" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.7.6" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 4 95 | } 96 | -------------------------------------------------------------------------------- /scripts/plot_saarclust_diagnostics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressMessages(library(SaaRclust)) 4 | 5 | args <- commandArgs(trailingOnly=TRUE) 6 | 7 | bed.file <- args[1] 8 | ref.genome <- args[2] 9 | output.folder <- args[3] 10 | plot.title <- args[4] 11 | haploid.assembly <- args[5] 12 | 13 | if (is.na(haploid.assembly)) { 14 | haploid.assembly <- FALSE 15 | } else { 16 | haploid.assembly <- as.logical(haploid.assembly) 17 | } 18 | 19 | stopifnot(ref.genome == 'hg38') 20 | 21 | suppressMessages(library(BSgenome.Hsapiens.UCSC.hg38)) 22 | 23 | plot.clustering <- NULL 24 | plot.ordering <- NULL 25 | plot.orienting <- NULL 26 | 27 | if (!haploid.assembly) { 28 | 29 | plot.clustering <- plotClusteredContigs( 30 | bedfile = bed.file, 31 | min.mapq = 10, 32 | bsgenome = BSgenome.Hsapiens.UCSC.hg38, 33 | report = 'clustering', 34 | title = paste('Clustering', plot.title, sep=': '), 35 | chromosomes = paste0('chr', c(1:22, 'X')) 36 | ) 37 | 38 | plot.orienting <- plotClusteredContigs( 39 | bedfile = bed.file, 40 | min.mapq = 10, 41 | bsgenome = BSgenome.Hsapiens.UCSC.hg38, 42 | report = 'orienting', 43 | title = paste('Orientation', plot.title, sep=': '), 44 | chromosomes = paste0('chr', c(1:22, 'X')) 45 | ) 46 | } else { 47 | 48 | plot.clustering <- plotClusteredContigs( 49 | bedfile = bed.file, 50 | min.mapq = 10, 51 | bsgenome = BSgenome.Hsapiens.UCSC.hg38, 52 | report = 'clustering', 53 | info.delim = '_', 54 | info.fields = c('cluster.SRC', 'contig.ID', 'order', 'cluster.ID'), 55 | col.by = 'cluster.ID', 56 | title = paste('Clustering', plot.title, sep=': '), 57 | chromosomes = paste0('chr', c(1:22, 'X')) 58 | ) 59 | 60 | plot.ordering <- plotClusteredContigs( 61 | bedfile = bed.file, 62 | min.mapq = 10, 63 | bsgenome = BSgenome.Hsapiens.UCSC.hg38, 64 | report = 'ordering', 65 | info.delim = '_', 66 | info.fields = c('cluster.SRC', 'contig.ID', 'order', 'cluster.ID'), 67 | title = paste('Ordering', plot.title, sep=': '), 68 | chromosomes = paste0('chr', c(1:22, 'X')) 69 | ) 70 | 71 | plot.orienting <- plotClusteredContigs( 72 | bedfile = bed.file, 73 | min.mapq = 10, 74 | bsgenome = BSgenome.Hsapiens.UCSC.hg38, 75 | report = 'orienting', 76 | info.delim = '_', 77 | info.fields = c('cluster.SRC', 'contig.ID', 'order', 'cluster.ID'), 78 | title = paste('Orientation', plot.title, sep=': '), 79 | chromosomes = paste0('chr', c(1:22, 'X')) 80 | ) 81 | } 82 | 83 | if (!is.null(plot.clustering)) { 84 | ggsave(filename = paste(output.folder, 'clustering.pdf', sep='.'), plot = plot.clustering, width = 16, height = 8) 85 | } 86 | 87 | if (!is.null(plot.ordering)) { 88 | ggsave(filename = paste(output.folder, 'ordering.pdf', sep='.'), plot = plot.ordering, width = 16, height = 8) 89 | } 90 | 91 | if (!is.null(plot.orienting)) { 92 | ggsave(filename = paste(output.folder, 'orienting.pdf', sep='.'), plot = plot.orienting, width = 16, height = 8) 93 | } 94 | 95 | warnings() 96 | 97 | quit(save='no') -------------------------------------------------------------------------------- /notes/align_strandseq.md: -------------------------------------------------------------------------------- 1 | # Align Strand-seq reads to reference or custom assembly 2 | 3 | Sent by David via e-mail on 2019-09-17 4 | 5 | Relevance: 6 | - default alignment and preprocessing commands for strand-seq data 7 | - merge of mono- and dinucleotide fractions is data-specific 8 | 9 | ``` 10 | from collections import defaultdict 11 | 12 | SAMPLE_DIR, SAMPLE, PLATE, LIBUM, NUCL = glob_wildcards( 13 | "rawData/{sample_dir}/{sample}_{plate}_{libum}_{nucl}_1.fastq.gz" 14 | ) 15 | 16 | ## Take unique ID for each sample 17 | SAMPLES = sorted(set(SAMPLE)) 18 | 19 | rule all: 20 | input: 21 | bam=lambda wildcards: [ 22 | "alignments/{}/{}.{}.{}.monodi.srt.mdup.bam.bai".format( 23 | sample_dir, sample, plate, libum 24 | ) for sample_dir, sample, plate, libum in zip(SAMPLE_DIR, SAMPLE, PLATE, LIBUM) 25 | ] 26 | 27 | rule align_bwa: 28 | input: 29 | read1="rawData/{sample_dir}/{sample}_{plate}_{libum}_{nucl}_1.fastq.gz", 30 | read2="rawData/{sample_dir}/{sample}_{plate}_{libum}_{nucl}_2.fastq.gz", 31 | ref = config["reference"] 32 | output: 33 | temp("alignments/{sample_dir}/{nucl}/{sample}.{plate}.{libum}.{nucl}.bam") 34 | log: 35 | "log/{sample_dir}/{nucl}/{sample}.{plate}.{libum}.{nucl}.bam" 36 | threads: 37 | 8 38 | params: 39 | rg="@RG\\tID:{sample}_{nucl}\\tPL:Illumina\\tSM:{sample}_{nucl}" 40 | shell: 41 | """ 42 | bwa mem -t {threads} \ 43 | -R '{params.rg}' \ 44 | {input.ref} {input.read1} {input.read2} | samtools view -Sb -F 2304 - > {output} 2> {log} 45 | """ 46 | 47 | rule merge_mono_di: 48 | input: 49 | bam1="alignments/{sample_dir}/mono/{sample}.{plate}.{libum}.mono.bam", 50 | bam2="alignments/{sample_dir}/di/{sample}.{plate}.{libum}.di.bam" 51 | output: 52 | temp("alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.bam") 53 | log: 54 | "log/{sample_dir}/{sample}.{plate}.{libum}.monodi.bam.log" 55 | threads: 56 | 8 57 | shell: 58 | "samtools merge -@ {threads} -O BAM {output} {input} 2> {log}" 59 | 60 | rule sort_mono_di: 61 | input: 62 | "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.bam" 63 | output: 64 | temp("alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.bam") 65 | log: 66 | "log/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.bam.log" 67 | threads: 68 | 8 69 | shell: 70 | "samtools sort -@ {threads} -O BAM {input} -o {output} 2> {log}" 71 | 72 | 73 | rule markDuplicates: 74 | input: 75 | "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.bam" 76 | output: 77 | "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam" 78 | log: 79 | "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam.log" 80 | threads: 81 | 8 82 | shell: 83 | "sambamba markdup -t {threads} {input} {output} 2> {log}" 84 | 85 | 86 | rule index_bam: 87 | input: 88 | "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam" 89 | output: 90 | "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam.bai" 91 | threads: 92 | 8 93 | shell: 94 | "samtools index -@ {threads} {input} {output}" 95 | 96 | ``` 97 | -------------------------------------------------------------------------------- /scripts/utilities/summarize_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import argparse as argp 5 | 6 | import pandas as pd 7 | import pysam 8 | 9 | 10 | GT_MAP = { 11 | (0, 0): 'HOM', 12 | (1, 1): 'HOM', 13 | (0, 1): 'HET', 14 | (1, 0): 'HET' 15 | } 16 | 17 | 18 | def parse_command_line(): 19 | 20 | parser = argp.ArgumentParser() 21 | parser.add_argument( 22 | '--input', 23 | '-i', 24 | type=str, 25 | dest='input', 26 | nargs='+' 27 | ) 28 | parser.add_argument( 29 | '--output', 30 | '-o', 31 | type=str, 32 | dest='output', 33 | required=True 34 | ) 35 | args = parser.parse_args() 36 | return args 37 | 38 | 39 | def parse_filename(file_path): 40 | 41 | fname = os.path.basename(file_path) 42 | short_reads, long_read_assm = fname.split('_map-to_') 43 | sample = short_reads.split('_')[0] 44 | 45 | hap = long_read_assm.split('.')[1] 46 | if hap == 'h1-un': 47 | hap = 10 48 | elif hap == 'h2-un': 49 | hap = 20 50 | else: 51 | raise ValueError('Unexpected haplotype: {}'.format(fname)) 52 | 53 | platform = long_read_assm.split('_')[1].split('-')[1] 54 | assert platform in ['clr', 'ccs'], 'Unknown long read tech: {}'.format(long_reads) 55 | platform = 'HiFi' if platform == 'ccs' else 'CLR' 56 | 57 | return sample, platform, hap 58 | 59 | 60 | def main(): 61 | args = parse_command_line() 62 | 63 | out_mode = 'w' 64 | 65 | for vcf_file in args.input: 66 | rows = [] 67 | index = [] 68 | sample, platform, hap = parse_filename(vcf_file) 69 | 70 | with pysam.VariantFile(vcf_file, 'r') as vcf: 71 | for record in vcf: 72 | assert record.chrom == record.contig, 'Sequence mismatch: {}'.format(record) 73 | v = { 74 | 'sequence': record.chrom, 75 | 'pos': record.pos, 76 | 'start': record.start, 77 | 'stop': record.stop, 78 | 'qual': int(round(record.qual, 0)), 79 | 'ref_allele': record.ref, 80 | 'alt_allele': record.alts[0], 81 | 'depth': record.info['DP'], 82 | 'region_length': record.rlen, 83 | 'variant_length': record.info['LEN'][0] 84 | } 85 | var_type = record.info['TYPE'][0].upper() 86 | if var_type == 'SNP': 87 | var_type = 'SNV' 88 | genotype = record.samples[sample]['GT'] 89 | gt = GT_MAP[genotype] 90 | index.append((sample, platform, hap, var_type, gt)) 91 | rows.append(v) 92 | 93 | df = pd.DataFrame.from_records( 94 | rows, 95 | index=pd.MultiIndex.from_tuples( 96 | index, 97 | names=['sample', 'platform', 'hap', 'var_type', 'genotype'] 98 | ) 99 | ) 100 | store_key = os.path.join(sample, platform, 'HAP' + str(hap)) 101 | df.to_hdf(args.output, store_key, mode=out_mode, format='fixed', complevel=9) 102 | out_mode = 'a' 103 | 104 | return 0 105 | 106 | 107 | if __name__ == '__main__': 108 | main() -------------------------------------------------------------------------------- /scripts/eval/extract_contigs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import io 5 | import argparse 6 | 7 | import pandas as pd 8 | 9 | 10 | def parse_command_line(): 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument( 14 | '--contig-table', 15 | '-ct', 16 | type=str, 17 | dest='contig_table' 18 | ) 19 | parser.add_argument( 20 | '--fasta-folder', 21 | '-ff', 22 | type=str, 23 | dest='fasta_folder' 24 | ) 25 | parser.add_argument( 26 | '--output-folder', 27 | '-of', 28 | type=str, 29 | dest='output_folder' 30 | ) 31 | args = parser.parse_args() 32 | return args 33 | 34 | 35 | def read_seqs_from_fasta(fasta_path, contigs): 36 | 37 | seq_buffer = io.StringIO() 38 | 39 | buffer = False 40 | with open(fasta_path, 'r') as fasta: 41 | for line in fasta: 42 | if line.startswith('>'): 43 | this_contig = line.strip().strip('>') 44 | if this_contig in contigs: 45 | _ = seq_buffer.write('\n') 46 | _ = seq_buffer.write(line) 47 | buffer = True 48 | else: 49 | buffer = False 50 | continue 51 | if buffer: 52 | _ = seq_buffer.write(line) 53 | return seq_buffer 54 | 55 | 56 | def cache_fasta_paths(fasta_folder): 57 | 58 | cache = dict() 59 | for filename in os.listdir(fasta_folder): 60 | if not filename.endswith('.fasta'): 61 | continue 62 | sample, _, platform, _ = filename.split('_', 3) 63 | if platform == 'pbsq2-clr': 64 | tech = 'CLR' 65 | elif platform == 'pbsq2-ccs': 66 | tech = 'CCS' 67 | else: 68 | raise ValueError(filename) 69 | if 'h1-un' in filename: 70 | hap = 'H1' 71 | elif 'h2-un' in filename: 72 | hap = 'H2' 73 | else: 74 | raise ValueError(filename) 75 | cache[(sample, hap, tech)] = filename 76 | return cache 77 | 78 | 79 | def main(): 80 | 81 | args = parse_command_line() 82 | os.makedirs(args.output_folder, exist_ok=True) 83 | 84 | df = pd.read_csv(args.contig_table, sep='\t', header=0) 85 | 86 | fasta_cache = cache_fasta_paths(args.fasta_folder) 87 | 88 | for (sample, hap, tech), contigs in df.groupby(['sample', 'haplotype', 'platform']): 89 | try: 90 | fasta_file = fasta_cache[(sample, hap, tech)] 91 | except KeyError: 92 | print('skipping ', sample, hap, tech) 93 | continue 94 | contig_names = set(contigs['contig_id']) 95 | fasta_path = os.path.join(args.fasta_folder, fasta_file) 96 | contig_seqs = read_seqs_from_fasta(fasta_path, contig_names) 97 | 98 | outname = fasta_file.replace('.fasta', '.ctg3q29.fasta') 99 | outpath = os.path.join(args.output_folder, outname) 100 | if os.path.isfile(outpath): 101 | continue 102 | 103 | with open(outpath, 'w') as dump: 104 | _ = dump.write(contig_seqs.getvalue()) 105 | print('done ', sample, hap, tech) 106 | 107 | return 0 108 | 109 | 110 | if __name__ == '__main__': 111 | main() -------------------------------------------------------------------------------- /notebooks/subsample_hg00733_strandseq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 88, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "copying\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import yaml\n", 18 | "import json\n", 19 | "import os\n", 20 | "import random\n", 21 | "import shutil\n", 22 | "\n", 23 | "\n", 24 | "folder = '/scratch/bioinf/projects/diploid-genome-assembly/pebert/test_clr/run_folder/input/fastq/HG00733_1kg_il25k-npe_sseq'\n", 25 | "\n", 26 | "out_folder = '/scratch/bioinf/projects/diploid-genome-assembly/pebert/subsampling'\n", 27 | "os.makedirs(out_folder, exist_ok=True)\n", 28 | "\n", 29 | "all_files = os.listdir(folder)\n", 30 | "\n", 31 | "all_libs = set([x.split('_')[3] for x in all_files if x.endswith('.fastq.gz')])\n", 32 | "\n", 33 | "total_libs = len(all_libs)\n", 34 | "\n", 35 | "all_subsets = []\n", 36 | "\n", 37 | "while 1:\n", 38 | "\n", 39 | " for factor in [0.8, 0.6, 0.4, 0.2]:\n", 40 | " select_num = int(round(total_libs * factor))\n", 41 | " selected_libs = set(random.sample(all_libs, select_num))\n", 42 | " all_subsets.append(selected_libs)\n", 43 | "\n", 44 | " all_jaccards = []\n", 45 | "\n", 46 | " for i in all_subsets:\n", 47 | " for j in all_subsets:\n", 48 | " if i == j:\n", 49 | " continue\n", 50 | " isect = i.intersection(j)\n", 51 | " union = i.union(j)\n", 52 | " jaccard = len(isect) / len(union)\n", 53 | " all_jaccards.append(jaccard)\n", 54 | "\n", 55 | " if all([j < 0.5 for j in all_jaccards]):\n", 56 | " print('copying')\n", 57 | " for subset, label in zip(all_subsets, ['sub80', 'sub60', 'sub40', 'sub20']):\n", 58 | " out_path = os.path.join(out_folder, label)\n", 59 | " os.makedirs(out_path, exist_ok=True)\n", 60 | " for lib in subset:\n", 61 | " subset_files = [f for f in all_files if lib in f]\n", 62 | " for sf in subset_files:\n", 63 | " new_file = sf.replace('1kg', label)\n", 64 | " old_path = os.path.join(folder, sf)\n", 65 | " new_path = os.path.join(out_path, new_file)\n", 66 | " shutil.copy(old_path, new_path)\n", 67 | " break\n", 68 | " else:\n", 69 | " print('max j ', max(all_jaccards))\n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "\n", 74 | "\n", 75 | "\n" 76 | ] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "Python 3", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.6.7" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 2 100 | } 101 | -------------------------------------------------------------------------------- /smk_config/samples/na24143.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA24143: 3 | individual: NA24143 4 | sex: female 5 | super_population: EUR 6 | population: AKJ 7 | family: 3140 8 | member: parent 9 | comment: "Sample alias: HG004" 10 | data_sources: 11 | - long_reads: 12 | readset: NA24143_hpg_pbsq2-ccs 13 | technology: pacbio 14 | data_type: fastq 15 | load_type: parts 16 | source_type: amazon 17 | comment: "https://github.com/human-pangenomics/HG002_Data_Freeze_v1.0#motherhg004na24143" 18 | - strandseq: 19 | readset: &sseq_reads NA24143_bccrc_ilany-75pe_sseq 20 | source_type: local 21 | library_fractions: one 22 | comment: "Lansdorp collaboration data" 23 | 24 | 25 | sample_targets_NA24143: 26 | - aliases: 27 | 1: &ccs_reads NA24143_hpg_pbsq2-ccs_1000 28 | - defaults: 29 | hap_reads: *ccs_reads 30 | vc_reads: *ccs_reads 31 | sseq_reads: *sseq_reads 32 | pol_reads: *ccs_reads 33 | pol_pass: racon-p2 34 | hap_assm_mode: split 35 | hap: 36 | - h1-un 37 | - h2-un 38 | - target: 39 | name: Lansdorp 40 | nhr_assembler: hifiasm 41 | hap_assembler: hifiasm 42 | var_caller: deepvar 43 | 44 | 45 | sample_data_sources_NA24143: 46 | input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part1: 47 | local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part1.fastq.gz 48 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel733_2_B01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191124_055423.fastq.gz 49 | input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part2: 50 | local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part2.fastq.gz 51 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel735_1_A01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191126_155613.fastq.gz 52 | input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part3: 53 | local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part3.fastq.gz 54 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel735_2_B01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191127_220906.fastq.gz 55 | input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part4: 56 | local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part4.fastq.gz 57 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel735_3_C01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191129_043425.fastq.gz 58 | input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part5: 59 | local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part5.fastq.gz 60 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_21kb/PBmixSequel724_1_A01_PBTA_30hours_21kbV2PD_70pM_HumanHG004_CCS/m64017_191115_211223.fastq.gz 61 | input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part6: 62 | local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part6.fastq.gz 63 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_21kb/PBmixSequel725_1_A01_PBTA_30hours_21kbV2PD_70pM_HumanHG004_CCS/m64017_191118_150849.fastq.gz 64 | 65 | -------------------------------------------------------------------------------- /smk_config/samples/na24149.yml: -------------------------------------------------------------------------------- 1 | 2 | sample_description_NA24149: 3 | individual: NA24149 4 | sex: male 5 | super_population: EUR 6 | population: AKJ 7 | family: 3140 8 | member: parent 9 | comment: "Sample alias: HG003" 10 | data_sources: 11 | - long_reads: 12 | readset: NA24149_hpg_pbsq2-ccs 13 | technology: pacbio 14 | data_type: fastq 15 | load_type: parts 16 | source_type: amazon 17 | comment: "https://github.com/human-pangenomics/HG002_Data_Freeze_v1.0#fatherhg003na24149" 18 | - strandseq: 19 | readset: &sseq_reads NA24149_bccrc_ilany-75pe_sseq 20 | source_type: local 21 | library_fractions: one 22 | comment: "Lansdorp collaboration data" 23 | 24 | 25 | sample_targets_NA24149: 26 | - aliases: 27 | 1: &ccs_reads NA24149_hpg_pbsq2-ccs_1000 28 | - defaults: 29 | hap_reads: *ccs_reads 30 | vc_reads: *ccs_reads 31 | sseq_reads: *sseq_reads 32 | pol_reads: *ccs_reads 33 | pol_pass: racon-p2 34 | hap_assm_mode: split 35 | hap: 36 | - h1-un 37 | - h2-un 38 | - target: 39 | name: Lansdorp 40 | nhr_assembler: hifiasm 41 | hap_assembler: hifiasm 42 | var_caller: deepvar 43 | 44 | 45 | sample_data_sources_NA24149: 46 | input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part1: 47 | local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part1.fastq.gz 48 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_14kb/PBmixSequel740_2_B01_PBST_30hours_14kbV2PD_70pM_HumanHG003_CCS/m64017_191205_225630.fastq.gz 49 | input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part2: 50 | local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part2.fastq.gz 51 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_14kb/PBmixSequel740_3_C01_PBST_30hours_14kbV2PD_70pM_HumanHG003_CCS/m64017_191207_052215.fastq.gz 52 | input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part3: 53 | local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part3.fastq.gz 54 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel737_1_A01_PBTG_30hours_15kbV2PD_70pM_HumanHG003_CCS/m64017_191202_204405.fastq.gz 55 | input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part4: 56 | local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part4.fastq.gz 57 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel740_1_A01_PBTG_30hours_15kbV2PD_70pM_HumanHG003_CCS/m64017_191204_164321.fastq.gz 58 | input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part5: 59 | local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part5.fastq.gz 60 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_19kb/PBmixSequel729_1_A01_PBTH_30hours_19kbV2PD_70pM_HumanHG003_CCS/m64017_191120_193948.fastq.gz 61 | input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part6: 62 | local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part6.fastq.gz 63 | remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_19kb/PBmixSequel733_1_A01_PBTH_30hours_19kbV2PD_70pM_HumanHG003_CCS/m64017_191122_184406.fastq.gz 64 | 65 | -------------------------------------------------------------------------------- /annotation/grch38/issues/grch38_p13_unknown.tsv: -------------------------------------------------------------------------------- 1 | "HG-2562" "Unknown" "chr14:105,171,721-105,336,833" 1 "Open" "" "Ensembl, NCBI, UCSC" "HG00733 contains a 7kbp insertion haplotype not seen in the Reference." 2 | "HG-2552" "Unknown" "chrX:149,479,317-149,843,545" 1 "Open" "" "Ensembl, NCBI, UCSC" "Possible Inversion in reference components AC244197.3_AC244098.2" 3 | "HG-2550" "Unknown" "chr11:61,949,767-62,116,633" 1 "Open" "" "Ensembl, NCBI, UCSC" "Possible Inversion in reference component AP003733.5" 4 | "HG-2549" "Unknown" "chr11:215,457-356,450" 1 "Under Review" "" "Ensembl, NCBI, UCSC" "Possible Inversion in reference component AC136475.7" 5 | "HG-2547" "Unknown" "chr9:123,855,313-124,032,767" 1 "Open" "" "Ensembl, NCBI, UCSC" "Possible Inversion in reference component AC006450.13" 6 | "HG-2546" "Unknown" "chr6:106,695,378-106,856,632" 1 "Open" "" "Ensembl, NCBI, UCSC" "Possible Inversion in reference component AL080314.29" 7 | "HG-2536" "Unknown" "chr21:6,427,259-6,580,181" 1 "Open" "" "Ensembl, NCBI, UCSC" "Does GRCh38 placement of FP236240.8 on 21p arm represent a true duplication?" 8 | "HG-2497" "Unknown" "chr13:32,231,913-32,398,469" 1 "Awaiting External Info" "" "Ensembl, NCBI, UCSC" "GRCh38 represents rare allele in BRCA2 (rs169547)" 9 | "HG-2467" "Unknown" "chr15:77,667,045-77,764,477" 1 "Awaiting External Info" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AC110607.7" 10 | "HG-2426" "Unknown" "chr17:43,196,179-43,329,548" 1 "Under Review" "" "Ensembl, NCBI, UCSC" "Two bases (AA) missing from reference assembly in intron of NBR1 gene" 11 | "HG-2425" "Unknown" "chr16:21,176,310-22,760,988" 2 "Under Review" "" "Ensembl, NCBI, UCSC" "Potential rare variant haplotype at 16p12 in GRCh38" 12 | "HG-2359" "Unknown" "chr19:23,998,162-24,111,739" 1 "Awaiting External Info" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AC092279.2" 13 | "HG-2356" "Unknown" "chr14:103,846,805-103,934,844" 1 "Under Review" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AL132712.4" 14 | "HG-2165" "Unknown" "chr5:157,700,922-157,847,717" 1 "Awaiting Elec Data" "GRCh39" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AC026407.4" 15 | "HG-2113" "Unknown" "chr11:30,834,079-30,977,299" 1 "Awaiting Elec Data" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AL135932.7" 16 | "HG-2101" "Unknown" "chr1:86,959,754-87,131,991" 1 "Awaiting External Info" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AC093155.2" 17 | "HG-2082" "Unknown" "chr7:68,728,846-68,835,472" 1 "Awaiting Exptl Data" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AC004929.2" 18 | "HG-2069" "Unknown" "chr3:37,553,864-37,716,135" 1 "Under Review" "" "Ensembl, NCBI, UCSC" "Possible misassembly or indel variation in GRCh38 within AP006240.1" 19 | "HG-2020" "Unknown" "chr10:46,853,171-47,145,966" 1 "Open" "" "Ensembl, NCBI, UCSC" "PTPN20A is missing from GRCh38" 20 | "HG-1574" "Unknown" "chr22:50,342,656-50,806,138" 1 "Awaiting Exptl Data" "" "Ensembl, NCBI, UCSC" "Chr 22 ABC12 pathway" 21 | "HG-994" "Unknown" "chr16:88,986,311-89,130,142" 1 "Awaiting External Info" "" "Ensembl, NCBI, UCSC" "Possible missing sequence in assembly component AC135782.4" 22 | "HG-675" "Unknown" "chr12:40,461,422-40,561,522" 1 "Stalled" "" "Ensembl, NCBI, UCSC" "GeneID: 283463 (MUC19) has poor alignment to the Reference" 23 | -------------------------------------------------------------------------------- /notebooks/dump_sample_table.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "\n", 11 | "import yaml\n", 12 | "import pandas as pd\n", 13 | "\n", 14 | "base_path = '/home/local/work/code/github/project-diploid-assembly/smk_config/samples'\n", 15 | "\n", 16 | "ignore_samples = set([\n", 17 | " 'NA19434',\n", 18 | " 'HG03721',\n", 19 | " 'HG01573',\n", 20 | " 'HG02018',\n", 21 | " 'NA19036',\n", 22 | " 'NA19320'\n", 23 | "])\n", 24 | "\n", 25 | "samples = []\n", 26 | "for root, dirs, files in os.walk(base_path):\n", 27 | " yaml_configs = [f for f in files if f.endswith('.yml') or f.endswith('.yaml')]\n", 28 | " for cfg in yaml_configs:\n", 29 | " with open(os.path.join(root, cfg), 'r') as dump:\n", 30 | " metadata = yaml.safe_load(dump)\n", 31 | " is_sample = [k for k in metadata.keys() if k.startswith('sample_description')]\n", 32 | " if not is_sample:\n", 33 | " continue\n", 34 | " metadata = metadata[is_sample.pop()]\n", 35 | " metadata['HiFi'] = 0\n", 36 | " metadata['CLR'] = 0\n", 37 | " metadata['2020_SKIP'] = 1 if metadata['individual'] in ignore_samples else 0\n", 38 | " if metadata['population'] == 'AKJ':\n", 39 | " metadata['population'] = 'ASK'\n", 40 | " for ds in metadata['data_sources']:\n", 41 | " if 'long_reads' not in ds:\n", 42 | " continue\n", 43 | " attributes = ds['long_reads']\n", 44 | " if 'pbsq2' not in attributes['readset']:\n", 45 | " continue\n", 46 | " if '-ccs' in attributes['readset']:\n", 47 | " metadata['HiFi'] = 1\n", 48 | " continue\n", 49 | " if '-clr' in attributes['readset']:\n", 50 | " metadata['CLR'] = 1\n", 51 | " continue\n", 52 | " del metadata['data_sources']\n", 53 | " samples.append(metadata)\n", 54 | "\n", 55 | "sample_table = pd.DataFrame(samples)\n", 56 | "sample_table = sample_table[[\n", 57 | " 'individual',\n", 58 | " 'sex',\n", 59 | " 'super_population',\n", 60 | " 'population',\n", 61 | " 'family',\n", 62 | " 'member',\n", 63 | " 'HiFi',\n", 64 | " 'CLR',\n", 65 | " '2020_SKIP'\n", 66 | "]]\n", 67 | "out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/sample_table.tsv'\n", 68 | "sample_table.sort_values(['super_population', 'population', 'individual'], inplace=True)\n", 69 | "sample_table.to_csv(out_path, sep='\\t', header=True, index=False)" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.7.6" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 4 94 | } 95 | --------------------------------------------------------------------------------