├── environment
    ├── conda
    │   ├── deactivate
    │   │   └── env_vars.sh
    │   ├── activate
    │   │   └── env_vars.sh
    │   ├── conda_convert.yml
    │   ├── testing
    │   │   ├── testasm.yml
    │   │   └── rtest.yml
    │   ├── conda_dipassm.yml
    │   ├── conda_bifrost.yml
    │   ├── conda_merqury.yml
    │   ├── conda_pbtools.yml
    │   ├── conda_shelltools.yml
    │   ├── conda_rtools.yml
    │   ├── conda_export_env_vars.sh
    │   ├── conda_pyscript.yml
    │   ├── conda_biotools.yml
    │   └── conda_evaltools.yml
    ├── sync
    │   ├── include.txt
    │   ├── exclude.txt
    │   └── sync_commands.md
    └── snakemake
    │   ├── cluster
    │       ├── denbi_tu_slurm
    │       │   ├── denbi_cluster.json
    │       │   └── config.yaml
    │       ├── deep
    │       │   ├── config.yaml
    │       │   └── deep_cluster.json
    │       ├── denbi_tu_pbs
    │       │   ├── config.yaml
    │       │   └── denbi_cluster.json
    │       └── hhu_pbs
    │       │   ├── config.yaml
    │       │   └── hilbert_queues.md
    │   ├── demo
    │       └── config.yaml
    │   ├── server
    │       ├── denbi_europa
    │       │   └── config.yaml
    │       └── d3compute
    │       │   └── config.yaml
    │   └── laptop
    │       └── config.yaml
├── smk_config
    ├── selectors
    │   ├── hgsvc_blacklist.yml
    │   ├── test_pb.yml
    │   ├── ccs_prod_run.yml
    │   ├── test_ont.yml
    │   ├── hgsvc_clr_run.yml
    │   ├── clr_prod_linknhr_run.yml
    │   ├── clr_prod_run.yml
    │   ├── hgsvc_ccs_run.yml
    │   └── lansdorp.yml
    ├── demo
    │   ├── run_env.yml
    │   ├── na12878.yml
    │   └── params.yml
    ├── run_env
    │   ├── smk_cfg_env-laptop.yml
    │   ├── smk_cfg_env-hhu.yml
    │   ├── smk_cfg_env-mmci.yml
    │   └── smk_cfg_env-valet.yml
    ├── data_sources
    │   ├── sseq_local_denbi.yml
    │   ├── hgsvc_local_denbi.yml
    │   ├── hgsvc_ftp_src_illumina.yml
    │   ├── hgsvc_ftp_src_strandseq.yml
    │   └── hgsvc_local_hhu.yml
    ├── samples
    │   ├── hgsvc
    │   │   ├── AFR
    │   │   │   ├── ESN
    │   │   │   │   ├── hg03125.yml
    │   │   │   │   └── hg03371.yml
    │   │   │   ├── GWD
    │   │   │   │   ├── hg02818.yml
    │   │   │   │   └── hg02587.yml
    │   │   │   ├── MSL
    │   │   │   │   ├── hg03486.yml
    │   │   │   │   └── hg03065.yml
    │   │   │   ├── LWK
    │   │   │   │   └── na19036.yml
    │   │   │   ├── ACB
    │   │   │   │   └── hg02011.yml
    │   │   │   ├── ASW
    │   │   │   │   └── na19983.yml
    │   │   │   └── YRI
    │   │   │   │   ├── na19240.yml
    │   │   │   │   ├── na19238.yml
    │   │   │   │   └── na19239.yml
    │   │   ├── AMR
    │   │   │   ├── PEL
    │   │   │   │   └── hg01573.yml
    │   │   │   ├── CLM
    │   │   │   │   └── hg01114.yml
    │   │   │   ├── MXL
    │   │   │   │   └── na19650.yml
    │   │   │   └── PUR
    │   │   │   │   ├── hg00732.yml
    │   │   │   │   └── hg00731.yml
    │   │   ├── SAS
    │   │   │   ├── PJL
    │   │   │   │   └── hg02492.yml
    │   │   │   ├── BEB
    │   │   │   │   └── hg03009.yml
    │   │   │   ├── ITU
    │   │   │   │   ├── hg03721.yml
    │   │   │   │   └── hg03732.yml
    │   │   │   ├── STU
    │   │   │   │   └── hg03683.yml
    │   │   │   └── GIH
    │   │   │   │   └── na20847.yml
    │   │   ├── EUR
    │   │   │   ├── CEU
    │   │   │   │   └── na12329.yml
    │   │   │   ├── FIN
    │   │   │   │   └── hg00171.yml
    │   │   │   ├── IBS
    │   │   │   │   └── hg01505.yml
    │   │   │   ├── GBR
    │   │   │   │   └── hg00096.yml
    │   │   │   └── TSI
    │   │   │   │   └── na20509.yml
    │   │   └── EAS
    │   │   │   ├── KHV
    │   │   │       ├── hg01596.yml
    │   │   │       └── hg02018.yml
    │   │   │   ├── CDX
    │   │   │       └── hg00864.yml
    │   │   │   ├── JPT
    │   │   │       └── na18939.yml
    │   │   │   ├── CHB
    │   │   │       └── na18534.yml
    │   │   │   └── CHS
    │   │   │       ├── hg00514.yml
    │   │   │       ├── hg00513.yml
    │   │   │       └── hg00512.yml
    │   ├── na12878.yml
    │   ├── na24143.yml
    │   └── na24149.yml
    └── params
    │   ├── smk_cfg_params_RV7.yml
    │   ├── smk_cfg_params_RV8.yml
    │   ├── smk_cfg_params_RV10.yml
    │   ├── smk_cfg_params_RV9.yml
    │   ├── smk_cfg_params_RV11.yml
    │   └── smk_cfg_params_RV12.yml
├── annotation
    ├── grch38
    │   ├── known_regions
    │   │   ├── ucsc_segdups.tsv.gz
    │   │   ├── GRCh38_p13_chromXY_PAR.tsv
    │   │   └── Modeled_regions_for_GRCh38.tsv
    │   ├── 20200723_GRCh38_p13_regions.bed
    │   └── issues
    │   │   └── grch38_p13_unknown.tsv
    ├── 20200507_ASanders_100cell_controls.txt
    ├── 1kg_hgsvc_colors.csv
    ├── sample_table.tsv
    ├── NA24385_selected_libraries_sseq.csv
    ├── in_preparation
    │   └── bl_supp_HG02818_HG03125_HG03486_NA19434.txt
    └── 20200507_ASanders_wgs_cells.txt
├── scripts
    ├── run_saarclust.R
    ├── install_saarclust.R
    ├── install_strandphaser.R
    ├── run_strandphaser.R
    ├── run_breakpointr.R
    ├── eval
    │   ├── response
    │   │   ├── response-reviewer3-comment12.py
    │   │   └── response-reviewer3-comment2.py
    │   └── extract_contigs.py
    ├── install_breakpointr.R
    ├── fb-parallel-timeout.sh
    ├── utilities
    │   ├── mem_profiler.py
    │   ├── check_scripts
    │   │   ├── fastq_checker.py
    │   │   ├── tagging_checker.py
    │   │   └── fasta_checker.py
    │   ├── process_logger.py
    │   ├── inspect_environment.py
    │   ├── version_checker.py
    │   └── summarize_vcf.py
    ├── dev
    │   ├── ref_phasing
    │   │   ├── prep_vcf.py
    │   │   └── prep_ref.py
    │   ├── cluster_splitter.py
    │   └── hybrid_renamer.py
    └── plot_saarclust_diagnostics.R
├── notes
    ├── align_ccs_racon.md
    ├── minimap_ctg_ref.md
    └── align_strandseq.md
├── smk_include
    ├── module_includes.smk
    ├── link_data_sources.smk
    ├── results
    │   ├── run_eur_trios.smk
    │   ├── run_sas_trios.smk
    │   ├── run_amr_trios.smk
    │   └── run_eas_trios.smk
    ├── dev
    │   └── run_all_eval.smk
    ├── haploid_read_coverage.smk
    └── eval_known_reference.smk
├── LICENSE
├── .gitignore
├── docs
    ├── demo.md
    └── autoconf.md
├── README.md
└── notebooks
    ├── 2020_project
        └── processing
        │   └── clean_segdups_annotation.ipynb
    ├── dev
        └── merge_numpy_aln_dumps.ipynb
    ├── subsample_hg00733_strandseq.ipynb
    └── dump_sample_table.ipynb


/environment/conda/deactivate/env_vars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | unset SGE_ROOT
4 | unset SGE_CELL


--------------------------------------------------------------------------------
/smk_config/selectors/hgsvc_blacklist.yml:
--------------------------------------------------------------------------------
1 | file_download_blacklist: "annotation/hgsvc_blacklist.txt"


--------------------------------------------------------------------------------
/environment/conda/activate/env_vars.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export SGE_ROOT=/TL/deep-gridengine
4 | export SGE_CELL=deep


--------------------------------------------------------------------------------
/environment/sync/include.txt:
--------------------------------------------------------------------------------
1 | *.fasta
2 | *.vcf.bgz
3 | *.vcf.bgz.tbi
4 | *stats
5 | *.pdf
6 | *.txt
7 | *.tsv
8 | *.log
9 | *.rsrc


--------------------------------------------------------------------------------
/smk_config/demo/run_env.yml:
--------------------------------------------------------------------------------
1 | 
2 | num_cpu_max: 24
3 | num_cpu_high: 24
4 | num_cpu_medium: 12
5 | num_cpu_low: 4
6 | 
7 | env_module_singularity: False


--------------------------------------------------------------------------------
/smk_config/selectors/test_pb.yml:
--------------------------------------------------------------------------------
1 | 
2 | select_targets:
3 |   nhr_assembler:
4 |     - flye
5 |   var_caller:
6 |     - longshot
7 |   name:
8 |     - pacbio_test


--------------------------------------------------------------------------------
/annotation/grch38/known_regions/ucsc_segdups.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ptrebert/project-diploid-assembly/HEAD/annotation/grch38/known_regions/ucsc_segdups.tsv.gz


--------------------------------------------------------------------------------
/annotation/grch38/known_regions/GRCh38_p13_chromXY_PAR.tsv:
--------------------------------------------------------------------------------
1 | "PAR#1"	"X"	10001	2781479
2 | "PAR#2"	"X"	155701383	156030895
3 | "PAR#1"	"Y"	10001	2781479
4 | "PAR#2"	"Y"	56887903	57217415
5 | 


--------------------------------------------------------------------------------
/environment/conda/conda_convert.yml:
--------------------------------------------------------------------------------
 1 | name: convert
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.6
 8 |   - pip
 9 |   - seqtk
10 |   - dnaio
11 | 


--------------------------------------------------------------------------------
/smk_config/selectors/ccs_prod_run.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - pereg
 5 |   var_caller:
 6 |     - deepvar
 7 | 
 8 | skip_targets:
 9 |   name:
10 |     - pacbio_test
11 |     - nanopore_test
12 | 


--------------------------------------------------------------------------------
/environment/conda/testing/testasm.yml:
--------------------------------------------------------------------------------
 1 | name: testasm
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - Python=3.6.*
 8 |   - pip=19.2.3
 9 |   - flye=2.6
10 |   - raven-assembler=0.0.1


--------------------------------------------------------------------------------
/smk_config/run_env/smk_cfg_env-laptop.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | notify: False
 3 | notify_email: ebertp@hhu.de
 4 | 
 5 | env_module_singularity: False
 6 | 
 7 | num_cpu_max: 4
 8 | num_cpu_high: 3
 9 | num_cpu_medium: 2
10 | num_cpu_low: 1
11 | 


--------------------------------------------------------------------------------
/smk_config/run_env/smk_cfg_env-hhu.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | notify: False
 3 | notify_email: ebertp@hhu.de
 4 | 
 5 | env_module_singularity: Singularity
 6 | 
 7 | num_cpu_max: 72
 8 | num_cpu_high: 24
 9 | num_cpu_medium: 12
10 | num_cpu_low: 6
11 | 


--------------------------------------------------------------------------------
/smk_config/run_env/smk_cfg_env-mmci.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | notify: True
 3 | notify_email: pebert@mpi-inf.mpg.de
 4 | 
 5 | env_module_singularity: False
 6 | 
 7 | num_cpu_max: 48
 8 | num_cpu_high: 24
 9 | num_cpu_medium: 12
10 | num_cpu_low: 6
11 | 


--------------------------------------------------------------------------------
/smk_config/selectors/test_ont.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - shasta
 5 |     - flye
 6 |   var_caller:
 7 |     - longshot
 8 |   name:
 9 |     - nanopore_test
10 | 
11 | select_target_path: REPORT_DRAFT_HAPLOID_ASSEMBLY
12 | 


--------------------------------------------------------------------------------
/smk_config/selectors/hgsvc_clr_run.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - flye
 5 |   hap_assembler:
 6 |     - flye
 7 |   var_caller:
 8 |     - longshot
 9 | 
10 | skip_targets:
11 |   name:
12 |     - pacbio_test
13 |     - nanopore_test
14 | 


--------------------------------------------------------------------------------
/smk_config/run_env/smk_cfg_env-valet.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | notify: False
 3 | notify_email: pebert@mpi-inf.mpg.de
 4 | 
 5 | force_local_copy: True
 6 | 
 7 | env_module_singularity: False
 8 | 
 9 | num_cpu_max: 36
10 | num_cpu_high: 36
11 | num_cpu_medium: 12
12 | num_cpu_low: 6
13 | 


--------------------------------------------------------------------------------
/smk_config/selectors/clr_prod_linknhr_run.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - uw27
 5 |     - jax27
 6 |     - hhu26
 7 |     - hhu27
 8 |   var_caller:
 9 |     - longshot
10 | 
11 | skip_targets:
12 |   name:
13 |     - pacbio_test
14 |     - nanopore_test
15 | 


--------------------------------------------------------------------------------
/smk_config/selectors/clr_prod_run.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - flye
 5 |     - uw27r
 6 |     - mpi27r
 7 |     - jax27r
 8 |     - hhu26
 9 |   var_caller:
10 |     - longshot
11 | 
12 | skip_targets:
13 |   name:
14 |     - pacbio_test
15 |     - nanopore_test
16 | 


--------------------------------------------------------------------------------
/smk_config/selectors/hgsvc_ccs_run.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - pereg
 5 |   hap_assembler:
 6 |     - pereg
 7 |   var_caller:
 8 |     - deepvar
 9 | 
10 | skip_targets:
11 |   name:
12 |     - pacbio_test
13 |     - nanopore_test
14 |     - hifi_subsampling
15 | 


--------------------------------------------------------------------------------
/scripts/run_saarclust.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages(library(SaaRclust))
 4 | 
 5 | args = commandArgs(trailingOnly=TRUE)
 6 | 
 7 | scaffoldDenovoAssembly(
 8 |     configfile = args[1],
 9 |     bamfolder = args[2],
10 |     outputfolder = args[3]
11 | )
12 | 
13 | warnings()
14 | 
15 | quit(save='no')


--------------------------------------------------------------------------------
/environment/conda/conda_dipassm.yml:
--------------------------------------------------------------------------------
 1 | name: dipassm
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python=3.6.*
 7 |   - pip=19.2.3
 8 |   - openssl=1.0.2t
 9 |   - Snakemake=5.10.0
10 |   - drmaa=0.7.9
11 |   - pyyaml=5.3
12 |   - pandas=1.0.5
13 |   - pytables=3.6.1
14 |   - intervaltree=3.0.2


--------------------------------------------------------------------------------
/smk_config/selectors/lansdorp.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | select_targets:
 3 |   nhr_assembler:
 4 |     - hifiasm
 5 |   hap_assembler:
 6 |     - hifiasm
 7 |   var_caller:
 8 |     - deepvar
 9 |   name:
10 |     - Lansdorp
11 | 
12 | skip_targets:
13 |   name:
14 |     - pacbio_test
15 |     - nanopore_test
16 |     - hifi_subsampling
17 | 
18 | 


--------------------------------------------------------------------------------
/environment/snakemake/cluster/denbi_tu_slurm/denbi_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "__default__":
 3 |   {
 4 |     "cores": "{threads}",
 5 |     "mem_mb": "{resources.mem_per_cpu_mb}",
 6 |     "name": "{jobid}_{rule}",
 7 |     "output": "log/cluster_jobs/{jobid}_{rule}.stdout",
 8 |     "error": "log/cluster_jobs/{jobid}_{rule}.stderr"
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/environment/conda/conda_bifrost.yml:
--------------------------------------------------------------------------------
 1 | name: bifrost
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - python=3.6
 8 |   - pip
 9 |   - c-compiler
10 |   - cmake>=3.0.0
11 |   - compilers
12 |   - cxx-compiler
13 |   - libclang>=8.0.0
14 |   - pkg-config
15 |   - pthread-stubs
16 |   - xz
17 |   - zlib
18 | 


--------------------------------------------------------------------------------
/environment/conda/conda_merqury.yml:
--------------------------------------------------------------------------------
 1 | name: merqury
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python<3.8
 7 |   - r-base=3.6.1
 8 |   - r-ggplot2
 9 |   - r-argparse
10 |   - r-scales
11 |   - samtools
12 |   - bedtools
13 |   - igvtools
14 |   - openjdk
15 |   - c-compiler
16 |   - cxx-compiler
17 |   - compilers
18 |   - make
19 | 


--------------------------------------------------------------------------------
/environment/conda/conda_pbtools.yml:
--------------------------------------------------------------------------------
 1 | name: pbtools
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - python=2.7.*
 7 |   - python-consensuscore2=3.4.1
 8 |   - genomicconsensus=2.3.3
 9 |   - pbgcpp=1.9.0
10 |   - pbccs=3.4.1
11 |   - pbbam=1.0.6
12 |   - pbmm2=1.1.0
13 |   - pbcoretools=0.2.4
14 |   - bam2fastx=1.3.0
15 |   - hap.py=0.3.10


--------------------------------------------------------------------------------
/environment/conda/conda_shelltools.yml:
--------------------------------------------------------------------------------
 1 | name: shelltools
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python=3.6.*
 7 |   - pip=19.2.3
 8 |   - wget=1.19.4
 9 |   - libssh2=1.8.0
10 |   - openssl=1.0.2t
11 |   - libcurl=7.64.0
12 |   - curl=7.64.0
13 |   - aria2=1.34.0
14 |   - pbgzip=2016.08.04
15 |   - zlib=1.2.11
16 |   - bzip2=1.0.8
17 | 


--------------------------------------------------------------------------------
/environment/sync/exclude.txt:
--------------------------------------------------------------------------------
 1 | *snakemake*
 2 | *cluster_jobs*
 3 | *references/*
 4 | *output/alignments/*
 5 | *output/container/*
 6 | *output/check_files/*
 7 | */haploid_fasta/*
 8 | */haploid_fastq/*
 9 | */haploid_bam/*
10 | */layout/*
11 | *temp*
12 | *tmp*
13 | *processing*
14 | *.sh
15 | *.bam
16 | *.sam
17 | *.bai
18 | *.pbi
19 | *.fastq
20 | *.fastq.gz
21 | *input/*.fasta


--------------------------------------------------------------------------------
/environment/snakemake/demo/config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cores: 24
 3 | latency-wait: 120
 4 | keep-going: True
 5 | rerun-incomplete: True
 6 | keep-incomplete: True
 7 | restart-times: 0
 8 | use-conda: True
 9 | use-envmodules: False
10 | nolock: False
11 | resources:
12 |   mem_total_mb=131072
13 | default-resources:
14 |   - mem_per_cpu_mb=1024
15 |   - mem_total_mb=1024
16 |   - runtime_hrs=1
17 |   - runtime_min=59
18 | 


--------------------------------------------------------------------------------
/smk_config/data_sources/sseq_local_denbi.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_source_strandseq_denbi_local:
 3 |   comment: "deNBI local Strand-seq source for NA24385 and sub-sampled HG00733"
 4 |   output: 'strandseq_local_denbi.json'
 5 |   server: 'localhost'
 6 |   data_source: '/beeond/data/strandseq'
 7 |   collect_files:
 8 |     - 'fastq.gz'
 9 |   sort_into:
10 |     - 'fastq'
11 |   assume_correct_filenames: True
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/environment/snakemake/server/denbi_europa/config.yaml:
--------------------------------------------------------------------------------
 1 | # Default runtime profile for deNBI Tuebingen infrastructure / single VM
 2 | 
 3 | directory: /mnt/vol/quobyte/projects/diploid-assembly
 4 | jobs: 36
 5 | latency-wait: 300
 6 | keep-going: True
 7 | rerun-incomplete: True
 8 | restart-times: 0
 9 | default-resources:
10 |   - runtime_hrs=1
11 |   - runtime_min=59
12 |   - mem_per_cpu_mb=2048
13 |   - mem_total_mb=4096


--------------------------------------------------------------------------------
/environment/snakemake/laptop/config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | cores: 4
 3 | latency-wait: 5
 4 | keep-going: True
 5 | rerun-incomplete: True
 6 | keep-incomplete: True
 7 | restart-times: 0
 8 | use-conda: True
 9 | use-envmodules: False
10 | nolock: False
11 | resources:
12 |   mem_total_mb=14336
13 | default-resources:
14 |   - mem_per_cpu_mb=1024
15 |   - mem_total_mb=1024
16 |   - runtime_hrs=1
17 |   - runtime_min=59
18 | #forcerun: "config.dump"
19 | 


--------------------------------------------------------------------------------
/environment/snakemake/server/d3compute/config.yaml:
--------------------------------------------------------------------------------
 1 | # Default runtime profile for TM infrastructure / D3 compute server
 2 | 
 3 | cores: 48
 4 | latency-wait: 10
 5 | keep-going: True
 6 | rerun-incomplete: True
 7 | keep-incomplete: True
 8 | restart-times: 0
 9 | use-conda: True
10 | use-envmodules: False
11 | nolock: True
12 | resources:
13 |   mem_total_mb=1520432
14 | default-resources:
15 |   - mem_per_cpu_mb=1024
16 |   - mem_total_mb=1024
17 |   - runtime_hrs=1
18 |   - runtime_min=59
19 | 


--------------------------------------------------------------------------------
/smk_config/data_sources/hgsvc_local_denbi.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_source_pacbio_hhu_local:
 3 |   comment: "HGSVC local deNBI-VALET source for HG00514 Sequel2 PacBio CCS data"
 4 |   output: 'hgsvc_local_denbi_hifi.json'
 5 |   server: 'localhost'
 6 |   data_source: '/beeond/data/share/2020-07_HG00514_HiFi'
 7 |   collect_files:
 8 |     - 'fastq.gz'
 9 |   sort_into:
10 |     - 'fastq'
11 |   file_infix: 'hgsvc_pbsq2-'
12 |   fix_tech: 'ccs'
13 |   local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}'
14 | 


--------------------------------------------------------------------------------
/scripts/install_saarclust.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | if (is.element('SaaRclust', installed.packages()[,1])) {
 4 |     print('Removing previously installed version of SaaRclust')
 5 |     remove.packages('SaaRclust')
 6 | }
 7 | 
 8 | args = commandArgs(trailingOnly=TRUE)
 9 | 
10 | git.commit = args[1]
11 | 
12 | devtools::install_git(
13 |     "git://github.com/daewoooo/SaaRclust.git",
14 |     ref = git.commit,
15 |     dependencies=FALSE,
16 |     upgrade=FALSE
17 | )
18 | 
19 | quit(save="no")
20 | 


--------------------------------------------------------------------------------
/environment/conda/testing/rtest.yml:
--------------------------------------------------------------------------------
 1 | name: rtest
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - r
 6 | dependencies:
 7 |   - Python=3.6.*
 8 |   - pip=19.2.3
 9 |   - wget=1.19.4
10 |   - r-base<3.6
11 |   - r-devtools=2.2.1
12 |   - r-usethis=1.5.1
13 |   - r-withr=2.1.2
14 |   - r-igraph=1.2.4.1
15 |   - r-zoo=1.8_6
16 |   - r-cluster=2.1.0
17 |   - r-doparallel=1.0.15
18 |   - r-foreach=1.4.7
19 |   - r-biocmanager=1.30.7
20 |   - bioconductor-rhtslib=1.14.1
21 |   - bioconductor-bamsignals=1.14.0


--------------------------------------------------------------------------------
/scripts/install_strandphaser.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | if (is.element('StrandPhaseR', installed.packages()[,1])) {
 4 |     print('Removing previously installed version of StrandPhaseR')
 5 |     remove.packages('StrandPhaseR')
 6 | }
 7 | 
 8 | args = commandArgs(trailingOnly=TRUE)
 9 | 
10 | git.commit = args[1]
11 | 
12 | devtools::install_git(
13 |     "git://github.com/daewoooo/StrandPhaseR.git",
14 |     ref = git.commit,
15 |     dependencies=FALSE,
16 |     upgrade=FALSE
17 | )
18 | 
19 | quit(save="no")
20 | 


--------------------------------------------------------------------------------
/environment/conda/conda_rtools.yml:
--------------------------------------------------------------------------------
 1 | name: rtools
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python=3.7.*
 7 |   - pip
 8 |   - ca-certificates>=2020.10
 9 |   - openssl>=1.1
10 |   - quast=5.0.2
11 |   - augustus>=3.3
12 |   - busco>=4.1  # R dependency; update required b/c of LD_LIBRARY_PATH issue (2021/01)
13 |   - circos=0.69.8
14 |   - hmmer=3.1b2
15 |   - blast>=2.10
16 |   - htslib>=1.9
17 |   - perl>=5.26
18 |   - r-base>=4.0
19 |   - minimap2=2.17  # this is made explicit b/c of QUAST
20 | 
21 | 


--------------------------------------------------------------------------------
/annotation/20200507_ASanders_100cell_controls.txt:
--------------------------------------------------------------------------------
 1 | GM12329x02PE20490
 2 | GM18534Bx02PE20392
 3 | GM18939x02PE20464
 4 | GM19650Ax02PE20523
 5 | GM19983x02PE20496
 6 | GM20509Bx01PE20515
 7 | GM20847Bx02PE20410
 8 | HG00096x02PE20385
 9 | HG00171Ax02PE20490
10 | HG00864x02PE20396
11 | HG01114x02PE20328
12 | HG01505x02PE20494
13 | HG01573x02PE20391
14 | HG01596x02PE20501
15 | HG02011x02PE20571
16 | HG02018x01PE20491
17 | HG02492x02PE20423
18 | HG02587x02PE20390
19 | HG03009x02PE20385
20 | HG03065x02PE20587
21 | HG03371x02PE20572
22 | HG03683x01PE20461
23 | HG03732x02PE20594
24 | GM19036Bx02PE20369


--------------------------------------------------------------------------------
/environment/snakemake/cluster/deep/config.yaml:
--------------------------------------------------------------------------------
 1 | # Default runtime profile for DEEP infrastructure / SGE cluster
 2 | 
 3 | directory: /TL/deep/fhgfs/projects/pebert/cbi/diploid-assembly
 4 | drmaa-log-dir: /TL/deep/fhgfs/projects/pebert/cbi/diploid-assembly/log/cluster_jobs
 5 | drmaa: " {cluster.clusterSpec}"
 6 | jobname: "SMK{jobid}_{name}"
 7 | cluster-config: /home/pebert/work/code/github/project-diploid-assembly/environment/snakemake/cluster/deep/deep_cluster.json
 8 | local-cores: 4
 9 | jobs: 200
10 | latency-wait: 1
11 | keep-going: True
12 | rerun-incomplete: False
13 | restart-times: 1


--------------------------------------------------------------------------------
/smk_config/data_sources/hgsvc_ftp_src_illumina.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_source_JAX_illumina:
 3 |   comment: "HGSVC FTP source for Illumina short read data (JAX)"
 4 |   output: 'hgsvc_JAX_illumina.json'
 5 |   server: 'ftp.1000genomes.ebi.ac.uk'
 6 |   data_source: 'vol1/ftp/data_collections/HGSVC2/working/20191004_Illumina'
 7 |   collect_files:
 8 |     - 'fastq.gz'
 9 |   sort_into:
10 |     - 'fastq'
11 |   file_infix: 'hgsvc_ilnvs-'
12 |   fix_tech: '150pe'
13 |   file_suffix: 'library_id'
14 |   local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}_short'
15 |   assume_paired_reads: True
16 | 


--------------------------------------------------------------------------------
/smk_config/data_sources/hgsvc_ftp_src_strandseq.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | data_source_EMBL_strandseq:
 4 |   comment: "HGSVC FTP source for Strand-seq data (EMBL)"
 5 |   output: 'hgsvc_EMBL_strandseq.json'
 6 |   server: 'ftp.1000genomes.ebi.ac.uk'
 7 |   data_source: 'vol1/ftp/data_collections/HGSVC2/working/20200120_Strandseq/fastq'
 8 |   collect_files:
 9 |     - 'fastq.gz'
10 |   sort_into:
11 |     - 'fastq'
12 |   file_infix: 'hgsvc_ilnxs-'
13 |   fix_tech: '80pe'
14 |   file_suffix: 'library_id'
15 |   local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}_sseq'
16 |   assume_paired_reads: True


--------------------------------------------------------------------------------
/environment/conda/conda_export_env_vars.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | REPOSITORY_PREFIX="/home/pebert/work/code/github/project-diploid-assembly"
 4 | 
 5 | CONDA_PREFIX="/TL/epigenetics2/work/pebert/conda/envs/dipassm"
 6 | 
 7 | cd ${CONDA_PREFIX}
 8 | mkdir -p ./etc/conda/activate.d
 9 | mkdir -p ./etc/conda/deactivate.d
10 | #touch ./etc/conda/activate.d/env_vars.sh
11 | #touch ./etc/conda/deactivate.d/env_vars.sh
12 | 
13 | cp -f ${REPOSITORY_PREFIX}/environment/conda/activate/env_vars.sh ./etc/conda/activate.d/env_vars.sh
14 | cp -f ${REPOSITORY_PREFIX}/environment/conda/deactivate/env_vars.sh ./etc/conda/deactivate.d/env_vars.sh


--------------------------------------------------------------------------------
/scripts/run_strandphaser.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages(library(StrandPhaseR))
 4 | 
 5 | args = commandArgs(trailingOnly=TRUE)
 6 | 
 7 | bam.folder = args[1]
 8 | config.file = args[2]
 9 | variant.calls = args[3]
10 | wc.regions = args[4]
11 | output.folder = args[5]
12 | sample.individual = args[6]
13 | 
14 | strandPhaseR(
15 |     inputfolder=bam.folder,
16 |     configfile=config.file,
17 |     outputfolder=output.folder,
18 |     positions=variant.calls,
19 |     fillMissAllele=variant.calls,
20 |     WCregions=wc.regions,
21 |     exportVCF=sample.individual
22 | )
23 | 
24 | warnings()
25 | 
26 | quit(save='no')


--------------------------------------------------------------------------------
/environment/snakemake/cluster/denbi_tu_slurm/config.yaml:
--------------------------------------------------------------------------------
 1 | # Default runtime profile for deNBI Tuebingen infrastructure / SLURM cluster
 2 | 
 3 | directory: /mnt/vol/gridshare/projects/diploid-assembly
 4 | cluster: " sbatch --export=ALL --cpus-per-task={cluster.cores} --mem-per-cpu={cluster.mem_mb}M --job-name={cluster.name} --output={cluster.output} --error={cluster.error} "
 5 | cluster-config: /mnt/vol/gridshare/user/code/project-diploid-assembly/environment/snakemake/cluster/denbi_tu/denbi_cluster.json
 6 | local-cores: 2
 7 | jobs: 200
 8 | latency-wait: 1
 9 | keep-going: True
10 | rerun-incomplete: False
11 | restart-times: 1
12 | default-resources:
13 |   - mem_per_cpu_mb=2048
14 |   - mem_total_mb=4096


--------------------------------------------------------------------------------
/scripts/run_breakpointr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages(library(breakpointR))
 4 | 
 5 | args = commandArgs(trailingOnly=TRUE)
 6 | 
 7 | bam.folder = args[1]
 8 | config.file = args[2]
 9 | output.folder = args[3]
10 | num.cpu = args[4]
11 | output.file = args[5]
12 | 
13 | breakpointr(
14 |     inputfolder=bam.folder,
15 |     outputfolder=output.folder,
16 |     configfile=config.file,
17 |     numCPU=num.cpu
18 | )
19 | 
20 | exportRegions(
21 |     datapath=file.path(output.folder, "data"),
22 |     file=output.file,
23 |     collapseInversions=TRUE,
24 |     collapseRegionSize=5000000,
25 |     minRegionSize=5000000,
26 |     state="wc"
27 | )
28 | 
29 | warnings()
30 | 
31 | quit(save='no')


--------------------------------------------------------------------------------
/environment/conda/conda_pyscript.yml:
--------------------------------------------------------------------------------
 1 | name: pyscript
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python=3.6.*
 7 |   - pip=19.2.3
 8 |   - wget=1.19.4
 9 |   - openssl=1.0.2t
10 |   - libblas=3.8.0
11 |   - libcblas=3.8.0
12 |   - liblapack=3.8.0
13 |   - libopenblas=0.3.6
14 |   - libgfortran-ng=7.3.0
15 |   - libgcc-ng=9.1.0
16 |   - libstdcxx-ng=9.1.0
17 |   - libxml2=2.9.9
18 |   - libcurl=7.64.0
19 |   - lp_solve=5.5.2.5
20 |   - openblas=0.3.6
21 |   - make=4.2.1
22 |   - numpy=1.17.1
23 |   - matplotlib=3.1.1
24 |   - htslib=1.9
25 |   - libdeflate=1.3  # v1.0 may trigger pysam import error - DO NOT DOWNGRADE
26 |   - pysam=0.15.3
27 |   - dnaio=0.4.1
28 |   - biopython=1.76
29 | 


--------------------------------------------------------------------------------
/environment/snakemake/cluster/deep/deep_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "__default__": {
 3 |     "clusterSpec": " -V -S /bin/bash -l h_vmem=32G,slots_free=4,mem_free=8G",
 4 |     "jobName": "{rule}__defaultSpec__"
 5 |   },
 6 |   "arrow_contig_polishing_pass1": {
 7 |     "clusterSpec": " -V -S /bin/bash -l h_vmem=124G,slots_free=16,mem_free=80G",
 8 |     "jobName": "{rule}_arpol1"
 9 |   },
10 |   "racon_contig_polishing_pass1": {
11 |     "clusterSpec": " -V -S /bin/bash -l h_vmem=124G,slots_free=8,mem_free=80G",
12 |     "jobName": "{rule}_rcpol1"
13 |   },
14 |   "quast_analysis_reference_strandseq_polished_haploid_assembly": {
15 |     "clusterSpec": " -V -S /bin/bash -l h_vmem=124G,slots_free=12,mem_free=80G",
16 |     "jobName": "{rule}_quast"
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/scripts/eval/response/response-reviewer3-comment12.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | import upsetplot
 5 | 
 6 | # plotting script for response letter
 7 | # created by Tobias Marschall
 8 | 
 9 | d = pd.read_csv('pete-zenodo/variants_freeze3_sv_insdel.tsv.gz', sep='\t')
10 | 
11 | d = d.assign(PAV=lambda df: True)
12 | d = d.assign(PANGENIE_STRICT=lambda df: df.PG_CONF==4)
13 | d = d.assign(PANGENIE_LENIENT=lambda df: df.PG_CONF>0)
14 | d = d.assign(ILLUMINA=lambda df: df['1KGHC_OVERLAP']=='OVR')
15 | 
16 | counts = d.groupby(by=['PAV', 'PANGENIE_STRICT','PANGENIE_LENIENT','ILLUMINA']).size()
17 | upsetplot.plot(counts, sort_by='cardinality')
18 | plt.savefig('response-reviewer3-comment12.pdf')
19 | 
20 | print(counts)
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/scripts/eval/response/response-reviewer3-comment2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | import upsetplot
 5 | 
 6 | # plotting script for response letter
 7 | # created by Tobias Marschall
 8 | 
 9 | d = pd.read_csv('pete-zenodo/variants_freeze3_sv_insdel.tsv.gz', sep='\t')
10 | 
11 | d = d.assign(PAV=lambda df: True)
12 | d = d.assign(IN_AUDANO=lambda df: ~df.AUDANO2019.isna())
13 | d = d.assign(IN_CHAISSON=lambda df: ~df.HGSVC1.isna())
14 | 
15 | print(d.groupby(by=['PAV', 'IN_AUDANO']).size())
16 | print(d.groupby(by=['PAV', 'IN_CHAISSON']).size())
17 | 
18 | counts = d.groupby(by=['PAV', 'IN_AUDANO','IN_CHAISSON']).size()
19 | print(counts)
20 | upsetplot.plot(counts, sort_by='cardinality')
21 | plt.savefig('response-reviewer3-comment2.pdf')
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/environment/snakemake/cluster/denbi_tu_pbs/config.yaml:
--------------------------------------------------------------------------------
 1 | # Default runtime profile for deNBI Tuebingen infrastructure / PBS-TORQUE cluster
 2 | 
 3 | cluster: >-
 4 |   qsub
 5 |   -d ../run_folder
 6 |   -l walltime={cluster.run_hrs}:{cluster.run_min}:00,nodes=1:ppn={cluster.cores},mem={cluster.mem_mb}M
 7 |   -N {cluster.name} -o {cluster.output} -e {cluster.error}
 8 | cluster-config: environment/snakemake/cluster/denbi_tu_pbs/denbi_cluster.json
 9 | local-cores: 2
10 | jobs: 40
11 | latency-wait: 300
12 | keep-going: True
13 | rerun-incomplete: True
14 | keep-incomplete: False
15 | restart-times: 1
16 | use-conda: True
17 | use-envmodules: False
18 | nolock: True
19 | max-status-checks-per-second: 0.01
20 | default-resources:
21 |   - runtime_hrs=1
22 |   - runtime_min=59
23 |   - mem_per_cpu_mb=2048
24 |   - mem_total_mb=4096


--------------------------------------------------------------------------------
/notes/align_ccs_racon.md:
--------------------------------------------------------------------------------
 1 | # Align CCS reads to reference for Racon polishing
 2 | 
 3 | Sent by Aaron Wenger via e-mail on 2019-06-10
 4 | 
 5 | Relevance:
 6 |   - Racon accepts minimap's PAF output, but does not seem to perform polishing
 7 |   - Generally, for CCS reads, preset asm20 is suggested; not used here, see the following:
 8 | 
 9 | ```
10 | Use "-x map-pb" to minimap2 even when aligning CCS reads.
11 | Racon will clip out segments that have no coverage, and map-pb does
12 | a better job than `-x asm5` of avoiding alignment clipping at
13 | quality dropouts in the draft assembly.
14 | ```
15 | 
16 | ```
17 | The parameters we use are:
18 | $ minimap2 -a -x map-pb --eqx -m 5000 --secondary=no draft-asm.fa reads.fastq | samtools sort | samtools view -q 10 -F0x704 - > draft-asm.reads.sam
19 | $ racon reads.fastq draft-asm.reads.sam draft-asm.fa -u > polished-asm.fa
20 | ```
21 | 
22 | 


--------------------------------------------------------------------------------
/environment/snakemake/cluster/hhu_pbs/config.yaml:
--------------------------------------------------------------------------------
 1 | # Default runtime profile for HHU HILBERT infrastructure / PBS Professional
 2 | 
 3 | cluster: >-
 4 |   qsub
 5 |   -A {cluster.account} -l walltime={cluster.run_hrs}:{cluster.run_min}:00
 6 |   -l select=1:ncpus={cluster.cores}:mem={cluster.mem_mb}mb{cluster.arch}
 7 |   -N {cluster.name} -o {cluster.output} -e {cluster.error}
 8 | cluster-config: environment/snakemake/cluster/hhu_pbs/hilbert_cluster.json
 9 | local-cores: 1
10 | jobs: 50
11 | latency-wait: 300
12 | keep-going: True
13 | keep-incomplete: False
14 | rerun-incomplete: True
15 | restart-times: 1
16 | max-status-checks-per-second: 0.001
17 | use-conda: True
18 | use-envmodules: True
19 | conda-prefix: /gpfs/project/ebertp/projects/conda_envs
20 | nolock: False
21 | default-resources:
22 |   - mem_per_cpu_mb=1024
23 |   - mem_total_mb=1024
24 |   - runtime_hrs=1
25 |   - runtime_min=59
26 | 


--------------------------------------------------------------------------------
/annotation/1kg_hgsvc_colors.csv:
--------------------------------------------------------------------------------
 1 | super_pop	population	hex	rgb
 2 | AFR	ACB	f4971d	244,151,29
 3 | AFR	AFR	DB7D27	219,125,39
 4 | AFR	ASW	e9651e	233,101,30
 5 | AFR	ESN	fecf0d	254,207,13
 6 | AFR	GWD	fbeb09	251,235,9
 7 | AFR	LWK	cb9a31	203,154,49
 8 | AFR	MSL	dfb819	223,184,25
 9 | AFR	YRI	feca6a	254,202,106
10 | AMR	AMR	D72519	215,37,25
11 | AMR	CLM	cc3133	204,49,51
12 | AMR	MXL	df0036	223,0,54
13 | AMR	PEL	e61420	230,20,32
14 | AMR	PUR	cb3413	203,52,19
15 | EAS	CDX	369934	54,153,52
16 | EAS	CHB	aeca10	174,202,16
17 | EAS	CHS	66b42a	102,180,42
18 | EAS	EAS	41A22F	65,162,47
19 | EAS	JPT	158d34	21,141,52
20 | EAS	KHV	4dad38	77,173,56
21 | EUR	ASK	2930de	41,48,222
22 | EUR	CEU	264999	38,73,153
23 | EUR	EUR	2D6F91	45,111,145
24 | EUR	FIN	32bac5	50,186,197
25 | EUR	GBR	70c0d2	112,192,210
26 | EUR	IBS	6385af	99,133,175
27 | EUR	TSI	293065	41,48,101
28 | SAS	BEB	831b82	131,27,130
29 | SAS	GIH	6b3f94	107,63,148
30 | SAS	ITU	b12e60	177,46,96
31 | SAS	PJL	dd1384	221,19,132
32 | SAS	SAS	782B8A	120,43,138
33 | SAS	STU	a7529b	167,82,155
34 | 


--------------------------------------------------------------------------------
/scripts/install_breakpointr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | if (is.element('breakpointR', installed.packages()[,1])) {
 4 |     print('Removing previously installed version of breakpointR')
 5 |     remove.packages('breakpointR')
 6 | }
 7 | 
 8 | args = commandArgs(trailingOnly=TRUE)
 9 | 
10 | git.commit = args[1]
11 | 
12 | if (is.na(as.numeric(git.commit))) {
13 |     # means proper git tag
14 | 
15 |     # if dev version is installed, breakpointRdata
16 |     # is not automatically included as a dependency,
17 |     # so trigger setup manually
18 |     devtools::install_git(
19 |         "git://github.com/daewoooo/breakpointRdata.git",
20 |         dependencies=FALSE,
21 |         upgrade=FALSE
22 |     )
23 | 
24 |     devtools::install_git(
25 |         "git://github.com/daewoooo/breakpointR.git",
26 |         ref = git.commit,
27 |         dependencies=FALSE,
28 |         upgrade=FALSE
29 |     )
30 | } else {
31 |     BiocManager::install(
32 |         c("breakpointR"),
33 |         update=FALSE
34 |     )
35 | }
36 | 
37 | quit(save="no")


--------------------------------------------------------------------------------
/annotation/grch38/known_regions/Modeled_regions_for_GRCh38.tsv:
--------------------------------------------------------------------------------
 1 | #region_name	chr	start	stop
 2 | CEN1	1	122026460	125184587	3158128
 3 | CEN2	2	92188146	94090557	1902412
 4 | CEN3	3	90772459	93655574	2883116
 5 | CEN4	4	49708101	51743951	2035851
 6 | CEN5	5	46485901	50059807	3573907
 7 | CEN6	6	58553889	59829934	1276046
 8 | CEN7	7	58169654	60828234	2658581
 9 | HET7	7	61377789	61528020	150232
10 | CEN8	8	44033745	45877265	1843521
11 | CEN9	9	43236168	45518558	2282391
12 | CEN10	10	39686683	41593521	1906839
13 | CEN11	11	51078349	54425074	3346726
14 | CEN12	12	34769408	37185252	2415845
15 | CEN13	13	16000001	18051248	2051248
16 | CEN14	14	16000001	18173523	2173523
17 | CEN15	15	17000001	19725254	2725254
18 | CEN16	16	36311159	38280682	1969524
19 | CEN17	17	22813680	26885980	4072301
20 | CEN18	18	15460900	20861206	5400307
21 | CEN19	19	24498981	27190874	2691894
22 | CEN20	20	26436233	30038348	3602116
23 | CEN21	21	10864561	12915808	2051248
24 | CEN22	22	12954789	15054318	2099530
25 | CENX	X	58605580	62412542	3806963
26 | CENY	Y	10316945	10544039	227095


--------------------------------------------------------------------------------
/environment/sync/sync_commands.md:
--------------------------------------------------------------------------------
 1 | ### Sync from deNBI cloud cluster to MMCI/MPI
 2 | 
 3 | Executed as daily cronjob at 6am on d3compute09
 4 | 
 5 | ```bash
 6 | rsync --recursive --delete-before --prune-empty-dirs \
 7 |     --exclude-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/exclude.txt \
 8 |     --include-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/include.txt \
 9 |     centos@valet:/mnt/vol/beeond_backup/projects/diploid-assembly \
10 |     /scratch/bioinf/projects/diploid-genome-assembly/sync/denbi
11 | ```
12 | 
13 | Executed as daily cronjob at 6pm on lap-13-72
14 | 
15 | ```bash
16 | rsync --recursive --delete-before --prune-empty-dirs \
17 |     -e "ssh contact.mpi-inf.mpg.de ssh" \
18 |     --exclude-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/exclude.txt \
19 |     --include-from=/home/pebert/work/code/github/project-diploid-assembly/environment/sync/include.txt \
20 |     /mnt/sshfs/hhu/project/ebertp/projects/rfdga \
21 |     /scratch/bioinf/projects/diploid-genome-assembly/sync/hhu
22 | ```
23 | 


--------------------------------------------------------------------------------
/smk_include/module_includes.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | # modules w/o other dependencies
 3 | include: 'constraints.smk'
 4 | include: 'aux_utilities.smk'
 5 | include: 'environments.smk'
 6 | include: 'link_data_sources.smk'
 7 | include: 'scrape_data_sources.smk'
 8 | include: 'query_data_repos.smk'
 9 | include: 'handle_reference_download.smk'
10 | 
11 | # input preparation stage, one or two dependencies to above modules
12 | include: 'handle_data_download.smk'
13 | include: 'preprocess_input.smk'
14 | include: 'preprocess_references.smk'
15 | 
16 | # actual pipeline processing steps
17 | include: 'variant_calling.smk'
18 | include: 'integrative_phasing.smk'
19 | 
20 | include: 'strandseq_dga_split.smk'
21 | include: 'strandseq_dga_joint.smk'
22 | 
23 | include: 'collect_statistics.smk'
24 | include: 'run_alignments.smk'
25 | include: 'run_assemblies.smk'
26 | include: 'prepare_custom_references.smk'
27 | 
28 | include: 'run_polishing.smk'
29 | 
30 | include: 'haploid_assembly_clustering.smk'
31 | include: 'haploid_read_coverage.smk'
32 | 
33 | include: 'create_plots.smk'
34 | include: 'eval_known_reference.smk'
35 | 
36 | include: 'targets.smk'


--------------------------------------------------------------------------------
/scripts/fb-parallel-timeout.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# -lt 3 ]];
 4 | then
 5 |     echo "usage: $0 [regions file] [ncpus] [timeout] [logfile] [freebayes arguments]"
 6 |     echo
 7 |     echo "Run freebayes in parallel over regions listed in regions file, using ncpus processors."
 8 |     echo "Will merge and sort output, producing a uniform VCF stream on stdout.  Flags to freebayes"
 9 |     echo "which would write to e.g. a particular file will obviously cause problms, so caution is"
10 |     echo "encouraged when using this script."
11 |     echo
12 |     echo "This script: adapted by Tobias Marschall"
13 |     echo
14 |     echo "For original version, see this github repo:"
15 |     echo
16 |     echo "https://github.com/ekg/freebayes/blob/master/scripts/freebayes-parallel"
17 |     echo
18 |     exit
19 | fi
20 | 
21 | regionsfile=$1
22 | shift
23 | ncpus=$1
24 | shift
25 | timeout=$1
26 | shift
27 | logfile=$1
28 | shift
29 | 
30 | command=("freebayes" "$@")
31 | 
32 | ( cat "$regionsfile" | parallel -k --joblog "$logfile" -j "$ncpus" "timeout ${timeout} ${command[@]}" --region {} ) | vcffirstheader | vcfstreamsort -w 10000 | vcfuniq


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Peter Ebert
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/annotation/grch38/20200723_GRCh38_p13_regions.bed:
--------------------------------------------------------------------------------
 1 | #chrom	start	end	name	score
 2 | chr1	122026460	125184588	CEN1	1000
 3 | chr10	39686683	41593522	CEN10	1000
 4 | chr11	51078349	54425075	CEN11	1000
 5 | chr12	34769408	37185253	CEN12	1000
 6 | chr13	16000001	18051249	CEN13	1000
 7 | chr14	16000001	18173524	CEN14	1000
 8 | chr15	17000001	19725255	CEN15	1000
 9 | chr16	36311159	38280683	CEN16	1000
10 | chr17	22813680	26885981	CEN17	1000
11 | chr18	15460900	20861207	CEN18	1000
12 | chr19	24498981	27190875	CEN19	1000
13 | chr2	92188146	94090558	CEN2	1000
14 | chr20	26436233	30038349	CEN20	1000
15 | chr21	10864561	12915809	CEN21	1000
16 | chr22	12954789	15054319	CEN22	1000
17 | chr3	90772459	93655575	CEN3	1000
18 | chr4	49708101	51743952	CEN4	1000
19 | chr5	46485901	50059808	CEN5	1000
20 | chr6	58553889	59829935	CEN6	1000
21 | chr7	58169654	60828235	CEN7	1000
22 | chr7	61377789	61528021	HET7	750
23 | chr8	44033745	45877266	CEN8	1000
24 | chr9	43236168	45518559	CEN9	1000
25 | chrX	10001	2781480	PAR1X	500
26 | chrX	58605580	62412543	CENX	1000
27 | chrX	155701383	156030896	PAR2X	500
28 | chrY	10001	2781480	PAR1Y	500
29 | chrY	10316945	10544040	CENY	1000
30 | chrY	56887903	57217416	PAR2Y	500
31 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/ESN/hg03125.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03125:
 3 |   individual: HG03125
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: ESN
 7 |   family: NG34
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03125_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03125_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03125_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03125:
28 |   - aliases:
29 |       1: &ccs_reads HG03125_hgsvc_pbsq2-ccs_1000
30 |   - defaults:
31 |       hap_reads: *ccs_reads
32 |       vc_reads: *ccs_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *ccs_reads
35 |       pol_pass: racon-p2
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: pereg
42 |       hap_assembler: pereg
43 |       var_caller: deepvar


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/GWD/hg02818.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG02818:
 3 |   individual: HG02818
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: GWD
 7 |   family: GB66
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG02818_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG02818_hgsvc_ilnxs-80pe_sseq
17 |         source_type: local
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG02818_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG02818:
28 |   - aliases:
29 |       1: &ccs_reads HG02818_hgsvc_pbsq2-ccs_1000
30 |   - defaults:
31 |       hap_reads: *ccs_reads
32 |       vc_reads: *ccs_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *ccs_reads
35 |       pol_pass: racon-p2
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: pereg
42 |       hap_assembler: pereg
43 |       var_caller: deepvar


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/MSL/hg03486.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03486:
 3 |   individual: HG03486
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: MSL
 7 |   family: SL61
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03486_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03486_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03486_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03486:
28 |   - aliases:
29 |       1: &ccs_reads HG03486_hgsvc_pbsq2-ccs_1000
30 |   - defaults:
31 |       hap_reads: *ccs_reads
32 |       vc_reads: *ccs_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *ccs_reads
35 |       pol_pass: racon-p2
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: pereg
42 |       hap_assembler: pereg
43 |       var_caller: deepvar


--------------------------------------------------------------------------------
/scripts/utilities/mem_profiler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import time
 5 | import psutil
 6 | 
 7 | 
 8 | workdir = os.getcwd()
 9 | logfile = os.path.join(workdir, 'memlog.txt')
10 | 
11 | bytes_to_gigabytes = 1024 ** 3
12 | 
13 | time_limit = 86400
14 | 
15 | with open(logfile, 'w') as foo:
16 |     pass
17 | 
18 | sleep_time = 0
19 | 
20 | with open(logfile, 'a') as log:
21 |     header = '\t'.join(['#time', 'threads', 'load', 'mem_tot', 'mem_free', 'swap_tot', 'swap_free'])
22 |     _ = log.write(header + '\n')
23 |     while sleep_time < time_limit:
24 |         now = str(time.ctime()).replace(' ', '_')
25 |         threads = str(psutil.cpu_count(logical=True))
26 |         pct_cpu = str(round(psutil.cpu_percent(), 2))
27 |         mem = psutil.virtual_memory()
28 |         mem_tot = str(round(mem.total / bytes_to_gigabytes, 2))
29 |         mem_av = str(round(mem.available / bytes_to_gigabytes, 2))
30 |         swap = psutil.swap_memory()
31 |         swap_tot = str(round(swap.total / bytes_to_gigabytes, 2))
32 |         swap_free = str(round(swap.free / bytes_to_gigabytes, 2))
33 | 
34 |         logline = '\t'.join([now, threads, pct_cpu, mem_tot, mem_av, swap_tot, swap_free])
35 |         _ = log.write(logline + '\n')
36 | 
37 |         sleep_time += 60
38 |         time.sleep(60)
39 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AMR/PEL/hg01573.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG01573:
 3 |   individual: HG01573
 4 |   sex: female
 5 |   super_population: AMR
 6 |   population: PEL
 7 |   family: PEL003
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG01573_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "no squashed assembly possible"
16 |     - strandseq:
17 |         readset: &sseq_reads HG01573_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads HG01573_hgsvc_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB36890
24 |         load_type: complete
25 |         comment: "698 cohort"
26 | 
27 | 
28 | sample_targets_HG01573:
29 |   - aliases:
30 |       2: &clr_hgsvc HG01573_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_hgsvc
33 |       vc_reads: *clr_hgsvc
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_hgsvc
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/LWK/na19036.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA19036:
 3 |   individual: NA19036
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: LWK
 7 |   family: NA19036
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA19036_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "no squashed assembly possible"
16 |     - strandseq:
17 |         readset: &sseq_reads NA19036_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads NA19036_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB31736
24 |         load_type: complete
25 |         comment: "2504 cohort"
26 | 
27 | 
28 | sample_targets_NA19036:
29 |   - aliases:
30 |       2: &clr_hgsvc NA19036_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_hgsvc
33 |       vc_reads: *clr_hgsvc
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_hgsvc
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |         - h1
42 |         - h2
43 |   - target:
44 |       nhr_assembler: flye
45 |       hap_assembler: flye
46 |       var_caller: longshot
47 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/SAS/PJL/hg02492.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG02492:
 3 |   individual: HG02492
 4 |   sex: male
 5 |   super_population: SAS
 6 |   population: PJL
 7 |   family: PK06
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG02492_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG02492_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG02492_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG02492:
28 |   - aliases:
29 |       1: &clr_reads HG02492_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: uw27
46 |       hap_assembler: flye
47 |       var_caller: longshot


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EUR/CEU/na12329.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA12329:
 3 |   individual: NA12329
 4 |   sex: female
 5 |   super_population: EUR
 6 |   population: CEU
 7 |   family: 1328
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA12329_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads NA12329_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads NA12329_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_NA12329:
28 |   - aliases:
29 |       1: &clr_reads NA12329_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: jax27
46 |       hap_assembler: flye
47 |       var_caller: longshot


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/SAS/BEB/hg03009.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03009:
 3 |   individual: HG03009
 4 |   sex: male
 5 |   super_population: SAS
 6 |   population: BEB
 7 |   family: HG03009
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03009_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03009_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03009_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB31736
23 |         load_type: complete
24 |         comment: "2504 cohort"
25 | 
26 | 
27 | sample_targets_HG03009:
28 |   - aliases:
29 |       1: &clr_reads HG03009_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: jax27
46 |       hap_assembler: flye
47 |       var_caller: longshot


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/SAS/ITU/hg03721.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03721:
 3 |   individual: HG03721
 4 |   sex: female
 5 |   super_population: SAS
 6 |   population: ITU
 7 |   family: IT003
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03721_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03721_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03721_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03721:
28 |   - aliases:
29 |       1: &clr_reads HG03721_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: hhu26
46 |       hap_assembler: flye
47 |       var_caller: longshot
48 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/SAS/STU/hg03683.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03683:
 3 |   individual: HG03683
 4 |   sex: female
 5 |   super_population: SAS
 6 |   population: STU
 7 |   family: ST012
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03683_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03683_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03683_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03683:
28 |   - aliases:
29 |       1: &clr_reads HG03683_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: jax27
46 |       hap_assembler: flye
47 |       var_caller: longshot
48 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/KHV/hg01596.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG01596:
 3 |   individual: HG01596
 4 |   sex: male
 5 |   super_population: EAS
 6 |   population: KHV
 7 |   family: VN002
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG01596_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG01596_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG01596_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB31736
23 |         load_type: complete
24 |         comment: "2504 cohort"
25 | 
26 | 
27 | sample_targets_HG01596:
28 |   - aliases:
29 |       1: &clr_reads HG01596_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: hhu27
46 |       hap_assembler: flye
47 |       var_caller: longshot
48 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/SAS/ITU/hg03732.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03732:
 3 |   individual: HG03732
 4 |   sex: male
 5 |   super_population: SAS
 6 |   population: ITU
 7 |   family: IT003
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03732_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03732_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03732_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03732:
28 |   - aliases:
29 |       1: &clr_reads HG03732_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: hhu26
46 |       hap_assembler: flye
47 |       var_caller: longshot
48 | 
49 | 


--------------------------------------------------------------------------------
/smk_config/data_sources/hgsvc_local_hhu.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_source_pacbio_hifi_hhu_local:
 3 |   comment: "HGSVC local HHU-HILBERT source for Sequel2 PacBio CCS data"
 4 |   output: 'hgsvc_local_hhu_hifi.json'
 5 |   server: 'localhost'
 6 |   data_source: '/gpfs/project/ebertp/data/globus/sequence_data/HiFi'
 7 |   collect_files:
 8 |     - 'fastq.gz'
 9 |   sort_into:
10 |     - 'fastq'
11 |   file_infix: 'hgsvc_pbsq2-'
12 |   fix_tech: 'ccs'
13 |   local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}'
14 | 
15 | 
16 | data_source_pacbio_clr_hhu_local:
17 |   comment: "HGSVC local HHU-HILBERT source for Sequel2 PacBio CLR data"
18 |   output: 'hgsvc_local_hhu_clr.json'
19 |   server: 'localhost'
20 |   data_source: '/gpfs/project/ebertp/data/globus/sequence_data/CLR'
21 |   collect_files:
22 |     - 'bam'
23 |   sort_into:
24 |     - 'bam'
25 |   file_infix: 'hgsvc_pbsq2-'
26 |   fix_tech: 'clr'
27 |   assume_pacbio_native: True
28 |   local_path_suffix: '{{individual}}_{{file_infix}}{{tech}}'
29 | 
30 | 
31 | data_source_strandseq_hhu_local:
32 |   comment: "HHU local Strand-seq source for NA24385 and sub-sampled HG00733"
33 |   output: 'strandseq_local_hhu.json'
34 |   server: 'localhost'
35 |   data_source: '/gpfs/project/ebertp/data/local_source/strandseq'
36 |   collect_files:
37 |     - 'fastq.gz'
38 |   sort_into:
39 |     - 'fastq'
40 |   assume_correct_filenames: True
41 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/CDX/hg00864.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00864:
 3 |   individual: HG00864
 4 |   sex: female
 5 |   super_population: EAS
 6 |   population: CDX
 7 |   family: HG00864
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00864_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG00864_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG00864_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB31736
23 |         load_type: complete
24 |         comment: "2504 cohort"
25 | 
26 | 
27 | sample_targets_HG00864:
28 |   - aliases:
29 |       1: &clr_reads HG00864_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: jax27
46 |       hap_assembler: flye
47 |       var_caller: longshot
48 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EUR/FIN/hg00171.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00171:
 3 |   individual: HG00171
 4 |   sex: female
 5 |   super_population: EUR
 6 |   population: FIN
 7 |   family: HG00171
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00171_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG00171_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG00171_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB31736
23 |         load_type: complete
24 |         comment: "2504 cohort"
25 | 
26 | 
27 | sample_targets_HG00171:
28 |   - aliases:
29 |       1: &clr_reads HG00171_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_reads
32 |       vc_reads: *clr_reads
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_reads
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - target:
45 |       nhr_assembler: hhu26
46 |       hap_assembler: flye
47 |       var_caller: longshot
48 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EUR/IBS/hg01505.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG01505:
 3 |   individual: HG01505
 4 |   sex: male
 5 |   super_population: EUR
 6 |   population: IBS
 7 |   family: child
 8 |   member: IBS002
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG01505_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads HG01505_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads HG01505_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB36890
24 |         load_type: complete
25 |         comment: "698 cohort"
26 | 
27 | 
28 | sample_targets_HG01505:
29 |   - aliases:
30 |       1: &clr_reads HG01505_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: jax27
47 |       hap_assembler: flye
48 |       var_caller: longshot
49 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/JPT/na18939.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA18939:
 3 |   individual: NA18939
 4 |   sex: female
 5 |   super_population: EAS
 6 |   population: JPT
 7 |   family: NA18939
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA18939_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads NA18939_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads NA18939_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB31736
24 |         load_type: complete
25 |         comment: "2504 cohort"
26 | 
27 | 
28 | sample_targets_NA18939:
29 |   - aliases:
30 |       1: &clr_reads NA18939_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: hhu26
47 |       hap_assembler: flye
48 |       var_caller: longshot


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EUR/GBR/hg00096.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00096:
 3 |   individual: HG00096
 4 |   sex: male
 5 |   super_population: EUR
 6 |   population: GBR
 7 |   family: HG00096
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00096_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads HG00096_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads HG00096_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB31736
24 |         load_type: complete
25 |         comment: "2504 cohort"
26 | 
27 | 
28 | sample_targets_HG00096:
29 |   - aliases:
30 |       1: &clr_reads HG00096_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: jax27
47 |       hap_assembler: flye
48 |       var_caller: longshot
49 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EUR/TSI/na20509.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA20509:
 3 |   individual: NA20509
 4 |   sex: male
 5 |   super_population: EUR
 6 |   population: TSI
 7 |   family: NA20509
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA20509_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads NA20509_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads NA20509_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB31736
24 |         load_type: complete
25 |         comment: "2504 cohort"
26 | 
27 | 
28 | sample_targets_NA20509:
29 |   - aliases:
30 |       1: &clr_reads NA20509_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: jax27
47 |       hap_assembler: flye
48 |       var_caller: longshot
49 | 


--------------------------------------------------------------------------------
/scripts/utilities/check_scripts/fastq_checker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import io
 5 | import argparse
 6 | 
 7 | import dnaio
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('fastq1', type=str)
12 |     parser.add_argument('fastq2', type=str)
13 |     args = parser.parse_args()
14 |     return args
15 | 
16 | 
17 | def collect_read_names(fastq_path):
18 | 
19 |     reads = []
20 | 
21 |     with dnaio.open(fastq_path) as fastx:
22 |         for record in fastx:
23 |             reads.append(record.name)
24 | 
25 |     total_reads = len(reads)
26 |     reads = set(reads)
27 | 
28 |     if not total_reads == len(reads):
29 |         print('error: read duplicates {}: {} out of {}'.format(fastq_path, total_reads - len(reads), total_reads))
30 | 
31 |     return reads
32 | 
33 | 
34 | def main():
35 |     args = parse_args()
36 |     fq1_reads = collect_read_names(args.fastq1)
37 |     fq2_reads = collect_read_names(args.fastq2)
38 | 
39 |     intersect = fq1_reads.intersection(fq2_reads)
40 |     if len(intersect) > 0:
41 |         print('error: read sets not disjoined: {} out of {} / {}'.format(len(intersect), len(fq1_reads), len(fq2_reads)))
42 | 
43 |     print('fq1 reads {}'.format(len(fq1_reads)))
44 |     print('fq2 reads {}'.format(len(fq2_reads)))
45 | 
46 | 
47 |     return 0
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/SAS/GIH/na20847.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA20847:
 3 |   individual: NA20847
 4 |   sex: female
 5 |   super_population: SAS
 6 |   population: GIH
 7 |   family: NA20847
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA20847_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads NA20847_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads NA20847_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB31736
24 |         load_type: complete
25 |         comment: "2504 cohort"
26 | 
27 | 
28 | sample_targets_NA20847:
29 |   - aliases:
30 |       1: &clr_reads NA20847_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: hhu26
47 |       hap_assembler: flye
48 |       var_caller: longshot
49 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/CHB/na18534.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA18534:
 3 |   individual: NA18534
 4 |   sex: male
 5 |   super_population: EAS
 6 |   population: CHB
 7 |   family: NA18534
 8 |   member: unrelated
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA18534_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads NA18534_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads NA18534_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB31736
24 |         load_type: complete
25 |         comment: "2504 cohort"
26 | 
27 | 
28 | sample_targets_NA18534:
29 |   - aliases:
30 |       1: &clr_reads NA18534_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: jax27
47 |       hap_assembler: flye
48 |       var_caller: longshot
49 | 
50 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/KHV/hg02018.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG02018:
 3 |   individual: HG02018
 4 |   sex: female
 5 |   super_population: EAS
 6 |   population: KHV
 7 |   family: VN047
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG02018_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "no phased assembly possible"
16 |     - strandseq:
17 |         readset: &sseq_reads HG02018_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads HG02018_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB36890
24 |         load_type: complete
25 |         comment: "698 cohort"
26 | 
27 | 
28 | sample_targets_HG02018:
29 |   - aliases:
30 |       1: &clr_reads HG02018_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - target:
46 |       nhr_assembler: jax27
47 |       hap_assembler: flye
48 |       var_caller: longshot
49 | 


--------------------------------------------------------------------------------
/scripts/utilities/check_scripts/tagging_checker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import io
 5 | import argparse
 6 | 
 7 | import pandas as pd
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('tags', type=str)
13 |     args = parser.parse_args()
14 |     return args
15 | 
16 | 
17 | def check_haplotags(file_path):
18 | 
19 |     names = ['readname', 'haplotype', 'phaseset', 'chromosome']
20 |     df = pd.read_csv(file_path, sep='\t', comment='#', header=None, names=names)
21 |     
22 |     hap_counts = df['haplotype'].value_counts()
23 |     total = hap_counts.sum()
24 |    
25 |     print('--- percent tagged')
26 | 
27 |     for hap, count in hap_counts.items():
28 |         print(hap, round(count / total * 100, 2))
29 | 
30 | 
31 |     h1_reads = set(df.loc[df['haplotype'] == 'H1', 'readname'].values)
32 |     h2_reads = set(df.loc[df['haplotype'] == 'H2', 'readname'].values)
33 |     untagged = set(df.loc[df['haplotype'] == 'none', 'readname'].values)
34 | 
35 |     print('--- intersect')
36 | 
37 |     print('h1 v h2 ', len(h1_reads.intersection(h2_reads)))
38 |     print('h1 v h0 ', len(h1_reads.intersection(untagged)))
39 |     print('h2 v h0 ', len(h2_reads.intersection(untagged)))
40 | 
41 |     return
42 | 
43 | 
44 | def main():
45 |     args = parse_args()
46 |     check_haplotags(args.tags)
47 | 
48 |     return 0
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/scripts/dev/ref_phasing/prep_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import argparse
 5 | import re
 6 | import io
 7 | 
 8 | import pysam
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--vcf-in', '-i', dest='input', type=str)
13 |     parser.add_argument('--out-pattern', '-o', dest='output', type=str)
14 |     parser.add_argument('--chromosomes', '-c', dest='chrom', default='"^chr[0-9]+$"')
15 |     args = parser.parse_args()
16 |     return args
17 | 
18 | 
19 | def main():
20 |     args = parse_args()
21 | 
22 |     chrom_match = re.compile(args.chrom.strip('"'))
23 | 
24 |     with pysam.VariantFile(args.input) as vcf:
25 |         collected_chroms = [c for c in vcf.header.contigs if chrom_match(c) is not None]
26 | 
27 |     call = 'bcftools view --regions {} --output-type v --output-file {} {}'
28 |     for c in collected_chroms:
29 |         out_file = args.output.strip('"').format(c)
30 |         out_path = os.path.dirname(out_file)
31 |         os.makedirs(out_path, exist_ok=True)
32 |         tmp = call.format(c, out_file, args.input)
33 |         try:
34 |             out = subprocess.check_output(tmp, stderr=subprocess.STDOUT, shell=True, executable='/bin/bash')
35 |         except subprocess.CalledProcessError as spe:
36 |             raise RuntimeError(spe.output.decode('utf-8'))
37 | 
38 |     print(sorted(collected_chroms))
39 |     return 0
40 | 
41 | if __name__ == '__main__':
42 |     main()


--------------------------------------------------------------------------------
/environment/conda/conda_biotools.yml:
--------------------------------------------------------------------------------
 1 | name: biotools
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python=3.6.*
 7 |   - pip=19.2.3
 8 |   - openssl=1.0.2t
 9 |   - gxx_impl_linux-64=7.3.0
10 |   - gxx_linux-64=7.3.0
11 |   - ld_impl_linux-64=2.33.1
12 |   - libblas=3.8.0
13 |   - libcblas=3.8.0
14 |   - liblapack=3.8.0
15 |   - libopenblas=0.3.6
16 |   - libgfortran-ng=7.3.0
17 |   - libgcc-ng=9.1.0
18 |   - libstdcxx-ng=9.1.0
19 |   - libxml2=2.9.9
20 |   - lp_solve=5.5.2.5
21 |   - openblas=0.3.6
22 |   - make=4.2.1
23 |   - bwa=0.7.17
24 |   - minimap2=2.17
25 |   - mummer4
26 |   - sambamba=0.7.1  # FIX github.com/biod/sambamba/issues/393 - DO NOT DOWNGRADE
27 |   - ldc=1.13.0=h02c9852_1  # FIX https://github.com/bcbio/bcbio-nextgen/issues/3032 - DO NOT UP- or DOWNGRADE
28 |   - htslib=1.9
29 |   - libdeflate=1.3  # v1.0 may trigger pysam import error - DO NOT DOWNGRADE
30 |   - pysam=0.15.3
31 |   - samtools=1.9
32 |   - bamtools=2.5.1
33 |   - bedtools=2.29.0
34 |   - bedops=2.4.37
35 |   - bcftools=1.9
36 |   - fastqc=0.11.8
37 |   - freebayes=1.3.1
38 |   - longshot=0.4.0
39 |   - wtdbg=2.5
40 |   - flye=2.7
41 |   - canu=2.0
42 |   - racon=1.4.10
43 |   - lighter=1.1.2
44 |   - bcalm=2.2.2
45 |   - dnaio=0.4.2
46 |   - cutadapt=2.10
47 |   - trim-galore=0.6.5
48 | #  - bifrost=1.0.3
49 |   - graphaligner=1.0.11
50 |   - ucsc-bedgraphtobigwig=377
51 |   - pigz=2.4
52 |   - pip:
53 |     - git+https://bitbucket.org/whatshap/whatshap@a3f8c91
54 | 


--------------------------------------------------------------------------------
/smk_config/demo/na12878.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA12878:
 3 |   individual: NA12878
 4 |   sex: female
 5 |   super_population: EUR
 6 |   population: CEU
 7 |   family: 1463
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA12878_demo_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: complete
15 |     - strandseq:
16 |         readset: NA12878_demo_il25k-100pe_sseq
17 |         library_fractions: one
18 | 
19 | 
20 | sample_targets_NA12878:
21 |   - aliases:
22 |       1: &ccs_reads NA12878_demo_pbsq2-ccs_1000
23 |       2: &sseq_reads NA12878_demo_il25k-100pe_sseq
24 |   - defaults:
25 |       hap_reads: *ccs_reads
26 |       sseq_reads: *sseq_reads
27 |       vc_reads: *ccs_reads
28 |       pol_reads: *ccs_reads
29 |       hap_assm_mode: split
30 |       hap:
31 |         - h1-un
32 |         - h2-un
33 |   - target:
34 |       nhr_assembler: flye
35 |       hap_assembler: flye
36 |       var_caller: freebayes
37 |       pol_pass: racon-p2
38 | 
39 | 
40 | data_source_NA12878_demo:
41 |   output: 'na12878_demo_local.json'
42 |   server: 'localhost'
43 |   data_source: '../demo_data/'
44 |   collect_files:
45 |     - 'fastq.gz'
46 |   sort_into:
47 |     - 'fastq'
48 |   assume_correct_filenames: True
49 | 
50 | force_local_copy: False
51 | 
52 | link_data_input:
53 |   - '../demo_data/NA12878_demo_reference.fasta'
54 | 
55 | link_data_output:
56 |   - 'references/assemblies/NA12878_demo_reference.fasta'


--------------------------------------------------------------------------------
/smk_include/link_data_sources.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | CONFIG_FORCE_LOCAL_COPY = bool(config.get('force_local_copy', False))
 3 | 
 4 | if not CONFIG_FORCE_LOCAL_COPY:
 5 |     # making copies can be I/O intensive,
 6 |     # this should not run on a cluster submit node
 7 |     localrules: master_link_data_sources
 8 | 
 9 | 
10 | rule master_link_data_sources:
11 |     """
12 |     This is the place to inject external data
13 |     into the pipeline via symlinking
14 |     """
15 |     input:
16 |         ancient([os.path.abspath(fp) for fp in config.get('link_data_input', [])])
17 |     output:
18 |         config.get('link_data_output', [])
19 |     run:
20 |         input_files = list(input)
21 |         output_links = list(output)
22 | 
23 |         if len(input_files) != len(output_links):
24 |             raise RuntimeError('Cannot inject data via sym linking, no 1-to-1 correspondence '
25 |                                'between input and output: {} vs {}'.format(len(input_files), len(output_links)))
26 | 
27 |         import os
28 |         import shutil
29 |         for input_file, output_link in zip(input_files, output_links):
30 |             assert os.path.isfile(input_file), 'Invalid path to input file for linking/copying: {}'.format(input_file)
31 |             os.makedirs(os.path.dirname(output_link), exist_ok=True)
32 |             if CONFIG_FORCE_LOCAL_COPY:
33 |                 shutil.copy(input_file, output_link)
34 |             else:
35 |                 os.symlink(input_file, output_link)


--------------------------------------------------------------------------------
/smk_config/samples/na12878.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA12878:
 3 |   individual: NA12878
 4 |   sex: female
 5 |   super_population: EUR
 6 |   population: CEU
 7 |   family: 1463
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA12878_giab_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |         source_type: ena
16 |         bioproject: PRJNA540705
17 |     - strandseq:
18 |         readset: &sseq_reads NA12878_eriba_il25k-100pe_sseq
19 |         source_type: ena
20 |         bioproject: PRJEB14185
21 |         library_fractions: one
22 |     - short_reads:
23 |         readset: NA12878_ptg_il2k-100pe_short
24 |         source_type: ena
25 |         bioproject: PRJEB3381
26 |         load_type: complete
27 |     - short_reads:
28 |         readset: NA12878_1kg_ilnvs-150pe_short
29 |         source_type: ena
30 |         bioproject: PRJEB31736
31 |         load_type: complete
32 |         comment: "2504 cohort"
33 | 
34 | 
35 | sample_targets_NA12878:
36 |   - aliases:
37 |       1: &ccs_reads NA12878_giab_pbsq2-ccs_1000
38 |   - defaults:
39 |       hap_reads: *ccs_reads
40 |       vc_reads: *ccs_reads
41 |       sseq_reads: *sseq_reads
42 |       pol_reads: *ccs_reads
43 |       pol_pass: racon-p2
44 |       hap_assm_mode: split
45 |       hap:
46 |         - h1-un
47 |         - h2-un
48 |   - target:
49 |       nhr_assembler: pereg
50 |       hap_assembler: pereg
51 |       var_caller: deepvar
52 | 


--------------------------------------------------------------------------------
/scripts/dev/ref_phasing/prep_ref.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import argparse
 5 | import re
 6 | import io
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--fasta-in', '-i', dest='input', type=str)
12 |     parser.add_argument('--fasta-out', '-o', dest='output', type=str)
13 |     parser.add_argument('--chromosomes', '-c', dest='chrom', default='"^chr[0-9]+$"')
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | 
18 | def main():
19 |     args = parse_args()
20 | 
21 |     chrom_match = re.compile(args.chrom.strip('"'))
22 | 
23 |     out_buffer = io.StringIO()
24 | 
25 |     collected_chroms = []
26 | 
27 |     collect = False
28 |     with open(args.input, 'r') as fasta:
29 |         for line in fasta:
30 |             if line.startswith('>'):
31 |                 chrom = chrom_match.match(line.strip().strip('>'))
32 |                 if chrom is None:
33 |                     collect = False
34 |                     continue
35 |                 collected_chroms.append(chrom.group(0))
36 |                 out_buffer.write(line)
37 |                 collect = True
38 |                 continue
39 |             elif collect:
40 |                 out_buffer.write(line)
41 |             else:
42 |                 continue
43 | 
44 |     out_path = os.path.dirname(args.output)
45 |     os.makedirs(out_path, exist_ok=True)
46 |     with open(args.output, 'w') as dump:
47 |         _ = dump.write(out_buffer.getvalue())
48 | 
49 |     print(sorted(collected_chroms))
50 |     return 0
51 | 
52 | if __name__ == '__main__':
53 |     main()


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/ACB/hg02011.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG02011:
 3 |   individual: HG02011
 4 |   sex: male
 5 |   super_population: AFR
 6 |   population: ACB
 7 |   family: BB13
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG02011_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG02011_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG02011_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG02011:
28 |   - aliases:
29 |       2: &clr_hgsvc HG02011_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_hgsvc
32 |       vc_reads: *clr_hgsvc
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_hgsvc
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - defaults:
45 |       hap_reads: *clr_hgsvc
46 |       vc_reads: *clr_hgsvc
47 |       sseq_reads: *sseq_reads
48 |       pol_reads: *clr_hgsvc
49 |       pol_pass: arrow-p1
50 |       hap_assm_mode: split
51 |       hap:
52 |         - h1-un
53 |         - h2-un
54 |   - target:
55 |       nhr_assembler: hhu26
56 |       hap_assembler: flye
57 |       var_caller: longshot
58 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/ASW/na19983.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA19983:
 3 |   individual: NA19983
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: ASW
 7 |   family: 2436
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA19983_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads NA19983_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads NA19983_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_NA19983:
28 |   - aliases:
29 |       2: &clr_hgsvc NA19983_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_hgsvc
32 |       vc_reads: *clr_hgsvc
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_hgsvc
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - defaults:
45 |       hap_reads: *clr_hgsvc
46 |       vc_reads: *clr_hgsvc
47 |       sseq_reads: *sseq_reads
48 |       pol_reads: *clr_hgsvc
49 |       pol_pass: arrow-p1
50 |       hap_assm_mode: split
51 |       hap:
52 |         - h1-un
53 |         - h2-un
54 |   - target:
55 |       nhr_assembler: hhu26
56 |       hap_assembler: flye
57 |       var_caller: longshot
58 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/ESN/hg03371.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03371:
 3 |   individual: HG03371
 4 |   sex: male
 5 |   super_population: AFR
 6 |   population: ESN
 7 |   family: NG98
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03371_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03371_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03371_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03371:
28 |   - aliases:
29 |       2: &clr_hgsvc HG03371_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_hgsvc
32 |       vc_reads: *clr_hgsvc
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_hgsvc
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - defaults:
45 |       hap_reads: *clr_hgsvc
46 |       vc_reads: *clr_hgsvc
47 |       sseq_reads: *sseq_reads
48 |       pol_reads: *clr_hgsvc
49 |       pol_pass: arrow-p1
50 |       hap_assm_mode: split
51 |       hap:
52 |         - h1-un
53 |         - h2-un
54 |   - target:
55 |       nhr_assembler: hhu26
56 |       hap_assembler: flye
57 |       var_caller: longshot
58 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/GWD/hg02587.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG02587:
 3 |   individual: HG02587
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: GWD
 7 |   family: GB24
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG02587_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG02587_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG02587_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG02587:
28 |   - aliases:
29 |       2: &clr_hgsvc HG02587_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_hgsvc
32 |       vc_reads: *clr_hgsvc
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_hgsvc
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - defaults:
45 |       hap_reads: *clr_hgsvc
46 |       vc_reads: *clr_hgsvc
47 |       sseq_reads: *sseq_reads
48 |       pol_reads: *clr_hgsvc
49 |       pol_pass: arrow-p1
50 |       hap_assm_mode: split
51 |       hap:
52 |         - h1-un
53 |         - h2-un
54 |   - target:
55 |       nhr_assembler: hhu26
56 |       hap_assembler: flye
57 |       var_caller: longshot
58 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/MSL/hg03065.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG03065:
 3 |   individual: HG03065
 4 |   sex: male
 5 |   super_population: AFR
 6 |   population: MSL
 7 |   family: SL05
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG03065_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG03065_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG03065_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG03065:
28 |   - aliases:
29 |       2: &clr_hgsvc HG03065_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_hgsvc
32 |       vc_reads: *clr_hgsvc
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_hgsvc
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - defaults:
45 |       hap_reads: *clr_hgsvc
46 |       vc_reads: *clr_hgsvc
47 |       sseq_reads: *sseq_reads
48 |       pol_reads: *clr_hgsvc
49 |       pol_pass: arrow-p1
50 |       hap_assm_mode: split
51 |       hap:
52 |         - h1-un
53 |         - h2-un
54 |   - target:
55 |       nhr_assembler: hhu26
56 |       hap_assembler: flye
57 |       var_caller: longshot
58 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AMR/CLM/hg01114.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG01114:
 3 |   individual: HG01114
 4 |   sex: female
 5 |   super_population: AMR
 6 |   population: CLM
 7 |   family: CLM03
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG01114_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |     - strandseq:
16 |         readset: &sseq_reads HG01114_hgsvc_ilnxs-80pe_sseq
17 |         source_type: ftp
18 |         library_fractions: one
19 |     - short_reads:
20 |         readset: &short_reads HG01114_1kg_ilnvs-150pe_short
21 |         source_type: ena
22 |         bioproject: PRJEB36890
23 |         load_type: complete
24 |         comment: "698 cohort"
25 | 
26 | 
27 | sample_targets_HG01114:
28 |   - aliases:
29 |       2: &clr_hgsvc HG01114_hgsvc_pbsq2-clr_1000
30 |   - defaults:
31 |       hap_reads: *clr_hgsvc
32 |       vc_reads: *clr_hgsvc
33 |       sseq_reads: *sseq_reads
34 |       pol_reads: *clr_hgsvc
35 |       pol_pass: arrow-p1
36 |       hap_assm_mode: split
37 |       hap:
38 |         - h1-un
39 |         - h2-un
40 |   - target:
41 |       nhr_assembler: flye
42 |       hap_assembler: flye
43 |       var_caller: longshot
44 |   - defaults:
45 |       hap_reads: *clr_hgsvc
46 |       vc_reads: *clr_hgsvc
47 |       sseq_reads: *sseq_reads
48 |       pol_reads: *clr_hgsvc
49 |       pol_pass: arrow-p1
50 |       hap_assm_mode: split
51 |       hap:
52 |         - h1-un
53 |         - h2-un
54 |   - target:
55 |       nhr_assembler: jax27
56 |       hap_assembler: flye
57 |       var_caller: longshot
58 | 


--------------------------------------------------------------------------------
/environment/conda/conda_evaltools.yml:
--------------------------------------------------------------------------------
 1 | name: evaltools
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 | dependencies:
 6 |   - Python=3.6.*
 7 |   - pip=19.2.3
 8 |   - openssl=1.0.2t
 9 |   - gxx_impl_linux-64=7.3.0
10 |   - gxx_linux-64=7.3.0
11 |   - ld_impl_linux-64=2.33.1
12 |   - libblas=3.8.0
13 |   - libcblas=3.8.0
14 |   - liblapack=3.8.0
15 |   - libopenblas=0.3.6
16 |   - libgfortran-ng=7.3.0
17 |   - libgcc-ng=9.1.0
18 |   - libstdcxx-ng=9.1.0
19 |   - libxml2=2.9.9
20 |   - lp_solve=5.5.2.5
21 |   - openblas=0.3.6
22 |   - make=4.2.1
23 |   - bwa=0.7.17
24 |   - minimap2=2.17
25 |   - mummer4
26 |   - sambamba=0.7.1  # FIX github.com/biod/sambamba/issues/393 - DO NOT DOWNGRADE
27 |   - ldc=1.13.0=h02c9852_1  # FIX https://github.com/bcbio/bcbio-nextgen/issues/3032 - DO NOT UP- or DOWNGRADE
28 |   - htslib=1.9
29 |   - libdeflate=1.3  # v1.0 may trigger pysam import error - DO NOT DOWNGRADE
30 |   - pysam=0.15.3
31 |   - samtools=1.9
32 |   - bamtools=2.5.1
33 |   - bedtools=2.29.2  # FIX https://github.com/arq5x/bedtools2/issues/779 - DO NOT DOWNGRADE
34 |   - bedops=2.4.37
35 |   - bcftools=1.9
36 |   - fastqc=0.11.8
37 |   - freebayes=1.3.1
38 |   - longshot=0.4.0
39 |   - wtdbg=2.5
40 |   - flye=2.7
41 |   - canu=2.0
42 |   - racon=1.4.10
43 |   - lighter=1.1.2
44 |   - bcalm=2.2.2
45 |   - dnaio=0.4.2
46 |   - cutadapt=2.10
47 |   - trim-galore=0.6.5
48 |   - graphaligner=1.0.11
49 |   - ucsc-bedgraphtobigwig=377
50 |   - ucsc-bigwigaverageoverbed=377
51 |   - ucsc-bigwigcorrelate=377
52 |   - ucsc-bigwigmerge=377
53 |   - ucsc-bigwigcluster=377
54 |   - pigz=2.4
55 |   - pandas=1.0.5
56 |   - pytables=3.6.1
57 |   - intervaltree=3.0.2
58 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AMR/MXL/na19650.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA19650:
 3 |   individual: NA19650
 4 |   sex: male
 5 |   super_population: AMR
 6 |   population: MXL
 7 |   family: m001
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA19650_hgsvc_pbsq2-clr
12 |         technology: pacbio
13 |         data_type: pacbio_native
14 |         load_type: parts
15 |         comment: "seq_center:UMIGS"
16 |     - strandseq:
17 |         readset: &sseq_reads NA19650_hgsvc_ilnxs-80pe_sseq
18 |         source_type: ftp
19 |         library_fractions: one
20 |     - short_reads:
21 |         readset: &short_reads NA19650_1kg_ilnvs-150pe_short
22 |         source_type: ena
23 |         bioproject: PRJEB36890
24 |         load_type: complete
25 |         comment: "698 cohort"
26 | 
27 | 
28 | sample_targets_NA19650:
29 |   - aliases:
30 |       1: &clr_reads NA19650_hgsvc_pbsq2-clr_1000
31 |   - defaults:
32 |       hap_reads: *clr_reads
33 |       vc_reads: *clr_reads
34 |       sseq_reads: *sseq_reads
35 |       pol_reads: *clr_reads
36 |       pol_pass: arrow-p1
37 |       hap_assm_mode: split
38 |       hap:
39 |         - h1-un
40 |         - h2-un
41 |   - target:
42 |       nhr_assembler: flye
43 |       hap_assembler: flye
44 |       var_caller: longshot
45 |   - defaults:
46 |       hap_reads: *clr_reads
47 |       vc_reads: *clr_reads
48 |       sseq_reads: *sseq_reads
49 |       pol_reads: *clr_reads
50 |       pol_pass: arrow-p1
51 |       hap_assm_mode: split
52 |       hap:
53 |         - h1-un
54 |         - h2-un
55 |   - target:
56 |       nhr_assembler: uw27
57 |       hap_assembler: flye
58 |       var_caller: longshot
59 | 


--------------------------------------------------------------------------------
/annotation/sample_table.tsv:
--------------------------------------------------------------------------------
 1 | individual	sex	super_population	population	family	member	HiFi	CLR	2020_SKIP
 2 | HG02011	male	AFR	ACB	BB13	child	0	1	0
 3 | NA19983	female	AFR	ASW	2436	child	0	1	0
 4 | HG03125	female	AFR	ESN	NG34	child	1	0	0
 5 | HG03371	male	AFR	ESN	NG98	child	0	1	0
 6 | HG02587	female	AFR	GWD	GB24	child	0	1	0
 7 | HG02818	female	AFR	GWD	GB66	child	1	0	0
 8 | NA19036	female	AFR	LWK	NA19036	unrelated	0	1	1
 9 | HG03065	male	AFR	MSL	SL05	child	0	1	0
10 | HG03486	female	AFR	MSL	SL61	child	1	0	0
11 | NA19238	female	AFR	YRI	Y117	parent	1	1	0
12 | NA19239	male	AFR	YRI	Y117	parent	1	1	0
13 | NA19240	female	AFR	YRI	Y117	child	1	1	0
14 | HG01114	female	AMR	CLM	CLM03	child	0	1	0
15 | NA19650	male	AMR	MXL	m001	child	0	1	0
16 | HG01573	female	AMR	PEL	PEL003	child	0	1	1
17 | HG00731	male	AMR	PUR	PR05	parent	1	1	0
18 | HG00732	female	AMR	PUR	PR05	parent	1	1	0
19 | HG00733	female	AMR	PUR	PR05	child	1	1	0
20 | HG00864	female	EAS	CDX	HG00864	unrelated	0	1	0
21 | NA18534	male	EAS	CHB	NA18534	unrelated	0	1	0
22 | HG00512	male	EAS	CHS	SH032	parent	1	1	0
23 | HG00513	female	EAS	CHS	SH032	parent	1	1	0
24 | HG00514	female	EAS	CHS	SH032	child	1	1	0
25 | NA18939	female	EAS	JPT	NA18939	unrelated	0	1	0
26 | HG01596	male	EAS	KHV	VN002	unrelated	0	1	0
27 | HG02018	female	EAS	KHV	VN047	child	0	1	1
28 | NA24385	male	EUR	ASK	3140	child	1	0	0
29 | NA12329	female	EUR	CEU	1328	child	0	1	0
30 | NA12878	female	EUR	CEU	1463	child	1	0	0
31 | HG00171	female	EUR	FIN	HG00171	unrelated	0	1	0
32 | HG00096	male	EUR	GBR	HG00096	unrelated	0	1	0
33 | HG01505	male	EUR	IBS	IBS002	child	0	1	0
34 | NA20509	male	EUR	TSI	NA20509	unrelated	0	1	0
35 | HG03009	male	SAS	BEB	HG03009	unrelated	0	1	0
36 | NA20847	female	SAS	GIH	NA20847	unrelated	0	1	0
37 | HG03721	female	SAS	ITU	IT003	parent	0	1	1
38 | HG03732	male	SAS	ITU	IT003	child	0	1	0
39 | HG02492	male	SAS	PJL	PK06	child	0	1	0
40 | HG03683	female	SAS	STU	ST012	child	0	1	0
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # PyCharm idea folder
107 | .idea/
108 | 
109 | # Cached data
110 | cache*
111 | .cache*


--------------------------------------------------------------------------------
/docs/demo.md:
--------------------------------------------------------------------------------
 1 | # DEMO
 2 | 
 3 | ## Running the pipeline demo data
 4 | 
 5 | The following instructions assume that you have read the [tutorial](tutorial.md)
 6 | at least up to the point "*Snakemake execution environment*". Your working directory
 7 | should thus look as follows:
 8 | 
 9 | ```bash
10 | /work_dir$ ls -1
11 | project-diploid-assembly/
12 | smk_env/
13 | ```
14 | 
15 | Please download the [demo data (DOI: 10.5281/zenodo.3746293)](https://doi.org/10.5281/zenodo.3746293)
16 | into your working directory (~ 6.2 GB), and extract the gzipped tar:
17 | 
18 | ```bash
19 | /work_dir$ tar xzvf pipeline_demo.tar.gz
20 | ```
21 | 
22 | After this operation, your working directory should look like this:
23 | 
24 | ```bash
25 | /work_dir$ ls -1
26 | demo_data/
27 | pipeline_demo.tar.gz
28 | project-diploid-assembly/
29 | smk_env/
30 | ```
31 | 
32 | The pipeline repository contains a Snakemake *profile* that specifies a compute environment
33 | with **24 CPU cores** and **64 GB of main memory**. You can either use the Snakemake *profile* and the
34 | pipeline run environment configuration that are shipped with the pipeline code in the repository,
35 | or you can use your own based on the information given in the [tutorial](tutorial.md).
36 | In both cases, please proceed to the instructions how to [execute the pipeline](execute.md).
37 | 
38 | ## How to interpret the results of the demo
39 | 
40 | In all brevity, just don't. The demo data is a heavily downsampled version of a publicly
41 | available PacBio Sequel-2 HiFi/CCS dataset retrieved from EBI/ENA (PRJNA540705),
42 | and of the respective Strand-seq data (PRJEB14185). The objective was to create a dataset
43 | that could be processed from start to finish with moderate resources and within a reasonable
44 | amount of time (less than 24 hours). A successful run of the demo data is a "proof of function"
45 | for the pipeline, but it does not generate "biologically interesting" results.


--------------------------------------------------------------------------------
/environment/snakemake/cluster/denbi_tu_pbs/denbi_cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "__default__": {
 3 |     "cores": "{threads}",
 4 |     "mem_mb": "{resources.mem_total_mb}",
 5 |     "name": "{jobid}_{rule}",
 6 |     "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout",
 7 |     "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr",
 8 |     "run_hrs": "{resources.runtime_hrs}",
 9 |     "run_min": "{resources.runtime_min}"
10 |   },
11 |   "handle_partial_fastq_download_request": {
12 |     "cores": "36",
13 |     "mem_mb": "{resources.mem_total_mb}",
14 |     "name": "{jobid}_{rule}",
15 |     "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout",
16 |     "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr",
17 |     "run_hrs": "{resources.runtime_hrs}",
18 |     "run_min": "{resources.runtime_min}"
19 |   },
20 |   "merge_fastq_input_parts": {
21 |     "cores": "36",
22 |     "mem_mb": "{resources.mem_total_mb}",
23 |     "name": "{jobid}_{rule}",
24 |     "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout",
25 |     "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr",
26 |     "run_hrs": "{resources.runtime_hrs}",
27 |     "run_min": "{resources.runtime_min}"
28 |   },
29 |   "strandseq_dga_split_merge_tag_groups": {
30 |     "cores": "18",
31 |     "mem_mb": "{resources.mem_total_mb}",
32 |     "name": "{jobid}_{rule}",
33 |     "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout",
34 |     "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr",
35 |     "run_hrs": "{resources.runtime_hrs}",
36 |     "run_min": "{resources.runtime_min}"
37 |   },
38 |   "strandseq_dga_split_haplo_tagging": {
39 |     "cores": "36",
40 |     "mem_mb": "{resources.mem_total_mb}",
41 |     "name": "{jobid}_{rule}",
42 |     "output": "log/cluster_jobs/stdout/{jobid}_{rule}.stdout",
43 |     "error": "log/cluster_jobs/stderr/{jobid}_{rule}.stderr",
44 |     "run_hrs": "{resources.runtime_hrs}",
45 |     "run_min": "{resources.runtime_min}"
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/smk_include/results/run_eur_trios.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | localrules: run_hg00171_individual,
 3 |             run_na20509_individual,
 4 |             run_hg00096_individual,
 5 |             run_eur_trios,
 6 |             run_ceu_trio,
 7 |             run_ceu_child,
 8 |             run_ibs_trio,
 9 |             run_ibs_child,
10 | 
11 | 
12 | rule run_hg00171_individual:
13 |     input:
14 |         'output/targets/EUR_FIN_HG00171/HG00171.fofn'
15 |     message: 'Running EUR-FIN-HG00171 individual'
16 | 
17 | #######################################################
18 | 
19 | rule run_na20509_individual:
20 |     input:
21 |         'output/targets/EUR_TSI_NA20509/NA20509.fofn'
22 |     message: 'Running EUR-TSI-NA20509 individual'
23 | 
24 | #######################################################
25 | 
26 | rule run_hg00096_individual:
27 |     input:
28 |         'output/targets/EUR_GBR_HG00096/HG00096.fofn'
29 |     message: 'Running EUR-GBR-HG00096 individual'
30 | 
31 | #######################################################
32 | 
33 | rule run_ceu_child:
34 |     input:
35 |         'output/targets/EUR_CEU_1328/NA12329.fofn'
36 |     message: 'Running EUR-CEU-1328 child'
37 | 
38 | rule run_ceu_trio:
39 |     input:
40 |         rules.run_ceu_child.input,
41 |     message: 'Running EUR-CEU-1328 trio'
42 | 
43 | ########################################################
44 | 
45 | rule run_ibs_child:
46 |     input:
47 |         'output/targets/EUR_IBS_IBS002/HG01505.fofn'
48 |     message: 'Running EUR-IBS-IBS002 child'
49 | 
50 | rule run_ibs_trio:
51 |     input:
52 |         rules.run_ibs_child.input,
53 |     message: 'Running EUR-IBS-IBS002 trio'
54 | 
55 | ########################################################
56 | 
57 | rule run_eur_trios:
58 |     input:
59 |         rules.run_hg00171_individual.input,
60 |         rules.run_na20509_individual.input,
61 |         rules.run_hg00096_individual.input,
62 |         rules.run_ceu_trio.input,
63 |         rules.run_ibs_trio.input,
64 |     message: 'Running EUR trios'
65 | 


--------------------------------------------------------------------------------
/annotation/NA24385_selected_libraries_sseq.csv:
--------------------------------------------------------------------------------
 1 | HG002x01PE20301
 2 | HG002x01PE20303
 3 | HG002x01PE20305
 4 | HG002x01PE20306
 5 | HG002x01PE20307
 6 | HG002x01PE20308
 7 | HG002x01PE20313
 8 | HG002x01PE20315
 9 | HG002x01PE20318
10 | HG002x01PE20319
11 | HG002x01PE20325
12 | HG002x01PE20327
13 | HG002x01PE20328
14 | HG002x01PE20329
15 | HG002x01PE20331
16 | HG002x01PE20332
17 | HG002x01PE20334
18 | HG002x01PE20335
19 | HG002x01PE20336
20 | HG002x01PE20337
21 | HG002x01PE20339
22 | HG002x01PE20340
23 | HG002x01PE20341
24 | HG002x01PE20342
25 | HG002x01PE20343
26 | HG002x01PE20345
27 | HG002x01PE20347
28 | HG002x01PE20350
29 | HG002x01PE20351
30 | HG002x01PE20352
31 | HG002x01PE20353
32 | HG002x01PE20355
33 | HG002x01PE20356
34 | HG002x01PE20357
35 | HG002x01PE20358
36 | HG002x01PE20359
37 | HG002x01PE20361
38 | HG002x01PE20362
39 | HG002x01PE20363
40 | HG002x01PE20364
41 | HG002x01PE20367
42 | HG002x01PE20368
43 | HG002x01PE20374
44 | HG002x01PE20376
45 | HG002x01PE20377
46 | HG002x01PE20378
47 | HG002x01PE20379
48 | HG002x01PE20381
49 | HG002x01PE20387
50 | HG002x01PE20388
51 | HG002x01PE20389
52 | HG002x01PE20391
53 | HG002x01PE20392
54 | HG002x01PE20393
55 | HG002x02PE20403
56 | HG002x02PE20405
57 | HG002x02PE20407
58 | HG002x02PE20414
59 | HG002x02PE20416
60 | HG002x02PE20417
61 | HG002x02PE20419
62 | HG002x02PE20421
63 | HG002x02PE20422
64 | HG002x02PE20424
65 | HG002x02PE20425
66 | HG002x02PE20428
67 | HG002x02PE20430
68 | HG002x02PE20431
69 | HG002x02PE20432
70 | HG002x02PE20434
71 | HG002x02PE20435
72 | HG002x02PE20439
73 | HG002x02PE20440
74 | HG002x02PE20443
75 | HG002x02PE20445
76 | HG002x02PE20450
77 | HG002x02PE20452
78 | HG002x02PE20454
79 | HG002x02PE20456
80 | HG002x02PE20458
81 | HG002x02PE20464
82 | HG002x02PE20466
83 | HG002x02PE20467
84 | HG002x02PE20469
85 | HG002x02PE20473
86 | HG002x02PE20479
87 | HG002x02PE20483
88 | HG002x02PE20484
89 | HG002x02PE20485
90 | HG002x02PE20486
91 | HG002x02PE20487
92 | HG002x02PE20488
93 | HG002x02PE20490
94 | HG002x02PE20491
95 | HG002x02PE20492
96 | HG002x02PE20494
97 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/YRI/na19240.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA19240:
 3 |   individual: NA19240
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: YRI
 7 |   family: Y117
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA19240_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: NA19240_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |     - strandseq:
21 |         readset: &sseq_reads NA19240_1kg_il25k-npe_sseq
22 |         source_type: ena
23 |         bioproject: PRJEB12849
24 |         library_fractions: two
25 |     - short_reads:
26 |         readset: NA19240_1kg_il25k-125pe_short
27 |         source_type: ena
28 |         bioproject: PRJEB9396
29 |         load_type: parts
30 |     - short_reads:
31 |         readset: NA19240_1kg_ilnvs-150pe_short
32 |         source_type: ena
33 |         bioproject: PRJEB36890
34 |         load_type: complete
35 |         comment: "698 cohort"
36 | 
37 | 
38 | sample_targets_NA19240:
39 |   - aliases:
40 |       1: &ccs_reads NA19240_hgsvc_pbsq2-ccs_1000
41 |       2: &clr_reads NA19240_hgsvc_pbsq2-clr_1000
42 |   - defaults:
43 |       hap_reads: *ccs_reads
44 |       vc_reads: *ccs_reads
45 |       sseq_reads: *sseq_reads
46 |       pol_reads: *ccs_reads
47 |       pol_pass: racon-p2
48 |       hap_assm_mode: split
49 |       hap:
50 |         - h1-un
51 |         - h2-un
52 |   - target:
53 |       nhr_assembler: pereg
54 |       hap_assembler: pereg
55 |       var_caller: deepvar
56 |   - defaults:
57 |       hap_reads: *clr_reads
58 |       vc_reads: *clr_reads
59 |       sseq_reads: *sseq_reads
60 |       pol_reads: *clr_reads
61 |       pol_pass: arrow-p1
62 |       hap_assm_mode: split
63 |       hap:
64 |         - h1-un
65 |         - h2-un
66 |   - target:
67 |       nhr_assembler: uw27
68 |       hap_assembler: flye
69 |       var_caller: longshot


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Project repository: Phased Genome Assembly using Strand-seq (PGAS)
 2 | 
 3 | ## Citation
 4 | 
 5 | If you use this pipeline or extract and reuse original code/rules from this repository,
 6 | please cite the following two papers:
 7 | 
 8 | > Porubsky and Ebert et al.  
 9 | > "Fully Phased Human Genome Assembly without Parental Data Using Single-Cell Strand Sequencing and Long Reads."  
10 | > Nature Biotechnology, December 2020  
11 | > [DOI: 10.1038/s41587-020-0719-5](https://doi.org/10.1038/s41587-020-0719-5)
12 | 
13 | > Ebert, Audano, Zhu and Rodriguez-Martin et al.  
14 | > "Haplotype-resolved diverse human genomes and integrated analysis of structural variation"  
15 | > Science, February 2021  
16 | > [DOI: 10.1126/science.abf7117](https://doi.org/10.1126/science.abf7117)
17 | 
18 | #### Deprecated citations
19 | 
20 | Please do not reference the preprints ([10.1101/855049](https://doi.org/10.1101/855049) and [10.1101/2020.12.16.423102](https://doi.org/10.1101/2020.12.16.423102)) anymore.
21 | 
22 | ## Scope of this repository
23 | 
24 | This repository contains the Snakemake pipeline code plus some auxiliary scripts to go from raw
25 | input data to polished haploid assemblies. Any self-contained, general purpose software tool used in
26 | the pipeline is either available via conda/bioconda, or via github. In any case, the pipeline
27 | implementation covers the entire software setup required for a complete pipeline run. 
28 | 
29 | In particular, the code for the `SaaRclust`, `StrandPhaseR` and `breakpointR` R packages is
30 | available in [David Porubsky's github](https://github.com/daewoooo/SaaRclust).
31 | 
32 | ## Documentation
33 | 
34 | There are several step-by-step manuals available that describe all use cases currently supported
35 | for this pipeline. First-time users should start by reading the [tutorial](docs/tutorial.md).
36 | If you encounter any problems or "strange behaviour" during pipeline execution, please check
37 | the [FAQ](docs/faq.md) for explanations and solutions. If this does not help, please open a
38 | [github issue](https://guides.github.com/features/issues).


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/YRI/na19238.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA19238:
 3 |   individual: NA19238
 4 |   sex: female
 5 |   super_population: AFR
 6 |   population: YRI
 7 |   family: Y117
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA19238_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: NA19238_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |     - strandseq:
21 |         readset: &sseq_reads NA19238_1kg_il25k-npe_sseq
22 |         source_type: ena
23 |         bioproject: PRJEB12849
24 |         library_fractions: two
25 |     - short_reads:
26 |         readset: NA19238_1kg_il25k-125pe_short
27 |         source_type: ena
28 |         bioproject: PRJEB9396
29 |         load_type: parts
30 |     - short_reads:
31 |         readset: NA19238_1kg_ilnvs-150pe_short
32 |         source_type: ena
33 |         bioproject: PRJEB31736
34 |         load_type: complete
35 |         comment: "2504 cohort"
36 | 
37 | 
38 | sample_targets_NA19238:
39 |   - aliases:
40 |       1: &ccs_reads NA19238_hgsvc_pbsq2-ccs_1000
41 |       2: &clr_reads NA19238_hgsvc_pbsq2-clr_1000
42 |   - defaults:
43 |       hap_reads: *ccs_reads
44 |       vc_reads: *ccs_reads
45 |       sseq_reads: *sseq_reads
46 |       pol_reads: *ccs_reads
47 |       pol_pass: racon-p2
48 |       hap_assm_mode: split
49 |       hap:
50 |         - h1-un
51 |         - h2-un
52 |   - target:
53 |       nhr_assembler: pereg
54 |       hap_assembler: pereg
55 |       var_caller: deepvar
56 |   - target:
57 |       nhr_assembler: flye
58 |       hap_assembler: flye
59 |       var_caller: freebayes
60 |   - defaults:
61 |       hap_reads: *clr_reads
62 |       vc_reads: *clr_reads
63 |       sseq_reads: *sseq_reads
64 |       pol_reads: *clr_reads
65 |       pol_pass: arrow-p1
66 |       hap_assm_mode: split
67 |       hap:
68 |         - h1-un
69 |         - h2-un
70 |   - target:
71 |       nhr_assembler: jax27
72 |       hap_assembler: flye
73 |       var_caller: longshot
74 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AMR/PUR/hg00732.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00732:
 3 |   individual: HG00732
 4 |   sex: female
 5 |   super_population: AMR
 6 |   population: PUR
 7 |   family: PR05
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00732_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: HG00732_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |     - strandseq:
21 |         readset: &sseq_reads HG00732_1kg_il25k-npe_sseq
22 |         source_type: ena
23 |         bioproject: PRJEB12849
24 |         library_fractions: two
25 |     - short_reads:
26 |         readset: &short_reads HG00732_1kg_il25k-125pe_short
27 |         source_type: ena
28 |         bioproject: PRJEB9396
29 |         load_type: parts
30 |     - short_reads:
31 |         readset: HG00732_1kg_ilnvs-150pe_short
32 |         source_type: ena
33 |         bioproject: PRJEB31736
34 |         load_type: complete
35 |         comment: "2504 cohort"
36 | 
37 | 
38 | sample_targets_HG00732:
39 |   - aliases:
40 |       1: &ccs_reads HG00732_hgsvc_pbsq2-ccs_1000
41 |       2: &clr_reads HG00732_hgsvc_pbsq2-clr_1000
42 |   - defaults:
43 |       hap_reads: *ccs_reads
44 |       vc_reads: *ccs_reads
45 |       sseq_reads: *sseq_reads
46 |       pol_reads: *ccs_reads
47 |       pol_pass: racon-p2
48 |       hap_assm_mode: split
49 |       hap:
50 |         - h1-un
51 |         - h2-un
52 |   - target:
53 |       nhr_assembler: pereg
54 |       hap_assembler: pereg
55 |       var_caller: deepvar
56 |   - defaults:
57 |       hap_reads: *clr_reads
58 |       vc_reads: *clr_reads
59 |       sseq_reads: *sseq_reads
60 |       pol_reads: *clr_reads
61 |       pol_pass: arrow-p1
62 |       hap_assm_mode: split
63 |       hap:
64 |         - h1-un
65 |         - h2-un
66 |   - target:
67 |       nhr_assembler: flye
68 |       hap_assembler: flye
69 |       var_caller: longshot
70 |   - target:
71 |       nhr_assembler: uw27
72 |       hap_assembler: flye
73 |       var_caller: longshot
74 | 


--------------------------------------------------------------------------------
/scripts/dev/cluster_splitter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import io
 5 | import dnaio
 6 | 
 7 | infile = 'HG03486_hgsvc_pbsq2-ccs_1000_scV12-pereg.fasta.backup'
 8 | outfile = 'HG03486_hgsvc_pbsq2-ccs_1000_scV12-pereg.fasta'
 9 | 
10 | target_cluster = 'cluster1'
11 | 
12 | head_buffer = io.StringIO()
13 | tail_buffer = io.StringIO()
14 | 
15 | current_buffer = head_buffer
16 | 
17 | cluster_buffer = io.StringIO()
18 | 
19 | with open(infile, 'r') as fasta:
20 |     for line in fasta:
21 |         if line.startswith('>'):
22 |             if line.strip() == '>{}'.format(target_cluster):
23 |                 current_buffer = cluster_buffer
24 |                 continue
25 |             elif cluster_buffer.tell() > 0:
26 |                 current_buffer = tail_buffer
27 |             else:
28 |                 current_buffer = head_buffer
29 |         current_buffer.write(line)
30 | 
31 | splitter = 'N' * 100
32 | 
33 | cluster_seq = cluster_buffer.getvalue().replace('\n', '').split(splitter)
34 | print('num contigs: {}'.format(len(cluster_seq)))
35 | seq_sizes = [len(s) for s in cluster_seq]
36 | print(seq_sizes)
37 | 
38 | print(head_buffer.tell())
39 | print(tail_buffer.tell())
40 | 
41 | cluster_buffer = []
42 | 
43 | suffices = ['A', 'B', 'C']
44 | suffix_idx = 0
45 | 
46 | with dnaio.FastaWriter(outfile, line_length=80) as fasta:
47 |     current_block = 0
48 |     for seq_size, seq in zip(seq_sizes, cluster_seq):
49 |         current_block += seq_size
50 |         cluster_buffer.append(seq)
51 |         if current_block > 150e6:
52 |             print('Writing block size: {}'.format(current_block))
53 |             block_name = target_cluster + suffices[suffix_idx]
54 |             fasta.write(block_name, splitter.join(cluster_buffer))
55 |             cluster_buffer = []
56 |             current_block = 0
57 |             suffix_idx += 1
58 |         
59 | 
60 |     print('Writing block size: {}'.format(current_block))
61 |     block_name = target_cluster + suffices[suffix_idx]
62 |     fasta.write(block_name, splitter.join(cluster_buffer))
63 |     
64 | with open(outfile, 'a') as fasta:
65 |     _ = fasta.write(tail_buffer.getvalue())


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/CHS/hg00514.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00514:
 3 |   individual: HG00514
 4 |   sex: female
 5 |   super_population: EAS
 6 |   population: CHS
 7 |   family: SH032
 8 |   member: child
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00514_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: HG00514_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |         comment: "seq_center:UMIGS"
21 |     - strandseq:
22 |         readset: &sseq_reads HG00514_1kg_il25k-npe_sseq
23 |         source_type: ena
24 |         bioproject: PRJEB12849
25 |         library_fractions: two
26 |     - short_reads:
27 |         readset: HG00514_1kg_il25k-125pe_short
28 |         source_type: ena
29 |         bioproject: PRJEB9396
30 |         load_type: parts
31 |     - short_reads:
32 |         readset: HG00514_1kg_ilnvs-150pe_short
33 |         source_type: ena
34 |         bioproject: PRJEB36890
35 |         load_type: complete
36 |         comment: "698 cohort"
37 | 
38 | 
39 | sample_targets_HG00514:
40 |   - aliases:
41 |       1: &ccs_reads HG00514_hgsvc_pbsq2-ccs_1000
42 |       3: &clr_hgsvc HG00514_hgsvc_pbsq2-clr_1000
43 |   - defaults:
44 |       hap_reads: *ccs_reads
45 |       vc_reads: *ccs_reads
46 |       sseq_reads: *sseq_reads
47 |       pol_reads: *ccs_reads
48 |       pol_pass: racon-p2
49 |       hap_assm_mode: split
50 |       hap:
51 |         - h1-un
52 |         - h2-un
53 |   - target:
54 |       nhr_assembler: pereg
55 |       hap_assembler: pereg
56 |       var_caller: deepvar
57 |   - target:
58 |       nhr_assembler: flye
59 |       hap_assembler: flye
60 |       var_caller: freebayes
61 |   - defaults:
62 |       hap_reads: *clr_hgsvc
63 |       vc_reads: *clr_hgsvc
64 |       sseq_reads: *sseq_reads
65 |       pol_reads: *clr_hgsvc
66 |       pol_pass: arrow-p1
67 |       hap_assm_mode: split
68 |       hap:
69 |         - h1-un
70 |         - h2-un
71 |   - target:
72 |       nhr_assembler: flye
73 |       hap_assembler: flye
74 |       var_caller: longshot


--------------------------------------------------------------------------------
/scripts/utilities/check_scripts/fasta_checker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import io
 5 | import argparse
 6 | import hashlib
 7 | 
 8 | import dnaio
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('fasta1', type=str)
13 |     parser.add_argument('fasta2', type=str)
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | 
18 | def collect_contig_hashes(fasta_path):
19 | 
20 |     contigs = dict()
21 |     duplicates = list()
22 | 
23 |     with dnaio.open(fasta_path) as fastx:
24 |         for record in fastx:
25 |             contig_name = record.name
26 |             seq = record.sequence
27 |             seq_hash = hashlib.blake2b(seq.encode('ascii')).digest()
28 |             if seq_hash in contigs:
29 |                 # happens for flye assembler
30 |                 duplicates.append(contig_name)
31 |             contigs[seq_hash] = contig_name, len(seq)
32 | 
33 |     print('assembled contigs {}'.format(len(contigs)))
34 |     if duplicates:
35 |         print('duplicate contig sequences: {}'.format(sorted(duplicates)))
36 | 
37 |     return contigs
38 | 
39 | 
40 | def main():
41 |     args = parse_args()
42 |     fasta1 = collect_contig_hashes(args.fasta1)
43 |     fasta2 = collect_contig_hashes(args.fasta2)
44 | 
45 |     total1 = sum([x[1] for x in fasta1.values()])
46 |     total2 = sum([x[1] for x in fasta2.values()])
47 | 
48 |     intersect = set(fasta1.keys()).intersection(fasta2.keys())
49 |     if len(intersect) > 0:
50 |         dups1 = [fasta1[h] for h in intersect]
51 |         dups2 = [fasta2[h] for h in intersect]
52 | 
53 |         cluster1 = sum([x[1] for x in dups1])
54 |         cluster2 = sum([x[1] for x in dups2])
55 | 
56 |         frac_h1 = round(len(dups1) / len(fasta1) * 100, 2)
57 |         frac_h2 = round(len(dups2) / len(fasta2) * 100, 2)
58 | 
59 |         pct_bp_h1 = round(cluster1 / total1 * 100, 2)
60 |         pct_bp_h2 = round(cluster2 / total2 * 100, 2)
61 | 
62 |         print('HOM frac h1 #ctg', frac_h1)
63 |         print('HOM frac h1 pct. bp', pct_bp_h1)
64 | 
65 |         print('HOM frac h2 #ctg', frac_h2)
66 |         print('HOM frac h2 pct. bp', pct_bp_h2)
67 | 
68 |     return 0
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/smk_config/demo/params.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # === Software setup settings ===
 3 | # Specify git commits for SaaRclust
 4 | # and StrandPhaseR setup
 5 | git_commit_saarclust: ba65b53
 6 | git_commit_strandphaser: e608407
 7 | git_commit_breakpointr: 268d99d
 8 | # arbitrarily tying a version number
 9 | # to the git commits to avoid additional
10 | # wildcards - increment this number when
11 | # git commits are changed!
12 | git_commit_version: 9
13 | 
14 | peregrine_version: 0.1.5.5
15 | deepvariant_version: 0.9.0
16 | shasta_version: 0.4.0
17 | 
18 | # Assembler settings
19 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
20 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
21 | 
22 | # SaaRclust parameter sets
23 | # goal is to obtain 24 clusters
24 | min_contig_size: 50000
25 | min_region_to_order: 500000
26 | bin_size: 100000
27 | step_size: 100000
28 | prob_threshold: 0.25
29 | init_clusters: 25
30 | desired_clusters: 10
31 | min_mapq: 60
32 | 
33 | # VARIANT CALLING
34 | # Postprocessing parameters
35 | filter_vcf_qual: 10
36 | filter_vcf_gq: 100
37 | 
38 | freebayes_timeout_sec: 3600
39 | 
40 | # not primary alignment || supplementary alignment
41 | bwa_strandseq_aln_discard: 2304
42 | 
43 | # read unmapped || not primary alignment || failed QC || PCR dup
44 | minimap_readref_aln_discard: 1796
45 | 
46 | # read unmapped || not primary alignment
47 | minimap_contigref_aln_discard: 260
48 | 
49 | # read unmapped || not primary alignment || failed QC || PCR dup
50 | minimap_racon_aln_discard: 1796  # same as 0x704
51 | minimap_racon_aln_min_qual: 10
52 | 
53 | # main chromosomes to be used
54 | # for known references for main
55 | # pipeline steps (i.e., everything
56 | # before evaluation)
57 | eval_known_ref: GRCh38_GCA_p13
58 | eval_align_ref: hg38_GCA_p13
59 | eval_gene_model: GRCh38_GENCODEv31_basic
60 | use_genome_size: NA12878_demo_reference
61 | main_chromosomes:
62 |   - chr1
63 |   - chr2
64 |   - chr3
65 |   - chr4
66 |   - chr5
67 |   - chr6
68 |   - chr7
69 |   - chr8
70 |   - chr9
71 |   - chr10
72 |   - chr11
73 |   - chr12
74 |   - chr13
75 |   - chr14
76 |   - chr15
77 |   - chr16
78 |   - chr17
79 |   - chr18
80 |   - chr19
81 |   - chr20
82 |   - chr21
83 |   - chr22
84 |   - chrX
85 |   - chrY
86 | 


--------------------------------------------------------------------------------
/smk_include/dev/run_all_eval.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | include: 'prep_custom_references.smk'
 3 | include: 'run_kmer_analysis.smk'
 4 | include: 'run_illumina_qv.smk'
 5 | include: 'run_tech_comparison.smk'
 6 | include: 'run_contig_remap.smk'
 7 | include: 'run_bng_hybrids.smk'
 8 | 
 9 | 
10 | localrules: master_eval
11 | 
12 | wildcard_constraints:
13 |     folder_path = '[A-Za-z0-9\-_\/]+',  # note: "." is NOT allowed in a folder path
14 |     file_name = '[A-Za-z0-9\-_\.]+',
15 |     known_ref = 'GRCh3[78][A-Za-z0-9_]+',
16 |     genemodel = 'GRCh38[A-Za-z0-9_]+'
17 | 
18 | 
19 | def quast_busco_determine_targets(wildcards):
20 |     """
21 |     Rerun QUAST-LG with manually fixed BUSCO database
22 |     to get BUSCO stats as requested
23 |     ODB source:
24 |     https://busco-data.ezlab.org/v4/data/lineages/eukaryota_odb10.2020-09-10.tar.gz
25 | 
26 |     NB: this requires a manual fix for QUAST/BUSCO, i.e. adding the above database
27 |     to the correct path in QUAST
28 |     """
29 | 
30 |     genemodel = 'GRCh38_GENCODEv31_basic'
31 |     refgenome = 'GRCh38_HGSVC2_noalt'
32 |     folder_path = 'evaluation/phased_assemblies'
33 | 
34 |     fixed_wildcards = {
35 |         'known_ref': refgenome,
36 |         'genemodel': genemodel,
37 |         'folder_path': folder_path
38 |     }
39 | 
40 |     # output/{folder_path}/{file_name}.fasta'
41 |     target_path = 'output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{{file_name}}/report.pdf'.format(**fixed_wildcards)
42 | 
43 |     load_path = os.path.join('output', folder_path)
44 | 
45 |     phased_assemblies = sorted([f for f in os.listdir(load_path) if f.endswith('.fasta')])
46 | 
47 |     compute_targets = []
48 |     for ps_assm in phased_assemblies:
49 |         target_file = target_path.format(**{'file_name': ps_assm.strip('.fasta')})
50 |         compute_targets.append(target_file)
51 | 
52 |     return compute_targets
53 | 
54 | 
55 | rule master_quast_busco:
56 |     input:
57 |         quast_busco_determine_targets
58 | 
59 | 
60 | rule master_eval:
61 |     input:
62 |         tech_comparison_determine_targets,
63 |         kmer_analysis_determine_targets,
64 |         illumina_qv_determine_targets,
65 |         contig_remap_determine_targets,
66 |         bng_hybrids_determine_targets,
67 |         quast_busco_determine_targets


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/CHS/hg00513.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00513:
 3 |   individual: HG00513
 4 |   sex: female
 5 |   super_population: EAS
 6 |   population: CHS
 7 |   family: SH032
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00513_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: HG00513_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |         comment: "seq_center:UMIGS"
21 |     - strandseq:
22 |         readset: &sseq_reads HG00513_1kg_il25k-npe_sseq
23 |         source_type: ena
24 |         bioproject: PRJEB12849
25 |         library_fractions: two
26 |     - short_reads:
27 |         readset: HG00513_1kg_il25k-125pe_short
28 |         source_type: ena
29 |         bioproject: PRJEB9396
30 |         load_type: parts
31 |     - short_reads:
32 |         readset: HG00513_1kg_ilnvs-150pe_short
33 |         source_type: ena
34 |         bioproject: PRJEB31736
35 |         load_type: complete
36 |         comment: "2504 cohort"
37 | 
38 | 
39 | sample_targets_HG00513:
40 |   - aliases:
41 |       1: &ccs_reads HG00513_hgsvc_pbsq2-ccs_1000
42 |       2: &clr_reads HG00513_hgsvc_pbsq2-clr_1000
43 |   - defaults:
44 |       hap_reads: *ccs_reads
45 |       vc_reads: *ccs_reads
46 |       sseq_reads: *sseq_reads
47 |       pol_reads: *ccs_reads
48 |       pol_pass: racon-p2
49 |       hap_assm_mode: split
50 |       hap:
51 |         - h1-un
52 |         - h2-un
53 |   - target:
54 |       nhr_assembler: pereg
55 |       hap_assembler: pereg
56 |       var_caller: deepvar
57 |   - target:
58 |       nhr_assembler: flye
59 |       hap_assembler: flye
60 |       var_caller: freebayes
61 |   - defaults:
62 |       hap_reads: *clr_reads
63 |       vc_reads: *clr_reads
64 |       sseq_reads: *sseq_reads
65 |       pol_reads: *clr_reads
66 |       pol_pass: arrow-p1
67 |       hap_assm_mode: split
68 |       hap:
69 |         - h1-un
70 |         - h2-un
71 |   - target:
72 |       nhr_assembler: flye
73 |       hap_assembler: flye
74 |       var_caller: longshot
75 |   - target:
76 |       nhr_assembler: hhu26
77 |       hap_assembler: flye
78 |       var_caller: longshot
79 | 
80 | 


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/EAS/CHS/hg00512.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00512:
 3 |   individual: HG00512
 4 |   sex: male
 5 |   super_population: EAS
 6 |   population: CHS
 7 |   family: SH032
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00512_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: HG00512_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |         comment: "seq_center:UMIGS"
21 |     - strandseq:
22 |         readset: &sseq_reads HG00512_1kg_il25k-npe_sseq
23 |         source_type: ena
24 |         bioproject: PRJEB12849
25 |         library_fractions: two
26 |     - short_reads:
27 |         readset: &short_reads HG00512_1kg_il25k-125pe_short
28 |         source_type: ena
29 |         bioproject: PRJEB9396
30 |         load_type: parts
31 |     - short_reads:
32 |         readset: HG00512_1kg_ilnvs-150pe_short
33 |         source_type: ena
34 |         bioproject: PRJEB36890
35 |         load_type: complete
36 |         comment: "698 cohort"
37 | 
38 | sample_targets_HG00512:
39 |   - aliases:
40 |       1: &ccs_reads HG00512_hgsvc_pbsq2-ccs_1000
41 |       2: &clr_reads HG00512_hgsvc_pbsq2-clr_1000
42 |   - defaults:
43 |       hap_reads: *ccs_reads
44 |       vc_reads: *ccs_reads
45 |       sseq_reads: *sseq_reads
46 |       pol_reads: *ccs_reads
47 |       pol_pass: racon-p2
48 |       hap_assm_mode: split
49 |       hap:
50 |         - h1-un
51 |         - h2-un
52 |   - target:
53 |       nhr_assembler: pereg
54 |       hap_assembler: pereg
55 |       var_caller: deepvar
56 |   - target:
57 |       nhr_assembler: flye
58 |       hap_assembler: flye
59 |       var_caller: freebayes
60 |   - defaults:
61 |       hap_reads: *clr_reads
62 |       vc_reads: *clr_reads
63 |       sseq_reads: *sseq_reads
64 |       pol_reads: *clr_reads
65 |       pol_pass: arrow-p1
66 |       hap_assm_mode: split
67 |       hap:
68 |         - h1-un
69 |         - h2-un
70 |   - target:
71 |       nhr_assembler: flye
72 |       hap_assembler: flye
73 |       var_caller: longshot
74 |   - target:
75 |       nhr_assembler: hhu26
76 |       hap_assembler: flye
77 |       var_caller: longshot
78 | 
79 | 


--------------------------------------------------------------------------------
/docs/autoconf.md:
--------------------------------------------------------------------------------
 1 | # Autoconf
 2 | 
 3 | ## Using the autoconf.py script
 4 | 
 5 | The probably easiest way to start a pipeline run for your own data is to use the `autoconf.py` script
 6 | to generate the necessary configuration file. Since all pipeline configuration is realized with simple
 7 | textual [YAML](https://yaml.org/) files, you can edit the auto-generated configuration files if you
 8 | need more flexibility. Please note that the `autoconf.py` script only supports generating config files
 9 | for one sample at a time.
10 | 
11 | Run the `autoconf.py` script as follows:
12 | 
13 | ```bash
14 | /work_dir$ conda activate ./smk_env
15 | (smk_env)/work_dir$ cd project-diploid-assembly
16 | (smk_env)/work_dir/project-diploid-assembly$ ./autoconf.py
17 | ```
18 | 
19 | The script is interactively guiding you through the configuration process by asking a series of
20 | basic questions about your data, e.g., the local storage path or the type of long reads. You can
21 | always accept the default value (if one is provided!) by hitting `<enter>`. You can reduce
22 | the number of questions to the bare minimum by accepting all defaults:
23 | 
24 | ```bash
25 | (smk_env)/work_dir/project-diploid-assembly$ ./autoconf.py --accept-defaults
26 | ```
27 | 
28 | After you successfully completed the autoconf process, you find two additional folders in your working
29 | directory:
30 | 
31 | ```bash
32 | /work_dir$ ls -1
33 | autoconf_config/
34 | autoconf_linked_data/
35 | project-diploid-assembly/
36 | smk_env/
37 | ```
38 | 
39 | The `autoconf_config` folder contains the generated configuration file, and the `autoconf_linked_data`
40 | folder contains symbolic links to the input data. The symbolic links are named following the pattern
41 | required by the pipeline to process your data correctly.
42 | 
43 | **Caveat**: if the `autoconf.py` script fails at deriving well-behaved names for your input files, please
44 | open a github issue showing a handful of examples of the file names that cannot be processed. However, since
45 | file names are a matter of personal preference, or sometimes of project requirements, the worst case would be
46 | that you have to create appropriately named symbolic links to your input files yourself.
47 | 
48 | Next, please proceed to the documentation on how to [execute the pipeline](execute.md).


--------------------------------------------------------------------------------
/notebooks/2020_project/processing/clean_segdups_annotation.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 20,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import pandas as pd\n",
10 |     "\n",
11 |     "\"\"\"\n",
12 |     "What does this do?\n",
13 |     "Clean up SD annotation downloaded from UCSC\n",
14 |     "Rescales pct. id. into \"score\" BED column (0-1000), and creates combined name\n",
15 |     "for output file\n",
16 |     "\"\"\"\n",
17 |     "\n",
18 |     "sd_file = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/known_regions/segdups_hg38.tsv.gz'\n",
19 |     "\n",
20 |     "df = pd.read_csv(sd_file, sep='\\t')\n",
21 |     "df['chrom'] = df['#chrom']\n",
22 |     "df.drop('#chrom', axis=1, inplace=True)\n",
23 |     "df['score'] = (df['fracMatch'] * 1000).round(0).astype(int)\n",
24 |     "df['name'] = df['uid'].astype(str) + '@' + df['score'].astype(str) + '@' + df['name'].astype(str)\n",
25 |     "\n",
26 |     "\n",
27 |     "new_sort_order = ['chrom'] + list(df.columns[:-1])\n",
28 |     "df = df[new_sort_order]\n",
29 |     "df.sort_values(['chrom', 'chromStart', 'chromEnd'], inplace=True)\n",
30 |     "\n",
31 |     "tsv_output = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/known_regions/ucsc_segdups.tsv.gz'\n",
32 |     "df.to_csv(tsv_output, sep='\\t', header=True, index=False)\n",
33 |     "\n",
34 |     "bed_output = '/home/local/work/code/github/project-diploid-assembly/annotation/grch38/GRCh38_segdups.bed'\n",
35 |     "with open(bed_output, 'w') as dump:\n",
36 |     "    dump.write('#')\n",
37 |     "    df[['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand']].to_csv(dump, sep='\\t', header=True, index=False)"
38 |    ]
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "kernelspec": {
43 |    "display_name": "Python 3",
44 |    "language": "python",
45 |    "name": "python3"
46 |   },
47 |   "language_info": {
48 |    "codemirror_mode": {
49 |     "name": "ipython",
50 |     "version": 3
51 |    },
52 |    "file_extension": ".py",
53 |    "mimetype": "text/x-python",
54 |    "name": "python",
55 |    "nbconvert_exporter": "python",
56 |    "pygments_lexer": "ipython3",
57 |    "version": "3.7.6"
58 |   }
59 |  },
60 |  "nbformat": 4,
61 |  "nbformat_minor": 4
62 | }
63 | 


--------------------------------------------------------------------------------
/scripts/utilities/process_logger.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import time
 4 | 
 5 | import psutil
 6 | 
 7 | attributes = [
 8 |     'cmdline',
 9 |     'cpu_percent',
10 |     'create_time',
11 |     'cwd',
12 |     'exe',
13 |     'memory_info',
14 |     'name',
15 |     'pid',
16 |     'ppid',
17 |     'status',
18 |     'threads',
19 |     'terminal',
20 |     'username',
21 |     'uids'
22 | ]
23 | 
24 | system_exe = [
25 |     '/bin',
26 |     '/usr/bin',
27 |     '/lib',
28 |     '/usr/lib',
29 |     '/usr/sbin',
30 |     '/opt/pbs'
31 | ]
32 | 
33 | special_processes = [
34 |     'sd-pam',
35 |     'ssh-agent',
36 |     'ssh',
37 |     'sshd',
38 |     'screen',
39 |     'SCREEN'
40 | ]
41 | 
42 | whitelist = [
43 |     '/smk_env/',
44 |     '/globus/'
45 | ]
46 | 
47 | 
48 | def main():
49 | 
50 |     LOGFILE = '/home/ebertp/process.log'
51 |     USERNAME = 'ebertp'
52 | 
53 |     with open(LOGFILE, 'w') as logfile:
54 |         pass
55 | 
56 |     while 1:
57 |         cache = dict()
58 |         suspects = []
59 |         for process in psutil.process_iter(attrs=attributes, ad_value='N/A'):
60 |             cache[process.info['pid']] = process.info['exe'], process.info['cmdline']
61 |             if process.info['username'] != USERNAME:
62 |                 continue
63 |             if any([process.info['exe'].startswith(se) for se in system_exe]):
64 |                 continue
65 |             try:
66 |                 if any([sp in process.info['cmdline'][0] for sp in special_processes]):
67 |                     continue
68 |             except IndexError:
69 |                 # process has no cmdline
70 |                 pass
71 |             if any([wl in process.info['exe'] for wl in whitelist]):
72 |                 continue
73 |             suspects.append(process.info)
74 | 
75 |         with open(LOGFILE, 'a') as logfile:
76 |             for p_info in suspects:
77 |                 _ = logfile.write('PARENT: {} / {}\n'.format(*cache[p_info['ppid']]))
78 |                 _ = logfile.write('OFFENDER\n')
79 |                 block = '\n'.join(['{}\t{}'.format(k, p_info[k]) for k in sorted(p_info.keys())])
80 |                 _ = logfile.write(block + '\n')
81 |                 _ = logfile.write('========\n')
82 | 
83 |         time.sleep(30)
84 |     return
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main()
89 | 


--------------------------------------------------------------------------------
/smk_config/params/smk_cfg_params_RV7.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # === Software setup settings ===
 3 | # Specify git commits for SaaRclust
 4 | # and StrandPhaseR setup
 5 | git_commit_saarclust: c0eb57f
 6 | git_commit_strandphaser: e608407
 7 | # arbitrarily tying a version number
 8 | # to the git commits to avoid additional
 9 | # wildcards - increment this number when
10 | # git commits are changed!
11 | git_commit_version: 7
12 | 
13 | peregrine_version: 0.1.5.5
14 | deepvariant_version: 0.9.0
15 | shasta_version: 0.4.0
16 | 
17 | # Assembler settings
18 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
19 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
20 | 
21 | # SaaRclust parameter sets
22 | # goal is to obtain 24 clusters
23 | min_contig_size: 100000
24 | min_region_to_order: 500000
25 | bin_size: 200000
26 | step_size: 200000
27 | prob_threshold: 0.25
28 | init_clusters: 100
29 | 
30 | # this solves a known HET inversion located on chr8
31 | sample_non_default_parameters:
32 |   HG00733:
33 |     init_clusters: 150
34 | 
35 | # VARIANT CALLING
36 | # Postprocessing parameters
37 | filter_vcf_qual: 10
38 | filter_vcf_gq: 100
39 | 
40 | freebayes_timeout_sec: 3600
41 | 
42 | # not primary alignment || supplementary alignment
43 | bwa_strandseq_aln_discard: 2304
44 | 
45 | # read unmapped || not primary alignment || failed QC || PCR dup
46 | minimap_readref_aln_discard: 1796
47 | 
48 | # read unmapped || not primary alignment
49 | minimap_contigref_aln_discard: 260
50 | 
51 | # read unmapped || not primary alignment || failed QC || PCR dup
52 | minimap_racon_aln_discard: 1796  # same as 0x704
53 | minimap_racon_aln_min_qual: 10
54 | 
55 | # main chromosomes to be used
56 | # for known references for main
57 | # pipeline steps (i.e., everything
58 | # before evaluation)
59 | eval_known_ref: GRCh38_GCA_p13
60 | eval_align_ref: hg38_GCA_p13
61 | eval_gene_model: GRCh38_GENCODEv31_basic
62 | use_genome_size: hg38_GCA_p13
63 | main_chromosomes:
64 |   - chr1
65 |   - chr2
66 |   - chr3
67 |   - chr4
68 |   - chr5
69 |   - chr6
70 |   - chr7
71 |   - chr8
72 |   - chr9
73 |   - chr10
74 |   - chr11
75 |   - chr12
76 |   - chr13
77 |   - chr14
78 |   - chr15
79 |   - chr16
80 |   - chr17
81 |   - chr18
82 |   - chr19
83 |   - chr20
84 |   - chr21
85 |   - chr22
86 |   - chrX
87 |   - chrY
88 | 


--------------------------------------------------------------------------------
/smk_include/results/run_sas_trios.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | localrules: run_hg03009_individual,
 3 |             run_na20847_individual,
 4 |             run_sas_trios,
 5 |             run_itu_trio,
 6 |             run_itu_mother,
 7 |             run_itu_child,
 8 |             run_stu_trio,
 9 |             run_stu_child,
10 |             run_pjl_trio,
11 |             run_pjl_child
12 | 
13 | 
14 | rule run_hg03009_individual:
15 |     input:
16 |         'output/targets/SAS_BEB_HG03009/HG03009.fofn'
17 |     message: 'Running SAS-BEB-HG03009 individual'
18 | 
19 | ########################################################
20 | 
21 | rule run_na20847_individual:
22 |     input:
23 |         'output/targets/SAS_GIH_NA20847/NA20847.fofn'
24 |     message: 'Running SAS-GIH-NA20847 individual'
25 | 
26 | ########################################################
27 | 
28 | rule run_itu_mother:
29 |     input:
30 |         'output/targets/SAS_ITU_IT003/HG03721.fofn'
31 |     message: 'Running SAS-ITU-IT003 mother'
32 | 
33 | rule run_itu_child:
34 |     input:
35 |         'output/targets/SAS_ITU_IT003/HG03732.fofn'
36 |     message: 'Running SAS-ITU-IT003 child'
37 | 
38 | rule run_itu_trio:
39 |     input:
40 |         rules.run_itu_mother.input,
41 |         rules.run_itu_child.input
42 |     message: 'Running SAS-ITU-IT003 trio'
43 | 
44 | #########################################################
45 | 
46 | rule run_stu_child:
47 |     input:
48 |         'output/targets/SAS_STU_ST012/HG03683.fofn'
49 |     message: 'Running SAS-STU-ST012 child'
50 | 
51 | rule run_stu_trio:
52 |     input:
53 |         rules.run_stu_child.input
54 |     message: 'Running SAS-STU-ST012 trio'
55 | 
56 | #########################################################
57 | 
58 | rule run_pjl_child:
59 |     input:
60 |         'output/targets/SAS_PJL_PK06/HG02492.fofn'
61 |     message: 'Running SAS-PJL-PK06 child'
62 | 
63 | rule run_pjl_trio:
64 |     input:
65 |         rules.run_pjl_child.input
66 |     message: 'Running SAS-PJL-PK06 trio'
67 | 
68 | #########################################################
69 | 
70 | rule run_sas_trios:
71 |     input:
72 |         rules.run_hg03009_individual.input,
73 |         rules.run_na20847_individual.input,
74 |         rules.run_itu_trio.input,
75 |         rules.run_stu_trio.input,
76 |         rules.run_pjl_trio.input
77 |     message: 'Running SAS trios'
78 | 


--------------------------------------------------------------------------------
/smk_include/results/run_amr_trios.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | localrules: run_amr_trios,
 3 |             run_pur_trio,
 4 |             run_pur_father,
 5 |             run_pur_mother,
 6 |             run_pur_child,
 7 |             run_clm_trio,
 8 |             run_clm_child,
 9 |             run_mxl_trio,
10 |             run_mxl_child,
11 |             run_pel_trio,
12 |             run_pel_child
13 | 
14 | 
15 | rule run_pur_father:
16 |     input:
17 |         'output/targets/AMR_PUR_PR05/HG00731.fofn'
18 |     message: 'Running AMR-PUR-PR05 father'
19 | 
20 | 
21 | rule run_pur_mother:
22 |     input:
23 |         'output/targets/AMR_PUR_PR05/HG00732.fofn'
24 |     message: 'Running AMR-PUR-PR05 mother'
25 | 
26 | 
27 | rule run_pur_child:
28 |     input:
29 |         'output/targets/AMR_PUR_PR05/HG00733.fofn'
30 |     message: 'Running AMR-PUR-PR05 child'
31 | 
32 | 
33 | rule run_pur_trio:
34 |     input:
35 |         rules.run_pur_father.input,
36 |         rules.run_pur_mother.input,
37 |         rules.run_pur_child.input
38 |     message: 'Running AMR-PUR-PR05 trio'
39 | 
40 | #############################################
41 | 
42 | rule run_clm_child:
43 |     input:
44 |          'output/targets/AMR_CLM_CLM03/HG01114.fofn'
45 |     message: 'Running AMR-CLM-CLM03 child'
46 | 
47 | rule run_clm_trio:
48 |     input:
49 |          rules.run_clm_child.input
50 |     message: 'Running AMR-CLM-CLM03 trio'
51 | 
52 | ##############################################
53 | 
54 | rule run_mxl_child:
55 |     input:
56 |          'output/targets/AMR_MXL_m001/NA19650.fofn'
57 |     message: 'Running AMR-MXL-m001 child'
58 | 
59 | rule run_mxl_trio:
60 |     input:
61 |          rules.run_mxl_child.input
62 |     message: 'Running AMR-MXL-m001 trio'
63 | 
64 | ##############################################
65 | 
66 | rule run_pel_child:
67 |     input:
68 |          'output/targets/AMR_PEL_PEL003/HG01573.fofn'
69 |     message: 'Running AMR-PEL-PEL003 child'
70 | 
71 | rule run_pel_trio:
72 |     input:
73 |          rules.run_pel_child.input
74 |     message: 'Running AMR-PEL-PEL003 trio'
75 | 
76 | ##############################################
77 | 
78 | rule run_amr_trios:
79 |     input:
80 |         rules.run_pur_trio.input,
81 |         rules.run_clm_trio.input,
82 |         rules.run_mxl_trio.input,
83 |         rules.run_pel_trio.input
84 |     message: 'Running AMR trios'
85 | 
86 | 


--------------------------------------------------------------------------------
/environment/snakemake/cluster/hhu_pbs/hilbert_queues.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Queue: default
 3 |     queue_type = Route
 4 |     total_jobs = 0
 5 |     state_count = Transit:0 Queued:0 Held:0 Waiting:0 Running:0 Exiting:0 Begun
 6 |         :0 
 7 |     resources_max.walltime = 167:59:59
 8 |     route_destinations = CUDA,short,workq,long
 9 |     route_retry_time = 30
10 |     enabled = True
11 |     started = True
12 | 
13 | 
14 | Queue: short
15 |     queue_type = Execution
16 |     Priority = 120
17 |     total_jobs = -14
18 |     state_count = Transit:0 Queued:0 Held:1 Waiting:0 Running:4 Exiting:0 Begun
19 |         :0 
20 |     max_queued = [u:PBS_GENERIC=1024]
21 |     from_route_only = True
22 |     resources_max.ngpus = 0
23 |     resources_max.walltime = 02:00:00
24 |     resources_default.preempt_targets = NONE
25 |     resources_assigned.mem = 11534336kb
26 |     resources_assigned.mpiprocs = 0
27 |     resources_assigned.ncpus = 4
28 |     resources_assigned.nodect = 4
29 |     max_run = [u:PBS_GENERIC=512]
30 |     enabled = True
31 |     started = True
32 | 
33 | 
34 | Queue: workq
35 |     queue_type = Execution
36 |     Priority = 100
37 |     total_jobs = 182
38 |     state_count = Transit:0 Queued:0 Held:11 Waiting:0 Running:105 Exiting:0 Be
39 |         gun:1 
40 |     max_queued = [u:PBS_GENERIC=1024]
41 |     from_route_only = True
42 |     resources_max.ngpus = 0
43 |     resources_max.walltime = 72:00:00
44 |     resources_min.walltime = 02:00:01
45 |     resources_default.preempt_targets = NONE
46 |     resources_assigned.mem = 4672716800kb
47 |     resources_assigned.mpiprocs = 2396
48 |     resources_assigned.ncpus = 2626
49 |     resources_assigned.nodect = 141
50 |     enabled = True
51 |     started = True
52 | 
53 | 
54 | Queue: long
55 |     queue_type = Execution
56 |     Priority = 80
57 |     total_jobs = 82
58 |     state_count = Transit:0 Queued:26 Held:2 Waiting:0 Running:95 Exiting:0 Beg
59 |         un:4 
60 |     max_queued = [u:PBS_GENERIC=1024]
61 |     from_route_only = True
62 |     resources_max.ngpus = 0
63 |     resources_max.walltime = 167:00:00
64 |     resources_min.walltime = 72:00:01
65 |     resources_default.preempt_targets = NONE
66 |     resources_assigned.mem = 1525022720kb
67 |     resources_assigned.mpiprocs = 24
68 |     resources_assigned.ncpus = 272
69 |     resources_assigned.nodect = 100
70 |     max_run_res.ncpus = [o:PBS_ALL=1024]
71 |     enabled = True
72 |     started = True


--------------------------------------------------------------------------------
/annotation/in_preparation/bl_supp_HG02818_HG03125_HG03486_NA19434.txt:
--------------------------------------------------------------------------------
  1 | HG02818x02PE20320
  2 | HG02818x02PE20338
  3 | HG02818x02PE20341
  4 | HG02818x02PE20359
  5 | HG02818x02PE20362
  6 | HG02818x02PE20383
  7 | HG02818x02PE20385
  8 | HG02818x02PE20386
  9 | HG02818x02PE20387
 10 | HG02818x02PE20391
 11 | HG02818x02PE20393
 12 | HG02818x02PE20395
 13 | HG02818x02PE20396
 14 | HG03125x02PE20301
 15 | HG03125x02PE20302
 16 | HG03125x02PE20305
 17 | HG03125x02PE20306
 18 | HG03125x02PE20308
 19 | HG03125x02PE20309
 20 | HG03125x02PE20324
 21 | HG03125x02PE20325
 22 | HG03125x02PE20328
 23 | HG03125x02PE20331
 24 | HG03125x02PE20333
 25 | HG03125x02PE20336
 26 | HG03125x02PE20337
 27 | HG03125x02PE20342
 28 | HG03125x02PE20344
 29 | HG03125x02PE20347
 30 | HG03125x02PE20352
 31 | HG03125x02PE20354
 32 | HG03125x02PE20363
 33 | HG03125x02PE20366
 34 | HG03125x02PE20367
 35 | HG03125x02PE20373
 36 | HG03125x02PE20374
 37 | HG03125x02PE20379
 38 | HG03125x02PE20380
 39 | HG03125x02PE20382
 40 | HG03125x02PE20384
 41 | HG03125x02PE20386
 42 | HG03125x02PE20388
 43 | HG03125x02PE20389
 44 | HG03125x02PE20393
 45 | HG03486x02PE20504
 46 | HG03486x02PE20509
 47 | HG03486x02PE20513
 48 | HG03486x02PE20516
 49 | HG03486x02PE20520
 50 | HG03486x02PE20521
 51 | HG03486x02PE20527
 52 | HG03486x02PE20528
 53 | HG03486x02PE20531
 54 | HG03486x02PE20535
 55 | HG03486x02PE20539
 56 | HG03486x02PE20550
 57 | HG03486x02PE20553
 58 | HG03486x02PE20554
 59 | HG03486x02PE20557
 60 | HG03486x02PE20561
 61 | HG03486x02PE20570
 62 | HG03486x02PE20574
 63 | HG03486x02PE20581
 64 | HG03486x02PE20588
 65 | NA19434x02PE20501
 66 | NA19434x02PE20503
 67 | NA19434x02PE20505
 68 | NA19434x02PE20507
 69 | NA19434x02PE20508
 70 | NA19434x02PE20514
 71 | NA19434x02PE20516
 72 | NA19434x02PE20523
 73 | NA19434x02PE20524
 74 | NA19434x02PE20525
 75 | NA19434x02PE20532
 76 | NA19434x02PE20533
 77 | NA19434x02PE20534
 78 | NA19434x02PE20535
 79 | NA19434x02PE20539
 80 | NA19434x02PE20540
 81 | NA19434x02PE20541
 82 | NA19434x02PE20546
 83 | NA19434x02PE20548
 84 | NA19434x02PE20551
 85 | NA19434x02PE20554
 86 | NA19434x02PE20556
 87 | NA19434x02PE20561
 88 | NA19434x02PE20563
 89 | NA19434x02PE20564
 90 | NA19434x02PE20567
 91 | NA19434x02PE20571
 92 | NA19434x02PE20572
 93 | NA19434x02PE20579
 94 | NA19434x02PE20580
 95 | NA19434x02PE20582
 96 | NA19434x02PE20584
 97 | NA19434x02PE20587
 98 | NA19434x02PE20588
 99 | NA19434x02PE20594
100 | NA19434x02PE20595
101 | NA19434x02PE20596


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AFR/YRI/na19239.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA19239:
 3 |   individual: NA19239
 4 |   sex: male
 5 |   super_population: AFR
 6 |   population: YRI
 7 |   family: Y117
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: NA19239_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: NA19239_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |     - strandseq:
21 |         readset: &sseq_reads NA19239_1kg_il25k-npe_sseq
22 |         source_type: ena
23 |         bioproject: PRJEB12849
24 |         library_fractions: two
25 |     - short_reads:
26 |         readset: NA19239_1kg_il25k-125pe_short
27 |         source_type: ena
28 |         bioproject: PRJEB9396
29 |         load_type: parts
30 |     - short_reads:
31 |         readset: NA19239_1kg_ilnvs-150pe_short
32 |         source_type: ena
33 |         bioproject: PRJEB31736
34 |         load_type: complete
35 |         comment: "2504 cohort"
36 | 
37 | 
38 | sample_targets_NA19239:
39 |   - aliases:
40 |       1: &ccs_reads NA19239_hgsvc_pbsq2-ccs_1000
41 |       2: &clr_reads NA19239_hgsvc_pbsq2-clr_1000
42 |   - defaults:
43 |       hap_reads: *ccs_reads
44 |       vc_reads: *ccs_reads
45 |       sseq_reads: *sseq_reads
46 |       pol_reads: *ccs_reads
47 |       pol_pass: racon-p2
48 |       hap_assm_mode: split
49 |       hap:
50 |         - h1-un
51 |         - h2-un
52 |   - target:
53 |       nhr_assembler: pereg
54 |       hap_assembler: pereg
55 |       var_caller: deepvar
56 |   - target:
57 |       nhr_assembler: flye
58 |       hap_assembler: flye
59 |       var_caller: freebayes
60 |   - defaults:
61 |       hap_reads: *clr_reads
62 |       vc_reads: *clr_reads
63 |       sseq_reads: *sseq_reads
64 |       pol_reads: *clr_reads
65 |       pol_pass: arrow-p1
66 |       hap_assm_mode: split
67 |       hap:
68 |         - h1-un
69 |         - h2-un
70 |   - target:
71 |       nhr_assembler: flye
72 |       hap_assembler: flye
73 |       var_caller: longshot
74 |   - defaults:
75 |       hap_reads: *clr_reads
76 |       vc_reads: *clr_reads
77 |       sseq_reads: *sseq_reads
78 |       pol_reads: *clr_reads
79 |       pol_pass: arrow-p1
80 |       hap_assm_mode: split
81 |       hap:
82 |         - h1-un
83 |         - h2-un
84 |   - target:
85 |       nhr_assembler: hhu26
86 |       hap_assembler: flye
87 |       var_caller: longshot


--------------------------------------------------------------------------------
/smk_config/samples/hgsvc/AMR/PUR/hg00731.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_HG00731:
 3 |   individual: HG00731
 4 |   sex: male
 5 |   super_population: AMR
 6 |   population: PUR
 7 |   family: PR05
 8 |   member: parent
 9 |   data_sources:
10 |     - long_reads:
11 |         readset: HG00731_hgsvc_pbsq2-ccs
12 |         technology: pacbio
13 |         data_type: fastq
14 |         load_type: parts
15 |     - long_reads:
16 |         readset: HG00731_hgsvc_pbsq2-clr
17 |         technology: pacbio
18 |         data_type: pacbio_native
19 |         load_type: parts
20 |     - strandseq:
21 |         readset: &sseq_reads HG00731_1kg_il25k-npe_sseq
22 |         source_type: ena
23 |         bioproject: PRJEB12849
24 |         library_fractions: two
25 |     - short_reads:
26 |         readset: HG00731_1kg_il25k-125pe_short
27 |         source_type: ena
28 |         bioproject: PRJEB9396
29 |         load_type: parts
30 |     - short_reads:
31 |         readset: HG00731_1kg_ilnvs-150pe_short
32 |         source_type: ena
33 |         bioproject: PRJEB31736
34 |         load_type: complete
35 |         comment: "2504 cohort"
36 | 
37 | 
38 | sample_targets_HG00731:
39 |   - aliases:
40 |       1: &ccs_reads HG00731_hgsvc_pbsq2-ccs_1000
41 |       2: &clr_reads HG00731_hgsvc_pbsq2-clr_1000
42 |   - defaults:
43 |       hap_reads: *ccs_reads
44 |       vc_reads: *ccs_reads
45 |       sseq_reads: *sseq_reads
46 |       pol_reads: *ccs_reads
47 |       pol_pass: racon-p2
48 |       hap_assm_mode: split
49 |       hap:
50 |         - h1-un
51 |         - h2-un
52 |   - target:
53 |       nhr_assembler: pereg
54 |       hap_assembler: pereg
55 |       var_caller: deepvar
56 |   - target:
57 |       nhr_assembler: flye
58 |       hap_assembler: flye
59 |       var_caller: freebayes
60 |   - defaults:
61 |       hap_reads: *clr_reads
62 |       vc_reads: *clr_reads
63 |       sseq_reads: *sseq_reads
64 |       pol_reads: *clr_reads
65 |       pol_pass: arrow-p1
66 |       hap_assm_mode: split
67 |       hap:
68 |         - h1-un
69 |         - h2-un
70 |   - target:
71 |       nhr_assembler: flye
72 |       hap_assembler: flye
73 |       var_caller: longshot
74 |   - defaults:
75 |       hap_reads: *clr_reads
76 |       vc_reads: *clr_reads
77 |       sseq_reads: *sseq_reads
78 |       pol_reads: *clr_reads
79 |       pol_pass: arrow-p1
80 |       hap_assm_mode: split
81 |       hap:
82 |         - h1-un
83 |         - h2-un
84 |   - target:
85 |       nhr_assembler: hhu26
86 |       hap_assembler: flye
87 |       var_caller: longshot
88 | 


--------------------------------------------------------------------------------
/smk_config/params/smk_cfg_params_RV8.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # === Software setup settings ===
 3 | # Specify git commits for SaaRclust
 4 | # and StrandPhaseR setup
 5 | git_commit_saarclust: 71b3763
 6 | git_commit_strandphaser: e608407
 7 | git_commit_breakpointr: 268d99d
 8 | # arbitrarily tying a version number
 9 | # to the git commits to avoid additional
10 | # wildcards - increment this number when
11 | # git commits are changed!
12 | git_commit_version: 8
13 | 
14 | peregrine_version: 0.1.5.5
15 | deepvariant_version: 0.9.0
16 | shasta_version: 0.4.0
17 | 
18 | # Assembler settings
19 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
20 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
21 | 
22 | # SaaRclust parameter sets
23 | # goal is to obtain 24 clusters
24 | min_contig_size: 100000
25 | min_region_to_order: 500000
26 | bin_size: 200000
27 | step_size: 200000
28 | prob_threshold: 0.25
29 | init_clusters: 100
30 | desired_clusters: 24
31 | 
32 | # this solves a known HET inversion located on chr8
33 | sample_non_default_parameters:
34 |   HG00733:
35 |     use_only_in:
36 |       write_saarclust_config_file:
37 |         init_clusters: 150
38 |         desired_clusters: 25
39 | 
40 | 
41 | # VARIANT CALLING
42 | # Postprocessing parameters
43 | filter_vcf_qual: 10
44 | filter_vcf_gq: 100
45 | 
46 | freebayes_timeout_sec: 3600
47 | 
48 | # not primary alignment || supplementary alignment
49 | bwa_strandseq_aln_discard: 2304
50 | 
51 | # read unmapped || not primary alignment || failed QC || PCR dup
52 | minimap_readref_aln_discard: 1796
53 | 
54 | # read unmapped || not primary alignment
55 | minimap_contigref_aln_discard: 260
56 | 
57 | # read unmapped || not primary alignment || failed QC || PCR dup
58 | minimap_racon_aln_discard: 1796  # same as 0x704
59 | minimap_racon_aln_min_qual: 10
60 | 
61 | # main chromosomes to be used
62 | # for known references for main
63 | # pipeline steps (i.e., everything
64 | # before evaluation)
65 | eval_known_ref: GRCh38_GCA_p13
66 | eval_align_ref: hg38_GCA_p13
67 | eval_gene_model: GRCh38_GENCODEv31_basic
68 | use_genome_size: hg38_GCA_p13
69 | main_chromosomes:
70 |   - chr1
71 |   - chr2
72 |   - chr3
73 |   - chr4
74 |   - chr5
75 |   - chr6
76 |   - chr7
77 |   - chr8
78 |   - chr9
79 |   - chr10
80 |   - chr11
81 |   - chr12
82 |   - chr13
83 |   - chr14
84 |   - chr15
85 |   - chr16
86 |   - chr17
87 |   - chr18
88 |   - chr19
89 |   - chr20
90 |   - chr21
91 |   - chr22
92 |   - chrX
93 |   - chrY
94 | 


--------------------------------------------------------------------------------
/scripts/utilities/inspect_environment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | import subprocess as sp
 7 | 
 8 | 
 9 | def main():
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--outfile', '-o', type=str, dest='outfile')
13 |     parser.add_argument('--logfile', '-l', type=str, dest='logfile')
14 |     parser.add_argument('--export-conda-env', '-e', action='store_true', default=False, dest='export')
15 | 
16 |     args = parser.parse_args()
17 | 
18 |     outfile = args.outfile
19 |     logfile = args.logfile
20 | 
21 |     try:
22 |         os.makedirs(os.path.dirname(os.path.abspath(outfile)), exist_ok=True)
23 |         os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
24 |     except TypeError:
25 |         # since Conda environments (or the Singularity module on Hilbert)
26 |         # only support Python2 (...), exist_ok may cause an exception
27 |         # Ignore that and hope that Snakemake creates everything...
28 |         pass
29 | 
30 |     my_env = dict(os.environ)
31 | 
32 |     env_vars = sorted(my_env.keys())
33 | 
34 |     conda_env = None
35 | 
36 |     with open(logfile, 'w') as log:
37 | 
38 |         _ = log.write('\n===== Accessible environment:\n')
39 | 
40 |         for k in env_vars:
41 |             _ = log.write('{} - {}\n'.format(k, my_env[k]))
42 |             if k == 'CONDA_PREFIX':
43 |                 conda_env = my_env[k]
44 | 
45 |         _ = log.write('\nDone\n')
46 | 
47 |         if args.export and conda_env is None:
48 |             _ = logfile.write('\nERROR: cannot export CONDA env, no prefix path found in environment (see above)\n')
49 |         elif args.export:
50 |             _ = log.write('\n===== Export of active CONDA environment\n\n')
51 | 
52 |             try:
53 |                 out = sp.check_output('conda env export --prefix {}'.format(conda_env),
54 |                                       stderr=sp.STDOUT,
55 |                                       shell=True,
56 |                                       env=None)
57 |                 out = out.decode('utf-8')
58 |                 _ = log.write(out + '\n\n')
59 |             except sp.CalledProcessError as spe:
60 |                 _ = log.write('Exporting Conda env failed with code {}: {}\n'.format(spe.returncode, spe.output))
61 |         else:
62 |             pass
63 | 
64 |     with open(outfile, 'w') as touch:
65 |         _ = touch.write('ENV OK\n')
66 | 
67 |     return
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     main()
72 |     sys.exit(0)
73 | 


--------------------------------------------------------------------------------
/scripts/utilities/version_checker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | import argparse
 6 | import re
 7 | 
 8 | 
 9 | def main():
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--outfile', '-o', type=str, dest='outfile')
13 |     parser.add_argument('--at-least', '-a', type=str, dest='atleast')
14 |     parser.add_argument('--logfile', '-l', type=str, dest='logfile')
15 | 
16 |     args = parser.parse_args()
17 | 
18 |     outfile = args.outfile
19 |     logfile = args.logfile
20 | 
21 |     try:
22 |         os.makedirs(os.path.dirname(os.path.abspath(outfile)), exist_ok=True)
23 |         os.makedirs(os.path.dirname(os.path.abspath(logfile)), exist_ok=True)
24 |     except TypeError:
25 |         # since Conda environments (or the Singularity module on Hilbert)
26 |         # only support Python2 (...), exist_ok may cause an exception
27 |         # Ignore that and hope that Snakemake creates everything...
28 |         pass
29 | 
30 |     req_version = [int(v) for v in args.atleast.split('.')]
31 | 
32 |     version_pattern = re.compile('[0-9]+\\.[0-9]+(\\.[0-9]+)?')
33 | 
34 |     match_found = False
35 | 
36 |     with open(logfile, 'w') as log:
37 |         _ = log.write('Minimum version required: {}\n'.format(args.atleast))
38 |         for line in sys.stdin.readlines():
39 |             _ = log.write('Processing line: {}\n'.format(line.strip()))
40 |             mobj = version_pattern.search(line.strip())
41 |             if mobj is not None:
42 |                 version_info = mobj.group(0)
43 |                 _ = log.write('Potential version info found: {}\n'.format(version_info))
44 |                 tool_version = [int(v) for v in version_info.split('.')]
45 |                 for min_v, is_v in zip(req_version, tool_version):
46 |                     if is_v > min_v and not is_v < min_v:
47 |                         _ = log.write('Minimum version matched...\n')
48 |                         match_found = True
49 |                         break
50 |                 if match_found:
51 |                     break
52 |                 else:
53 |                     _ = log.write('Version info did not match...\n')
54 | 
55 |         if match_found:
56 |             exit_code = 0
57 |             with open(outfile, 'w') as touch:
58 |                 _ = touch.write('Version confirmed: {}\n'.format('.'.join([str(v) for v in tool_version])))
59 |         else:
60 |             exit_code = 1
61 |             _ = log.write('No match found')
62 | 
63 |     return exit_code
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     sys.exit(main())
68 | 


--------------------------------------------------------------------------------
/smk_include/haploid_read_coverage.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | rule dump_haploid_read_coverage:
 4 |     """
 5 |     "Since recently", UCSC tools require old "ASCII" sort order for the big* indices
 6 |     to be correct. This is incompatible with default locale (UTF-8) on many Linux systems
 7 |     """
 8 |     input:
 9 |         'output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.psort.sam.bam'
10 |     output:
11 |         'output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.sorted.bedGraph'
12 |     log:
13 |         bedtools = 'log/output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.bg.log',
14 |         sort = 'log/output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.sort.log',
15 |     benchmark:
16 |         os.path.join('run/output/alignments/hap_reads_to_reference',
17 |                      '{folder_path}',
18 |                      '{file_name}_map-to_{aln_reference}.{hap}.bg' + '.t{}.rsrc'.format(config['num_cpu_medium']))
19 |     conda:
20 |         '../environment/conda/conda_biotools.yml'
21 |     threads: config['num_cpu_medium']
22 |     resources:
23 |         runtime_hrs = lambda wildcards, attempt: attempt * 6,
24 |         mem_total_mb = lambda wildcards, attempt: attempt * 24576 + 49152,
25 |         mem_per_cpu_mb = lambda wildcards, attempt: int((attempt * 24576 + 49152) / config['num_cpu_medium'])
26 |     shell:
27 |         'bedtools genomecov -bg -ibam {input} 2> {log.bedtools}'
28 |         ' | '
29 |         'LC_COLLATE=C sort --buffer-size={resources.mem_total_mb}M --parallel={threads} '
30 |         '-k1,1 -k2,2n > {output} 2> {log.sort}'
31 | 
32 | 
33 | rule convert_hap_read_coverage:
34 |     input:
35 |        bg_track = 'output/alignments/hap_reads_to_reference/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.sorted.bedGraph',
36 |        sizes = 'references/assemblies/{aln_reference}.sizes'
37 |     output:
38 |        'output/cov_tracks/hap_reads/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.bigWig'
39 |     log:
40 |        'log/output/cov_tracks/hap_reads/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.log'
41 |     benchmark:
42 |        'run/output/cov_tracks/hap_reads/{folder_path}/{file_name}_map-to_{aln_reference}.{hap}.rsrc'
43 |     conda:
44 |         '../environment/conda/conda_biotools.yml'
45 |     resources:
46 |         runtime_hrs = lambda wildcards, attempt: attempt * 6,
47 |         mem_total_mb = lambda wildcards, attempt: attempt * 16384,
48 |         mem_per_cpu_mb = lambda wildcards, attempt: attempt * 16384
49 |     shell:
50 |          'bedGraphToBigWig {input.bg_track} {input.sizes} {output} 2> {log}'
51 | 


--------------------------------------------------------------------------------
/smk_include/results/run_eas_trios.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | localrules: run_hg01596_individual,
 3 |             run_na18534_individual,
 4 |             run_na18939_individual,
 5 |             run_hg00864_individual,
 6 |             run_eas_trios,
 7 |             run_chs_trio,
 8 |             run_chs_father,
 9 |             run_chs_mother,
10 |             run_chs_child,
11 |             run_khv_trio,
12 |             run_khv_child
13 | 
14 | 
15 | rule run_hg01596_individual:
16 |     input:
17 |         'output/targets/EAS_KHV_HG01596/HG01596.fofn'
18 |     message: 'Running EAS-KHV-HG01596 individual'
19 | 
20 | #####################################################
21 | 
22 | rule run_na18534_individual:
23 |     input:
24 |         'output/targets/EAS_CHB_NA18534/NA18534.fofn'
25 |     message: 'Running EAS-CHB-NA18534 individual'
26 | 
27 | #####################################################
28 | 
29 | rule run_na18939_individual:
30 |     input:
31 |         'output/targets/EAS_JPT_NA18939/NA18939.fofn'
32 |     message: 'Running EAS-JPT-NA18939 individual'
33 | 
34 | #####################################################
35 | 
36 | rule run_hg00864_individual:
37 |     input:
38 |         'output/targets/EAS_CDX_HG00864/HG00864.fofn'
39 |     message: 'Running EAS-CDX-HG00864 individual'
40 | 
41 | #####################################################
42 | 
43 | rule run_chs_mother:
44 |     input:
45 |         'output/targets/EAS_CHS_SH032/HG00512.fofn'
46 |     message: 'Running EAS-CHS-SH032 mother'
47 | 
48 | 
49 | rule run_chs_father:
50 |     input:
51 |         'output/targets/EAS_CHS_SH032/HG00513.fofn'
52 |     message: 'Running EAS-CHS-SH032 father'
53 | 
54 | 
55 | rule run_chs_child:
56 |     input:
57 |         'output/targets/EAS_CHS_SH032/HG00514.fofn'
58 |     message: 'Running EAS-CHS-SH032 child'
59 | 
60 | 
61 | rule run_chs_trio:
62 |     input:
63 |         rules.run_chs_father.input,
64 |         rules.run_chs_mother.input,
65 |         rules.run_chs_child.input
66 |     message: 'Running EAS-CHS-SH032 trio'
67 | 
68 | #######################################################
69 | 
70 | rule run_khv_child:
71 |     input:
72 |         'output/targets/EAS_KHV_VN047/HG02018.fofn'
73 |     message: 'Running EAS-KHV-VN047 child'
74 | 
75 | 
76 | rule run_khv_trio:
77 |     input:
78 |         rules.run_khv_child.input
79 |     message: 'Running EAS-KHV-VN047 trio'
80 | 
81 | ########################################################
82 | 
83 | rule run_eas_trios:
84 |     input:
85 |         rules.run_hg01596_individual.input,
86 |         rules.run_na18534_individual.input,
87 |         rules.run_na18939_individual.input,
88 |         rules.run_hg00864_individual.input,
89 |         rules.run_chs_trio.input,
90 |         rules.run_khv_trio.input
91 |     message: 'Running EAS trios'
92 | 


--------------------------------------------------------------------------------
/smk_config/params/smk_cfg_params_RV10.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | # === Software setup settings ===
  3 | # Specify git commits for SaaRclust
  4 | # and StrandPhaseR setup
  5 | git_commit_saarclust: 9b4aa00
  6 | git_commit_strandphaser: e608407
  7 | git_commit_breakpointr: 268d99d
  8 | # arbitrarily tying a version number
  9 | # to the git commits to avoid additional
 10 | # wildcards - increment this number when
 11 | # git commits are changed!
 12 | git_commit_version: 10
 13 | 
 14 | peregrine_version: 0.1.5.5
 15 | deepvariant_version: 0.9.0
 16 | shasta_version: 0.4.0
 17 | 
 18 | # Assembler settings
 19 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
 20 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
 21 | 
 22 | # SaaRclust parameter sets
 23 | # goal is to obtain approx.
 24 | # 24 clusters (for human)
 25 | min_contig_size: 100000
 26 | min_region_to_order: 500000
 27 | bin_size: 200000
 28 | step_size: 200000
 29 | prob_threshold: 0.25
 30 | init_clusters: 100
 31 | desired_clusters: 24
 32 | min_mapq: 10
 33 | 
 34 | # this solves a known HET inversion located on chr8
 35 | sample_non_default_parameters:
 36 |   HG00733:
 37 |     use_only_in:
 38 |       write_saarclust_config_file:
 39 |         init_clusters: 150
 40 |         desired_clusters: 25
 41 |   NA24385:
 42 |     use_only_in:
 43 |       write_saarclust_config_file:
 44 |         init_clusters: 150
 45 |         desired_clusters: 25
 46 | 
 47 | # VARIANT CALLING
 48 | # Postprocessing parameters
 49 | filter_vcf_qual: 10
 50 | filter_vcf_gq: 100
 51 | 
 52 | freebayes_timeout_sec: 3600
 53 | 
 54 | # not primary alignment || supplementary alignment
 55 | bwa_strandseq_aln_discard: 2304
 56 | 
 57 | # read unmapped || not primary alignment || failed QC || PCR dup
 58 | minimap_readref_aln_discard: 1796
 59 | 
 60 | # read unmapped || not primary alignment
 61 | minimap_contigref_aln_discard: 260
 62 | 
 63 | # read unmapped || not primary alignment || failed QC || PCR dup
 64 | minimap_racon_aln_discard: 1796  # same as 0x704
 65 | minimap_racon_aln_min_qual: 10
 66 | 
 67 | # main chromosomes to be used
 68 | # for known references for main
 69 | # pipeline steps (i.e., everything
 70 | # before evaluation)
 71 | eval_known_ref: GRCh38_GCA_p13
 72 | eval_align_ref: hg38_GCA_p13
 73 | eval_gene_model: GRCh38_GENCODEv31_basic
 74 | use_genome_size: hg38_GCA_p13
 75 | main_chromosomes:
 76 |   - chr1
 77 |   - chr2
 78 |   - chr3
 79 |   - chr4
 80 |   - chr5
 81 |   - chr6
 82 |   - chr7
 83 |   - chr8
 84 |   - chr9
 85 |   - chr10
 86 |   - chr11
 87 |   - chr12
 88 |   - chr13
 89 |   - chr14
 90 |   - chr15
 91 |   - chr16
 92 |   - chr17
 93 |   - chr18
 94 |   - chr19
 95 |   - chr20
 96 |   - chr21
 97 |   - chr22
 98 |   - chrX
 99 |   - chrY
100 | 


--------------------------------------------------------------------------------
/smk_config/params/smk_cfg_params_RV9.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | # === Software setup settings ===
  3 | # Specify git commits for SaaRclust
  4 | # and StrandPhaseR setup
  5 | git_commit_saarclust: ba65b53
  6 | git_commit_strandphaser: e608407
  7 | git_commit_breakpointr: 268d99d
  8 | # arbitrarily tying a version number
  9 | # to the git commits to avoid additional
 10 | # wildcards - increment this number when
 11 | # git commits are changed!
 12 | git_commit_version: 9
 13 | 
 14 | peregrine_version: 0.1.5.5
 15 | deepvariant_version: 0.9.0
 16 | shasta_version: 0.4.0
 17 | 
 18 | # Assembler settings
 19 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
 20 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
 21 | 
 22 | # SaaRclust parameter sets
 23 | # goal is to obtain approx.
 24 | # 24 clusters (for human)
 25 | min_contig_size: 100000
 26 | min_region_to_order: 500000
 27 | bin_size: 200000
 28 | step_size: 200000
 29 | prob_threshold: 0.25
 30 | init_clusters: 100
 31 | desired_clusters: 24
 32 | min_mapq: 10
 33 | 
 34 | # this solves a known HET inversion located on chr8
 35 | sample_non_default_parameters:
 36 |   HG00733:
 37 |     use_only_in:
 38 |       write_saarclust_config_file:
 39 |         init_clusters: 150
 40 |         desired_clusters: 25
 41 |   NA24385:
 42 |     use_only_in:
 43 |       write_saarclust_config_file:
 44 |         init_clusters: 150
 45 |         desired_clusters: 25
 46 | 
 47 | # VARIANT CALLING
 48 | # Postprocessing parameters
 49 | filter_vcf_qual: 10
 50 | filter_vcf_gq: 100
 51 | 
 52 | freebayes_timeout_sec: 3600
 53 | 
 54 | # not primary alignment || supplementary alignment
 55 | bwa_strandseq_aln_discard: 2304
 56 | 
 57 | # read unmapped || not primary alignment || failed QC || PCR dup
 58 | minimap_readref_aln_discard: 1796
 59 | 
 60 | # read unmapped || not primary alignment
 61 | minimap_contigref_aln_discard: 260
 62 | 
 63 | # read unmapped || not primary alignment || failed QC || PCR dup
 64 | minimap_racon_aln_discard: 1796  # same as 0x704
 65 | minimap_racon_aln_min_qual: 10
 66 | 
 67 | # main chromosomes to be used
 68 | # for known references for main
 69 | # pipeline steps (i.e., everything
 70 | # before evaluation)
 71 | eval_known_ref: GRCh38_GCA_p13
 72 | eval_align_ref: hg38_GCA_p13
 73 | eval_gene_model: GRCh38_GENCODEv31_basic
 74 | use_genome_size: hg38_GCA_p13
 75 | main_chromosomes:
 76 |   - chr1
 77 |   - chr2
 78 |   - chr3
 79 |   - chr4
 80 |   - chr5
 81 |   - chr6
 82 |   - chr7
 83 |   - chr8
 84 |   - chr9
 85 |   - chr10
 86 |   - chr11
 87 |   - chr12
 88 |   - chr13
 89 |   - chr14
 90 |   - chr15
 91 |   - chr16
 92 |   - chr17
 93 |   - chr18
 94 |   - chr19
 95 |   - chr20
 96 |   - chr21
 97 |   - chr22
 98 |   - chrX
 99 |   - chrY
100 | 


--------------------------------------------------------------------------------
/smk_config/params/smk_cfg_params_RV11.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | # === Software setup settings ===
  3 | # Specify git commits for SaaRclust
  4 | # and StrandPhaseR setup
  5 | git_commit_saarclust: aac02ed
  6 | git_commit_strandphaser: e608407
  7 | git_commit_breakpointr: 268d99d
  8 | # arbitrarily tying a version number
  9 | # to the git commits to avoid additional
 10 | # wildcards - increment this number when
 11 | # git commits are changed!
 12 | git_commit_version: 11
 13 | 
 14 | peregrine_version: 0.1.6.1
 15 | deepvariant_version: 0.10.0
 16 | shasta_version: 0.4.0
 17 | 
 18 | # Assembler settings
 19 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
 20 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
 21 | 
 22 | # SaaRclust parameter sets
 23 | # goal is to obtain approx.
 24 | # 24 clusters (for human)
 25 | min_contig_size: 100000
 26 | min_region_to_order: 500000
 27 | bin_size: 200000
 28 | step_size: 200000
 29 | prob_threshold: 0.25
 30 | init_clusters: 100
 31 | desired_clusters: 24
 32 | min_mapq: 10
 33 | 
 34 | # this solves a known HET inversion located on chr8
 35 | sample_non_default_parameters:
 36 |   HG00733:
 37 |     use_only_in:
 38 |       write_saarclust_config_file:
 39 |         init_clusters: 150
 40 |         desired_clusters: 25
 41 |   NA24385:
 42 |     use_only_in:
 43 |       write_saarclust_config_file:
 44 |         init_clusters: 150
 45 |         desired_clusters: 25
 46 | 
 47 | # VARIANT CALLING
 48 | # Postprocessing parameters
 49 | filter_vcf_qual: 10
 50 | filter_vcf_gq: 100
 51 | 
 52 | freebayes_timeout_sec: 3600
 53 | 
 54 | # not primary alignment || supplementary alignment
 55 | bwa_strandseq_aln_discard: 2304
 56 | 
 57 | # read unmapped || not primary alignment || failed QC || PCR dup
 58 | minimap_readref_aln_discard: 1796
 59 | 
 60 | # read unmapped || not primary alignment
 61 | minimap_contigref_aln_discard: 260
 62 | 
 63 | # read unmapped || not primary alignment || failed QC || PCR dup
 64 | minimap_racon_aln_discard: 1796  # same as 0x704
 65 | minimap_racon_aln_min_qual: 10
 66 | 
 67 | # main chromosomes to be used
 68 | # for known references for main
 69 | # pipeline steps (i.e., everything
 70 | # before evaluation)
 71 | eval_known_ref: GRCh38_GCA_p13
 72 | eval_align_ref: hg38_GCA_p13
 73 | eval_gene_model: GRCh38_GENCODEv31_basic
 74 | use_genome_size: hg38_GCA_p13
 75 | main_chromosomes:
 76 |   - chr1
 77 |   - chr2
 78 |   - chr3
 79 |   - chr4
 80 |   - chr5
 81 |   - chr6
 82 |   - chr7
 83 |   - chr8
 84 |   - chr9
 85 |   - chr10
 86 |   - chr11
 87 |   - chr12
 88 |   - chr13
 89 |   - chr14
 90 |   - chr15
 91 |   - chr16
 92 |   - chr17
 93 |   - chr18
 94 |   - chr19
 95 |   - chr20
 96 |   - chr21
 97 |   - chr22
 98 |   - chrX
 99 |   - chrY
100 | 


--------------------------------------------------------------------------------
/scripts/dev/hybrid_renamer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import shutil
 5 | 
 6 | 
 7 | def get_haplotype(file_name):
 8 | 
 9 |     if 'h1-un' in file_name or '_h1_' in file_name:
10 |         return 'h1-un'
11 |     elif 'h2-un' in file_name or '_h2_' in file_name:
12 |         return 'h2-un'
13 |     else:
14 |         raise ValueError('Unrecognized haplotype: {}'.format(file_name))
15 | 
16 | 
17 | def get_read_info(sample, file_name):
18 |     if sample == 'NA24385':
19 |         return 'hpg_pbsq2-ccs_1000'
20 |     elif sample == 'NA12878':
21 |         return 'giab_pbsq2-ccs_1000'
22 |     else:
23 |         if 'ccs' in file_name.lower():
24 |             return 'hgsvc_pbsq2-ccs_1000'
25 |         elif 'clr' in file_name.lower():
26 |             return 'hgsvc_pbsq2-clr_1000'
27 |         else:
28 |             raise ValueError('Unrecognized read type: {}'.format(file_name))
29 | 
30 | 
31 | def get_assembler_info(read_info):
32 |     if 'ccs' in read_info:
33 |         return 'pereg', 'racon-p2'
34 |     elif 'clr' in read_info:
35 |         return 'flye', 'arrow-p1'
36 |     else:
37 |         raise ValueError('Cannot match assembler to reads: {}'.format(read_info))
38 | 
39 | 
40 | def get_new_file_ext(file_name):
41 | 
42 |     if 'not_scaffolded' in file_name.lower():
43 |         return 'bng-unsupported.fasta'
44 |     elif file_name.endswith('.agp'):
45 |         return 'bng-hybrid.agp'
46 |     elif file_name.endswith('.fasta'):
47 |         return 'bng-scaffolds.fasta'
48 |     else:
49 |         raise ValueError('Cannot handle file name: {}'.format(file_name))
50 |     
51 | 
52 | def build_new_name(file_name):
53 | 
54 |     sample = file_name.split('_', 1)[0]
55 |     if sample.startswith('GM'):
56 |         if sample == 'GM00864':
57 |             sample = sample.replace('GM', 'HG')
58 |         else:
59 |             sample = sample.replace('GM', 'NA')
60 |     read_info = get_read_info(sample, file_name)
61 |     hap = get_haplotype(file_name)
62 |     assembler, polisher = get_assembler_info(read_info)
63 |     new_file_ext = get_new_file_ext(file_name)
64 | 
65 |     new_name = '{}_{}-{}.{}.{}.{}'.format(sample, read_info, assembler, hap, polisher, new_file_ext)
66 |     return new_name
67 | 
68 | 
69 | target_path = '/gpfs/project/ebertp/projects/rfdga/production/EVAL/run_folder/output/evaluation/scaffolded_assemblies'
70 | 
71 | for root, dirs, files in os.walk(os.getcwd()):
72 |     if not files:
73 |         continue
74 |     for f in files:
75 |         if not (f.endswith('.fasta') or f.endswith('.agp')):
76 |             continue
77 |         old_path = os.path.join(root, f)
78 |         new_path = os.path.join(target_path, build_new_name(f))
79 |         if os.path.isfile(new_path):
80 |             continue
81 |         shutil.copy(old_path, new_path)
82 | 
83 |         
84 | 


--------------------------------------------------------------------------------
/smk_include/eval_known_reference.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | localrules: master_eval_known_reference
 3 | 
 4 | rule master_eval_known_reference:
 5 |     input:
 6 |         []
 7 | 
 8 | 
 9 | rule compute_delta_assembly_reference:
10 |     input:
11 |         known_ref = 'references/assemblies/{known_ref}.fasta',
12 |         assembly = 'output/{folder_path}/{file_name}.fasta'
13 |     output:
14 |         delta = 'output/evaluation/mummer_delta/{known_ref}/{folder_path}/{file_name}.delta'
15 |     log:
16 |         'log/output/evaluation/mummer_delta/{known_ref}/{folder_path}/{file_name}.mummer.log'
17 |     benchmark:
18 |         'run/output/evaluation/mummer_delta/{{known_ref}}/{{folder_path}}/{{file_name}}.mummer.t{}.rsrc'.format(config['num_cpu_medium'])
19 |     conda:
20 |          '../environment/conda/conda_biotools.yml'
21 |     threads: config['num_cpu_medium']
22 |     resources:
23 |         mem_per_cpu_mb = lambda wildcards, attempt: int((24576 + attempt * 24576) / config['num_cpu_medium']),
24 |         mem_total_mb = lambda wildcards, attempt: 24576 + attempt * 24576,
25 |         runtime_hrs = lambda wildcards, attempt: 6 * attempt
26 |     shell:
27 |         'nucmer --maxmatch -l 100 -c 500 --threads={threads} {input.known_ref} {input.custom_ref} --delta={output.delta} &> {log}'
28 | 
29 | 
30 | rule quast_analysis_assembly:
31 |     input:
32 |         dl_chk = 'output/check_files/quast-lg/busco_db_download.ok',
33 |         known_ref = 'references/assemblies/{known_ref}.fasta',
34 |         genes = 'references/downloads/{genemodel}.gff3.gz',
35 |         assembly = 'output/{folder_path}/{file_name}.fasta',
36 |     output:
37 |         pdf_report = 'output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{file_name}/report.pdf',
38 |         html_icarus = 'output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{file_name}/icarus.html',
39 |     log:
40 |         'log/output/evaluation/quastlg_busco/{known_ref}-{genemodel}/{folder_path}/{file_name}/quast_run.log',
41 |     benchmark:
42 |         'run/output/evaluation/quastlg_busco/{{known_ref}}-{{genemodel}}/{{folder_path}}/{{file_name}}/quast_run.t{}.rsrc'.format(config['num_cpu_medium'])
43 |     conda:
44 |          '../environment/conda/conda_rtools.yml'
45 |     threads: config['num_cpu_medium']
46 |     resources:
47 |         mem_per_cpu_mb = lambda wildcards, attempt: int((36864 + attempt * 36864) / config['num_cpu_medium']),
48 |         mem_total_mb = lambda wildcards, attempt: 36864 + attempt * 36864,
49 |         runtime_hrs = lambda wildcards, attempt: 16 * attempt
50 |     params:
51 |         output_dir = lambda wildcards, output: os.path.dirname(output.pdf_report)
52 |     priority: 100
53 |     shell:
54 |         'quast-lg.py --threads {threads} -r {input.known_ref}'
55 |             ' --features gene:{input.genes} --conserved-genes-finding'
56 |             ' --output-dir {params.output_dir} {input.assembly}'
57 |             ' &> {log}'
58 | 


--------------------------------------------------------------------------------
/annotation/20200507_ASanders_wgs_cells.txt:
--------------------------------------------------------------------------------
  1 | GM19650Ax02PE20523
  2 | GM20847Bx02PE20410
  3 | HG01114x02PE20328
  4 | HG01505x02PE20494
  5 | HG01596x02PE20501
  6 | HG02018x01PE20491
  7 | HG02587x02PE20340
  8 | HG00096x02PE20385
  9 | HG00171Ax02PE20490
 10 | HG03065x02PE20587
 11 | HG03371x02PE20572
 12 | GM18939x02PE20464
 13 | GM19036Bx02PE20369
 14 | GM12329x02PE20440
 15 | GM12329x02PE20472
 16 | GM18939x02PE20440
 17 | GM19036Bx02PE20387
 18 | HG00171Ax02PE20432
 19 | HG00171Ax02PE20440
 20 | HG01573x02PE20332
 21 | HG01573x02PE20356
 22 | HG01573x02PE20380
 23 | HG01573x02PE20388
 24 | HG02011x02PE20571
 25 | HG02011x02PE20595
 26 | HG02011x02PE20596
 27 | HG02492x02PE20456
 28 | HG02587x02PE20348
 29 | HG02587x02PE20372
 30 | HG02587x02PE20388
 31 | HG02587x02PE20395
 32 | HG03732x02PE20571
 33 | GM18939x02PE20456
 34 | GM12329x02PE20448
 35 | GM12329x02PE20455
 36 | GM12329x02PE20456
 37 | GM12329x02PE20479
 38 | GM12329x02PE20480
 39 | GM12329x02PE20488
 40 | GM12329x02PE20490
 41 | GM12329x02PE20495
 42 | GM12329x02PE20496
 43 | GM18939x02PE20423
 44 | GM18939x02PE20448
 45 | GM18939x02PE20463
 46 | GM18939x02PE20472
 47 | GM18939x02PE20479
 48 | GM18939x02PE20487
 49 | GM18939x02PE20488
 50 | GM18939x02PE20494
 51 | GM18939x02PE20495
 52 | HG00171Ax02PE20424
 53 | HG00171Ax02PE20439
 54 | HG00171Ax02PE20448
 55 | HG00171Ax02PE20456
 56 | HG00171Ax02PE20472
 57 | HG00171Ax02PE20479
 58 | HG00171Ax02PE20480
 59 | HG00171Ax02PE20487
 60 | HG00171Ax02PE20488
 61 | HG00171Ax02PE20495
 62 | HG00171Ax02PE20496
 63 | HG01505x02PE20424
 64 | HG01505x02PE20432
 65 | HG01505x02PE20440
 66 | HG01505x02PE20487
 67 | HG01505x02PE20496
 68 | HG02011x02PE20524
 69 | HG02011x02PE20532
 70 | HG02011x02PE20540
 71 | HG02011x02PE20548
 72 | HG02011x02PE20556
 73 | HG02011x02PE20580
 74 | HG02011x02PE20587
 75 | HG02011x02PE20588
 76 | HG02492x02PE20423
 77 | HG02492x02PE20439
 78 | HG02492x02PE20440
 79 | HG02492x02PE20472
 80 | HG02492x02PE20479
 81 | HG02492x02PE20480
 82 | HG02492x02PE20487
 83 | HG02492x02PE20488
 84 | HG02492x02PE20495
 85 | HG02492x02PE20496
 86 | HG02587x02PE20323
 87 | HG02587x02PE20324
 88 | HG02587x02PE20332
 89 | HG02587x02PE20339
 90 | HG02587x02PE20355
 91 | HG02587x02PE20356
 92 | HG02587x02PE20371
 93 | HG02587x02PE20379
 94 | HG02587x02PE20380
 95 | HG02587x02PE20387
 96 | HG02587x02PE20390
 97 | HG03009x02PE20384
 98 | HG03065x02PE20524
 99 | HG03065x02PE20532
100 | HG03065x02PE20572
101 | HG03065x02PE20580
102 | HG03065x02PE20588
103 | HG03065x02PE20596
104 | HG03683x01PE20425
105 | HG03683x01PE20461
106 | HG03732x02PE20512
107 | HG03732x02PE20548
108 | HG03732x02PE20572
109 | HG03732x02PE20580
110 | HG03732x02PE20587
111 | HG03732x02PE20588
112 | HG03732x02PE20594
113 | HG03732x02PE20595
114 | HG03732x02PE20596
115 | GM19036Bx02PE20372
116 | HG01573x02PE20324
117 | HG01573x02PE20331
118 | HG01573x02PE20340
119 | HG01573x02PE20396
120 | GM18534Bx02PE20392
121 | HG03065x02PE20556
122 | GM20509Bx01PE20515
123 | GM20509Bx01PE20580
124 | GM20509Bx01PE20504


--------------------------------------------------------------------------------
/notes/minimap_ctg_ref.md:
--------------------------------------------------------------------------------
 1 | # Snakefile: minimap contig to reference assembly alignment
 2 | 
 3 | Sent via e-mail by David on 2019-10-08
 4 | 
 5 | Relevance:
 6 |   - minimap2 parameters to get reasonable alignments between de novo and reference
 7 |   - reference (`ref` below) refers here to, e.g., GRCh38
 8 | 
 9 | 
10 | ```
11 |     ## Snakefile to align denovo contigs to the reference genome
12 | 
13 |     ## Set config file
14 |     configfile: "Snake.config.json"
15 | 
16 |     FASTA, = glob_wildcards("clustered_assembly/{fasta}.fasta")
17 | 
18 |     rule all:
19 |         input:
20 |             #expand("alignments/{fasta}.bed", fasta=FASTA)
21 |             "alignments/HG00733_sra_pbsq1-clr_sqa_clustered_v2.bed"
22 | 
23 |     rule align_fasta:
24 |         input:
25 |             fasta = "clustered_assembly/{fasta}.fasta",
26 |             ref = config["reference"]
27 |         output:
28 |             "alignments/{fasta}.bam"
29 |         log:
30 |             "log/{fasta}.bam.log"
31 |         threads:
32 |             8
33 |         shell:
34 |             #"minimap2 -ax asm20 --eqx -r 20000 -s 30000 -t {threads} {input.ref} {input.fasta} | samtools view -F 260 -b - | samtools sort - > {output} 2> {log}"
35 |             "minimap2 --secondary=no --eqx -Y -ax asm20 -m 10000 -z 10000,50 -r 50000 --end-bonus=100 -O 5,56 -E 4,1 -B 5 -t {threads} {input.ref} {input.fasta} | samtools view -F 260 -b - | samtools sort - > {output} 2> {log}"
36 | 
37 |     rule bam2bed:
38 |         input:
39 |             "alignments/{fasta}.bam"
40 |         output:
41 |             "alignments/{fasta}.bed"
42 |         log:
43 |             "log/{fasta}.bed.log"
44 |         shell:
45 |             "bedtools bamtobed -i {input} > {output} 2> {log}"
46 | 
47 | ```
48 | 
49 | Diagnostic plot of the BED output file can be produced as follows:
50 | 
51 | ```R
52 | library(SaaRclust)
53 | library(BSgenome.Hsapiens.UCSC.hg38)
54 | 
55 | bedfile <- "bedfile with aligned contigs to the reference"
56 | plt1 <- plotClusteredContigs(bedfile = bedfile, min.mapq = 10, bsgenome = BSgenome.Hsapiens.UCSC.hg38, report = 'clustering')
57 | plt2 <- plotClusteredContigs(bedfile = bedfile, min.mapq = 10, bsgenome = BSgenome.Hsapiens.UCSC.hg38, report = 'ordering')
58 | plt3 <- plotClusteredContigs(bedfile = bedfile, min.mapq = 10, bsgenome = BSgenome.Hsapiens.UCSC.hg38, report = 'orienting')
59 | 
60 | #To save the plots:
61 | plot destination = "location and the file name where the plot should be saved"
62 | ggsave(filename = <plot destination/clustering.pdf>, plot = plt1, width = 12, height = 6)
63 | ggsave(filename = <plot destination/ordering.pdf>, plot = plt2, width = 12, height = 6)
64 | ggsave(filename = <plot destination/orienting.pdf>, plot = plt3, width = 12, height = 6)
65 | ```
66 | 
67 | ## Note
68 | 
69 | Add following packages to `rtools` environment after bug fixing stage.
70 | 
71 | ```yaml
72 |   - bioconductor-bsgenome=1.50.0=r351_0
73 |   - bioconductor-bsgenome.hsapiens.ucsc.hg38=1.4.1=r351_5
74 | ```


--------------------------------------------------------------------------------
/smk_config/params/smk_cfg_params_RV12.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | # === Software setup settings ===
  3 | # Specify git commits for SaaRclust
  4 | # and StrandPhaseR setup
  5 | git_commit_saarclust: d51c66f
  6 | git_commit_strandphaser: e608407
  7 | git_commit_breakpointr: 268d99d
  8 | # arbitrarily tying a version number
  9 | # to the git commits to avoid additional
 10 | # wildcards - increment this number when
 11 | # git commits are changed!
 12 | git_commit_version: 12
 13 | 
 14 | peregrine_version: 0.1.6.1
 15 | deepvariant_version: 0.10.0
 16 | shasta_version: 0.4.0
 17 | 
 18 | # Assembler settings
 19 | shasta_target_coverage: 60  # tech-independent recommendation: cov between 40x and 80x
 20 | flye_target_coverage: 50  # dev recommendation: ~30x, but we have enough RAM to go a bit higher
 21 | 
 22 | # SaaRclust parameter sets
 23 | # goal is to obtain approx.
 24 | # 24 clusters (for human)
 25 | min_contig_size: 100000
 26 | min_region_to_order: 500000
 27 | bin_size: 200000
 28 | step_size: 200000
 29 | prob_threshold: 0.25
 30 | init_clusters: 100
 31 | desired_clusters: 24
 32 | min_mapq: 10
 33 | 
 34 | # this solves a known HET inversion located on chr8
 35 | sample_non_default_parameters:
 36 |   HG00733:
 37 |     use_only_in:
 38 |       write_saarclust_config_file:
 39 |         init_clusters: 150
 40 |         desired_clusters: 25
 41 |   NA24385:
 42 |     use_only_in:
 43 |       write_saarclust_config_file:
 44 |         init_clusters: 150
 45 |         desired_clusters: 25
 46 |   NA20847:
 47 |     use_only_in:
 48 |       write_saarclust_config_file:
 49 |         desired_clusters: 23
 50 |   HG00864:
 51 |     use_only_in:
 52 |       write_saarclust_config_file:
 53 |         desired_clusters: 23
 54 | 
 55 | # VARIANT CALLING
 56 | # Postprocessing parameters
 57 | filter_vcf_qual: 10
 58 | filter_vcf_gq: 100
 59 | 
 60 | freebayes_timeout_sec: 3600
 61 | 
 62 | # not primary alignment || supplementary alignment
 63 | bwa_strandseq_aln_discard: 2304
 64 | 
 65 | # read unmapped || not primary alignment || failed QC || PCR dup
 66 | minimap_readref_aln_discard: 1796
 67 | 
 68 | # read unmapped || not primary alignment
 69 | minimap_contigref_aln_discard: 260
 70 | 
 71 | # read unmapped || not primary alignment || failed QC || PCR dup
 72 | minimap_racon_aln_discard: 1796  # same as 0x704
 73 | minimap_racon_aln_min_qual: 10
 74 | 
 75 | # main chromosomes to be used
 76 | # for known references for main
 77 | # pipeline steps (i.e., everything
 78 | # before evaluation)
 79 | eval_known_ref: GRCh38_GCA_p13
 80 | eval_align_ref: hg38_GCA_p13
 81 | eval_gene_model: GRCh38_GENCODEv31_basic
 82 | use_genome_size: hg38_GCA_p13
 83 | main_chromosomes:
 84 |   - chr1
 85 |   - chr2
 86 |   - chr3
 87 |   - chr4
 88 |   - chr5
 89 |   - chr6
 90 |   - chr7
 91 |   - chr8
 92 |   - chr9
 93 |   - chr10
 94 |   - chr11
 95 |   - chr12
 96 |   - chr13
 97 |   - chr14
 98 |   - chr15
 99 |   - chr16
100 |   - chr17
101 |   - chr18
102 |   - chr19
103 |   - chr20
104 |   - chr21
105 |   - chr22
106 |   - chrX
107 |   - chrY
108 | 


--------------------------------------------------------------------------------
/notebooks/dev/merge_numpy_aln_dumps.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 7,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "2846211526\n",
13 |       "num_hap\n",
14 |       "1        2945021\n",
15 |       "2        3464902\n",
16 |       "3        2505441\n",
17 |       "4        2542069\n",
18 |       "5        2043097\n",
19 |       "         ...    \n",
20 |       "60       5065712\n",
21 |       "61       5569952\n",
22 |       "62      21048551\n",
23 |       "63      65643213\n",
24 |       "64    2636716984\n",
25 |       "Name: length, Length: 64, dtype: int64\n"
26 |      ]
27 |     }
28 |    ],
29 |    "source": [
30 |     "import os\n",
31 |     "import numpy as np\n",
32 |     "import numpy.ma as ma\n",
33 |     "import pandas as pd\n",
34 |     "import collections as col\n",
35 |     "\n",
36 |     "input_path = '/home/local/work/data/hgsvc/aln_summary'\n",
37 |     "\n",
38 |     "\n",
39 |     "# chrom_regions = []\n",
40 |     "\n",
41 |     "# for dump in os.listdir(input_path):\n",
42 |     "#     if 'hifi-prio' not in dump or not dump.endswith('.npy'):\n",
43 |     "#         continue\n",
44 |     "#     a = np.load(os.path.join(input_path, dump))\n",
45 |     "#     chrom = dump.rsplit('.', 2)[-2]\n",
46 |     "#     print(chrom)\n",
47 |     "#     genomic_coordinates = np.arange(a.size, dtype=np.int32)\n",
48 |     "#     for i in range(1, 65, 1):\n",
49 |     "#         select_regions = ma.masked_array(genomic_coordinates, mask=(a == i))\n",
50 |     "#         df = pd.DataFrame(\n",
51 |     "#             [(s.start, s.stop) for s in  ma.clump_masked(select_regions)],\n",
52 |     "#             columns=['start', 'end'],\n",
53 |     "#             dtype='int32'\n",
54 |     "#         )\n",
55 |     "#         df['chrom'] = chrom\n",
56 |     "#         df['num_hap'] = i\n",
57 |     "#         chrom_regions.append(df)\n",
58 |     "\n",
59 |     "# chrom_regions = pd.concat(chrom_regions, axis=0, ignore_index=False)\n",
60 |     "# chrom_regions.sort_values(['chrom', 'start'], inplace=True, axis=0)\n",
61 |     "    \n",
62 |     "# with open(os.path.join(input_path, 'aln_64hap_hifi-prio.mapq60.bed'), 'w') as bed:\n",
63 |     "#     _ = bed.write('#')\n",
64 |     "#     chrom_regions.to_csv(\n",
65 |     "#         bed,\n",
66 |     "#         header=True,\n",
67 |     "#         index=False,\n",
68 |     "#         sep='\\t',\n",
69 |     "#         columns=['chrom', 'start', 'end', 'num_hap']\n",
70 |     "#     )\n"
71 |    ]
72 |   }
73 |  ],
74 |  "metadata": {
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "codemirror_mode": {
82 |     "name": "ipython",
83 |     "version": 3
84 |    },
85 |    "file_extension": ".py",
86 |    "mimetype": "text/x-python",
87 |    "name": "python",
88 |    "nbconvert_exporter": "python",
89 |    "pygments_lexer": "ipython3",
90 |    "version": "3.7.6"
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 4
95 | }
96 | 


--------------------------------------------------------------------------------
/scripts/plot_saarclust_diagnostics.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressMessages(library(SaaRclust))
 4 | 
 5 | args <- commandArgs(trailingOnly=TRUE)
 6 | 
 7 | bed.file <- args[1]
 8 | ref.genome <- args[2]
 9 | output.folder <- args[3]
10 | plot.title <- args[4]
11 | haploid.assembly <- args[5]
12 | 
13 | if (is.na(haploid.assembly)) {
14 |     haploid.assembly <- FALSE
15 | } else {
16 |     haploid.assembly <- as.logical(haploid.assembly)
17 | }
18 | 
19 | stopifnot(ref.genome == 'hg38')
20 | 
21 | suppressMessages(library(BSgenome.Hsapiens.UCSC.hg38))
22 | 
23 | plot.clustering <- NULL
24 | plot.ordering <- NULL
25 | plot.orienting <- NULL
26 | 
27 | if (!haploid.assembly) {
28 | 
29 |     plot.clustering <- plotClusteredContigs(
30 |         bedfile = bed.file,
31 |         min.mapq = 10,
32 |         bsgenome = BSgenome.Hsapiens.UCSC.hg38,
33 |         report = 'clustering',
34 |         title = paste('Clustering', plot.title, sep=': '),
35 |         chromosomes = paste0('chr', c(1:22, 'X'))
36 |     )
37 | 
38 |     plot.orienting <- plotClusteredContigs(
39 |         bedfile = bed.file,
40 |         min.mapq = 10,
41 |         bsgenome = BSgenome.Hsapiens.UCSC.hg38,
42 |         report = 'orienting',
43 |         title = paste('Orientation', plot.title, sep=': '),
44 |         chromosomes = paste0('chr', c(1:22, 'X'))
45 |     )
46 | } else {
47 | 
48 |     plot.clustering <- plotClusteredContigs(
49 |         bedfile = bed.file,
50 |         min.mapq = 10,
51 |         bsgenome = BSgenome.Hsapiens.UCSC.hg38,
52 |         report = 'clustering',
53 |         info.delim = '_',
54 |         info.fields = c('cluster.SRC', 'contig.ID', 'order', 'cluster.ID'),
55 |         col.by = 'cluster.ID',
56 |         title = paste('Clustering', plot.title, sep=': '),
57 |         chromosomes = paste0('chr', c(1:22, 'X'))
58 |     )
59 | 
60 |     plot.ordering <- plotClusteredContigs(
61 |         bedfile = bed.file,
62 |         min.mapq = 10,
63 |         bsgenome = BSgenome.Hsapiens.UCSC.hg38,
64 |         report = 'ordering',
65 |         info.delim = '_',
66 |         info.fields = c('cluster.SRC', 'contig.ID', 'order', 'cluster.ID'),
67 |         title = paste('Ordering', plot.title, sep=': '),
68 |         chromosomes = paste0('chr', c(1:22, 'X'))
69 |     )
70 | 
71 |     plot.orienting <- plotClusteredContigs(
72 |         bedfile = bed.file,
73 |         min.mapq = 10,
74 |         bsgenome = BSgenome.Hsapiens.UCSC.hg38,
75 |         report = 'orienting',
76 |         info.delim = '_',
77 |         info.fields = c('cluster.SRC', 'contig.ID', 'order', 'cluster.ID'),
78 |         title = paste('Orientation', plot.title, sep=': '),
79 |         chromosomes = paste0('chr', c(1:22, 'X'))
80 |     )
81 | }
82 | 
83 | if (!is.null(plot.clustering)) {
84 |     ggsave(filename = paste(output.folder, 'clustering.pdf', sep='.'), plot = plot.clustering, width = 16, height = 8)
85 | }
86 | 
87 | if (!is.null(plot.ordering)) {
88 |     ggsave(filename = paste(output.folder, 'ordering.pdf', sep='.'), plot = plot.ordering, width = 16, height = 8)
89 | }
90 | 
91 | if (!is.null(plot.orienting)) {
92 |     ggsave(filename = paste(output.folder, 'orienting.pdf', sep='.'), plot = plot.orienting, width = 16, height = 8)
93 | }
94 | 
95 | warnings()
96 | 
97 | quit(save='no')


--------------------------------------------------------------------------------
/notes/align_strandseq.md:
--------------------------------------------------------------------------------
 1 | # Align Strand-seq reads to reference or custom assembly
 2 | 
 3 | Sent by David via e-mail on 2019-09-17
 4 | 
 5 | Relevance:
 6 |   - default alignment and preprocessing commands for strand-seq data
 7 |   - merge of mono- and dinucleotide fractions is data-specific
 8 | 
 9 | ```
10 | from collections import defaultdict
11 | 
12 | SAMPLE_DIR, SAMPLE, PLATE, LIBUM, NUCL = glob_wildcards(
13 |     "rawData/{sample_dir}/{sample}_{plate}_{libum}_{nucl}_1.fastq.gz"
14 | )
15 | 
16 | ## Take unique ID for each sample
17 | SAMPLES = sorted(set(SAMPLE))
18 | 
19 | rule all:
20 |     input:
21 |         bam=lambda wildcards: [
22 |             "alignments/{}/{}.{}.{}.monodi.srt.mdup.bam.bai".format(
23 |                 sample_dir, sample, plate, libum
24 |             ) for sample_dir, sample, plate, libum in zip(SAMPLE_DIR, SAMPLE, PLATE, LIBUM)
25 |         ]
26 | 
27 | rule align_bwa:
28 |     input:
29 |         read1="rawData/{sample_dir}/{sample}_{plate}_{libum}_{nucl}_1.fastq.gz",
30 |         read2="rawData/{sample_dir}/{sample}_{plate}_{libum}_{nucl}_2.fastq.gz",
31 |         ref = config["reference"]
32 |     output:
33 |         temp("alignments/{sample_dir}/{nucl}/{sample}.{plate}.{libum}.{nucl}.bam")
34 |     log:
35 |         "log/{sample_dir}/{nucl}/{sample}.{plate}.{libum}.{nucl}.bam"
36 |     threads:
37 |         8
38 |     params:
39 |         rg="@RG\\tID:{sample}_{nucl}\\tPL:Illumina\\tSM:{sample}_{nucl}"
40 |     shell:
41 |         """
42 |         bwa mem -t {threads} \
43 |                 -R '{params.rg}' \
44 |                 {input.ref} {input.read1} {input.read2} | samtools view -Sb -F 2304 - > {output} 2> {log}
45 |         """
46 | 
47 | rule merge_mono_di:
48 |     input:
49 |         bam1="alignments/{sample_dir}/mono/{sample}.{plate}.{libum}.mono.bam",
50 |         bam2="alignments/{sample_dir}/di/{sample}.{plate}.{libum}.di.bam"
51 |     output:
52 |         temp("alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.bam")
53 |     log:
54 |         "log/{sample_dir}/{sample}.{plate}.{libum}.monodi.bam.log"
55 |     threads:
56 |         8
57 |     shell:
58 |         "samtools merge -@ {threads} -O BAM {output} {input} 2> {log}"
59 | 
60 | rule sort_mono_di:
61 |     input:
62 |         "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.bam"
63 |     output:
64 |         temp("alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.bam")
65 |     log:
66 |         "log/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.bam.log"
67 |     threads:
68 |         8
69 |     shell:
70 |         "samtools sort -@ {threads} -O BAM {input} -o {output} 2> {log}"
71 | 
72 | 
73 | rule markDuplicates:
74 |     input:
75 |         "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.bam"
76 |     output:
77 |         "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam"
78 |     log:
79 |         "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam.log"
80 |     threads:
81 |         8
82 |     shell:
83 |         "sambamba markdup -t {threads} {input} {output} 2> {log}"
84 | 
85 | 
86 | rule index_bam:
87 |     input:
88 |         "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam"
89 |     output:
90 |         "alignments/{sample_dir}/{sample}.{plate}.{libum}.monodi.srt.mdup.bam.bai"
91 |     threads:
92 |         8
93 |     shell:
94 |         "samtools index -@ {threads} {input} {output}"
95 | 
96 | ```
97 | 


--------------------------------------------------------------------------------
/scripts/utilities/summarize_vcf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import argparse as argp
  5 | 
  6 | import pandas as pd
  7 | import pysam
  8 | 
  9 | 
 10 | GT_MAP = {
 11 |     (0, 0): 'HOM',
 12 |     (1, 1): 'HOM',
 13 |     (0, 1): 'HET',
 14 |     (1, 0): 'HET'
 15 | }
 16 | 
 17 | 
 18 | def parse_command_line():
 19 | 
 20 |     parser = argp.ArgumentParser()
 21 |     parser.add_argument(
 22 |         '--input',
 23 |         '-i',
 24 |         type=str,
 25 |         dest='input',
 26 |         nargs='+'
 27 |         )
 28 |     parser.add_argument(
 29 |         '--output',
 30 |         '-o',
 31 |         type=str,
 32 |         dest='output',
 33 |         required=True
 34 |     )
 35 |     args = parser.parse_args()
 36 |     return args
 37 | 
 38 | 
 39 | def parse_filename(file_path):
 40 | 
 41 |     fname = os.path.basename(file_path)
 42 |     short_reads, long_read_assm = fname.split('_map-to_')
 43 |     sample = short_reads.split('_')[0]
 44 | 
 45 |     hap = long_read_assm.split('.')[1]
 46 |     if hap == 'h1-un':
 47 |         hap = 10
 48 |     elif hap == 'h2-un':
 49 |         hap = 20
 50 |     else:
 51 |         raise ValueError('Unexpected haplotype: {}'.format(fname))
 52 | 
 53 |     platform = long_read_assm.split('_')[1].split('-')[1]
 54 |     assert platform in ['clr', 'ccs'], 'Unknown long read tech: {}'.format(long_reads)
 55 |     platform = 'HiFi' if platform == 'ccs' else 'CLR'
 56 | 
 57 |     return sample, platform, hap
 58 | 
 59 | 
 60 | def main():
 61 |     args = parse_command_line()
 62 | 
 63 |     out_mode = 'w'
 64 | 
 65 |     for vcf_file in args.input:
 66 |         rows = []
 67 |         index = []
 68 |         sample, platform, hap = parse_filename(vcf_file)
 69 | 
 70 |         with pysam.VariantFile(vcf_file, 'r') as vcf:
 71 |             for record in vcf:
 72 |                 assert record.chrom == record.contig, 'Sequence mismatch: {}'.format(record)
 73 |                 v = {
 74 |                     'sequence': record.chrom,
 75 |                     'pos': record.pos,
 76 |                     'start': record.start,
 77 |                     'stop': record.stop,
 78 |                     'qual': int(round(record.qual, 0)),
 79 |                     'ref_allele': record.ref,
 80 |                     'alt_allele': record.alts[0],
 81 |                     'depth': record.info['DP'],
 82 |                     'region_length': record.rlen,
 83 |                     'variant_length': record.info['LEN'][0]
 84 |                 }
 85 |                 var_type = record.info['TYPE'][0].upper()
 86 |                 if var_type == 'SNP':
 87 |                     var_type = 'SNV'
 88 |                 genotype = record.samples[sample]['GT']
 89 |                 gt = GT_MAP[genotype]
 90 |                 index.append((sample, platform, hap, var_type, gt))
 91 |                 rows.append(v)
 92 |     
 93 |         df = pd.DataFrame.from_records(
 94 |             rows,
 95 |             index=pd.MultiIndex.from_tuples(
 96 |                 index,
 97 |                 names=['sample', 'platform', 'hap', 'var_type', 'genotype']
 98 |             )
 99 |         )
100 |         store_key = os.path.join(sample, platform, 'HAP' + str(hap))
101 |         df.to_hdf(args.output, store_key, mode=out_mode, format='fixed', complevel=9)
102 |         out_mode = 'a'
103 | 
104 |     return 0
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     main()


--------------------------------------------------------------------------------
/scripts/eval/extract_contigs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import io
  5 | import argparse
  6 | 
  7 | import pandas as pd
  8 | 
  9 | 
 10 | def parse_command_line():
 11 | 
 12 |     parser = argparse.ArgumentParser()
 13 |     parser.add_argument(
 14 |         '--contig-table',
 15 |         '-ct',
 16 |         type=str,
 17 |         dest='contig_table'
 18 |     )
 19 |     parser.add_argument(
 20 |         '--fasta-folder',
 21 |         '-ff',
 22 |         type=str,
 23 |         dest='fasta_folder'
 24 |     )
 25 |     parser.add_argument(
 26 |         '--output-folder',
 27 |         '-of',
 28 |         type=str,
 29 |         dest='output_folder'
 30 |     )
 31 |     args = parser.parse_args()
 32 |     return args
 33 | 
 34 | 
 35 | def read_seqs_from_fasta(fasta_path, contigs):
 36 | 
 37 |     seq_buffer = io.StringIO()
 38 | 
 39 |     buffer = False
 40 |     with open(fasta_path, 'r') as fasta:
 41 |         for line in fasta:
 42 |             if line.startswith('>'):
 43 |                 this_contig = line.strip().strip('>')
 44 |                 if this_contig in contigs:
 45 |                     _ = seq_buffer.write('\n')
 46 |                     _ = seq_buffer.write(line)
 47 |                     buffer = True
 48 |                 else:
 49 |                     buffer = False
 50 |                 continue
 51 |             if buffer:
 52 |                 _ = seq_buffer.write(line)
 53 |     return seq_buffer
 54 | 
 55 | 
 56 | def cache_fasta_paths(fasta_folder):
 57 | 
 58 |     cache = dict()
 59 |     for filename in os.listdir(fasta_folder):
 60 |         if not filename.endswith('.fasta'):
 61 |             continue
 62 |         sample, _, platform, _ = filename.split('_', 3)
 63 |         if platform == 'pbsq2-clr':
 64 |             tech = 'CLR'
 65 |         elif platform == 'pbsq2-ccs':
 66 |             tech = 'CCS'
 67 |         else:
 68 |             raise ValueError(filename)
 69 |         if 'h1-un' in filename:
 70 |             hap = 'H1'
 71 |         elif 'h2-un' in filename:
 72 |             hap = 'H2'
 73 |         else:
 74 |             raise ValueError(filename)
 75 |         cache[(sample, hap, tech)] = filename
 76 |     return cache
 77 | 
 78 | 
 79 | def main():
 80 | 
 81 |     args = parse_command_line()
 82 |     os.makedirs(args.output_folder, exist_ok=True)
 83 | 
 84 |     df = pd.read_csv(args.contig_table, sep='\t', header=0)
 85 | 
 86 |     fasta_cache = cache_fasta_paths(args.fasta_folder)
 87 | 
 88 |     for (sample, hap, tech), contigs in df.groupby(['sample', 'haplotype', 'platform']):
 89 |         try:
 90 |             fasta_file = fasta_cache[(sample, hap, tech)]
 91 |         except KeyError:
 92 |             print('skipping ', sample, hap, tech)
 93 |             continue
 94 |         contig_names = set(contigs['contig_id'])
 95 |         fasta_path = os.path.join(args.fasta_folder, fasta_file)
 96 |         contig_seqs = read_seqs_from_fasta(fasta_path, contig_names)
 97 | 
 98 |         outname = fasta_file.replace('.fasta', '.ctg3q29.fasta')
 99 |         outpath = os.path.join(args.output_folder, outname)
100 |         if os.path.isfile(outpath):
101 |             continue
102 | 
103 |         with open(outpath, 'w') as dump:
104 |             _ = dump.write(contig_seqs.getvalue())
105 |         print('done ', sample, hap, tech)
106 |         
107 |     return 0
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     main()


--------------------------------------------------------------------------------
/notebooks/subsample_hg00733_strandseq.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 88,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "copying\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "import yaml\n",
 18 |     "import json\n",
 19 |     "import os\n",
 20 |     "import random\n",
 21 |     "import shutil\n",
 22 |     "\n",
 23 |     "\n",
 24 |     "folder = '/scratch/bioinf/projects/diploid-genome-assembly/pebert/test_clr/run_folder/input/fastq/HG00733_1kg_il25k-npe_sseq'\n",
 25 |     "\n",
 26 |     "out_folder = '/scratch/bioinf/projects/diploid-genome-assembly/pebert/subsampling'\n",
 27 |     "os.makedirs(out_folder, exist_ok=True)\n",
 28 |     "\n",
 29 |     "all_files = os.listdir(folder)\n",
 30 |     "\n",
 31 |     "all_libs = set([x.split('_')[3] for x in all_files if x.endswith('.fastq.gz')])\n",
 32 |     "\n",
 33 |     "total_libs = len(all_libs)\n",
 34 |     "\n",
 35 |     "all_subsets = []\n",
 36 |     "\n",
 37 |     "while 1:\n",
 38 |     "\n",
 39 |     "    for factor in [0.8, 0.6, 0.4, 0.2]:\n",
 40 |     "        select_num = int(round(total_libs * factor))\n",
 41 |     "        selected_libs = set(random.sample(all_libs, select_num))\n",
 42 |     "        all_subsets.append(selected_libs)\n",
 43 |     "\n",
 44 |     "    all_jaccards = []\n",
 45 |     "\n",
 46 |     "    for i in all_subsets:\n",
 47 |     "        for j in all_subsets:\n",
 48 |     "            if i == j:\n",
 49 |     "                continue\n",
 50 |     "            isect = i.intersection(j)\n",
 51 |     "            union = i.union(j)\n",
 52 |     "            jaccard = len(isect) / len(union)\n",
 53 |     "            all_jaccards.append(jaccard)\n",
 54 |     "\n",
 55 |     "    if all([j < 0.5 for j in all_jaccards]):\n",
 56 |     "        print('copying')\n",
 57 |     "        for subset, label in zip(all_subsets, ['sub80', 'sub60', 'sub40', 'sub20']):\n",
 58 |     "            out_path = os.path.join(out_folder, label)\n",
 59 |     "            os.makedirs(out_path, exist_ok=True)\n",
 60 |     "            for lib in subset:\n",
 61 |     "                subset_files = [f for f in all_files if lib in f]\n",
 62 |     "                for sf in subset_files:\n",
 63 |     "                    new_file = sf.replace('1kg', label)\n",
 64 |     "                    old_path = os.path.join(folder, sf)\n",
 65 |     "                    new_path = os.path.join(out_path, new_file)\n",
 66 |     "                    shutil.copy(old_path, new_path)\n",
 67 |     "        break\n",
 68 |     "    else:\n",
 69 |     "        print('max j ', max(all_jaccards))\n",
 70 |     "                \n",
 71 |     "    \n",
 72 |     "    \n",
 73 |     "\n",
 74 |     "\n",
 75 |     "\n"
 76 |    ]
 77 |   }
 78 |  ],
 79 |  "metadata": {
 80 |   "kernelspec": {
 81 |    "display_name": "Python 3",
 82 |    "language": "python",
 83 |    "name": "python3"
 84 |   },
 85 |   "language_info": {
 86 |    "codemirror_mode": {
 87 |     "name": "ipython",
 88 |     "version": 3
 89 |    },
 90 |    "file_extension": ".py",
 91 |    "mimetype": "text/x-python",
 92 |    "name": "python",
 93 |    "nbconvert_exporter": "python",
 94 |    "pygments_lexer": "ipython3",
 95 |    "version": "3.6.7"
 96 |   }
 97 |  },
 98 |  "nbformat": 4,
 99 |  "nbformat_minor": 2
100 | }
101 | 


--------------------------------------------------------------------------------
/smk_config/samples/na24143.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA24143:
 3 |   individual: NA24143
 4 |   sex: female
 5 |   super_population: EUR
 6 |   population: AKJ
 7 |   family: 3140
 8 |   member: parent
 9 |   comment: "Sample alias: HG004"
10 |   data_sources:
11 |     - long_reads:
12 |         readset: NA24143_hpg_pbsq2-ccs
13 |         technology: pacbio
14 |         data_type: fastq
15 |         load_type: parts
16 |         source_type: amazon
17 |         comment: "https://github.com/human-pangenomics/HG002_Data_Freeze_v1.0#motherhg004na24143"
18 |     - strandseq:
19 |         readset: &sseq_reads NA24143_bccrc_ilany-75pe_sseq
20 |         source_type: local
21 |         library_fractions: one
22 |         comment: "Lansdorp collaboration data"
23 | 
24 | 
25 | sample_targets_NA24143:
26 |   - aliases:
27 |       1: &ccs_reads NA24143_hpg_pbsq2-ccs_1000
28 |   - defaults:
29 |       hap_reads: *ccs_reads
30 |       vc_reads: *ccs_reads
31 |       sseq_reads: *sseq_reads
32 |       pol_reads: *ccs_reads
33 |       pol_pass: racon-p2
34 |       hap_assm_mode: split
35 |       hap:
36 |         - h1-un
37 |         - h2-un
38 |   - target:
39 |       name: Lansdorp
40 |       nhr_assembler: hifiasm
41 |       hap_assembler: hifiasm
42 |       var_caller: deepvar
43 | 
44 | 
45 | sample_data_sources_NA24143:
46 |   input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part1:
47 |     local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part1.fastq.gz
48 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel733_2_B01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191124_055423.fastq.gz
49 |   input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part2:
50 |     local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part2.fastq.gz
51 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel735_1_A01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191126_155613.fastq.gz
52 |   input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part3:
53 |     local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part3.fastq.gz
54 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel735_2_B01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191127_220906.fastq.gz
55 |   input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part4:
56 |     local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part4.fastq.gz
57 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel735_3_C01_PBSU_30hours_15kbV2PD_70pM_HumanHG004_CCS/m64017_191129_043425.fastq.gz
58 |   input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part5:
59 |     local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part5.fastq.gz
60 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_21kb/PBmixSequel724_1_A01_PBTA_30hours_21kbV2PD_70pM_HumanHG004_CCS/m64017_191115_211223.fastq.gz
61 |   input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part6:
62 |     local_path: input/fastq/NA24143_hpg_pbsq2-ccs/NA24143_hpg_pbsq2-ccs.part6.fastq.gz
63 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG004/PacBio_HiFi/HudsonAlpha_21kb/PBmixSequel725_1_A01_PBTA_30hours_21kbV2PD_70pM_HumanHG004_CCS/m64017_191118_150849.fastq.gz
64 | 
65 | 


--------------------------------------------------------------------------------
/smk_config/samples/na24149.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sample_description_NA24149:
 3 |   individual: NA24149
 4 |   sex: male
 5 |   super_population: EUR
 6 |   population: AKJ
 7 |   family: 3140
 8 |   member: parent
 9 |   comment: "Sample alias: HG003"
10 |   data_sources:
11 |     - long_reads:
12 |         readset: NA24149_hpg_pbsq2-ccs
13 |         technology: pacbio
14 |         data_type: fastq
15 |         load_type: parts
16 |         source_type: amazon
17 |         comment: "https://github.com/human-pangenomics/HG002_Data_Freeze_v1.0#fatherhg003na24149"
18 |     - strandseq:
19 |         readset: &sseq_reads NA24149_bccrc_ilany-75pe_sseq
20 |         source_type: local
21 |         library_fractions: one
22 |         comment: "Lansdorp collaboration data"
23 | 
24 | 
25 | sample_targets_NA24149:
26 |   - aliases:
27 |       1: &ccs_reads NA24149_hpg_pbsq2-ccs_1000
28 |   - defaults:
29 |       hap_reads: *ccs_reads
30 |       vc_reads: *ccs_reads
31 |       sseq_reads: *sseq_reads
32 |       pol_reads: *ccs_reads
33 |       pol_pass: racon-p2
34 |       hap_assm_mode: split
35 |       hap:
36 |         - h1-un
37 |         - h2-un
38 |   - target:
39 |       name: Lansdorp
40 |       nhr_assembler: hifiasm
41 |       hap_assembler: hifiasm
42 |       var_caller: deepvar
43 | 
44 | 
45 | sample_data_sources_NA24149:
46 |   input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part1:
47 |     local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part1.fastq.gz
48 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_14kb/PBmixSequel740_2_B01_PBST_30hours_14kbV2PD_70pM_HumanHG003_CCS/m64017_191205_225630.fastq.gz
49 |   input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part2:
50 |     local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part2.fastq.gz
51 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_14kb/PBmixSequel740_3_C01_PBST_30hours_14kbV2PD_70pM_HumanHG003_CCS/m64017_191207_052215.fastq.gz
52 |   input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part3:
53 |     local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part3.fastq.gz
54 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel737_1_A01_PBTG_30hours_15kbV2PD_70pM_HumanHG003_CCS/m64017_191202_204405.fastq.gz
55 |   input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part4:
56 |     local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part4.fastq.gz
57 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_15kb/PBmixSequel740_1_A01_PBTG_30hours_15kbV2PD_70pM_HumanHG003_CCS/m64017_191204_164321.fastq.gz
58 |   input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part5:
59 |     local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part5.fastq.gz
60 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_19kb/PBmixSequel729_1_A01_PBTH_30hours_19kbV2PD_70pM_HumanHG003_CCS/m64017_191120_193948.fastq.gz
61 |   input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part6:
62 |     local_path: input/fastq/NA24149_hpg_pbsq2-ccs/NA24149_hpg_pbsq2-ccs.part6.fastq.gz
63 |     remote_path: https://s3-us-west-2.amazonaws.com/human-pangenomics/HG003/PacBio_HiFi/HudsonAlpha_19kb/PBmixSequel733_1_A01_PBTH_30hours_19kbV2PD_70pM_HumanHG003_CCS/m64017_191122_184406.fastq.gz
64 | 
65 | 


--------------------------------------------------------------------------------
/annotation/grch38/issues/grch38_p13_unknown.tsv:
--------------------------------------------------------------------------------
 1 | "HG-2562"	"Unknown"	"chr14:105,171,721-105,336,833"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"HG00733 contains a 7kbp insertion haplotype not seen in the Reference."
 2 | "HG-2552"	"Unknown"	"chrX:149,479,317-149,843,545"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"Possible Inversion in reference components AC244197.3_AC244098.2"
 3 | "HG-2550"	"Unknown"	"chr11:61,949,767-62,116,633"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"Possible Inversion in reference component AP003733.5"
 4 | "HG-2549"	"Unknown"	"chr11:215,457-356,450"	1	"Under Review"	""	"Ensembl, NCBI, UCSC"	"Possible Inversion in reference component AC136475.7"
 5 | "HG-2547"	"Unknown"	"chr9:123,855,313-124,032,767"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"Possible Inversion in reference component AC006450.13"
 6 | "HG-2546"	"Unknown"	"chr6:106,695,378-106,856,632"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"Possible Inversion in reference component AL080314.29"
 7 | "HG-2536"	"Unknown"	"chr21:6,427,259-6,580,181"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"Does GRCh38 placement of FP236240.8 on 21p arm represent a true duplication?"
 8 | "HG-2497"	"Unknown"	"chr13:32,231,913-32,398,469"	1	"Awaiting External Info"	""	"Ensembl, NCBI, UCSC"	"GRCh38 represents rare allele in BRCA2 (rs169547)"
 9 | "HG-2467"	"Unknown"	"chr15:77,667,045-77,764,477"	1	"Awaiting External Info"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AC110607.7"
10 | "HG-2426"	"Unknown"	"chr17:43,196,179-43,329,548"	1	"Under Review"	""	"Ensembl, NCBI, UCSC"	"Two bases (AA) missing from reference assembly in intron of NBR1 gene"
11 | "HG-2425"	"Unknown"	"chr16:21,176,310-22,760,988"	2	"Under Review"	""	"Ensembl, NCBI, UCSC"	"Potential rare variant haplotype at 16p12 in GRCh38"
12 | "HG-2359"	"Unknown"	"chr19:23,998,162-24,111,739"	1	"Awaiting External Info"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AC092279.2"
13 | "HG-2356"	"Unknown"	"chr14:103,846,805-103,934,844"	1	"Under Review"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AL132712.4"
14 | "HG-2165"	"Unknown"	"chr5:157,700,922-157,847,717"	1	"Awaiting Elec Data"	"GRCh39"	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AC026407.4"
15 | "HG-2113"	"Unknown"	"chr11:30,834,079-30,977,299"	1	"Awaiting Elec Data"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AL135932.7"
16 | "HG-2101"	"Unknown"	"chr1:86,959,754-87,131,991"	1	"Awaiting External Info"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AC093155.2"
17 | "HG-2082"	"Unknown"	"chr7:68,728,846-68,835,472"	1	"Awaiting Exptl Data"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AC004929.2"
18 | "HG-2069"	"Unknown"	"chr3:37,553,864-37,716,135"	1	"Under Review"	""	"Ensembl, NCBI, UCSC"	"Possible misassembly or indel variation in GRCh38 within AP006240.1"
19 | "HG-2020"	"Unknown"	"chr10:46,853,171-47,145,966"	1	"Open"	""	"Ensembl, NCBI, UCSC"	"PTPN20A is missing from GRCh38"
20 | "HG-1574"	"Unknown"	"chr22:50,342,656-50,806,138"	1	"Awaiting Exptl Data"	""	"Ensembl, NCBI, UCSC"	"Chr 22 ABC12 pathway"
21 | "HG-994"	"Unknown"	"chr16:88,986,311-89,130,142"	1	"Awaiting External Info"	""	"Ensembl, NCBI, UCSC"	"Possible missing sequence in assembly component AC135782.4"
22 | "HG-675"	"Unknown"	"chr12:40,461,422-40,561,522"	1	"Stalled"	""	"Ensembl, NCBI, UCSC"	"GeneID: 283463 (MUC19) has poor alignment to the Reference"
23 | 


--------------------------------------------------------------------------------
/notebooks/dump_sample_table.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 8,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import os\n",
10 |     "\n",
11 |     "import yaml\n",
12 |     "import pandas as pd\n",
13 |     "\n",
14 |     "base_path = '/home/local/work/code/github/project-diploid-assembly/smk_config/samples'\n",
15 |     "\n",
16 |     "ignore_samples = set([\n",
17 |     "    'NA19434',\n",
18 |     "    'HG03721',\n",
19 |     "    'HG01573',\n",
20 |     "    'HG02018',\n",
21 |     "    'NA19036',\n",
22 |     "    'NA19320'\n",
23 |     "])\n",
24 |     "\n",
25 |     "samples = []\n",
26 |     "for root, dirs, files in os.walk(base_path):\n",
27 |     "    yaml_configs = [f for f in files if f.endswith('.yml') or f.endswith('.yaml')]\n",
28 |     "    for cfg in yaml_configs:\n",
29 |     "        with open(os.path.join(root, cfg), 'r') as dump:\n",
30 |     "            metadata = yaml.safe_load(dump)\n",
31 |     "            is_sample = [k for k in metadata.keys() if k.startswith('sample_description')]\n",
32 |     "            if not is_sample:\n",
33 |     "                continue\n",
34 |     "            metadata = metadata[is_sample.pop()]\n",
35 |     "            metadata['HiFi'] = 0\n",
36 |     "            metadata['CLR'] = 0\n",
37 |     "            metadata['2020_SKIP'] = 1 if metadata['individual'] in ignore_samples else 0\n",
38 |     "            if metadata['population'] == 'AKJ':\n",
39 |     "                metadata['population'] = 'ASK'\n",
40 |     "            for ds in metadata['data_sources']:\n",
41 |     "                if 'long_reads' not in ds:\n",
42 |     "                    continue\n",
43 |     "                attributes = ds['long_reads']\n",
44 |     "                if 'pbsq2' not in attributes['readset']:\n",
45 |     "                    continue\n",
46 |     "                if '-ccs' in attributes['readset']:\n",
47 |     "                    metadata['HiFi'] = 1\n",
48 |     "                    continue\n",
49 |     "                if '-clr' in attributes['readset']:\n",
50 |     "                    metadata['CLR'] = 1\n",
51 |     "                    continue\n",
52 |     "            del metadata['data_sources']\n",
53 |     "            samples.append(metadata)\n",
54 |     "\n",
55 |     "sample_table = pd.DataFrame(samples)\n",
56 |     "sample_table = sample_table[[\n",
57 |     "    'individual',\n",
58 |     "    'sex',\n",
59 |     "    'super_population',\n",
60 |     "    'population',\n",
61 |     "    'family',\n",
62 |     "    'member',\n",
63 |     "    'HiFi',\n",
64 |     "    'CLR',\n",
65 |     "    '2020_SKIP'\n",
66 |     "]]\n",
67 |     "out_path = '/home/local/work/code/github/project-diploid-assembly/annotation/sample_table.tsv'\n",
68 |     "sample_table.sort_values(['super_population', 'population', 'individual'], inplace=True)\n",
69 |     "sample_table.to_csv(out_path, sep='\\t', header=True, index=False)"
70 |    ]
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.7.6"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 4
94 | }
95 | 


--------------------------------------------------------------------------------