├── .gitattributes ├── dockerized ├── multi_cpu_local.config ├── gwasdata │ ├── .DS_Store │ └── plink │ │ ├── .DS_Store │ │ └── high_LD_regions.txt ├── example_config.config ├── scripts │ ├── hwe_plot_qcplink.R │ ├── maf_plot_qcplink.R │ ├── snpmiss_plot_qcplink.R │ ├── select_diffmiss_qcplink.pl │ ├── diffmiss_plot_qcplink.R │ ├── select_miss_het_qcplink.pl │ ├── run_IBD_QC_qcplink.pl │ └── miss_het_plot_qcplink.R ├── dockerfile_witsgwas_container │ └── Dockerfile └── witsgwas_dockerized_pipeline.nf ├── QuickstartUserInput.py ├── LICENSE ├── README.md ├── pipeline_quickstart_stages_config.py ├── pipeline_quickstart_config.py ├── pipeline_quickstart.py ├── cluster_job.py └── cluster_job_edited_for_witsGWAS.py /.gitattributes: -------------------------------------------------------------------------------- 1 | dockerized/* linguist-vendored 2 | -------------------------------------------------------------------------------- /dockerized/multi_cpu_local.config: -------------------------------------------------------------------------------- 1 | process.cpus = '8' 2 | process.memory = '16 GB' 3 | -------------------------------------------------------------------------------- /dockerized/gwasdata/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magosil86/witsGWAS/HEAD/dockerized/gwasdata/.DS_Store -------------------------------------------------------------------------------- /dockerized/gwasdata/plink/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magosil86/witsGWAS/HEAD/dockerized/gwasdata/plink/.DS_Store -------------------------------------------------------------------------------- /dockerized/example_config.config: -------------------------------------------------------------------------------- 1 | process.executor = 'pbs' 2 | process.queue = 'WitsLong' 3 | process.memory = '16 GB' 4 | process.time = '6h' 5 | process.cpus = '8' 6 | -------------------------------------------------------------------------------- /dockerized/scripts/hwe_plot_qcplink.R: -------------------------------------------------------------------------------- 1 | #Load HWE P-value file and generate frequency_distribution 2 | b.frq <- read.table("clean_inds_qcplink_hweu.hwe",header=T) 3 | pdf("qcplink_plots/hwe_plot.pdf") 4 | b.frq$logP = log10(b.frq$P) 5 | plot(ecdf(b.frq$logP), xlim=c(-10,0),ylim=c(0,0.80),pch=20, main="HWE P-value",xlab="logP (HWE)", ylab="Fraction of SNPs",axes=T) 6 | -------------------------------------------------------------------------------- /dockerized/scripts/maf_plot_qcplink.R: -------------------------------------------------------------------------------- 1 | #Load SNP frequency file and generate cumulative freequency distribution 2 | b.frq <- read.table("qced_clean_inds_freq.frq",header=T) 3 | pdf("qcplink_plots/maf_plot.pdf") 4 | plot(ecdf(b.frq$MAF), xlim=c(0,0.10),ylim=c(0,1),pch=20, main="MAF cumulative distribution",xlab="Minor allele frequency (MAF)", ylab="Fraction of SNPs",axes=T) 5 | -------------------------------------------------------------------------------- /dockerized/scripts/snpmiss_plot_qcplink.R: -------------------------------------------------------------------------------- 1 | #Load SNP frequency file and generate histogram 2 | b.frq <- read.table("clean_inds_qcplink_missing.lmiss",header=T) 3 | pdf("qcplink_plots/snpmiss_plot.pdf") 4 | plot(ecdf(b.frq$F_MISS),xlim=c(0,0.10),ylim=c(0,1),pch=20, main="SNP Missingness Distribution", xlab="Missingness Frequency", ylab="Fraction of SNPs",col="blue",axes=T) 5 | -------------------------------------------------------------------------------- /dockerized/scripts/select_diffmiss_qcplink.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | open IN, '<', "clean_inds_qcplink_test_missing.missing" or die "Cannot open missing file \n"; 6 | open OUT, '>', "fail_diffmiss_qcplink.txt"; 7 | while(){ 8 | s/^\s+//; 9 | my @fields = split /\s+/, $_; 10 | unless($fields[0] eq 'CHR'){ 11 | if($fields[4] < $ARGV[0]){ 12 | print OUT "$fields[1]\n"; 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /QuickstartUserInput.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ QuickstartUserInput.py 4 | 5 | -Configuration file for the user to supply the projectname, author, 6 | and cutoffs specific to pipeline_quickstart.py 7 | ============================================================================= 8 | """ 9 | 10 | # settings for pipeline_quickstart.py 11 | #========================================== 12 | 13 | projectname = '' 14 | 15 | author = '' 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /dockerized/gwasdata/plink/high_LD_regions.txt: -------------------------------------------------------------------------------- 1 | 1 48000000 52000000 1 2 | 2 86000000 100500000 2 3 | 2 183000000 190000000 3 4 | 3 47500000 50000000 4 5 | 3 83500000 87000000 5 6 | 5 44500000 50500000 6 7 | 5 129000000 132000000 7 8 | 6 25500000 33500000 8 9 | 6 57000000 64000000 9 10 | 6 140000000 142500000 10 11 | 7 55000000 66000000 11 12 | 8 8000000 12000000 12 13 | 8 43000000 50000000 13 14 | 8 112000000 115000000 14 15 | 10 37000000 43000000 15 16 | 11 87500000 90500000 16 17 | 12 33000000 40000000 17 18 | 20 32000000 34500000 18 -------------------------------------------------------------------------------- /dockerized/scripts/diffmiss_plot_qcplink.R: -------------------------------------------------------------------------------- 1 | #Load SNP differential missingness file and generate distribution 2 | b.frq <- read.table("clean_inds_qcplink_test_missing.missing",header=T) 3 | if (nrow(b.frq) >= 1) { 4 | b.frq$logP = log10(b.frq$P) 5 | pdf("qcplink_plots/diffmiss_plot.pdf") 6 | plot(ecdf(b.frq$logP), xlim=c(-10,0),ylim=c(0,1),pch=20, main="Distribution of differential missingness P-values", xlab="logP Differential Missingness", ylab="Fraction of SNPs",col="red",axes=T) 7 | } else { 8 | print("No differential missingness info to plot")} -------------------------------------------------------------------------------- /dockerized/scripts/select_miss_het_qcplink.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | $cut_het_high=$ARGV[0]; 4 | $cut_het_low=$ARGV[1]; 5 | $cut_miss=$ARGV[2]; 6 | 7 | open(MISSFILE,"qcplink_miss.imiss"); 8 | open(HETFILE,"qcplink_het.het"); 9 | @all=; 10 | chomp(@all); 11 | open(OUT,">fail_miss_het_qcplink.txt"); 12 | 13 | $line=0; 14 | while(){ 15 | chomp($_); 16 | 17 | if($line>=1){ 18 | chomp($_); 19 | @parts_miss=split(/\s+/,$_); 20 | $missing=$parts_miss[6]; 21 | 22 | @parts_het=split(/\s+/,$all[$line]); 23 | $meanHet=sprintf("%.3f", ($parts_het[5]-$parts_het[3])/$parts_het[5]); 24 | 25 | if($missing>$cut_miss or $meanHet>$cut_het_high or $meanHet<$cut_het_low){ 26 | print OUT $parts_miss[1],"\t",$parts_miss[2],"\t",$missing,"\t",$meanHet,"\n"; 27 | } 28 | } 29 | 30 | 31 | ++$line; 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Lerato E. Magosi and Scott Hazelhurst, Sydney Brenner Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /dockerized/scripts/run_IBD_QC_qcplink.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | 5 | my %imiss; 6 | my %removed; 7 | 8 | open IMISS, '<', $ARGV[0].".imiss" 9 | or die "Cannot open genotypes file (".$ARGV[0].".imiss): $!\n"; 10 | print "Reading PLINK .imiss file ".$ARGV[0].".imiss\n"; 11 | while(){ 12 | s/^\s+//; 13 | my @fields = split /\s+/, $_; 14 | $imiss{$fields[0]}{$fields[1]} = $fields[5]; 15 | } 16 | 17 | open GENOME, '<', $ARGV[1].".genome" 18 | or die "Cannot open genotypes file (".$ARGV[1].".genome): $!\n"; 19 | open OUT, '>', "fail_IBD_qcplink.txt"; 20 | print "Reading PLINK .genome file ".$ARGV[1].".genome\n"; 21 | while(){ 22 | s/^\s+//; 23 | my @fields = split /\s+/, $_; 24 | if($fields[9] > 0.185){ 25 | if($imiss{$fields[0]}{$fields[1]}>$imiss{$fields[2]}{$fields[3]}){ 26 | unless($removed{$fields[0]}{$fields[1]}){ 27 | print OUT "$fields[0] $fields[1]\n"; 28 | $removed{$fields[0]}{$fields[1]} = 1; 29 | } 30 | } 31 | elsif($imiss{$fields[0]}{$fields[1]}<$imiss{$fields[2]}{$fields[3]}){ 32 | unless($removed{$fields[2]}{$fields[3]}){ 33 | print OUT "$fields[2] $fields[3]\n"; 34 | $removed{$fields[2]}{$fields[3]} = 1; 35 | } 36 | } 37 | else{ 38 | unless($removed{$fields[0]}{$fields[1]}){ 39 | print OUT "$fields[0] $fields[1]\n"; 40 | $removed{$fields[0]}{$fields[1]} = 1; 41 | } 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![witsgwas_banner2](https://cloud.githubusercontent.com/assets/8364031/9582190/13b1e182-5004-11e5-9336-8c030414e4bc.png) 2 | 3 | ## Background 4 | 5 | witsGWAS is a simple human GWAS analysis workflow built at the [Sydney Brenner Institute](https://www.wits.ac.za/research/sbimb/) for data quality control (QC) and basic association testing. It takes away the need for having to enter individual commands at the unix prompt and rather organizes GWAS tasks sequentially (facilitated via [Ruffus](http://www.ruffus.org.uk/)) for submission to a distributed PBS Torque cluster (managed via [Rubra](https://github.com/bjpop/rubra)). witsGWAS monitors (using flag files) the progress of jobs/tasks submitted to the cluster on behalf of the user, courteously waiting for one job to finish before sending another one 6 | 7 | ## Documentation 8 | 9 | Installation, Examples and tutorials for witsGWAS can be accessed at the [witsGWAS_wiki](https://github.com/magosil86/witsGWAS/wiki) 10 | 11 | ## Features 12 | 13 | **QC of Affymetrix array data** (SNP6 raw .CEL files) 14 | 15 | * genotype calling 16 | * converting birdseed calls to PLINK format 17 | 18 | **Sample and SNP QC of PLINK Binaries** 19 | 20 | Sample QC tasks checking: 21 | 22 | * discordant sex information 23 | * calculating missingness 24 | * heterozygosity scores 25 | * relatedness 26 | * divergent ancestry 27 | 28 | SNP QC tasks checking: 29 | 30 | * minor allele frequencies 31 | * SNP missingness 32 | * differential missingness 33 | * Hardy Weinberg Equilibrium deviations 34 | 35 | **Association testing** 36 | 37 | * Basic PLINK association tests, producing manhattan and qqplots 38 | * CMH association test - Association analysis, accounting for clusters 39 | * permutation testing 40 | * logistic regression 41 | * emmax association testing 42 | 43 | ### Dockerized Pipeline 44 | 45 | The pipeline has been 'dockerized', simplifying its use. See the Dockerized section on the [WitsGWAS 46 | Wiki](https://github.com/magosil86/witsGWAS/wiki) for more information. 47 | 48 | ### Authors 49 | 50 | Lerato E. Magosi, Scott Hazelhurst, Rob Clucas and the WITS Bioinformatics team 51 | 52 | ### License 53 | witsGWAS is offered under the MIT license. See LICENSE.txt. 54 | 55 | ### Download 56 | [witsGWAS-0.1.0](https://github.com/magosil86/witsGWAS/releases) 57 | 58 | ### References 59 | Anderson, C. et al. Data quality control in genetic case-control association studies. Nature Protocols. 5, 1564-1573, 2010 60 | 61 | Sloggett, Clare; Wakefield, Matthew; Philip, Gayle; Pope, Bernard (2014): 62 | Rubra - flexible distributed pipelines. figshare. http://dx.doi.org/10.6084/m9.figshare.895626 63 | -------------------------------------------------------------------------------- /pipeline_quickstart_stages_config.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ pipeline_quickstart_stages_config.py 4 | 5 | -Configuration file to set options specific to each stage/task in pipeline_quickstart.py 6 | ============================================================================= 7 | """ 8 | import os 9 | 10 | import QuickstartUserInput as I 11 | 12 | import WitsgwasSoftware as SW 13 | 14 | # python = SW.python 15 | # plink = SW.plink 16 | # plink1 = SW.plink1 17 | # perl = SW.perl 18 | # R = SW.R 19 | 20 | 21 | # stageDefaults contains the default options which are applied to each stage (command). 22 | # This section is required for every Rubra pipeline. 23 | # These can be overridden by options defined for individual stages, below. 24 | # Stage options which Rubra will recognise are: 25 | # - distributed: a boolean determining whether the task should be submitted to a cluster 26 | # job scheduling system (True) or run on the system local to Rubra (False). 27 | # - walltime: for a distributed PBS job, gives the walltime requested from the job 28 | # queue system; the maximum allowed runtime. For local jobs has no effect. 29 | # - memInGB: for a distributed PBS job, gives the memory in Gigabytes requested from the 30 | # job queue system. For local jobs has no effect. 31 | # - queue: for a distributed PBS job, this is the name of the queue to submit the 32 | # job to. For local jobs has no effect. This is currently a mandatory field for 33 | # distributed jobs, but can be set to None. 34 | # - modules: the modules to be loaded before running the task. This is intended for 35 | # systems with environment modules installed. Rubra will call module load on each 36 | # required module before running the task. Note that defining modules for individual 37 | # stages will override (not add to) any modules listed here. This currently only 38 | # works for distributed jobs. 39 | 40 | 41 | 42 | stageDefaults = { 43 | 'distributed': True, 44 | 'queue': 'WitsLong', 45 | 'walltime': "6:00:00", 46 | 'memInGB': 16, 47 | 'name': None, 48 | 'modules': [ 49 | # python, 50 | # plink, 51 | # perl, 52 | # R, 53 | 'gwaspipe', 54 | ] 55 | } 56 | 57 | 58 | 59 | # stages should hold the details of each stage which can be called by runStageCheck. 60 | # This section is required for every Rubra pipeline. 61 | # Calling a stage in this way carries out checkpointing and, if desired, batch job 62 | # submission. 63 | # Each stage must contain a 'command' definition. See stageDefaults above for other 64 | # allowable options. 65 | 66 | 67 | stages = { 68 | 'task1': { 69 | "command": "" 70 | }, 71 | 'task2': { 72 | "command": "" 73 | }, 74 | 'task3': { 75 | 'command': "" 76 | }, 77 | } 78 | -------------------------------------------------------------------------------- /dockerized/scripts/miss_het_plot_qcplink.R: -------------------------------------------------------------------------------- 1 | #--INSPECT MISSINGNESS PATTERNS--# 2 | 3 | #IMPORT PLINK FILES WITH MISSINGNESS INFORMATION 4 | #requires the files qcplink_miss.imiss and qcplink_het.het to be present in the script folder 5 | 6 | imiss <- read.table("qcplink_miss.imiss",header=T) 7 | het <- read.table("qcplink_het.het",header=T) 8 | 9 | #CHECK THAT THE PROPORTION OF MISSING GENOTYPES IS NOT O 10 | #NOTE: IF F_MISS IS ZERO THEN WE ONLY PLOT MEAN HETEROZYGOSITY 11 | 12 | if (!(min(imiss$F_MISS) == 0 && max(imiss$F_MISS) == 0)) { 13 | 14 | 15 | #CALCULATE CALL RATE, LOG10(F_FMISS) and mean heterozygosity 16 | imiss$CALL_RATE <- 1-imiss$F_MISS 17 | imiss$logF_MISS = log10(imiss[,6]) 18 | het$meanHet = (het$N.NM. - het$O.HOM.)/het$N.NM. 19 | het$meanHet <- ifelse(het$meanHet=="NaN", c(0),c(het$meanHet)) 20 | imiss.het <- merge(het,imiss,by=c("FID","IID")) 21 | 22 | #Print Heterozygosity cutoffs 23 | print(paste("cut_het_low: heterozygosity_mean - 3sd is ",sprintf("%.3f", mean(het$meanHet)-(3*sd(het$meanHet)), sep=""))) 24 | print(paste("cut_het_high: heterozygosity_mean + 3sd is ",sprintf("%.3f", mean(het$meanHet)+(3*sd(het$meanHet))), sep="")) 25 | 26 | #GENERATE CALL RATE BY HETEROZYGOSITY PLOT 27 | colors <- densCols(imiss$logF_MISS,het$meanHet) 28 | pdf("qcplink_plots/pairs.imiss-vs-het.pdf") 29 | #plot(imiss$logF_MISS,het$meanHet, col=colors, xlim=c(-3,0),ylim=c(0.26,0.35),pch=20, xlab="Proportion of missing genotypes", ylab="Heterozygosity rate", axes=F) 30 | plot(imiss$logF_MISS,het$meanHet, col=colors, xlim=c(-3,0),ylim=c(0,0.5), pch=20, xlab="Proportion of missing genotypes", ylab="Heterozygosity rate", axes=F) 31 | #axis(2,at=c(0.26,0.27,0.28, 0.29,0.3,0.31,0.32,0.33,0.34,0.35),tick=T) 32 | axis(2,at=c(0,0.05,0.10,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5),tick=T) 33 | axis(1,at=c(-3,-2,-1,0),labels=c(0.001,0.01,0.1,1)) 34 | #Heterozygosity thresholds (Horizontal Line) 35 | abline(h=mean(het$meanHet)-(3*sd(het$meanHet)),col="RED",lty=2) 36 | abline(h=mean(het$meanHet)+(3*sd(het$meanHet)),col="RED",lty=2) 37 | #Missing Data Thresholds (Vertical Line) 38 | abline(v=-1.30103, col="BLUE", lty=2) #THRESHOLD=0.07 39 | abline(v=-1.522879, col="RED", lty=2) #THRESHOLD=0.05 40 | 41 | } else { 42 | 43 | het$meanHet = (het$N.NM. - het$O.HOM.)/het$N.NM. 44 | het$meanHet <- ifelse(het$meanHet=="NaN", c(0),c(het$meanHet)) 45 | 46 | #Print Heterozygosity cutoffs 47 | print(paste("cut_het_low: heterozygosity_mean - 3sd is ",sprintf("%.3f", mean(het$meanHet)-(3*sd(het$meanHet)), sep=""))) 48 | print(paste("cut_het_high: heterozygosity_mean + 3sd is ",sprintf("%.3f", mean(het$meanHet)+(3*sd(het$meanHet))), sep="")) 49 | 50 | pdf("qcplink_plots/meanhet_plot.pdf") 51 | plot(het$meanHet) 52 | abline(h=mean(het$meanHet)-(3*sd(het$meanHet)),col="RED",lty=2) 53 | abline(h=mean(het$meanHet)+(3*sd(het$meanHet)),col="RED",lty=2)} -------------------------------------------------------------------------------- /pipeline_quickstart_config.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ pipeline_quickstart_config.py 4 | 5 | -Configuration file to set input files, directories and parameters 6 | specific to pipeline_quickstart.py 7 | ============================================================================= 8 | """ 9 | 10 | import os 11 | import WitsgwasScripts as SC 12 | import QuickstartUserInput as I 13 | 14 | 15 | # This section is used by the pipeline_quickstart.py to specify input data and 16 | # working directories. 17 | 18 | # Required inputs: 19 | # 1. path to input1 20 | # 2. path to input2 21 | # 3. input3 22 | 23 | ''' 24 | note: project name will be used by the pipeline to generate a 25 | time stamped output directory ''' 26 | 27 | 28 | working_files = { 29 | } 30 | 31 | 32 | # This OPTIONAL section is used by the pipeline_quickstart.py to submit preselected user cutoffs 33 | 34 | preselected_cutoff = { 35 | } 36 | 37 | 38 | 39 | # This section is used by the pipeline_quickstart.py to specify configuration options 40 | # for itself (pipeline_quickstart.py) as well as Rubra. 41 | 42 | # Rubra variables: 43 | # - logDir: the directory where batch queue scripts, stdout and sterr dumps are stored. 44 | # - logFile: the file used to log all jobs that are run. 45 | # - style: the default style, one of 'flowchart', 'print', 'run', 'touchfiles'. Can be 46 | # overridden by specifying --style on the command line. 47 | # - procs: the number of python processes to run simultaneously. This determines the 48 | # maximum parallelism of the pipeline. For distributed jobs it also constrains the 49 | # maximum total jobs submitted to the queue at any one time. 50 | # - verbosity: one of 0 (quiet), 1 (normal), 2 (chatty). Can be overridden by specifying 51 | # --verbose on the command line. 52 | # - end: the desired tasks to be run. Rubra will also run all tasks which are dependencies 53 | # of these tasks. Can be overridden by specifying --end on the command line. 54 | # - force: tasks which will be forced to run, regardless of timestamps. Can be overridden 55 | # by supplying --force on the command line. 56 | # - rebuild: one of 'fromstart','fromend'. Whether to calculate which dependencies will 57 | # be rerun by working back from an end task to the latest up-to-date task, or forward 58 | # from the earliest out-of-date task. 'fromstart' is the most conservative and 59 | # commonly used as it brings all intermediate tasks up to date. 60 | 61 | 62 | # pipeline_quickstart variables: 63 | # nothing at this stage, but could be used to add more features in future 64 | 65 | pipeline = { 66 | 'logDir': os.path.join(SC.CURRENT_PROJECT_DIR, "log_quickstart"), 67 | 'logFile': 'pipeline_quickstart.log', 68 | 'style': 'print', 69 | 'procs': 30, 70 | 'verbose': 1, 71 | 'end': ['quickstart_end_task' 72 | ], 73 | 'force': [], 74 | 'rebuild' : "fromstart", 75 | 76 | 'restrict_samples': False, 77 | 'allowed_samples': [] 78 | 79 | } 80 | -------------------------------------------------------------------------------- /pipeline_quickstart.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ pipeline_quickstart.py 4 | 5 | -One line description of the pipeline. 6 | ============================================================================= 7 | 8 | 9 | Authors: 10 | 11 | 12 | Goal of the pipeline: 13 | This program implements a <> workflow 14 | for human GWAS analysis using <> 15 | 16 | 17 | Pipeline features: 18 | List the features of the pipeline: 19 | 20 | - Feature 1 21 | - Feature 2 22 | - Feature 3 23 | 24 | Assumptions: 25 | This pipeline assumes the following steps have been carried out: 26 | 27 | 28 | Task management: 29 | It employs Rubra for sending jobs to a linux cluster via PBS Torque (version 2.5). 30 | Rubra is a pipeline system for bioinformatics workflows that is built on top 31 | of the Ruffus (http://www.ruffus.org.uk/) Python library (Ruffus version 2.2). 32 | Rubra adds support for running pipeline stages on a distributed computer cluster 33 | (https://github.com/bjpop/rubra) and also supports parallel evaluation of independent 34 | pipeline stages. (Rubra version 0.1.5) 35 | 36 | The pipeline is configured by an options file in a python file, 37 | including the actual commands which are run at each stage. 38 | 39 | 40 | References: 41 | 42 | """ 43 | 44 | 45 | # system imports 46 | import sys # will use to exit sys if no input files are detected 47 | import os # for changing directories 48 | import datetime # for adding timestamps to directories 49 | import subprocess # for executing shell command, can be used instead of os.system() 50 | 51 | 52 | # rubra and ruffus imports 53 | from ruffus import * 54 | from rubra.utils import pipeline_options 55 | from rubra.utils import (runStageCheck, mkLogFile, mkDir, mkForceLink) 56 | 57 | # witsGWAS banner 58 | from pyfiglet import Figlet 59 | 60 | # user defined module imports 61 | import Filemanager as FM 62 | import WitsgwasSoftware as SW 63 | import WitsgwasScripts as SC 64 | 65 | 66 | 67 | # Shorthand access to options defined in pipeline_quickstart_config.py 68 | #========================================== 69 | 70 | working_files = pipeline_options.working_files 71 | logDir = pipeline_options.pipeline['logDir'] 72 | 73 | 74 | 75 | # Data setup process and input organisation 76 | #========================================== 77 | 78 | f = Figlet(font='standard') 79 | print f.renderText('witsGWAS') 80 | print "(C) 2015 Lerato E. Magosi, Scott Hazelhurst" 81 | print "http://magosil86.github.io/witsGWAS/" 82 | print "witsGWAS v0.1.0 is licensed under the MIT license. See LICENSE.txt" 83 | print "----------------------------------------------------------------" 84 | 85 | 86 | # create a directory for the current project 87 | # note: The pipeline will use this dir. for output and intermediate files. 88 | SC.CURRENT_PROJECT_DIR = (os.path.join(SC.witsGWAS_PROJECTS_DIR, working_files['projectname']) + 89 | '-pipeline_quickstart-' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/') 90 | 91 | print "Current project directory %s" % SC.CURRENT_PROJECT_DIR 92 | 93 | FM.create_dir(SC.CURRENT_PROJECT_DIR) 94 | 95 | 96 | # path to the witsGWAS directory 97 | global witsGWAS_SCRIPTS_ROOT_DIR 98 | witsGWAS_SCRIPTS_ROOT_DIR = "absolute/path/to/witsGWAS/" 99 | 100 | 101 | # cd into the current project dir. 102 | os.chdir(SC.CURRENT_PROJECT_DIR) 103 | 104 | 105 | # Check current working directory. 106 | curr_work_dir = os.getcwd() 107 | print "Current working directory %s" % curr_work_dir 108 | 109 | 110 | # create a dir. for storing plots 111 | pipeline_quickstart_plots = (os.path.join(witsGWAS_SCRIPTS_ROOT_DIR, SC.CURRENT_PROJECT_DIR, "pipeline_quickstart_plots") + '/') 112 | FM.create_dir(pipeline_quickstart_plots) 113 | 114 | 115 | 116 | 117 | # Paths to intermediate result files 118 | #========================================== 119 | 120 | 121 | 122 | 123 | 124 | 125 | # Print project information 126 | #========================================== 127 | 128 | print "Starting project %s" % working_files['projectname'] 129 | print 130 | print "Intermediate files and output will be stored in %s" % SC.CURRENT_PROJECT_DIR 131 | print "Log dir is %s" % logDir 132 | print "Project author is %s" % working_files['projectauthor'] 133 | print 134 | 135 | 136 | 137 | # Pipeline declarations 138 | #========================================== 139 | 140 | # create a flagfile to start the pipeline as well as permutation association testing 141 | FM.create_emptyfile('pipeline_quickstart.Start') 142 | 143 | 144 | 145 | 146 | 147 | # Pipeline tasks 148 | #========================================== 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /dockerized/dockerfile_witsgwas_container/Dockerfile: -------------------------------------------------------------------------------- 1 | # Build WitsGWAS pipeline as a Docker image 2 | # 3 | # This script just builds a container with the dependancies which are needed 4 | # to run the WitsGWAS pipeline. 5 | # 6 | # The configuration files for the specific pipeline can be modified in the host 7 | # which runs the Docker container and loaded as a volume, as can the data which 8 | # is needed for the processing. This design was chosen to keep the witsgwas 9 | # Docker container small while still allowing the flexibility the pipeline 10 | # provides. 11 | # 12 | # VERSION : 1.0.0 13 | 14 | # Use Ubuntu as the base image - this is probably the most user-friendly 15 | FROM ubuntu:latest 16 | MAINTAINER Rob Clucas 17 | 18 | # Set properties of the image 19 | LABEL Description = "Docker image for WitsGWAS image" \ 20 | Vendor="Bionet" Version="1.0.0" 21 | 22 | # This defines the directory to while are the depedancy executables will be 23 | # linked, creates it and then adds it to the path 24 | ENV GWAS_ROOT /opt/bioinf/gwas 25 | ENV GWAS_BIN $GWAS_ROOT/bin 26 | RUN mkdir -p $GWAS_BIN 27 | ENV PATH=$GWAS_BIN:$PATH 28 | 29 | # Define environment variables 30 | ENV ADMIX_LINK \ 31 | https://www.genetics.ucla.edu/software/admixture/binaries/admixture_linux-1.3.0.tar.gz 32 | ENV AFFYM_LINK \ 33 | http://media.affymetrix.com/Download/updates/apt-1.18.0-x86_64-intel-linux.zip 34 | ENV EIEGN_LINK \ 35 | https://github.com/DReichLab/EIG.git 36 | ENV EMMAX_LINK \ 37 | http://csg.sph.umich.edu/kang/emmax/download/emmax-beta-07Mar2010.tar.gz 38 | ENV PLINK_107_LINK \ 39 | http://pngu.mgh.harvard.edu/~purcell/plink/dist/plink-1.07-x86_64.zip 40 | ENV PLINK_LINK \ 41 | https://www.cog-genomics.org/static/bin/plink160315/plink_linux_x86_64.zip 42 | ENV RUBRA_LINK \ 43 | https://github.com/bjpop/rubra.git 44 | ENV STOOL_LINK \ 45 | https://bootsrap.pypa.io/ez_setup.py 46 | ENV WITS_GWAS_LINK \ 47 | https://robclucas@bitbucket.org/robclucas/witsgwas.git 48 | 49 | # Start by updating Ubuntu 50 | RUN apt-get update 51 | 52 | # Install all dependancies for WithGWAS 53 | RUN apt-get install -y \ 54 | build-essential \ 55 | git \ 56 | perl \ 57 | python-pyudev \ 58 | python-pip \ 59 | python-pyfiglet \ 60 | figlet \ 61 | r-base \ 62 | wget 63 | 64 | # Install setuptools for python 65 | RUN wget $STOOL_LINK -O - | python 66 | 67 | # Install Plaink 1.07 -- this is tricky because plink1.9 has the same name 68 | # so rather than creating a link we will just add the dir to the path and make 69 | # a link to plink 1.9 70 | RUN mkdir -p /build/plink_1.07 71 | WORKDIR /build/plink_1.07 72 | RUN wget $PLINK_107_LINK 73 | RUN unzip *.zip 74 | RUN ln -sf /build/plink_1.07/plink-1.07-x86_64/plink /usr/bin/plink1 75 | 76 | # Install Plink 1.9 77 | RUN mkdir -p /build/plink 78 | WORKDIR /build/plink 79 | RUN wget $PLINK_LINK 80 | RUN unzip plink_linux_x86_64.zip 81 | RUN ln -sf /build/plink/plink $GWAS_BIN/plink 82 | 83 | # Install admixture 84 | RUN mkdir /build/admixture 85 | WORKDIR /build/admixture 86 | RUN wget $ADMIX_LINK 87 | RUN tar -xvf admixture_linux-1.3.0.tar.gz 88 | RUN ln -sf /build/admixture/admixture_linux-1.3.0/admixture \ 89 | $GWAS_BIN/admixture 90 | 91 | # Install Emmax 92 | RUN mkdir /build/emmax 93 | WORKDIR /build/emmax 94 | RUN wget $EMMAX_LINK 95 | RUN tar -xvf emmax-beta-07Mar2010.tar.gz 96 | RUN ln -sf /build/emmax/emmax-beta-07Mar2010/emmax $GWAS_BIN/emmax 97 | 98 | # Install Eiegensoft 99 | RUN mkdir /build/eiegensoft 100 | WORKDIR /build/eiegensoft 101 | RUN git clone $EIEGN_LINK 102 | RUN ln -sf /build/eiegensoft/EIG/bin/eiegenstrat $GWAS_BIN/eiegenstrat 103 | 104 | # Install Rubra (which installs Rufus) 105 | RUN mkdir /build/rubra 106 | WORKDIR /build/rubra 107 | RUN git clone $RUBRA_LINK 108 | WORKDIR /build/rubra/rubra 109 | RUN python setup.py install 110 | 111 | # Install Affymetrix power tools -- this link may need to be edited 112 | RUN mkdir /build/affymetrix 113 | WORKDIR /build/affymetrix 114 | RUN wget $AFFYM_LINK 115 | RUN unzip apt-1.18.0-x86_64-intel-linux.zip 116 | RUN mv /build/affymetrix/apt-1.18.0-x86_64-intel-linux \ 117 | /build/affymetrix/apt 118 | RUN cp /build/affymetrix/apt/bin/* $GWAS_BIN 119 | 120 | # Get the wits gwas repository 121 | RUN mkdir /witsgwas 122 | WORKDIR /witsgwas 123 | RUN git clone $WITS_GWAS_LINK 124 | 125 | # Go back to the root directory 126 | WORKDIR / 127 | -------------------------------------------------------------------------------- /cluster_job.py: -------------------------------------------------------------------------------- 1 | # Generate a PBS script for a job, and general utilities for 2 | # waiting for a job to complete. 3 | 4 | from shell_command import shellCommand 5 | import sys 6 | from time import sleep 7 | from tempfile import NamedTemporaryFile 8 | import os 9 | 10 | 11 | # this assumes that qstat info for a job will stick around for a while after 12 | # the job has finished. 13 | 14 | class Runnable_Script(object): 15 | def __init__(self, qstat_max_tries = 5, qstat_error_delay = 1, qstat_delay = 10): 16 | self.qstat_max_tries = qstat_max_tries # number of times to try qstat before failing 17 | self.qstat_error_delay = qstat_error_delay # seconds to sleep while waiting for qstat to recover 18 | self.qstat_delay = qstat_delay # seconds to sleep while waiting for job to complete 19 | pass 20 | 21 | def isJobCompleted(self, jobID): 22 | count = 0 23 | while True: 24 | (stdout, stderr, exitStatus) = shellCommand("qstat -f %s" % jobID) 25 | # qstat appears to have worked correctly, we can stop trying. 26 | if exitStatus == 0 or count >= self.qstat_max_tries: 27 | break 28 | count += 1 29 | sleep(self.qstat_error_delay) 30 | if exitStatus != 0: 31 | raise Exception("qstat -f %s returned non-zero exit status %d times,\ 32 | panicking" % (jobID, count)) 33 | else: 34 | # try to fetch the exit status of the job command from the output of 35 | # qstat. 36 | jobState = None 37 | exitStatus = None 38 | for line in stdout.split('\n'): 39 | ws = line.split() 40 | if len(ws) == 3: 41 | if ws[0] == 'job_state' and ws[1] == '=': 42 | jobState = ws[2] 43 | elif ws[0] == 'exit_status' and ws[1] == '=' and \ 44 | ws[2].isdigit(): 45 | exitStatus = int(ws[2]) 46 | if jobState.upper() == 'C': 47 | # Job has completed. 48 | return (True, exitStatus) 49 | else: 50 | # Job has not completed. 51 | return (False, exitStatus) 52 | 53 | 54 | # returns exit status of job (or None if it can't be determined) 55 | def waitForJobCompletion(self, jobID): 56 | isFinished, exitCode = self.isJobCompleted(jobID) 57 | while(not isFinished): 58 | sleep(self.qstat_delay) 59 | isFinished, exitCode = self.isJobCompleted(jobID) 60 | return exitCode 61 | 62 | 63 | # returns exit status of job (or None if it can't be determined) 64 | def runJobAndWait(self, stage, logDir='', verbose=0): 65 | jobID = self.launch() 66 | prettyJobID = jobID.split('.')[0] 67 | logFilename = os.path.join(logDir, stage + '.' + prettyJobID + '.pbs') 68 | with open(logFilename, 'w') as logFile: 69 | logFile.write(self.__str__()) 70 | if verbose > 0: 71 | print('stage = %s, jobID = %s' % (stage, prettyJobID)) 72 | return self.waitForJobCompletion(jobID) 73 | 74 | 75 | # Generate a PBS script for a job. 76 | class PBS_Script(Runnable_Script): 77 | def __init__(self, command, walltime=None, name=None, memInGB=None, 78 | queue='batch', moduleList=None, logDir=None, literals=None, **kw): 79 | self.command = command 80 | self.queue = queue 81 | self.name = name 82 | self.memInGB = memInGB 83 | self.walltime = walltime 84 | self.moduleList = moduleList 85 | self.logDir = logDir 86 | self.literals = literals 87 | super(PBS_Script, self).__init__(**kw) 88 | pass 89 | 90 | # render the job script as a string. 91 | def __str__(self): 92 | script = ['#!/bin/bash'] 93 | # XXX fixme 94 | # should include job id in the output name. 95 | # should use the proper log directory. 96 | if self.queue == 'terri-smp': 97 | script.append('#PBS -q terri') 98 | script.append('#PBS -l procs=8,tpn=8') 99 | else: 100 | script.append('#PBS -q %s' % self.queue) 101 | if self.logDir: 102 | script.append('#PBS -o %s' % self.logDir) 103 | script.append('#PBS -e %s' % self.logDir) 104 | # should put the name of the file in here if possible 105 | if self.name: 106 | script.append('#PBS -N %s' % self.name) 107 | if self.memInGB: 108 | if self.queue in ['smp', 'terri-smp']: 109 | script.append('#PBS -l mem=%sgb' % self.memInGB) 110 | else: 111 | script.append('#PBS -l pvmem=%sgb' % self.memInGB) 112 | if self.walltime: 113 | script.append('#PBS -l walltime=%s' % self.walltime) 114 | # copy the literal text verbatim into the end of the PBS options 115 | # section. 116 | if self.literals: 117 | script.append(self.literals) 118 | if type(self.moduleList) == list and len(self.moduleList) > 0: 119 | for item in self.moduleList: 120 | script.append('module load %s' % item) 121 | script.append('cd $PBS_O_WORKDIR') 122 | script.append(self.command) 123 | return '\n'.join(script) + '\n' 124 | 125 | # create a temporary file to store the job script and then 126 | # launch it with qsub. 127 | def launch(self): 128 | file = NamedTemporaryFile() 129 | file.write(str(self)) 130 | file.flush() 131 | command = 'qsub ' + file.name 132 | (stdout, stderr, returnCode) = shellCommand(command) 133 | file.close() 134 | if returnCode == 0: 135 | return stdout 136 | else: 137 | raise(Exception('qsub command failed with exit status: ' + 138 | str(returnCode))) 139 | 140 | #class SGE_Script(Runnable_Script): 141 | # def __init__(self, command, walltime=None, name=None, memInGB=None, 142 | # queue='batch', moduleList=None, logDir=None, **kw): 143 | # self.command = command 144 | # self.queue = queue 145 | # self.name = name 146 | # self.memInGB = memInGB 147 | # self.walltime = walltime 148 | # self.moduleList = moduleList 149 | # self.logDir = logDir 150 | # self.Runnable_Script.__init__(**kw) 151 | # pass 152 | -------------------------------------------------------------------------------- /cluster_job_edited_for_witsGWAS.py: -------------------------------------------------------------------------------- 1 | # Generate a PBS script for a job, and general utilities for 2 | # waiting for a job to complete. 3 | 4 | from shell_command import shellCommand 5 | import sys 6 | from time import sleep 7 | from tempfile import NamedTemporaryFile 8 | import os 9 | 10 | 11 | # this assumes that qstat info for a job will stick around for a while after 12 | # the job has finished. 13 | 14 | class Runnable_Script(object): 15 | def __init__(self, qstat_max_tries = 5, qstat_error_delay = 1, qstat_delay = 10): 16 | self.qstat_max_tries = qstat_max_tries # number of times to try qstat before failing 17 | self.qstat_error_delay = qstat_error_delay # seconds to sleep while waiting for qstat to recover 18 | self.qstat_delay = qstat_delay # seconds to sleep while waiting for job to complete 19 | pass 20 | 21 | def isJobCompleted(self, jobID): 22 | count = 0 23 | while True: 24 | (stdout, stderr, exitStatus) = shellCommand("qstat -f %s" % jobID) 25 | # qstat appears to have worked correctly, we can stop trying. 26 | if exitStatus == 0 or count >= self.qstat_max_tries: 27 | break 28 | count += 1 29 | sleep(self.qstat_error_delay) 30 | if exitStatus != 0: 31 | raise Exception("qstat -f %s returned non-zero exit status %d times,\ 32 | panicking" % (jobID, count)) 33 | else: 34 | # try to fetch the exit status of the job command from the output of 35 | # qstat. 36 | jobState = None 37 | exitStatus = None 38 | for line in stdout.split('\n'): 39 | ws = line.split() 40 | if len(ws) == 3: 41 | if ws[0] == 'job_state' and ws[1] == '=': 42 | jobState = ws[2] 43 | elif ws[0] == 'exit_status' and ws[1] == '=' and \ 44 | ws[2].isdigit(): 45 | exitStatus = int(ws[2]) 46 | if jobState.upper() == 'C': 47 | # Job has completed. 48 | return (True, exitStatus) 49 | else: 50 | # Job has not completed. 51 | return (False, exitStatus) 52 | 53 | 54 | # returns exit status of job (or None if it can't be determined) 55 | def waitForJobCompletion(self, jobID): 56 | isFinished, exitCode = self.isJobCompleted(jobID) 57 | while(not isFinished): 58 | sleep(self.qstat_delay) 59 | isFinished, exitCode = self.isJobCompleted(jobID) 60 | return exitCode 61 | 62 | 63 | # returns exit status of job (or None if it can't be determined) 64 | def runJobAndWait(self, stage, logDir='', verbose=0): 65 | jobID = self.launch() 66 | prettyJobID = jobID.split('.')[0] 67 | logFilename = os.path.join(logDir, stage + '.' + prettyJobID + '.pbs') 68 | with open(logFilename, 'w') as logFile: 69 | logFile.write(self.__str__()) 70 | if verbose > 0: 71 | print('stage = %s, jobID = %s' % (stage, prettyJobID)) 72 | return self.waitForJobCompletion(jobID) 73 | 74 | 75 | # Generate a PBS script for a job. 76 | class PBS_Script(Runnable_Script): 77 | def __init__(self, command, walltime=None, name=None, memInGB=None, 78 | queue='batch', moduleList=None, logDir=None, literals=None, **kw): 79 | self.command = command 80 | self.queue = queue 81 | self.name = name 82 | self.memInGB = memInGB 83 | self.walltime = walltime 84 | self.moduleList = moduleList 85 | self.logDir = logDir 86 | self.literals = literals 87 | super(PBS_Script, self).__init__(**kw) 88 | pass 89 | 90 | # render the job script as a string. 91 | def __str__(self): 92 | script = ['#!/bin/bash'] 93 | # XXX fixme 94 | # should include job id in the output name. 95 | # should use the proper log directory. 96 | if self.queue == 'WitsLong': 97 | script.append('#PBS -q WitsLong') 98 | script.append('#PBS -l nodes=1:ppn=8') 99 | else: 100 | script.append('#PBS -q %s' % self.queue) 101 | if self.logDir: 102 | script.append('#PBS -o %s' % self.logDir) 103 | script.append('#PBS -e %s' % self.logDir) 104 | # should put the name of the file in here if possible 105 | if self.name: 106 | script.append('#PBS -N %s' % self.name) 107 | if self.memInGB: 108 | if self.queue in ['medium', 'WitsLong']: 109 | script.append('#PBS -l mem=%sGB' % self.memInGB) 110 | else: 111 | script.append('#PBS -l pvmem=%sGB' % self.memInGB) 112 | if self.walltime: 113 | script.append('#PBS -l walltime=%s' % self.walltime) 114 | # copy the literal text verbatim into the end of the PBS options 115 | # section. 116 | if self.literals: 117 | script.append(self.literals) 118 | if type(self.moduleList) == list and len(self.moduleList) > 0: 119 | for item in self.moduleList: 120 | script.append('module load %s' % item) 121 | script.append('cd $PBS_O_WORKDIR') 122 | script.append(self.command) 123 | return '\n'.join(script) + '\n' 124 | 125 | # create a temporary file to store the job script and then 126 | # launch it with qsub. 127 | def launch(self): 128 | cmd=""" 129 | #PBS -N MergeKGSAHGP 130 | #PBS -l nodes=1:ppn=3,walltime=100:00:00,mem=1GB 131 | #PBS -q WitsLong 132 | 133 | sleep 20 134 | exit 0 135 | """ 136 | file = NamedTemporaryFile() 137 | file.write(str(self)) 138 | file.flush() 139 | command = 'qsub ' + file.name 140 | (stdout, stderr, returnCode) = shellCommand(command) 141 | file.close() 142 | if returnCode == 0: 143 | return stdout 144 | else: 145 | raise(Exception('qsub command failed with exit status: ' + 146 | str(returnCode))) 147 | 148 | #class SGE_Script(Runnable_Script): 149 | # def __init__(self, command, walltime=None, name=None, memInGB=None, 150 | # queue='batch', moduleList=None, logDir=None, **kw): 151 | # self.command = command 152 | # self.queue = queue 153 | # self.name = name 154 | # self.memInGB = memInGB 155 | # self.walltime = walltime 156 | # self.moduleList = moduleList 157 | # self.logDir = logDir 158 | # self.Runnable_Script.__init__(**kw) 159 | # pass 160 | -------------------------------------------------------------------------------- /dockerized/witsgwas_dockerized_pipeline.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | /* 4 | * Author : Rob Clucas 5 | * Description : Nextflow pipeline for Wits GWAS. 6 | */ 7 | 8 | //---- General definitions --------------------------------------------------// 9 | 10 | /* Defines the name of the docker container to run the pipeline through. 11 | */ 12 | params.dock_container = 'robclucas/witsgwas' 13 | 14 | /* Defines the name of the mountpoint of the data directories in the docker 15 | * container. This is so that any scripts which run in the container and 16 | * might need this info can run succesfully, and the user can specify the 17 | * directory to each of the scripts. 18 | * 19 | * NOTE: The mountpoint is mounted in the container from the root directory, 20 | * so specifying 'util' as the mount point mounts the data at '/util' in 21 | * the container. 22 | */ 23 | params.dock_mpoint = 'util' 24 | 25 | /* Defines the directory where the plink 1.07 input binary files are. 26 | * 27 | * NOTE: This must be a relative path, from where the pipeline is run. 28 | */ 29 | params.plink_inputpath = "gwasdata/plink" 30 | 31 | /* Defines the path where any scripts to be executed can be found. 32 | * 33 | * NOTE: This must be a ralative path, from where the pipeline is run. 34 | */ 35 | params.script_path = 'scripts' 36 | 37 | /* Defines the names of the plink binary files in the plink directory 38 | * (.fam, .bed, .bed). 39 | * 40 | * NOTE: This must be without the extension (so if A.fam, A.bed, ... 41 | * then use 'A'). 42 | */ 43 | params.plink_fname = 'raw-GWA-data' 44 | 45 | /* Defines the name of the file with high LD region information. 46 | * 47 | * NOTE: This can have/cannot have the extension, but should be in the 48 | * plink_inputpath specified above. 49 | */ 50 | params.high_ld_regions_fname = 'high_LD_regions.txt' 51 | 52 | /* Defines if sexinfo is available or not, options are: 53 | * - "true" : sexinfo is available 54 | * - "false" : sexinfo is not avalable 55 | */ 56 | params.sexinfo_available = "false" 57 | 58 | //---- Cutoff definitions ---------------------------------------------------// 59 | 60 | /* Defines the cutoffs for the heterozygosity. Standard cutoff +- 3sd from 61 | * mean) 62 | */ 63 | params.cut_het_high = 0.343 64 | params.cut_het_low = 0.254 65 | 66 | /* Defines the cutoff for missingness. Using standard cutoff -- 3 - 7%. 67 | */ 68 | params.cut_miss = 0.05 69 | params.cut_diff_miss = 0.05; 70 | 71 | 72 | /* Defines the cutoff for the SNP minor allele frequency. 73 | */ 74 | params.cut_maf = 0.01 75 | 76 | /* Defines the cutoff for SNP missingness. 77 | */ 78 | params.cut_genome = 0.01 79 | 80 | /* Defines the cutoff for the SNP Hardy Weinburg deviation. 81 | */ 82 | params.cut_hwe = 0.01 83 | 84 | //---- Modification of variables for pipeline -------------------------------// 85 | 86 | /* Define the command to add for plink depending on whether sexinfo is 87 | * available or not. Command is: 88 | * 89 | * - No sexinfo availabele : "--allow-no-sexinfo" 90 | * - Sexinfo available : "" 91 | */ 92 | if ( params.sexinfo_available == "false" ) { 93 | params.sexinfo_command = "--allow-no-sex" 94 | println "Sexinfo not available, command: " + params.sexinfo_command + "\n" 95 | } else { 96 | params.sexinfo_command = "" 97 | println "Sexinfo availabel command: " + params.sexinfo_command + "\n" 98 | } 99 | 100 | /* Convert the relative data path(s) to absolute, because this is required for 101 | * docker when mounting. 102 | */ 103 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 104 | script_path = Channel.fromPath(params.script_path, type : 'dir') 105 | 106 | //---- Start Pipeline -------------------------------------------------------// 107 | 108 | /* Process to check for duplicates. The process mounts the plink data to the 109 | * docker container and then runs plink 1.07 through the docker container. It 110 | * writes the results to a file results. 111 | * 112 | * Inputs: 113 | * - filename : The name of the plink input files wo extension 114 | * - container : The name of the docker container to use 115 | * - data_path : The path to the plink data 116 | * - mountpoint : The mountpoint of the data in the container 117 | * - sexinfo : The command to add to plink for sexinfo availability 118 | * 119 | * Outputs: 120 | * - results : The file with the stdout from plink. 121 | */ 122 | process checkDuplicateMarkers { 123 | input: 124 | val filename from params.plink_fname 125 | val container from params.dock_container 126 | val data_path from plink_data_path 127 | val mountpoint from params.dock_mpoint 128 | val sexinfo from params.sexinfo_command 129 | 130 | output: 131 | file 'results' 132 | 133 | script: 134 | """ 135 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 136 | $container plink1 --noweb --bfile $filename $sexinfo --out \ 137 | tmp >> results 138 | """ 139 | } 140 | 141 | //---- Process 2 ------------------------------------------------------------// 142 | 143 | /* Process to filter all the duplicate markers from running plink. 144 | * 145 | * Inputs: 146 | * - results : The file containing the stdout from running plink. 147 | * 148 | * Outputs: 149 | * - duplicate : A file containing all the duplicates from plink. 150 | */ 151 | process filterDuplicateMarkers { 152 | input: 153 | file results 154 | 155 | output: 156 | file 'duplicates' 157 | 158 | script: 159 | """ 160 | if grep 'Duplicates' results > duplicates; then 161 | echo 'Duplicates Found' >> duplicates 162 | echo 'Found Duplicates' 163 | else 164 | echo 'No Duplicates Found' >> duplicates 165 | echo 'Did Not Find Duplicates' 166 | fi 167 | """ 168 | } 169 | 170 | //---- Process 3 ------------------------------------------------------------// 171 | 172 | /* Process to extract all the duplicate RSIDs generated by the plink command. 173 | * 174 | * Inputs: 175 | * - duplicates : The list of duplicates from running plink 176 | * 177 | * Outputs: 178 | * - duplicate_rdis : A file with all the duplicate RSID's 179 | * 180 | * NOTES: The indentation of the inline python script is important because of 181 | * the way python uses indentation. If this has the usual 2 space indent 182 | * as the inline bash scripts do, then there is a python error. This 183 | * could be saved as a script and run through docker as well. 184 | */ 185 | process extractDuplicateRsids { 186 | input: 187 | file duplicates 188 | 189 | output: 190 | file 'duplicate_rsids' 191 | 192 | script: 193 | """ 194 | #!/usr/bin/env python 195 | 196 | input = open('duplicates', 'r') 197 | output = open('duplicate_rsids', 'w') 198 | 199 | # Remove all duplicates 200 | for line in input: 201 | if (line.startswith('#') or line.startswith('\\n') or 202 | line == 'Duplicates Found' or line == 'No Duplicates Found\\n'): 203 | pass 204 | else: 205 | line = line.split(" ") 206 | print(line) 207 | duplicate_snp = line[5].strip() 208 | print(duplicate_snp) 209 | output.write(duplicate_snp + '\\n') 210 | """ 211 | } 212 | 213 | //---- Process 4 ------------------------------------------------------------// 214 | 215 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 216 | 217 | /* Process to remove all the duplicate markers from the plink output. 218 | * 219 | * Inputs: 220 | * - duplicate_rsids : A file containing all duplicate rsids to remove. 221 | * - filename : The name of the plink input files wo extension 222 | * - container : The name of the docker container to use 223 | * - data_path : The path to the plink data 224 | * - mountpoint : The mountpoint of the data in the container 225 | * - sexinfo : The command to add to plink for sexinfo availability 226 | * 227 | * Outputs: 228 | * - qcplink_log* : Log files from plink with the output, these are the 229 | * input to later processes. 230 | * 231 | * NOTES : Multiple outputs are required so that other processes 232 | * which use the output can be started concurrently. If 233 | * only a single file is output, then the processes will 234 | * execute sequentially, and each one will have to output 235 | * the file. 236 | */ 237 | process removeDuplicateMarkers { 238 | input: 239 | file duplicate_rsids 240 | val filename from params.plink_fname 241 | val container from params.dock_container 242 | val data_path from plink_data_path 243 | val mountpoint from params.dock_mpoint 244 | val sexinfo from params.sexinfo_command 245 | 246 | output: 247 | file 'qcplink_log' into receiver 248 | file 'qcplink_log1' into receiver 249 | file 'qcplink_log2' into receiver 250 | file 'qcplink_log3' into receiver 251 | 252 | script: 253 | """ 254 | if [[ -s duplicate_rsids ]]; then 255 | # Copy the file to the mount path of the container 256 | cp duplicate_rsids $data_path/duplicate_rsids 257 | 258 | # Remove duplicate ID's, running plinnk through the container 259 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 260 | $container plink1 --noweb --bfile $filename $sexinfo \ 261 | --exclude duplicate_rsids --make-bed --out \ 262 | qcplink >> qcplink_log 263 | else 264 | # There are no duplicate RSID's, so don;t specify exclude file 265 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 266 | $container plink1 --noweb --bfile $filename $sexinfo \ 267 | --make-bed --out qcplink >> qcplink_log 268 | fi 269 | 270 | # Create links for the outputs 271 | ln -s qcplink_log qcplink_log1 272 | ln -s qcplink_log qcplink_log2 273 | ln -s qcplink_log qcplink_log3 274 | """ 275 | } 276 | 277 | //---- Process 5 ------------------------------------------------------------// 278 | 279 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 280 | 281 | /* Process to identify individual discordant sex information. 282 | * 283 | * Inputs: 284 | * - qcplink_log : The log file previously generated from running plink. 285 | * - filename : The name of the plink input files wo extension 286 | * - container : The name of the docker container to use 287 | * - data_path : The path to the plink data 288 | * - mountpoint : The mountpoint of the data in the container 289 | * - sexinfo : The command to add to plink for sexinfo availability 290 | * 291 | * Outputs: 292 | * -sexstat_problems : All sexinfo results which have problems. 293 | * 294 | * NOTES : The qcplink_log file is used as a 'start parameter', since when a 295 | * nextflow process uses the output of another process as input, the 296 | * process will only run once that input has become available. If we 297 | * did not do this, then the process would try and run concurrently at 298 | * the start, which would not work since the input data would not be 299 | * ready. 300 | */ 301 | process identifyIndivDiscSexinfo { 302 | input: 303 | file qcplink_log from receiver 304 | val filename from params.plink_fname 305 | val container from params.dock_container 306 | val data_path from plink_data_path 307 | val mountpoint from params.dock_mpoint 308 | val sexinfo from params.sexinfo_available 309 | 310 | output: 311 | file 'failed_sexcheck' 312 | 313 | script: 314 | """ 315 | # Check that the input is available. 316 | if [[ -s qcplink_log ]]; then 317 | echo 'Plink log received, can continue!' 318 | fi 319 | 320 | if [[ $sexinfo == 'true' ]]; then 321 | # Generate all the sex info. Because this runs through docker the output 322 | # will be in the workdir of the container ($data_path). 323 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 324 | $container plink --bfile qcplink --check-sex \ 325 | --out sexstat 326 | else 327 | echo 'no sexinfo available for qcplink' > $data_path/sexstat.sexcheck 328 | fi 329 | 330 | # Check for all the "PROBLEM" sex information. 331 | if grep -Rn 'PROBLEM' $data_path/sexstat.sexcheck > failed_sexcheck; then 332 | echo 'Discordant sex info found' 333 | else 334 | echo 'No discordant sex info found' 335 | fi 336 | """ 337 | } 338 | 339 | //---- Process 6 ------------------------------------------------------------// 340 | 341 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 342 | script_path = Channel.fromPath(params.script_path, type : 'dir') 343 | 344 | /* Process to calculate the sample missingness. 345 | * 346 | * Inputs: 347 | * - qcplink_log1 : The log file previously generated from running plink. 348 | * - filename : The name of the plink input files wo extension 349 | * - container : The name of the docker container to use 350 | * - data_path : The path to the plink data 351 | * - script_path : The path to the scripts 352 | * - mountpoint : The mountpoint of the data in the container 353 | * - sexinfo : The command to add to plink for sexinfo availability 354 | * 355 | * Outputs: 356 | * - qcplink_imiss* : Information for the missingness. Again multiple files so 357 | * that later processes start concurrently. 358 | * 359 | * NOTES : The qcplink_log file is used as a 'start parameter', since when a 360 | * nextflow process uses the output of another process as input, the 361 | * process will only run once that input has become available. If we 362 | * did not do this, then the process would try and run concurrently at 363 | * the start, which would not work since the input data would not be 364 | * ready. 365 | */ 366 | process calculateSampleMissingness { 367 | input: 368 | file qcplink_log1 from receiver 369 | val container from params.dock_container 370 | val script_dir from script_path 371 | val data_path from plink_data_path 372 | val mountpoint from params.dock_mpoint 373 | val sexinfo from params.sexinfo_command 374 | 375 | output: 376 | file 'qcplink_missing' 377 | file 'qcplink_missing1' 378 | file 'qcplink_missing2' 379 | 380 | script: 381 | """ 382 | if [[ -s qcplink_log1 ]]; then 383 | echo 'Plink log received, can continue!' 384 | fi 385 | 386 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 387 | $container plink --bfile qcplink $sexinfo --missing \ 388 | --out qcplink_missing 389 | 390 | # Create output links 391 | ln $data_path/qcplink_missing.imiss $script_dir/qcplink_miss.imiss 392 | 393 | echo "Complete" > qcplink_missing 394 | echo "Complete" > qcplink_missing1 395 | echo "Complete" > qcplink_missing2 396 | """ 397 | } 398 | 399 | //---- Process 7 ------------------------------------------------------------// 400 | 401 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 402 | script_path = Channel.fromPath(params.script_path, type : 'dir') 403 | 404 | /* Process to calculate the heterozygosity for the samples. 405 | * 406 | * Inputs: 407 | * - qcplink_log2 : The log file previously generated from running plink. 408 | * - filename : The name of the plink input files wo extension 409 | * - container : The name of the docker container to use 410 | * - data_path : The path to the plink data 411 | * - script_path : The path ot the scripts 412 | * - mountpoint : The mountpoint of the data in the container 413 | * - sexinfo : The command to add to plink for sexinfo availability 414 | * 415 | * Outputs: 416 | * - qcplink_het* : Information about the heterozygosity. Again multiple 417 | * so that multiple processes can start. 418 | * 419 | * NOTES : The qcplink_log file is used as a 'start parameter', since when a 420 | * nextflow process uses the output of another process as input, the 421 | * process will only run once that input has become available. If we 422 | * did not do this, then the process would try and run concurrently at 423 | * the start, which would not work since the input data would not be 424 | * ready. 425 | */ 426 | process calculateSampleHetrozygosity { 427 | input: 428 | file qcplink_log2 from receiver 429 | val container from params.dock_container 430 | val data_path from plink_data_path 431 | val script_dir from script_path 432 | val mountpoint from params.dock_mpoint 433 | val sexinfo from params.sexinfo_command 434 | 435 | output: 436 | file 'qcplink_het' 437 | file 'qcplink_het1' 438 | 439 | script: 440 | """ 441 | if [[ -s qcplink_log2 ]]; then 442 | echo 'Plink log received, can continue!' 443 | fi 444 | 445 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 446 | $container plink --bfile qcplink $sexinfo --het \ 447 | --out qcplink_het 448 | 449 | # Link the result in the data path to the output stream 450 | ln $data_path/qcplink_het.het $script_dir/qcplink_het.het 451 | 452 | echo "Complete" > qcplink_het 453 | echo "Complete" > qcplink_het1 454 | """ 455 | } 456 | 457 | //---- Process 8 ------------------------------------------------------------// 458 | 459 | script_path = Channel.fromPath(params.script_path, type : 'dir') 460 | 461 | /* Process to generate plots for the missingness and heterozygosity. 462 | * 463 | * Inputs: 464 | * - qcplink_missing : Link to the missingness data 465 | * - qcplink_het : Link to the heterozygosity data 466 | * - script_dir : Script directory to find scripts 467 | * - container : Docker container to use 468 | * - mountpoint : Mountpoint in container 469 | * 470 | * Outputs: 471 | * - qcplink_missing : Results for the missingness. 472 | * - qcplink_het : Results for the heterozygosity. 473 | * - failed_miss_het : Failed results for the missingness and heterozygosity. 474 | */ 475 | process generateMissHetPlot { 476 | errorStrategy 'ignore' 477 | 478 | input: 479 | file qcplink_missing 480 | file qcplink_het 481 | val script_dir from script_path 482 | val container from params.dock_container 483 | val mountpoint from params.dock_mpoint 484 | 485 | output: 486 | file 'failed_miss_het' 487 | 488 | script: 489 | """ 490 | if [[ qcplink_missing ]]; then 491 | echo "Missingness available" 492 | fi 493 | 494 | if [[ qcplink_het ]]; then 495 | echo "Heterozygosity available" 496 | fi 497 | 498 | docker run -v $script_dir:/$mountpoint -w /$mountpoint \ 499 | $container Rscript miss_het_plot_qcplink.R 500 | 501 | # Create a link which is the output file 502 | ln $script_dir/fail_miss_het_qcplink.txt failed_miss_het 503 | """ 504 | } 505 | 506 | //---- Process 9 ------------------------------------------------------------// 507 | 508 | script_path = Channel.fromPath(params.script_path, type : 'dir') 509 | 510 | /* 511 | * Process to find individuals with extreme missingness and heterozygosity 512 | * scores. 513 | * 514 | * Inputs: 515 | * - qcplink_missing1 : A link to the missingness file 516 | * - qcplink_het1 : A link to the heterozygosity file 517 | * - script_dir : The scripts directory 518 | * - container : The docker container to use 519 | * - mountpoint : The mountpoint in the container 520 | * - cut_het_high : The high values for heterozygosity 521 | * - cut_het_low : The low values for heterozygosity 522 | * - cut_miss : The missingness rate 523 | * 524 | * Outputs: 525 | * - None, the results are written to the scripts directory. 526 | */ 527 | process findIndivWithHighMissExtremeHet { 528 | input: 529 | file qcplink_missing1 530 | file qcplink_het1 531 | val script_dir from script_path 532 | val container from params.dock_container 533 | val mountpoint from params.dock_mpoint 534 | val cut_het_high from params.cut_het_high 535 | val cut_het_low from params.cut_het_low 536 | val cut_miss from params.cut_miss 537 | 538 | output: 539 | stdout 'result' 540 | 541 | script: 542 | """ 543 | if [[ qcplink_missing1 ]]; then 544 | echo "Missing file available" 545 | fi 546 | 547 | if [[ qcplink_het1 ]]; then 548 | echo "Heterozygosity file available" 549 | fi 550 | 551 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 552 | perl select_miss_het_qcplink.pl $cut_het_high $cut_het_low $cut_miss 553 | """ 554 | } 555 | 556 | //---- Process 10 -----------------------------------------------------------// 557 | 558 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 559 | 560 | /* Process to prune for IBD. 561 | * 562 | * Inputs: 563 | * - qcplink_log3 : File specifying the the plink input files are ready. 564 | * - high_ld_file : File specifying high ld regions to exclude 565 | * - container : The docker container to use 566 | * - data_path : The path where the data is, mounted onto container 567 | * - mountpoint : The location on the container where data is mounted 568 | * - sexinfo : Command to add based on sexinfo availability 569 | * 570 | * Outputs: 571 | * - qcplink_ibd_prune_status* : 572 | * The status of the process, when complete this file is created. 573 | * 574 | * NOTES : Plink data is written to the data_path directory. 575 | */ 576 | process pruneForIBD { 577 | input: 578 | file qcplink_log3 from receiver 579 | val high_ld_file from params.high_ld_regions_fname 580 | val container from params.dock_container 581 | val data_path from plink_data_path 582 | val mountpoint from params.dock_mpoint 583 | val sexinfo from params.sexinfo_command 584 | 585 | output: 586 | file 'qcplink_ibd_prune_status' 587 | file 'qcplink_ibd_prune_status1' 588 | 589 | script: 590 | """ 591 | if [[ -s qcplink_log3 ]]; then 592 | echo 'Qcplink log received, pruning IBD' 593 | fi 594 | 595 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 596 | $container plink --bfile qcplink $sexinfo --exclude $high_ld_file \ 597 | --range --indep-pairwise 50 5 0.2 --out qcplink_ibd 598 | 599 | echo 'Complete' > qcplink_ibd_prune_status 600 | ln qcplink_ibd_prune_status qcplink_ibd_prune_status1 601 | """ 602 | } 603 | 604 | //---- Process 11 -----------------------------------------------------------// 605 | 606 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 607 | 608 | /* Process to calculate the IBD. 609 | * 610 | * Inputs: 611 | * - qc_plink_ibd_prune_status : The status of the pruning process. 612 | * - container : The docker container to use 613 | * - data_path : The path where the data is, mounted onto container 614 | * - mountpoint : The location on the container where data is mounted 615 | * - sexinfo : Command to add based on sexinfo availability 616 | * 617 | * Outputs: 618 | * - None : Output files are written to the data_path directory. 619 | */ 620 | process calculateIBD { 621 | input: 622 | file qcplink_ibd_prune_status 623 | val container from params.dock_container 624 | val data_path from plink_data_path 625 | val mountpoint from params.dock_mpoint 626 | val sexinfo from params.sexinfo_command 627 | 628 | output: 629 | stdout 'result' 630 | 631 | script: 632 | """ 633 | if [[ -s qcplink_ibd_prune_status ]]; then 634 | echo "IBD Prune status file received, calculating IBD" 635 | fi 636 | 637 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 638 | $container plink --bfile qcplink $sexinfo --extract qcplink_ibd.prune.in \ 639 | --genome --out qcplink_ibd 640 | """ 641 | } 642 | 643 | //---- Process 12 -----------------------------------------------------------// 644 | 645 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 646 | 647 | /* Process to calculate the IBD with Min Pi Hat. 648 | * 649 | * - qc_plink_ibd_prune_status1 : The status of the pruning process. 650 | * - container : The docker container to use 651 | * - data_path : The path where the data is, mounted onto container 652 | * - mountpoint : The location on the container where data is mounted 653 | * - sexinfo : Command to add based on sexinfo availability 654 | * 655 | * Outputs: 656 | * - qcplink_ind_min_004* : The IBD results from plink. 657 | */ 658 | process calculateIBDMinPiHat { 659 | input: 660 | file qcplink_ibd_prune_status1 661 | val container from params.dock_container 662 | val data_path from plink_data_path 663 | val mountpoint from params.dock_mpoint 664 | val sexinfo from params.sexinfo_command 665 | 666 | output: 667 | file 'qcplink_ibd_min_004' 668 | file 'qcplink_ibd_min_0041' 669 | 670 | script: 671 | """ 672 | if [[ -s qcplink_ibd_prune_status1 ]]; then 673 | echo "IBD prune status recieved" 674 | fi 675 | 676 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 677 | $container plink --bfile qcplink $sexinfo --extract qcplink_ibd.prune.in \ 678 | --genome --min 0.04 --out qcplink_ibd_min_0_04 679 | 680 | ln $data_path/qcplink_ibd_min_0_04.genome qcplink_ibd_min_004 681 | ln $data_path/qcplink_ibd_min_0_04.genome qcplink_ibd_min_0041 682 | """ 683 | } 684 | 685 | //---- Process 13 -----------------------------------------------------------// 686 | 687 | /* Proces to sort the results from runnning IBD Min Pi hat. 688 | * 689 | * Inputs: 690 | * -qcplink_ibd_min_004 : The input file to sort. 691 | * 692 | * Outputs: 693 | * - qc_plink_ibd_min_004_sorted_pihat.txt : The sorted results. 694 | */ 695 | process sortByPiHat { 696 | input: 697 | file qcplink_ibd_min_004 698 | 699 | output: 700 | file 'qcplink_ibd_min_0_04_sorted_pihat.txt' 701 | 702 | """ 703 | sort -k10n qcplink_ibd_min_004 > qcplink_ibd_min_0_04_sorted_pihat.txt 704 | """ 705 | } 706 | 707 | //---- Process 14 -----------------------------------------------------------// 708 | 709 | script_path = Channel.fromPath(params.script_path, type : 'dir') 710 | 711 | /* Filters all the related individuals. 712 | * 713 | * Inputs: 714 | * - qcplink_missing2 : A link to the missingness file 715 | * - qcplink_ibd_min_0041 : A link to the ind file. 716 | * - script_dir : The scripts directory 717 | * - container : The docker container to use 718 | * - mountpoint : The directory in the conmtainer to mount to. 719 | * 720 | * Outputs: 721 | * - None : Results are written to the scripts directory. 722 | */ 723 | /* 724 | process filterRelatedIndiv { 725 | errorStrategy 'ignore' 726 | 727 | input: 728 | file qcplink_missing2 729 | file qcplink_ibd_min_0041 730 | val script_dir from script_path 731 | val container from params.dock_container 732 | val mountpoint from params.dock_mpoint 733 | 734 | output: 735 | stdout 'result' 736 | 737 | script: 738 | """ 739 | # Check that there are no old links 740 | if [[ -e $script_dir/qcplink_missing.imiss ]]; then 741 | rm $script_dir/qcplink_missing.imiss 742 | fi 743 | 744 | if [[ -e $script_dir/qcplink_genome.genome ]]; then 745 | rm $script_dir/qcplink_genome.genome 746 | fi 747 | 748 | # Make a link for the missing file so that the file has .imiss ext 749 | if [[ -s qcplink_missing2 ]]; then 750 | ln qcplink_missing2 $script_dir/qcplink_missing.imiss 751 | fi 752 | 753 | if [[ -s qcplink_ibd_min_0041 ]]; then 754 | ln qcplink_ibd_min_0041 $script_dir/qcplink_genome.genome 755 | fi 756 | 757 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 758 | perl run_IBD_QC_qcplink.pl qcplink_missing qcplink_genome 759 | """ 760 | } 761 | */ 762 | 763 | //---- Process 15 -----------------------------------------------------------// 764 | 765 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 766 | 767 | /* Process to join the failed individuals into a single file. 768 | * 769 | * Inputs: 770 | * - failed_miss_het : The failed missingness and heterozygosity results. 771 | * - failed_sexcheck : The failed sex stat results. 772 | * 773 | * Ouputs: 774 | * - failed_qc_plink_inds : The combined failed results. 775 | */ 776 | process joinQcplinkFailedIndivIntoSingleFile { 777 | input: 778 | file failed_miss_het 779 | file failed_sexcheck 780 | val data_path from plink_data_path 781 | 782 | output: 783 | file 'failed_qc_plink_inds' 784 | 785 | script: 786 | """ 787 | cat failed_sexcheck failed_miss_het | sort -k1 | \ 788 | uniq > $data_path/qcplink_failed_inds 789 | 790 | echo "Complete" > failed_qc_plink_inds 791 | """ 792 | } 793 | 794 | 795 | //---- Process 16 -----------------------------------------------------------// 796 | 797 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 798 | 799 | /* Process to remove all failed individuals. 800 | * 801 | * Inputs: 802 | * - failed_qc_plink_inds : The failed individuals to remove. 803 | * - script_dir : The scripts directory 804 | * - container : The docker container to use 805 | * - mountpoint : The directory in the conmtainer to mount to. 806 | * 807 | * Outputs: 808 | * - qced_qcplink_status* : The output file indicating that the process is done. 809 | */ 810 | process removeQcPlinkFailedIndiv { 811 | input: 812 | file failed_qc_plink_inds 813 | val container from params.dock_container 814 | val data_path from plink_data_path 815 | val mountpoint from params.dock_mpoint 816 | val sexinfo from params.sexinfo_command 817 | 818 | output: 819 | file 'qced_qcplink_status1' 820 | file 'qced_qcplink_status2' 821 | file 'qced_qcplink_status3' 822 | file 'qced_qcplink_status4' 823 | file 'qced_qcplink_status5' 824 | file 'qced_qcplink_status6' 825 | 826 | script: 827 | """ 828 | # Make a link in the data_path directory for the failed indices 829 | if [[ -s failed_qc_plink_inds ]]; then 830 | echo "Failed inds input available" 831 | fi 832 | 833 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 834 | $container plink --noweb --bfile qcplink $sexinfo --remove \ 835 | qcplink_failed_inds --make-bed --out qc_plink_clean_inds 836 | 837 | # Create output files 838 | echo 'Qced complete' > qced_qcplink_status1 839 | echo 'Qced complete' > qced_qcplink_status2 840 | echo 'Qced complete' > qced_qcplink_status3 841 | echo 'Qced complete' > qced_qcplink_status4 842 | echo 'Qced complete' > qced_qcplink_status5 843 | echo 'Qced complete' > qced_qcplink_status6 844 | """ 845 | } 846 | 847 | //---- Process 17 -----------------------------------------------------------// 848 | 849 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 850 | script_path = Channel.fromPath(params.script_path, type : 'dir') 851 | 852 | /* Process to calculate the Maf results. 853 | * 854 | * Inputs: 855 | * - qced_qcplink_status1 : The file indicating input data is available. 856 | * - container : The docker container to use 857 | * - data_path : The path the input data. 858 | * - script_dir : The directory where the scripts are. 859 | * - mountpoint : The directory in the conmtainer to mount to. 860 | * - sexinfo : The command to add for sexinfo. 861 | * 862 | * Outputs: 863 | * - qxced_clean_inds_freq : The output results for Maf calculation. 864 | */ 865 | process calculateMaf { 866 | input: 867 | file qced_qcplink_status1 868 | val container from params.dock_container 869 | val data_path from plink_data_path 870 | val script_dir from script_path 871 | val mountpoint from params.dock_mpoint 872 | val sexinfo from params.sexinfo_command 873 | 874 | output: 875 | file 'qced_clean_inds_freq' 876 | 877 | script: 878 | """ 879 | if [[ -s qced_qcplink_status1 ]]; then 880 | echo "Input available, can calculate maf" 881 | fi 882 | 883 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 884 | $container plink --noweb --bfile qc_plink_clean_inds $sexinfo \ 885 | --freq --out qc_plink_clean_inds_freq 886 | 887 | ln $data_path/qc_plink_clean_inds_freq.frq \ 888 | $script_dir/qced_clean_inds_freq.frq 889 | 890 | echo "Complete" > qced_clean_inds_freq 891 | """ 892 | } 893 | 894 | //---- Process 18 -----------------------------------------------------------// 895 | 896 | script_path = Channel.fromPath(params.script_path, type : 'dir') 897 | 898 | /* Process to generate the Maf plot. 899 | * 900 | * Inputs: 901 | * - qced_clean_inds_freq : A link to the input data from the calculateMaf 902 | * process. 903 | * - container : The docker container to use 904 | * - mountpoint : The directory in the conmtainer to mount to. 905 | * - script_dir : The directory where scripts are. 906 | * 907 | * Outputs: 908 | * - generate_maf_status : The status of the process. 909 | */ 910 | process generateMafPlot { 911 | input: 912 | file qced_clean_inds_freq 913 | val container from params.dock_container 914 | val mountpoint from params.dock_mpoint 915 | val script_dir from script_path 916 | 917 | output: 918 | file 'generate_maf_status' 919 | 920 | script: 921 | """ 922 | if [[ -s qced_clean_inds_freq ]]; then 923 | echo "Input available" 924 | fi 925 | 926 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 927 | Rscript maf_plot_qcplink.R 928 | 929 | echo "Complete" > generate_maf_status 930 | """ 931 | } 932 | 933 | //---- Process 19 -----------------------------------------------------------// 934 | 935 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 936 | script_path = Channel.fromPath(params.script_path, type : 'dir') 937 | 938 | /* Process to calculate the snp missingness. 939 | * 940 | * Inputs: 941 | * - qced_qcplink_status2 : The file indicating input data is available. 942 | * - container : The docker container to use 943 | * - data_path : The path the input data. 944 | * - script_dir : The directoryw where the scripts are. 945 | * - mountpoint : The directory in the conmtainer to mount to. 946 | * - sexinfo : The command to add for sexinfo. 947 | * 948 | * Outputs: 949 | * - qxced_clean_inds_missing : The output results for missingness calculation 950 | */ 951 | process calculateSnpMissigness { 952 | input: 953 | file qced_qcplink_status2 954 | val container from params.dock_container 955 | val data_path from plink_data_path 956 | val script_dir from script_path 957 | val mountpoint from params.dock_mpoint 958 | val sexinfo from params.sexinfo_command 959 | 960 | output: 961 | file 'qced_clean_inds_missing' 962 | 963 | script: 964 | """ 965 | if [[ -s qced_qcplink_status2 ]]; then 966 | echo "Input available, can calculate missingness" 967 | fi 968 | 969 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 970 | $container plink --bfile qc_plink_clean_inds $sexinfo --missing \ 971 | --out qc_plink_clean_inds_missing 972 | 973 | ln $data_path/qc_plink_clean_inds_missing.lmiss \ 974 | $script_dir/clean_inds_qcplink_missing.lmiss 975 | 976 | echo "Complete" > qced_clean_inds_missing 977 | """ 978 | } 979 | 980 | //---- Process 20 -----------------------------------------------------------// 981 | 982 | script_path = Channel.fromPath(params.script_path, type : 'dir') 983 | 984 | /* Proces to generate a plot of the missingness results. 985 | * 986 | * Inputs: 987 | * - qced_clean_inds_missing : A link to the input data from the missingness 988 | * calculatio process. 989 | * - container : The docker container to use 990 | * - mountpoint : The directory in the conmtainer to mount to. 991 | * - script_dir : The directory where scripts are. 992 | * 993 | * Outputs: 994 | * - generate_missingness_status : The status of the missingness plot 995 | * generation. 996 | */ 997 | process generateSnpMissingnessPlot { 998 | input: 999 | file qced_clean_inds_missing 1000 | val container from params.dock_container 1001 | val mountpoint from params.dock_mpoint 1002 | val script_dir from script_path 1003 | 1004 | output: 1005 | file 'generate_snp_missingness_status' 1006 | 1007 | script: 1008 | """ 1009 | if [[ -s qced_clean_inds_missing ]]; then 1010 | echo 'Finished calculating snp missingness, now plotting' 1011 | fi 1012 | 1013 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 1014 | Rscript snpmiss_plot_qcplink.R 1015 | 1016 | echo "Complete" > generate_snp_missingness_status 1017 | """ 1018 | } 1019 | 1020 | //---- Process 21 -----------------------------------------------------------// 1021 | 1022 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1023 | script_path = Channel.fromPath(params.script_path, type : 'dir') 1024 | 1025 | /* Process to calculate the snp differential missingness. 1026 | * 1027 | * Inputs: 1028 | * - qced_qcplink_status3 : The file indicating input data is available. 1029 | * - container : The docker container to use 1030 | * - data_path : The path the input data. 1031 | * - script_dir : The path to the scripts 1032 | * - mountpoint : The directory in the conmtainer to mount to. 1033 | * - sexinfo : The command to add for sexinfo. 1034 | * 1035 | * Outputs: 1036 | * - qced_clean_inds_test_missing* : The results of the process. 1037 | */ 1038 | process calculateSnpDifferentialMissingness { 1039 | input: 1040 | file qced_qcplink_status3 1041 | val container from params.dock_container 1042 | val data_path from plink_data_path 1043 | val script_dir from script_path 1044 | val mountpoint from params.dock_mpoint 1045 | val sexinfo from params.sexinfo_command 1046 | 1047 | output: 1048 | file 'qced_clean_inds_test_missing1' 1049 | file 'qced_clean_inds_test_missing2' 1050 | 1051 | script: 1052 | """ 1053 | if [[ -s qced_qcplink_status3 ]]; then 1054 | echo "Input available, can calculate differential missingness" 1055 | fi 1056 | 1057 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 1058 | $container plink --bfile qc_plink_clean_inds $sexinfo --missing \ 1059 | --out qc_plink_clean_inds_test_missing 1060 | 1061 | ln $data_path/qc_plink_clean_inds_test_missing.lmiss \ 1062 | $script_dir/clean_inds_qcplink_test_missing.missing 1063 | 1064 | echo "Complete" > qced_clean_inds_test_missing1 1065 | echo "Complete" > qced_clean_inds_test_missing2 1066 | """ 1067 | } 1068 | 1069 | //---- Process 22 -----------------------------------------------------------// 1070 | 1071 | script_path = Channel.fromPath(params.script_path, type : 'dir') 1072 | 1073 | /* Process to generate a plot for the differential missngness. 1074 | * 1075 | * Inputs: 1076 | * - qced-clean_inds_test_missing1 : The results to use to generate the plot. 1077 | * - container : The docker container to use 1078 | * - mountpoint : The directory in the conmtainer to mount to. 1079 | * - script_dir : The directory where scripts are. 1080 | * 1081 | * Outputs: 1082 | * - generate_diff_miss_status : The status of the plot generation. 1083 | * 1084 | * NOTES : Specifying "ignore" for the error strategy allows the pipeline to 1085 | * continue but still reports an error -- remove if this is not desired 1086 | */ 1087 | process generateDifferentialMissingnessPlot { 1088 | errorStrategy 'ignore' 1089 | 1090 | input: 1091 | file qced_clean_inds_test_missing1 1092 | val container from params.dock_container 1093 | val mountpoint from params.dock_mpoint 1094 | val script_dir from script_path 1095 | 1096 | output: 1097 | file 'generate_diff_miss_status' 1098 | 1099 | script: 1100 | """ 1101 | if [[ -s qced_clean_inds_test_missing1 ]]; then 1102 | echo "Input data available" 1103 | fi 1104 | 1105 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 1106 | Rscript diffmiss_plot_qcplink.R 1107 | 1108 | echo "Complete" > generate_diff_miss_status 1109 | """ 1110 | } 1111 | 1112 | //---- Process 23 -----------------------------------------------------------// 1113 | 1114 | script_path = Channel.fromPath(params.script_path, type : 'dir') 1115 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1116 | 1117 | /* Process to find snps with extreme differential missingness. 1118 | * 1119 | * Inputs: 1120 | * - qced_clean_inds_test_staus2 : The file indicating input data is available. 1121 | * - container : The docker container to use 1122 | * - data_path : The path the input data. 1123 | * - mountpoint : The directory in the conmtainer to mount to. 1124 | * - sexinfo : The command to add for sexinfo. 1125 | * - cut_diff_miss : The value to use to evaluate diff miss. 1126 | * 1127 | * Outputs: 1128 | * - failed_diffmiss : The failed results for the process. 1129 | */ 1130 | process findSnpExtremeDifferentialMissingness { 1131 | input: 1132 | file qced_clean_inds_test_missing2 1133 | val container from params.dock_container 1134 | val mountpoint from params.dock_mpoint 1135 | val data_path from plink_data_path 1136 | val script_dir from script_path 1137 | val cut_diff_miss from params.cut_diff_miss 1138 | 1139 | output: 1140 | file 'failed_diffmiss' 1141 | 1142 | script: 1143 | """ 1144 | if [[ qced_clean_inds_test_missing2 ]]; then 1145 | echo "Input data available" 1146 | fi 1147 | 1148 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 1149 | perl select_diffmiss_qcplink.pl $cut_diff_miss 1150 | 1151 | ln $script_dir/fail_diffmiss_qcplink.txt \ 1152 | $data_path/fail_diffmiss_qcplink.txt 1153 | 1154 | echo "Complete" > failed_diffmiss 1155 | """ 1156 | } 1157 | 1158 | //---- Process 24 -----------------------------------------------------------// 1159 | 1160 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1161 | 1162 | /* Process to find snps with extreme Hardy Weinburg deviations. 1163 | * 1164 | * Inputs: 1165 | * - qced_qcplink_status4 : The file indicating input data is available. 1166 | * - container : The docker container to use 1167 | * - data_path : The path the input data. 1168 | * - mountpoint : The directory in the conmtainer to mount to. 1169 | * - sexinfo : The command to add for sexinfo. 1170 | * 1171 | * Outputs: 1172 | * - qced_clean_inds_hwe : The results with extreme hwe deviations. 1173 | */ 1174 | process findSnpsExtremeHweDeviations { 1175 | input: 1176 | file qced_qcplink_status4 1177 | val container from params.dock_container 1178 | val data_path from plink_data_path 1179 | val mountpoint from params.dock_mpoint 1180 | val sexinfo from params.sexinfo_command 1181 | 1182 | output: 1183 | file 'qced_clean_inds_hwe' 1184 | 1185 | script: 1186 | """ 1187 | if [[ -s qced_qcplink_status4 ]]; then 1188 | echo "Input available, can find extreme hew variations" 1189 | fi 1190 | 1191 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 1192 | $container plink --bfile qc_plink_clean_inds $sexinfo --hardy \ 1193 | --out qc_plink_clean_inds_hwe 1194 | 1195 | ln $data_path/qc_plink_clean_inds_hwe.hwe qced_clean_inds_hwe 1196 | """ 1197 | } 1198 | 1199 | //---- Process 25 -----------------------------------------------------------// 1200 | 1201 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1202 | script_path = Channel.fromPath(params.script_path, type : 'dir') 1203 | 1204 | /* Process to find unaffected from HWE. 1205 | * 1206 | * Inputs: 1207 | * - qced_clean_inds_hwe : The hwe results from the previous process. 1208 | * - data_path : The path to all data. 1209 | * - script_dir : The directory where the scripts are. 1210 | * 1211 | * Outputs: 1212 | * - qced_clean_inds_hweu : The results for those unaffected from HWE. 1213 | */ 1214 | process findUnaffectedForHwePlot { 1215 | input: 1216 | file qced_clean_inds_hwe 1217 | val data_path from plink_data_path 1218 | val script_dir from script_path 1219 | 1220 | output: 1221 | file 'qced_clean_inds_hweu' 1222 | 1223 | script: 1224 | """ 1225 | if [[ -s qced_clean_inds_hwe ]]; then 1226 | echo "Prev stage complete, continuing" 1227 | fi 1228 | 1229 | head -1 $data_path/qc_plink_clean_inds_hwe.hwe \ 1230 | > $script_dir/clean_inds_qcplink_hweu.hwe | \ 1231 | grep 'UNAFF' $data_path/qc_plink_clean_inds_hwe.hwe \ 1232 | >> $script_dir/clean_inds_qcplink_hweu.hwe 1233 | 1234 | echo "Complete" > qced_clean_inds_hweu 1235 | """ 1236 | } 1237 | 1238 | //---- Process 26 -----------------------------------------------------------// 1239 | 1240 | script_path = Channel.fromPath(params.script_path, type : 'dir') 1241 | 1242 | /* Process to generate a plot for the HWE results. 1243 | * 1244 | * Inputs: 1245 | * qced_clean_inds_hweu : The result of those unaffected from HWE. 1246 | * - container : The docker container to use 1247 | * - mountpoint : The directory in the conmtainer to mount to. 1248 | * - scipt_dir : The directory where the scripts are. 1249 | * 1250 | * Outputs: 1251 | * - generate_hwe_status : The status of the plot generation. 1252 | */ 1253 | process generateHwePlot { 1254 | input: 1255 | file qced_clean_inds_hweu 1256 | val container from params.dock_container 1257 | val mountpoint from params.dock_mpoint 1258 | val script_dir from script_path 1259 | 1260 | output: 1261 | file 'generate_hwe_status' 1262 | 1263 | script: 1264 | """ 1265 | if [[ -s qced_clean_inds_hweu ]]; then 1266 | echo "Input available" 1267 | fi 1268 | 1269 | docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \ 1270 | Rscript hwe_plot_qcplink.R 1271 | 1272 | echo "Complete" > generate_hwe_status 1273 | """ 1274 | } 1275 | 1276 | //---- Process 27 -----------------------------------------------------------// 1277 | 1278 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1279 | 1280 | /* Process to remove snps which failed QC. 1281 | * 1282 | * Inputs: 1283 | * - qced_qcplink_status5 : The file indicating input data is available. 1284 | * - failed_diffmiss : The file with the failed diffmiss results. 1285 | * - cut_maf : Value for maf cut. 1286 | * - cut_geno : Value of genome cut. 1287 | * - cut_hwe : Value for hwe cut. 1288 | * - container : The docker container to use 1289 | * - data_path : The path the input data. 1290 | * - mountpoint : The directory in the conmtainer to mount to. 1291 | * - sexinfo : The command to add for sexinfo. 1292 | * 1293 | * Outputs: 1294 | * - None : Results are written to the data_path directory. 1295 | * 1296 | * NOTES : Specifying "ignore" for the error strategy allows the pipeline to 1297 | * continue but still reports an error -- remove if this is not desired 1298 | */ 1299 | process removeSnpsFailingQc { 1300 | errorStrategy 'ignore' 1301 | 1302 | input: 1303 | file qced_qcplink_status5 1304 | file failed_diffmiss 1305 | val cut_maf from params.cut_maf 1306 | val cut_geno from params.cut_genome 1307 | val cut_hwe from params.cut_hwe 1308 | val container from params.dock_container 1309 | val data_path from plink_data_path 1310 | val mountpoint from params.dock_mpoint 1311 | val sexinfo from params.sexinfo_command 1312 | 1313 | output: 1314 | stdout 'result' 1315 | 1316 | script: 1317 | """ 1318 | if [[ -s qced_qcplink_status5 ]]; then 1319 | echo "Input available, can find extreme hew variations" 1320 | fi 1321 | 1322 | if [[ -s failed_diffmis ]]; then 1323 | echo "Dffmiss available" 1324 | fi 1325 | 1326 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 1327 | $container plink --bfile qc_plink_clean_inds $sexinfo \ 1328 | --maf $cut_maf --geno $cut_geno --exclude fail_diffmiss_qcplink.txt \ 1329 | --hwe $cut_hwe --make-bed --out qc_plink_cleaned 1330 | """ 1331 | } 1332 | 1333 | //---- Process 28 -----------------------------------------------------------// 1334 | 1335 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1336 | 1337 | /* Process to find Xchr snps. 1338 | * 1339 | * Inputs: 1340 | * - qced_qcplink_status6 : The file indicating input data is available. 1341 | * - container : The docker container to use 1342 | * - data_path : The path the input data. 1343 | * - mountpoint : The directory in the conmtainer to mount to. 1344 | * - sexinfo : The command to add for sexinfo. 1345 | * 1346 | * Outputs: 1347 | * - xsnps_staus : The status of the process. 1348 | * 1349 | * NOTES : Specifying "ignore" for the error strategy allows the pipeline to 1350 | * continue but still reports an error -- remove if this is not desired 1351 | */ 1352 | process findXchrSnps { 1353 | errorStrategy 'ignore' 1354 | 1355 | input: 1356 | file qced_qcplink_status6 1357 | val container from params.dock_container 1358 | val data_path from plink_data_path 1359 | val mountpoint from params.dock_mpoint 1360 | val sexinfo from params.sexinfo_command 1361 | 1362 | output: 1363 | file "xsnps_status" 1364 | 1365 | script: 1366 | """ 1367 | if [[ -s qced_qcplink_status6 ]]; then 1368 | echo "Input available, can find extreme hew variations" 1369 | fi 1370 | 1371 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 1372 | $container plink --bfile qc_plink_clean_inds --chr 23 \ 1373 | --make-bed --out xsnps 1374 | 1375 | echo "Complete" > xsnps_status 1376 | """ 1377 | } 1378 | 1379 | //---- Process 29 -----------------------------------------------------------// 1380 | 1381 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir') 1382 | 1383 | /* Process to remove Xchr snps. 1384 | * 1385 | * Inputs: 1386 | * - xsnps_status : The file indicating that the process can start. 1387 | * - cut_maf : Value for maf cut. 1388 | * - cut_geno : Value of genome cut. 1389 | * - container : The docker container to use 1390 | * - data_path : The path the input data. 1391 | * - mountpoint : The directory in the conmtainer to mount to. 1392 | * - sexinfo : The command to add for sexinfo. 1393 | * 1394 | * Outputs: 1395 | * - None : Results are written to the data_path directory. 1396 | */ 1397 | process removeXchrSnps { 1398 | input: 1399 | file xsnps_status 1400 | val cut_maf from params.cut_maf 1401 | val cut_geno from params.cut_genome 1402 | val container from params.dock_container 1403 | val data_path from plink_data_path 1404 | val mountpoint from params.dock_mpoint 1405 | val sexinfo from params.sexinfo_command 1406 | 1407 | output: 1408 | stdout 'result' 1409 | 1410 | script: 1411 | """ 1412 | if [[ -s xsnps_status ]]; then 1413 | echo "Have input data" 1414 | fi 1415 | 1416 | docker run -v $data_path:/$mountpoint -w /$mountpoint \ 1417 | $container plink --bfile qc_plink_clean_inds $sexinfo \ 1418 | --maf $cut_maf --geno $cut_geno --exclude xsnps.bim \ 1419 | --make-bed --out xsnps_removed 1420 | """ 1421 | } 1422 | --------------------------------------------------------------------------------