├── .gitattributes
├── dockerized
    ├── multi_cpu_local.config
    ├── gwasdata
    │   ├── .DS_Store
    │   └── plink
    │   │   ├── .DS_Store
    │   │   └── high_LD_regions.txt
    ├── example_config.config
    ├── scripts
    │   ├── hwe_plot_qcplink.R
    │   ├── maf_plot_qcplink.R
    │   ├── snpmiss_plot_qcplink.R
    │   ├── select_diffmiss_qcplink.pl
    │   ├── diffmiss_plot_qcplink.R
    │   ├── select_miss_het_qcplink.pl
    │   ├── run_IBD_QC_qcplink.pl
    │   └── miss_het_plot_qcplink.R
    ├── dockerfile_witsgwas_container
    │   └── Dockerfile
    └── witsgwas_dockerized_pipeline.nf
├── QuickstartUserInput.py
├── LICENSE
├── README.md
├── pipeline_quickstart_stages_config.py
├── pipeline_quickstart_config.py
├── pipeline_quickstart.py
├── cluster_job.py
└── cluster_job_edited_for_witsGWAS.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | dockerized/* linguist-vendored
2 | 


--------------------------------------------------------------------------------
/dockerized/multi_cpu_local.config:
--------------------------------------------------------------------------------
1 | process.cpus   = '8'
2 | process.memory = '16 GB'
3 | 


--------------------------------------------------------------------------------
/dockerized/gwasdata/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magosil86/witsGWAS/HEAD/dockerized/gwasdata/.DS_Store


--------------------------------------------------------------------------------
/dockerized/gwasdata/plink/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magosil86/witsGWAS/HEAD/dockerized/gwasdata/plink/.DS_Store


--------------------------------------------------------------------------------
/dockerized/example_config.config:
--------------------------------------------------------------------------------
1 | process.executor = 'pbs'
2 | process.queue    = 'WitsLong'
3 | process.memory   = '16 GB'
4 | process.time     = '6h'
5 | process.cpus     = '8'
6 | 


--------------------------------------------------------------------------------
/dockerized/scripts/hwe_plot_qcplink.R:
--------------------------------------------------------------------------------
1 | #Load HWE P-value file and generate frequency_distribution
2 | b.frq <- read.table("clean_inds_qcplink_hweu.hwe",header=T)
3 | pdf("qcplink_plots/hwe_plot.pdf")
4 | b.frq$logP = log10(b.frq$P)
5 | plot(ecdf(b.frq$logP), xlim=c(-10,0),ylim=c(0,0.80),pch=20, main="HWE P-value",xlab="logP (HWE)", ylab="Fraction of SNPs",axes=T)
6 | 


--------------------------------------------------------------------------------
/dockerized/scripts/maf_plot_qcplink.R:
--------------------------------------------------------------------------------
1 | #Load SNP frequency file and generate cumulative freequency distribution
2 | b.frq <- read.table("qced_clean_inds_freq.frq",header=T)
3 | pdf("qcplink_plots/maf_plot.pdf")
4 | plot(ecdf(b.frq$MAF), xlim=c(0,0.10),ylim=c(0,1),pch=20, main="MAF cumulative distribution",xlab="Minor allele frequency (MAF)", ylab="Fraction of SNPs",axes=T)
5 | 


--------------------------------------------------------------------------------
/dockerized/scripts/snpmiss_plot_qcplink.R:
--------------------------------------------------------------------------------
1 | #Load SNP frequency file and generate histogram
2 | b.frq <- read.table("clean_inds_qcplink_missing.lmiss",header=T)
3 | pdf("qcplink_plots/snpmiss_plot.pdf")
4 | plot(ecdf(b.frq$F_MISS),xlim=c(0,0.10),ylim=c(0,1),pch=20, main="SNP Missingness Distribution", xlab="Missingness Frequency", ylab="Fraction of SNPs",col="blue",axes=T)
5 | 


--------------------------------------------------------------------------------
/dockerized/scripts/select_diffmiss_qcplink.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | 
 5 | open IN, '<', "clean_inds_qcplink_test_missing.missing" or die "Cannot open missing file \n";
 6 | open OUT, '>', "fail_diffmiss_qcplink.txt";
 7 | while(<IN>){
 8 | 	s/^\s+//;
 9 | 	my @fields = split /\s+/, $_;
10 | 	unless($fields[0] eq 'CHR'){
11 | 		if($fields[4] < $ARGV[0]){
12 | 			print OUT "$fields[1]\n";
13 | 		}
14 | 	}
15 | }
16 | 


--------------------------------------------------------------------------------
/QuickstartUserInput.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | """ QuickstartUserInput.py 
 4 | 
 5 |     -Configuration file for the user to supply the projectname, author,  
 6 |     and cutoffs specific to pipeline_quickstart.py
 7 | =============================================================================
 8 | """
 9 | 
10 | # settings for pipeline_quickstart.py
11 | #==========================================
12 | 
13 | projectname = ''
14 | 
15 | author = ''
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/dockerized/gwasdata/plink/high_LD_regions.txt:
--------------------------------------------------------------------------------
 1 | 1 48000000 52000000 1
 2 | 2 86000000 100500000 2
 3 | 2 183000000 190000000 3
 4 | 3 47500000 50000000 4
 5 | 3 83500000 87000000 5
 6 | 5 44500000 50500000 6
 7 | 5 129000000 132000000 7
 8 | 6 25500000 33500000 8
 9 | 6 57000000 64000000 9
10 | 6 140000000 142500000 10
11 | 7 55000000 66000000 11
12 | 8 8000000 12000000 12
13 | 8 43000000 50000000 13
14 | 8 112000000 115000000 14
15 | 10 37000000 43000000 15
16 | 11 87500000 90500000 16
17 | 12 33000000 40000000 17
18 | 20 32000000 34500000 18


--------------------------------------------------------------------------------
/dockerized/scripts/diffmiss_plot_qcplink.R:
--------------------------------------------------------------------------------
1 | #Load SNP differential missingness file and generate distribution
2 | b.frq <- read.table("clean_inds_qcplink_test_missing.missing",header=T)
3 | if (nrow(b.frq) >= 1) {
4 | b.frq$logP = log10(b.frq$P)
5 | pdf("qcplink_plots/diffmiss_plot.pdf")
6 | plot(ecdf(b.frq$logP), xlim=c(-10,0),ylim=c(0,1),pch=20, main="Distribution of differential missingness P-values", xlab="logP Differential Missingness", ylab="Fraction of SNPs",col="red",axes=T)
7 | } else {
8 |     print("No differential missingness info to plot")}


--------------------------------------------------------------------------------
/dockerized/scripts/select_miss_het_qcplink.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | 
 3 | $cut_het_high=$ARGV[0];
 4 | $cut_het_low=$ARGV[1];
 5 | $cut_miss=$ARGV[2];
 6 | 
 7 | open(MISSFILE,"qcplink_miss.imiss");
 8 | open(HETFILE,"qcplink_het.het");
 9 | @all=<HETFILE>;
10 | chomp(@all);
11 | open(OUT,">fail_miss_het_qcplink.txt");
12 | 
13 | $line=0;
14 | while(<MISSFILE>){
15 | chomp($_);
16 | 
17 | if($line>=1){
18 | chomp($_);
19 | @parts_miss=split(/\s+/,$_);
20 | $missing=$parts_miss[6];
21 | 
22 | @parts_het=split(/\s+/,$all[$line]);
23 | $meanHet=sprintf("%.3f", ($parts_het[5]-$parts_het[3])/$parts_het[5]);
24 | 
25 | if($missing>$cut_miss or $meanHet>$cut_het_high or $meanHet<$cut_het_low){
26 | print OUT $parts_miss[1],"\t",$parts_miss[2],"\t",$missing,"\t",$meanHet,"\n";
27 | }
28 | }
29 | 
30 | 
31 | ++$line;
32 | }
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Lerato E. Magosi and Scott Hazelhurst, Sydney Brenner Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/dockerized/scripts/run_IBD_QC_qcplink.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | 
 5 | my %imiss;
 6 | my %removed;
 7 | 
 8 | open IMISS, '<', $ARGV[0].".imiss"
 9 |     or die "Cannot open genotypes file (".$ARGV[0].".imiss): $!\n";
10 | print "Reading PLINK .imiss file ".$ARGV[0].".imiss\n";
11 | while(<IMISS>){
12 |     s/^\s+//;
13 |     my @fields = split /\s+/, $_;
14 |     $imiss{$fields[0]}{$fields[1]} = $fields[5];
15 | }
16 | 
17 | open GENOME, '<', $ARGV[1].".genome"
18 |     or die "Cannot open genotypes file (".$ARGV[1].".genome): $!\n";
19 | open OUT, '>', "fail_IBD_qcplink.txt";
20 | print "Reading PLINK .genome file ".$ARGV[1].".genome\n";
21 | while(<GENOME>){
22 |     s/^\s+//;
23 |     my @fields = split /\s+/, $_;
24 |     if($fields[9] > 0.185){
25 | 	if($imiss{$fields[0]}{$fields[1]}>$imiss{$fields[2]}{$fields[3]}){
26 | 	    unless($removed{$fields[0]}{$fields[1]}){
27 | 		print OUT "$fields[0] $fields[1]\n";
28 | 		$removed{$fields[0]}{$fields[1]} = 1;
29 | 	    }
30 | 	}
31 | 	elsif($imiss{$fields[0]}{$fields[1]}<$imiss{$fields[2]}{$fields[3]}){
32 | 	    unless($removed{$fields[2]}{$fields[3]}){
33 | 		print OUT "$fields[2] $fields[3]\n";
34 | 		$removed{$fields[2]}{$fields[3]} = 1;
35 | 	    }
36 | 	}
37 | 	else{
38 | 	    unless($removed{$fields[0]}{$fields[1]}){
39 | 		print OUT "$fields[0] $fields[1]\n";
40 | 		$removed{$fields[0]}{$fields[1]} = 1;
41 | 	    }
42 | 	}
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![witsgwas_banner2](https://cloud.githubusercontent.com/assets/8364031/9582190/13b1e182-5004-11e5-9336-8c030414e4bc.png)
 2 | 
 3 | ## Background
 4 | 
 5 | witsGWAS is a simple human GWAS analysis workflow built at the [Sydney Brenner Institute](https://www.wits.ac.za/research/sbimb/) for data quality control (QC) and basic association testing. It takes away the need for having to enter individual commands at the unix prompt and rather organizes GWAS tasks sequentially (facilitated via [Ruffus](http://www.ruffus.org.uk/)) for submission to a distributed PBS Torque cluster (managed via [Rubra](https://github.com/bjpop/rubra)).  witsGWAS monitors (using flag files) the progress of jobs/tasks submitted to the cluster on behalf of the user, courteously waiting for one job to finish before sending another one
 6 | 
 7 | ## Documentation 
 8 | 
 9 | Installation, Examples and tutorials for witsGWAS can be accessed at the [witsGWAS_wiki](https://github.com/magosil86/witsGWAS/wiki)
10 | 
11 | ## Features
12 | 
13 | **QC of Affymetrix array data** (SNP6 raw .CEL files)
14 | 
15 |   * genotype calling
16 |   * converting birdseed calls to PLINK format
17 | 
18 | **Sample and SNP QC of PLINK Binaries**
19 | 
20 | Sample QC tasks checking:
21 | 
22 |  *  discordant sex information
23 |  *  calculating missingness
24 |  *  heterozygosity scores
25 |  *  relatedness
26 |  *  divergent ancestry 
27 | 
28 | SNP QC tasks checking:
29 | 
30 |  * minor allele frequencies
31 |  * SNP missingness
32 |  * differential missingness
33 |  * Hardy Weinberg Equilibrium deviations
34 | 
35 | **Association testing**
36 | 
37 |  * Basic PLINK association tests, producing manhattan and qqplots
38 |  * CMH association test - Association analysis, accounting for clusters
39 |  * permutation testing
40 |  * logistic regression
41 |  * emmax association testing
42 | 
43 | ### Dockerized Pipeline
44 | 
45 | The pipeline has been 'dockerized', simplifying its use. See the Dockerized section on the [WitsGWAS
46 | Wiki](https://github.com/magosil86/witsGWAS/wiki) for more information.
47 | 
48 | ### Authors
49 | 
50 | Lerato E. Magosi, Scott Hazelhurst, Rob Clucas and the WITS Bioinformatics team
51 | 
52 | ### License
53 | witsGWAS is offered under the MIT license. See LICENSE.txt.
54 | 
55 | ### Download
56 | [witsGWAS-0.1.0](https://github.com/magosil86/witsGWAS/releases)
57 | 
58 | ### References
59 | Anderson, C. et al. Data quality control in genetic case-control association studies. Nature Protocols. 5, 1564-1573, 2010
60 | 
61 | Sloggett, Clare; Wakefield, Matthew; Philip, Gayle; Pope, Bernard (2014): 
62 | Rubra - flexible distributed pipelines. figshare. http://dx.doi.org/10.6084/m9.figshare.895626
63 | 


--------------------------------------------------------------------------------
/pipeline_quickstart_stages_config.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | """ pipeline_quickstart_stages_config.py
 4 | 
 5 |     -Configuration file to set options specific to each stage/task in pipeline_quickstart.py
 6 | =============================================================================
 7 | """
 8 | import os
 9 | 
10 | import QuickstartUserInput as I
11 | 
12 | import WitsgwasSoftware as SW
13 | 
14 | # python = SW.python
15 | # plink = SW.plink
16 | # plink1 = SW.plink1
17 | # perl = SW.perl
18 | # R = SW.R
19 | 
20 | 
21 | # stageDefaults contains the default options which are applied to each stage (command).
22 | # This section is required for every Rubra pipeline.
23 | # These can be overridden by options defined for individual stages, below.
24 | # Stage options which Rubra will recognise are: 
25 | #  - distributed: a boolean determining whether the task should be submitted to a cluster
26 | #      job scheduling system (True) or run on the system local to Rubra (False). 
27 | #  - walltime: for a distributed PBS job, gives the walltime requested from the job
28 | #      queue system; the maximum allowed runtime. For local jobs has no effect.
29 | #  - memInGB: for a distributed PBS job, gives the memory in Gigabytes requested from the 
30 | #      job queue system. For local jobs has no effect.
31 | #  - queue: for a distributed PBS job, this is the name of the queue to submit the
32 | #      job to. For local jobs has no effect. This is currently a mandatory field for
33 | #      distributed jobs, but can be set to None.
34 | #  - modules: the modules to be loaded before running the task. This is intended for  
35 | #      systems with environment modules installed. Rubra will call module load on each 
36 | #      required module before running the task. Note that defining modules for individual 
37 | #      stages will override (not add to) any modules listed here. This currently only
38 | #      works for distributed jobs.
39 | 
40 | 
41 | 
42 | stageDefaults = {
43 |     'distributed': True,
44 |     'queue': 'WitsLong',
45 |     'walltime': "6:00:00",
46 |     'memInGB': 16,
47 |     'name': None,
48 |     'modules': [
49 | #         python,
50 | #         plink,
51 | #         perl,
52 | #         R,
53 |           'gwaspipe',
54 |     ]
55 | }
56 | 
57 | 
58 | 
59 | # stages should hold the details of each stage which can be called by runStageCheck.
60 | # This section is required for every Rubra pipeline.
61 | # Calling a stage in this way carries out checkpointing and, if desired, batch job
62 | # submission. 
63 | # Each stage must contain a 'command' definition. See stageDefaults above for other 
64 | # allowable options.
65 | 
66 | 
67 | stages = {
68 | 	'task1': {
69 | 		"command": ""
70 | 	},
71 | 	'task2': {
72 | 		"command": ""
73 | 	},
74 | 	'task3': {
75 | 		'command': ""
76 | 	},
77 | }
78 | 


--------------------------------------------------------------------------------
/dockerized/scripts/miss_het_plot_qcplink.R:
--------------------------------------------------------------------------------
 1 | #--INSPECT MISSINGNESS PATTERNS--#
 2 | 
 3 | #IMPORT PLINK FILES WITH MISSINGNESS INFORMATION
 4 | #requires the files qcplink_miss.imiss and qcplink_het.het to be present in the script folder
 5 | 
 6 | imiss <- read.table("qcplink_miss.imiss",header=T)
 7 | het <- read.table("qcplink_het.het",header=T)
 8 | 
 9 | #CHECK THAT THE	PROPORTION OF MISSING GENOTYPES	IS NOT O
10 | #NOTE: IF F_MISS IS ZERO THEN WE ONLY PLOT MEAN HETEROZYGOSITY
11 | 
12 | if (!(min(imiss$F_MISS) == 0 && max(imiss$F_MISS) == 0)) {
13 | 
14 | 
15 |    #CALCULATE CALL RATE, LOG10(F_FMISS) and mean heterozygosity
16 |    imiss$CALL_RATE <- 1-imiss$F_MISS
17 |    imiss$logF_MISS = log10(imiss[,6])
18 |    het$meanHet = (het$N.NM. - het$O.HOM.)/het$N.NM.
19 |    het$meanHet <- ifelse(het$meanHet=="NaN", c(0),c(het$meanHet))
20 |    imiss.het <- merge(het,imiss,by=c("FID","IID"))
21 | 
22 |    #Print Heterozygosity cutoffs
23 |    print(paste("cut_het_low: heterozygosity_mean - 3sd is ",sprintf("%.3f", mean(het$meanHet)-(3*sd(het$meanHet)), sep="")))
24 |    print(paste("cut_het_high: heterozygosity_mean + 3sd is ",sprintf("%.3f", mean(het$meanHet)+(3*sd(het$meanHet))), sep=""))
25 | 
26 |    #GENERATE CALL RATE BY HETEROZYGOSITY PLOT
27 |    colors  <- densCols(imiss$logF_MISS,het$meanHet)
28 |    pdf("qcplink_plots/pairs.imiss-vs-het.pdf")
29 |    #plot(imiss$logF_MISS,het$meanHet, col=colors, xlim=c(-3,0),ylim=c(0.26,0.35),pch=20, xlab="Proportion of missing genotypes", ylab="Heterozygosity rate", axes=F)
30 |    plot(imiss$logF_MISS,het$meanHet, col=colors, xlim=c(-3,0),ylim=c(0,0.5), pch=20, xlab="Proportion of missing genotypes", ylab="Heterozygosity rate", axes=F)
31 |    #axis(2,at=c(0.26,0.27,0.28, 0.29,0.3,0.31,0.32,0.33,0.34,0.35),tick=T)
32 |    axis(2,at=c(0,0.05,0.10,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5),tick=T)
33 |    axis(1,at=c(-3,-2,-1,0),labels=c(0.001,0.01,0.1,1))
34 |    #Heterozygosity thresholds (Horizontal Line)
35 |    abline(h=mean(het$meanHet)-(3*sd(het$meanHet)),col="RED",lty=2)
36 |    abline(h=mean(het$meanHet)+(3*sd(het$meanHet)),col="RED",lty=2)
37 |    #Missing Data Thresholds (Vertical Line)
38 |    abline(v=-1.30103, col="BLUE", lty=2) #THRESHOLD=0.07
39 |    abline(v=-1.522879, col="RED", lty=2) #THRESHOLD=0.05
40 |    
41 | } else {
42 |   
43 |     het$meanHet = (het$N.NM. - het$O.HOM.)/het$N.NM.
44 |     het$meanHet <- ifelse(het$meanHet=="NaN", c(0),c(het$meanHet))
45 | 
46 |     #Print Heterozygosity cutoffs
47 |     print(paste("cut_het_low: heterozygosity_mean - 3sd is ",sprintf("%.3f", mean(het$meanHet)-(3*sd(het$meanHet)), sep="")))
48 |     print(paste("cut_het_high: heterozygosity_mean + 3sd is ",sprintf("%.3f", mean(het$meanHet)+(3*sd(het$meanHet))), sep=""))
49 |     
50 |     pdf("qcplink_plots/meanhet_plot.pdf")	
51 |     plot(het$meanHet)
52 |     abline(h=mean(het$meanHet)-(3*sd(het$meanHet)),col="RED",lty=2)
53 |     abline(h=mean(het$meanHet)+(3*sd(het$meanHet)),col="RED",lty=2)}


--------------------------------------------------------------------------------
/pipeline_quickstart_config.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python
 2 | 
 3 | """ pipeline_quickstart_config.py 
 4 | 
 5 |     -Configuration file to set input files, directories and parameters 
 6 |     specific to pipeline_quickstart.py
 7 | =============================================================================
 8 | """
 9 | 
10 | import os
11 | import WitsgwasScripts as SC 
12 | import QuickstartUserInput as I
13 | 
14 | 
15 | # This section is used by the pipeline_quickstart.py to specify input data and 
16 | # working directories.
17 | 
18 | # Required inputs:
19 | # 1. path to input1 
20 | # 2. path to input2
21 | # 3. input3
22 | 
23 | '''
24 | note: project name will be used by the pipeline to generate a 
25 | time stamped output directory '''
26 | 
27 | 
28 | working_files = {
29 | }
30 | 
31 | 
32 | # This OPTIONAL section is used by the pipeline_quickstart.py to submit preselected user cutoffs
33 | 
34 | preselected_cutoff = {
35 | }
36 | 
37 | 
38 | 
39 | # This section is used by the pipeline_quickstart.py to specify configuration options 
40 | # for itself (pipeline_quickstart.py) as well as Rubra. 
41 | 
42 | # Rubra variables:
43 | #  - logDir: the directory where batch queue scripts, stdout and sterr dumps are stored.
44 | #  - logFile: the file used to log all jobs that are run.
45 | #  - style: the default style, one of 'flowchart', 'print', 'run', 'touchfiles'. Can be 
46 | #      overridden by specifying --style on the command line.
47 | #  - procs: the number of python processes to run simultaneously. This determines the
48 | #      maximum parallelism of the pipeline. For distributed jobs it also constrains the
49 | #      maximum total jobs submitted to the queue at any one time.
50 | #  - verbosity: one of 0 (quiet), 1 (normal), 2 (chatty). Can be overridden by specifying
51 | #      --verbose on the command line.
52 | #  - end: the desired tasks to be run. Rubra will also run all tasks which are dependencies 
53 | #      of these tasks. Can be overridden by specifying --end on the command line.
54 | #  - force: tasks which will be forced to run, regardless of timestamps. Can be overridden
55 | #      by supplying --force on the command line.
56 | #  - rebuild: one of 'fromstart','fromend'. Whether to calculate which dependencies will
57 | #      be rerun by working back from an end task to the latest up-to-date task, or forward
58 | #      from the earliest out-of-date task. 'fromstart' is the most conservative and 
59 | #      commonly used as it brings all intermediate tasks up to date.
60 | 
61 | 
62 | # pipeline_quickstart variables:
63 | # nothing at this stage, but could be used to add more features in future
64 | 
65 | pipeline = {
66 |     'logDir': os.path.join(SC.CURRENT_PROJECT_DIR, "log_quickstart"),
67 |     'logFile': 'pipeline_quickstart.log',
68 |     'style': 'print',
69 |     'procs': 30,
70 |     'verbose': 1,
71 |     'end': ['quickstart_end_task' 
72 |             ],
73 |     'force': [],
74 |     'rebuild' : "fromstart",
75 | 
76 |     'restrict_samples': False,
77 |     'allowed_samples': []
78 |     
79 | }
80 | 


--------------------------------------------------------------------------------
/pipeline_quickstart.py:
--------------------------------------------------------------------------------
  1 | #!/bin/env python
  2 | 
  3 | """ pipeline_quickstart.py 
  4 | 
  5 |     -One line description of the pipeline.
  6 | =============================================================================
  7 | 
  8 | 
  9 | Authors:
 10 | 
 11 | 
 12 | Goal of the pipeline:
 13 | This program implements a <<type of pipeline e.g. QC, association testing, etc>> workflow 
 14 | for human GWAS analysis using <<name_of_program e.g. PLINK>>
 15 |  
 16 | 
 17 | Pipeline features:
 18 | List the features of the pipeline:
 19 | 
 20 |  - Feature 1
 21 |  - Feature 2
 22 |  - Feature 3
 23 |  
 24 | Assumptions:
 25 | This pipeline assumes the following steps have been carried out:
 26 | 
 27 | 
 28 | Task management:
 29 | It employs Rubra for sending jobs to a linux cluster via PBS Torque (version 2.5). 
 30 | Rubra is a pipeline system for bioinformatics workflows that is built on top
 31 | of the Ruffus (http://www.ruffus.org.uk/) Python library (Ruffus version 2.2). 
 32 | Rubra adds support for running pipeline stages on a distributed computer cluster 
 33 | (https://github.com/bjpop/rubra) and also supports parallel evaluation of independent 
 34 | pipeline stages. (Rubra version 0.1.5)
 35 | 
 36 | The pipeline is configured by an options file in a python file,
 37 | including the actual commands which are run at each stage.
 38 | 
 39 | 
 40 | References:
 41 | 
 42 | """
 43 | 
 44 | 
 45 | # system imports
 46 | import sys        # will use to exit sys if no input files are detected
 47 | import os		  # for changing directories
 48 | import datetime   # for adding timestamps to directories
 49 | import subprocess # for executing shell command, can be used instead of os.system()
 50 | 
 51 | 
 52 | # rubra and ruffus imports
 53 | from ruffus import *
 54 | from rubra.utils import pipeline_options
 55 | from rubra.utils import (runStageCheck, mkLogFile, mkDir, mkForceLink)
 56 | 
 57 | # witsGWAS banner
 58 | from pyfiglet import Figlet
 59 | 
 60 | # user defined module imports
 61 | import Filemanager as FM
 62 | import WitsgwasSoftware as SW
 63 | import WitsgwasScripts as SC
 64 | 
 65 | 
 66 | 
 67 | # Shorthand access to options defined in pipeline_quickstart_config.py
 68 | #==========================================
 69 | 
 70 | working_files = pipeline_options.working_files
 71 | logDir = pipeline_options.pipeline['logDir']
 72 | 
 73 | 
 74 | 
 75 | # Data setup process and input organisation
 76 | #==========================================
 77 | 
 78 | f = Figlet(font='standard')
 79 | print f.renderText('witsGWAS')
 80 | print "(C) 2015 Lerato E. Magosi, Scott Hazelhurst"
 81 | print "http://magosil86.github.io/witsGWAS/"    
 82 | print "witsGWAS v0.1.0 is licensed under the MIT license. See LICENSE.txt"
 83 | print "----------------------------------------------------------------"
 84 | 
 85 | 
 86 | # create a directory for the current project
 87 | # note: The pipeline will use this dir. for output and intermediate files.
 88 | SC.CURRENT_PROJECT_DIR = (os.path.join(SC.witsGWAS_PROJECTS_DIR, working_files['projectname']) + 
 89 | 	'-pipeline_quickstart-' + datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '/')
 90 | 
 91 | print "Current project directory %s" % SC.CURRENT_PROJECT_DIR
 92 | 
 93 | FM.create_dir(SC.CURRENT_PROJECT_DIR)
 94 | 
 95 | 
 96 | # path to the witsGWAS directory
 97 | global witsGWAS_SCRIPTS_ROOT_DIR
 98 | witsGWAS_SCRIPTS_ROOT_DIR = "absolute/path/to/witsGWAS/"
 99 | 
100 | 
101 | # cd into the current project dir.
102 | os.chdir(SC.CURRENT_PROJECT_DIR)
103 | 
104 | 
105 | # Check current working directory.
106 | curr_work_dir = os.getcwd()
107 | print "Current working directory %s" % curr_work_dir
108 | 
109 |  
110 | # create a dir. for storing plots
111 | pipeline_quickstart_plots = (os.path.join(witsGWAS_SCRIPTS_ROOT_DIR, SC.CURRENT_PROJECT_DIR, "pipeline_quickstart_plots") + '/') 
112 | FM.create_dir(pipeline_quickstart_plots)
113 | 
114 | 
115 | 
116 | 
117 | # Paths to intermediate result files
118 | #==========================================
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | # Print project information
126 | #==========================================
127 | 
128 | print "Starting project %s" % working_files['projectname']
129 | print
130 | print "Intermediate files and output will be stored in %s" % SC.CURRENT_PROJECT_DIR
131 | print "Log dir is %s" % logDir
132 | print "Project author is %s" % working_files['projectauthor'] 
133 | print
134 | 
135 | 
136 | 
137 | # Pipeline declarations
138 | #==========================================
139 | 
140 | # create a flagfile to start the pipeline as well as permutation association testing
141 | FM.create_emptyfile('pipeline_quickstart.Start')
142 | 
143 | 
144 | 
145 | 
146 | 
147 | # Pipeline tasks
148 | #==========================================
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/dockerized/dockerfile_witsgwas_container/Dockerfile:
--------------------------------------------------------------------------------
  1 | # Build WitsGWAS pipeline as a Docker image 
  2 | #
  3 | # This script just builds a container with the dependancies which are needed
  4 | # to run the WitsGWAS pipeline.
  5 | #
  6 | # The configuration files for the specific pipeline can be modified in the host
  7 | # which runs the Docker container and loaded as a volume, as can the data which
  8 | # is needed for the processing. This design was chosen to keep the witsgwas
  9 | # Docker container small while still allowing the flexibility the pipeline
 10 | # provides.
 11 | #
 12 | # VERSION   : 1.0.0
 13 | 
 14 | # Use Ubuntu as the base image - this is probably the most user-friendly
 15 | FROM ubuntu:latest 
 16 | MAINTAINER Rob Clucas <robjclucas@gmailcom>
 17 | 
 18 | # Set properties of the image
 19 | LABEL Description = "Docker image for WitsGWAS image" \
 20 |                     Vendor="Bionet" Version="1.0.0"
 21 | 
 22 | # This defines the directory to while are the depedancy executables will be
 23 | # linked, creates it and then adds it to the path
 24 | ENV GWAS_ROOT /opt/bioinf/gwas
 25 | ENV GWAS_BIN  $GWAS_ROOT/bin
 26 | RUN mkdir -p $GWAS_BIN
 27 | ENV PATH=$GWAS_BIN:$PATH
 28 | 
 29 | # Define environment variables
 30 | ENV ADMIX_LINK    \
 31 |   https://www.genetics.ucla.edu/software/admixture/binaries/admixture_linux-1.3.0.tar.gz
 32 | ENV AFFYM_LINK    \
 33 | http://media.affymetrix.com/Download/updates/apt-1.18.0-x86_64-intel-linux.zip
 34 | ENV EIEGN_LINK    \
 35 |   https://github.com/DReichLab/EIG.git
 36 | ENV EMMAX_LINK    \
 37 |   http://csg.sph.umich.edu/kang/emmax/download/emmax-beta-07Mar2010.tar.gz
 38 | ENV PLINK_107_LINK \
 39 |   http://pngu.mgh.harvard.edu/~purcell/plink/dist/plink-1.07-x86_64.zip
 40 | ENV PLINK_LINK    \
 41 |   https://www.cog-genomics.org/static/bin/plink160315/plink_linux_x86_64.zip
 42 | ENV RUBRA_LINK    \
 43 |   https://github.com/bjpop/rubra.git 
 44 | ENV STOOL_LINK    \
 45 |   https://bootsrap.pypa.io/ez_setup.py
 46 | ENV WITS_GWAS_LINK \
 47 |   https://robclucas@bitbucket.org/robclucas/witsgwas.git
 48 | 
 49 | # Start by updating Ubuntu 
 50 | RUN apt-get update 
 51 | 
 52 | # Install all dependancies for WithGWAS
 53 | RUN apt-get install -y      \
 54 |   build-essential           \
 55 |   git                       \
 56 |   perl                      \
 57 |   python-pyudev             \
 58 |   python-pip                \
 59 |   python-pyfiglet           \
 60 |   figlet                    \
 61 |   r-base                    \
 62 |   wget                      
 63 | 
 64 | # Install setuptools for python
 65 | RUN wget $STOOL_LINK -O - | python
 66 | 
 67 | # Install Plaink 1.07 -- this is tricky because plink1.9 has the same name 
 68 | # so rather than creating a link we will just add the dir to the path and make 
 69 | # a link to plink 1.9 
 70 | RUN       mkdir -p /build/plink_1.07
 71 | WORKDIR   /build/plink_1.07
 72 | RUN       wget $PLINK_107_LINK
 73 | RUN       unzip *.zip
 74 | RUN       ln -sf /build/plink_1.07/plink-1.07-x86_64/plink /usr/bin/plink1
 75 | 
 76 | # Install Plink 1.9
 77 | RUN       mkdir -p /build/plink
 78 | WORKDIR   /build/plink
 79 | RUN       wget $PLINK_LINK
 80 | RUN       unzip plink_linux_x86_64.zip
 81 | RUN       ln -sf /build/plink/plink  $GWAS_BIN/plink
 82 | 
 83 | # Install admixture
 84 | RUN       mkdir /build/admixture
 85 | WORKDIR   /build/admixture
 86 | RUN       wget $ADMIX_LINK
 87 | RUN       tar -xvf admixture_linux-1.3.0.tar.gz
 88 | RUN       ln -sf /build/admixture/admixture_linux-1.3.0/admixture \
 89 |           $GWAS_BIN/admixture
 90 | 
 91 | # Install Emmax
 92 | RUN       mkdir /build/emmax
 93 | WORKDIR   /build/emmax
 94 | RUN       wget $EMMAX_LINK
 95 | RUN       tar -xvf emmax-beta-07Mar2010.tar.gz
 96 | RUN       ln -sf /build/emmax/emmax-beta-07Mar2010/emmax $GWAS_BIN/emmax
 97 | 
 98 | # Install Eiegensoft
 99 | RUN       mkdir /build/eiegensoft
100 | WORKDIR   /build/eiegensoft
101 | RUN       git clone $EIEGN_LINK 
102 | RUN       ln -sf /build/eiegensoft/EIG/bin/eiegenstrat $GWAS_BIN/eiegenstrat
103 | 
104 | # Install Rubra (which installs Rufus)
105 | RUN       mkdir /build/rubra
106 | WORKDIR   /build/rubra
107 | RUN       git clone $RUBRA_LINK 
108 | WORKDIR   /build/rubra/rubra
109 | RUN       python setup.py install
110 | 
111 | # Install Affymetrix power tools -- this link may need to be edited
112 | RUN       mkdir /build/affymetrix
113 | WORKDIR   /build/affymetrix
114 | RUN       wget $AFFYM_LINK
115 | RUN       unzip apt-1.18.0-x86_64-intel-linux.zip
116 | RUN       mv /build/affymetrix/apt-1.18.0-x86_64-intel-linux  \
117 |           /build/affymetrix/apt
118 | RUN       cp /build/affymetrix/apt/bin/* $GWAS_BIN
119 | 
120 | # Get the wits gwas repository
121 | RUN       mkdir /witsgwas
122 | WORKDIR   /witsgwas
123 | RUN       git clone $WITS_GWAS_LINK
124 | 
125 | # Go back to the root directory
126 | WORKDIR   /
127 | 


--------------------------------------------------------------------------------
/cluster_job.py:
--------------------------------------------------------------------------------
  1 | # Generate a PBS script for a job, and general utilities for
  2 | # waiting for a job to complete.
  3 | 
  4 | from shell_command import shellCommand
  5 | import sys
  6 | from time import sleep
  7 | from tempfile import NamedTemporaryFile
  8 | import os
  9 | 
 10 | 
 11 | # this assumes that qstat info for a job will stick around for a while after
 12 | # the job has finished.
 13 | 
 14 | class Runnable_Script(object):
 15 |     def __init__(self, qstat_max_tries = 5, qstat_error_delay = 1, qstat_delay = 10):
 16 |         self.qstat_max_tries = qstat_max_tries      # number of times to try qstat before failing
 17 |         self.qstat_error_delay = qstat_error_delay  # seconds to sleep while waiting for qstat to recover
 18 |         self.qstat_delay = qstat_delay              # seconds to sleep while waiting for job to complete
 19 |         pass
 20 |         
 21 |     def isJobCompleted(self, jobID):
 22 |         count = 0
 23 |         while True:
 24 |             (stdout, stderr, exitStatus) = shellCommand("qstat -f %s" % jobID)
 25 |             # qstat appears to have worked correctly, we can stop trying.
 26 |             if exitStatus == 0 or count >= self.qstat_max_tries:
 27 |                 break
 28 |             count += 1
 29 |             sleep(self.qstat_error_delay)
 30 |         if exitStatus != 0:
 31 |             raise Exception("qstat -f %s returned non-zero exit status %d times,\
 32 |                              panicking" % (jobID, count))
 33 |         else:
 34 |             # try to fetch the exit status of the job command from the output of
 35 |             # qstat.
 36 |             jobState = None
 37 |             exitStatus = None
 38 |             for line in stdout.split('\n'):
 39 |                 ws = line.split()
 40 |                 if len(ws) == 3:
 41 |                     if ws[0] == 'job_state' and ws[1] == '=':
 42 |                         jobState = ws[2]
 43 |                     elif ws[0] == 'exit_status' and ws[1] == '=' and \
 44 |                             ws[2].isdigit():
 45 |                         exitStatus = int(ws[2])
 46 |             if jobState.upper() == 'C':
 47 |                 # Job has completed.
 48 |                 return (True, exitStatus)
 49 |             else:
 50 |                 # Job has not completed.
 51 |                 return (False, exitStatus)
 52 | 
 53 | 
 54 |     # returns exit status of job (or None if it can't be determined)
 55 |     def waitForJobCompletion(self, jobID):
 56 |         isFinished, exitCode = self.isJobCompleted(jobID)
 57 |         while(not isFinished):
 58 |             sleep(self.qstat_delay)
 59 |             isFinished, exitCode = self.isJobCompleted(jobID)
 60 |         return exitCode
 61 | 
 62 | 
 63 |     # returns exit status of job (or None if it can't be determined)
 64 |     def runJobAndWait(self, stage, logDir='', verbose=0):
 65 |         jobID = self.launch()
 66 |         prettyJobID = jobID.split('.')[0]
 67 |         logFilename = os.path.join(logDir, stage + '.' + prettyJobID + '.pbs')
 68 |         with open(logFilename, 'w') as logFile:
 69 |             logFile.write(self.__str__())
 70 |         if verbose > 0:
 71 |             print('stage = %s, jobID = %s' % (stage, prettyJobID))
 72 |         return self.waitForJobCompletion(jobID)
 73 | 
 74 | 
 75 | # Generate a PBS script for a job.
 76 | class PBS_Script(Runnable_Script):
 77 |     def __init__(self, command, walltime=None, name=None, memInGB=None,
 78 |                  queue='batch', moduleList=None, logDir=None, literals=None, **kw):
 79 |         self.command = command
 80 |         self.queue = queue
 81 |         self.name = name
 82 |         self.memInGB = memInGB
 83 |         self.walltime = walltime
 84 |         self.moduleList = moduleList
 85 |         self.logDir = logDir
 86 |         self.literals = literals
 87 |         super(PBS_Script, self).__init__(**kw)
 88 |         pass
 89 | 
 90 |     # render the job script as a string.
 91 |     def __str__(self):
 92 |         script = ['#!/bin/bash']
 93 |         # XXX fixme
 94 |         # should include job id in the output name.
 95 |         # should use the proper log directory.
 96 |         if self.queue == 'terri-smp':
 97 |             script.append('#PBS -q terri')
 98 |             script.append('#PBS -l procs=8,tpn=8')
 99 |         else:
100 |             script.append('#PBS -q %s' % self.queue)
101 |         if self.logDir:
102 |             script.append('#PBS -o %s' % self.logDir)
103 |             script.append('#PBS -e %s' % self.logDir)
104 |         # should put the name of the file in here if possible
105 |         if self.name:
106 |             script.append('#PBS -N %s' % self.name)
107 |         if self.memInGB:
108 |             if self.queue in ['smp', 'terri-smp']:
109 |                 script.append('#PBS -l mem=%sgb' % self.memInGB)
110 |             else:
111 |                 script.append('#PBS -l pvmem=%sgb' % self.memInGB)
112 |         if self.walltime:
113 |             script.append('#PBS -l walltime=%s' % self.walltime)
114 |         # copy the literal text verbatim into the end of the PBS options
115 |         # section.
116 |         if self.literals:
117 |             script.append(self.literals)
118 |         if type(self.moduleList) == list and len(self.moduleList) > 0:
119 |             for item in self.moduleList:
120 |                 script.append('module load %s' % item)
121 |         script.append('cd $PBS_O_WORKDIR')
122 |         script.append(self.command)
123 |         return '\n'.join(script) + '\n'
124 | 
125 |     # create a temporary file to store the job script and then
126 |     # launch it with qsub.
127 |     def launch(self):
128 |         file = NamedTemporaryFile()
129 |         file.write(str(self))
130 |         file.flush()
131 |         command = 'qsub ' + file.name
132 |         (stdout, stderr, returnCode) = shellCommand(command)
133 |         file.close()
134 |         if returnCode == 0:
135 |             return stdout
136 |         else:
137 |             raise(Exception('qsub command failed with exit status: ' +
138 |                   str(returnCode)))
139 | 
140 | #class SGE_Script(Runnable_Script):
141 | #    def __init__(self, command, walltime=None, name=None, memInGB=None,
142 | #                 queue='batch', moduleList=None, logDir=None, **kw):
143 | #        self.command = command
144 | #        self.queue = queue
145 | #        self.name = name
146 | #        self.memInGB = memInGB
147 | #        self.walltime = walltime
148 | #        self.moduleList = moduleList
149 | #        self.logDir = logDir
150 | #        self.Runnable_Script.__init__(**kw)
151 | #        pass
152 | 


--------------------------------------------------------------------------------
/cluster_job_edited_for_witsGWAS.py:
--------------------------------------------------------------------------------
  1 | # Generate a PBS script for a job, and general utilities for
  2 | # waiting for a job to complete.
  3 | 
  4 | from shell_command import shellCommand
  5 | import sys
  6 | from time import sleep
  7 | from tempfile import NamedTemporaryFile
  8 | import os
  9 | 
 10 | 
 11 | # this assumes that qstat info for a job will stick around for a while after
 12 | # the job has finished.
 13 | 
 14 | class Runnable_Script(object):
 15 |     def __init__(self, qstat_max_tries = 5, qstat_error_delay = 1, qstat_delay = 10):
 16 |         self.qstat_max_tries = qstat_max_tries      # number of times to try qstat before failing
 17 |         self.qstat_error_delay = qstat_error_delay  # seconds to sleep while waiting for qstat to recover
 18 |         self.qstat_delay = qstat_delay              # seconds to sleep while waiting for job to complete
 19 |         pass
 20 |         
 21 |     def isJobCompleted(self, jobID):
 22 |         count = 0
 23 |         while True:
 24 |             (stdout, stderr, exitStatus) = shellCommand("qstat -f %s" % jobID)
 25 |             # qstat appears to have worked correctly, we can stop trying.
 26 |             if exitStatus == 0 or count >= self.qstat_max_tries:
 27 |                 break
 28 |             count += 1
 29 |             sleep(self.qstat_error_delay)
 30 |         if exitStatus != 0:
 31 |             raise Exception("qstat -f %s returned non-zero exit status %d times,\
 32 |                              panicking" % (jobID, count))
 33 |         else:
 34 |             # try to fetch the exit status of the job command from the output of
 35 |             # qstat.
 36 |             jobState = None
 37 |             exitStatus = None
 38 |             for line in stdout.split('\n'):
 39 |                 ws = line.split()
 40 |                 if len(ws) == 3:
 41 |                     if ws[0] == 'job_state' and ws[1] == '=':
 42 |                         jobState = ws[2]
 43 |                     elif ws[0] == 'exit_status' and ws[1] == '=' and \
 44 |                             ws[2].isdigit():
 45 |                         exitStatus = int(ws[2])
 46 |             if jobState.upper() == 'C':
 47 |                 # Job has completed.
 48 |                 return (True, exitStatus)
 49 |             else:
 50 |                 # Job has not completed.
 51 |                 return (False, exitStatus)
 52 | 
 53 | 
 54 |     # returns exit status of job (or None if it can't be determined)
 55 |     def waitForJobCompletion(self, jobID):
 56 |         isFinished, exitCode = self.isJobCompleted(jobID)
 57 |         while(not isFinished):
 58 |             sleep(self.qstat_delay)
 59 |             isFinished, exitCode = self.isJobCompleted(jobID)
 60 |         return exitCode
 61 | 
 62 | 
 63 |     # returns exit status of job (or None if it can't be determined)
 64 |     def runJobAndWait(self, stage, logDir='', verbose=0):
 65 |         jobID = self.launch()
 66 |         prettyJobID = jobID.split('.')[0]
 67 |         logFilename = os.path.join(logDir, stage + '.' + prettyJobID + '.pbs')
 68 |         with open(logFilename, 'w') as logFile:
 69 |             logFile.write(self.__str__())
 70 |         if verbose > 0:
 71 |             print('stage = %s, jobID = %s' % (stage, prettyJobID))
 72 |         return self.waitForJobCompletion(jobID)
 73 | 
 74 | 
 75 | # Generate a PBS script for a job.
 76 | class PBS_Script(Runnable_Script):
 77 |     def __init__(self, command, walltime=None, name=None, memInGB=None,
 78 |                  queue='batch', moduleList=None, logDir=None, literals=None, **kw):
 79 |         self.command = command
 80 |         self.queue = queue
 81 |         self.name = name
 82 |         self.memInGB = memInGB
 83 |         self.walltime = walltime
 84 |         self.moduleList = moduleList
 85 |         self.logDir = logDir
 86 |         self.literals = literals
 87 |         super(PBS_Script, self).__init__(**kw)
 88 |         pass
 89 | 
 90 |     # render the job script as a string.
 91 |     def __str__(self):
 92 |         script = ['#!/bin/bash']
 93 |         # XXX fixme
 94 |         # should include job id in the output name.
 95 |         # should use the proper log directory.
 96 |         if self.queue == 'WitsLong':
 97 |             script.append('#PBS -q WitsLong')
 98 |             script.append('#PBS -l nodes=1:ppn=8')
 99 |         else:
100 |             script.append('#PBS -q %s' % self.queue)
101 |         if self.logDir:
102 |             script.append('#PBS -o %s' % self.logDir)
103 |             script.append('#PBS -e %s' % self.logDir)
104 |         # should put the name of the file in here if possible
105 |         if self.name:
106 |             script.append('#PBS -N %s' % self.name)
107 |         if self.memInGB:
108 |             if self.queue in ['medium', 'WitsLong']:
109 |                 script.append('#PBS -l mem=%sGB' % self.memInGB)
110 |             else:
111 |                 script.append('#PBS -l pvmem=%sGB' % self.memInGB)
112 |         if self.walltime:
113 |             script.append('#PBS -l walltime=%s' % self.walltime)
114 |         # copy the literal text verbatim into the end of the PBS options
115 |         # section.
116 |         if self.literals:
117 |             script.append(self.literals)
118 |         if type(self.moduleList) == list and len(self.moduleList) > 0:
119 |             for item in self.moduleList:
120 |                 script.append('module load %s' % item)
121 |         script.append('cd $PBS_O_WORKDIR')
122 |         script.append(self.command)
123 |         return '\n'.join(script) + '\n'
124 | 
125 |     # create a temporary file to store the job script and then
126 |     # launch it with qsub.
127 |     def launch(self):
128 |         cmd="""
129 |         #PBS -N MergeKGSAHGP
130 |         #PBS -l nodes=1:ppn=3,walltime=100:00:00,mem=1GB 
131 |         #PBS -q WitsLong
132 | 
133 |         sleep 20
134 |         exit 0
135 |         """
136 |         file = NamedTemporaryFile()
137 |         file.write(str(self))
138 |         file.flush()
139 |         command = 'qsub ' + file.name
140 |         (stdout, stderr, returnCode) = shellCommand(command)
141 |         file.close()
142 |         if returnCode == 0:
143 |             return stdout
144 |         else:
145 |             raise(Exception('qsub command failed with exit status: ' +
146 |                   str(returnCode)))
147 | 
148 | #class SGE_Script(Runnable_Script):
149 | #    def __init__(self, command, walltime=None, name=None, memInGB=None,
150 | #                 queue='batch', moduleList=None, logDir=None, **kw):
151 | #        self.command = command
152 | #        self.queue = queue
153 | #        self.name = name
154 | #        self.memInGB = memInGB
155 | #        self.walltime = walltime
156 | #        self.moduleList = moduleList
157 | #        self.logDir = logDir
158 | #        self.Runnable_Script.__init__(**kw)
159 | #        pass
160 | 


--------------------------------------------------------------------------------
/dockerized/witsgwas_dockerized_pipeline.nf:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env nextflow
   2 | 
   3 | /*
   4 |  * Author       : Rob Clucas
   5 |  * Description  : Nextflow pipeline for Wits GWAS.
   6 |  */
   7 | 
   8 | //---- General definitions --------------------------------------------------//
   9 | 
  10 | /* Defines the name of the docker container to run the pipeline through.
  11 |  */
  12 | params.dock_container   = 'robclucas/witsgwas'
  13 | 
  14 | /* Defines the name of the mountpoint of the data directories in the docker
  15 |  * container. This is so that any scripts which run in the container and 
  16 |  * might need this info can run succesfully, and the user can specify the 
  17 |  * directory to each of the scripts.
  18 |  *
  19 |  * NOTE: The mountpoint is mounted in the container from the root directory,
  20 |  *       so specifying 'util' as the mount point mounts the data at '/util' in
  21 |  *       the container.
  22 |  */
  23 | params.dock_mpoint      = 'util'
  24 | 
  25 | /* Defines the directory where the plink 1.07 input binary files are. 
  26 |  *
  27 |  * NOTE: This must be a relative path, from where the pipeline is run.
  28 |  */
  29 | params.plink_inputpath  = "gwasdata/plink"
  30 | 
  31 | /* Defines the path where any scripts to be executed can be found.
  32 |  *
  33 |  * NOTE: This must be a ralative path, from where the pipeline is run.
  34 |  */
  35 | params.script_path      = 'scripts'
  36 | 
  37 | /* Defines the names of the plink binary files in the plink directory 
  38 |  * (.fam, .bed, .bed).
  39 |  *
  40 |  * NOTE: This must be without the extension (so if A.fam, A.bed, ... 
  41 |  *       then use 'A').
  42 |  */
  43 | params.plink_fname      = 'raw-GWA-data'
  44 | 
  45 | /* Defines the name of the file with high LD region information.
  46 |  * 
  47 |  * NOTE: This can have/cannot have the extension, but should be in the 
  48 |  *       plink_inputpath specified above.
  49 |  */
  50 | params.high_ld_regions_fname = 'high_LD_regions.txt'
  51 | 
  52 | /* Defines if sexinfo is available or not, options are:
  53 |  *  - "true"  : sexinfo is available
  54 |  *  - "false" : sexinfo is not avalable
  55 |  */
  56 | params.sexinfo_available = "false"
  57 | 
  58 | //---- Cutoff definitions ---------------------------------------------------//
  59 | 
  60 | /* Defines the cutoffs for the heterozygosity. Standard cutoff +- 3sd from 
  61 |  * mean)
  62 |  */
  63 | params.cut_het_high = 0.343
  64 | params.cut_het_low  = 0.254
  65 | 
  66 | /* Defines the cutoff for missingness. Using standard cutoff -- 3 - 7%.
  67 |  */
  68 | params.cut_miss      = 0.05
  69 | params.cut_diff_miss = 0.05;
  70 | 
  71 | 
  72 | /* Defines the cutoff for the SNP minor allele frequency.
  73 |  */
  74 | params.cut_maf        = 0.01
  75 | 
  76 | /* Defines the cutoff for SNP missingness.
  77 |  */
  78 | params.cut_genome     = 0.01
  79 | 
  80 | /* Defines the cutoff for the SNP Hardy Weinburg deviation.
  81 |  */
  82 | params.cut_hwe        = 0.01
  83 | 
  84 | //---- Modification of variables for pipeline -------------------------------//
  85 | 
  86 | /* Define the command to add for plink depending on whether sexinfo is
  87 |  * available or not. Command is:
  88 |  * 
  89 |  * - No sexinfo availabele  : "--allow-no-sexinfo"   
  90 |  * - Sexinfo available      : ""
  91 |  */
  92 | if ( params.sexinfo_available == "false" ) {
  93 |   params.sexinfo_command = "--allow-no-sex"
  94 |   println "Sexinfo not available, command: " + params.sexinfo_command + "\n"
  95 | } else {
  96 |   params.sexinfo_command = ""
  97 |   println "Sexinfo availabel command: " + params.sexinfo_command + "\n"
  98 | }
  99 | 
 100 | /* Convert the relative data path(s) to absolute, because this is required for 
 101 |  * docker when mounting.
 102 |  */
 103 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 104 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
 105 | 
 106 | //---- Start Pipeline -------------------------------------------------------//
 107 | 
 108 | /* Process to check for duplicates. The process mounts the plink data to the 
 109 |  * docker container and then runs plink 1.07 through the docker container. It 
 110 |  * writes the results to a file results.
 111 |  * 
 112 |  * Inputs:
 113 |  * - filename   : The name of the plink input files wo extension
 114 |  * - container  : The name of the docker container to use
 115 |  * - data_path  : The path to the plink data
 116 |  * - mountpoint : The mountpoint of the data in the container
 117 |  * - sexinfo    : The command to add to plink for sexinfo availability
 118 |  *
 119 |  * Outputs:
 120 |  * - results    : The file with the stdout from plink.
 121 |  */
 122 | process checkDuplicateMarkers { 
 123 |   input:
 124 |   val filename    from params.plink_fname
 125 |   val container   from params.dock_container
 126 |   val data_path   from plink_data_path
 127 |   val mountpoint  from params.dock_mpoint
 128 |   val sexinfo     from params.sexinfo_command
 129 | 
 130 |   output:
 131 |   file 'results'
 132 | 
 133 |   script:
 134 |   """
 135 |   docker run -v $data_path:/$mountpoint -w /$mountpoint           \
 136 |     $container plink1 --noweb --bfile $filename $sexinfo --out    \
 137 |     tmp >> results
 138 |   """
 139 | }
 140 | 
 141 | //---- Process 2 ------------------------------------------------------------//
 142 | 
 143 | /* Process to filter all the duplicate markers from running plink.
 144 |  *
 145 |  * Inputs:
 146 |  * - results    : The file containing the stdout from running plink.
 147 |  *
 148 |  * Outputs:
 149 |  * - duplicate  : A file containing all the duplicates from plink.
 150 |  */
 151 | process filterDuplicateMarkers {
 152 |   input:
 153 |   file results
 154 | 
 155 |   output:
 156 |   file 'duplicates'
 157 | 
 158 |   script:
 159 |   """
 160 |   if grep 'Duplicates' results > duplicates; then
 161 |     echo 'Duplicates Found' >> duplicates
 162 |     echo 'Found Duplicates'
 163 |   else                                            
 164 |     echo 'No Duplicates Found' >> duplicates
 165 |     echo 'Did Not Find Duplicates'
 166 |   fi
 167 |   """
 168 | }
 169 | 
 170 | //---- Process 3 ------------------------------------------------------------//
 171 | 
 172 | /* Process to extract all the duplicate RSIDs generated by the plink command.
 173 |  *
 174 |  * Inputs:
 175 |  * - duplicates     : The list of duplicates from running plink
 176 |  * 
 177 |  * Outputs:
 178 |  * - duplicate_rdis : A file with all the duplicate RSID's
 179 |  *
 180 |  * NOTES: The indentation of the inline python script is important because of 
 181 |  *        the way python uses indentation. If this has the usual 2 space indent
 182 |  *        as the inline bash scripts do, then there is a python error. This 
 183 |  *        could be saved as a script and run through docker as well.
 184 |  */
 185 | process extractDuplicateRsids {
 186 |   input:
 187 |   file duplicates
 188 | 
 189 |   output:
 190 |   file 'duplicate_rsids'
 191 | 
 192 |   script:
 193 |   """
 194 |   #!/usr/bin/env python
 195 | 
 196 | input    = open('duplicates', 'r')
 197 | output   = open('duplicate_rsids', 'w')
 198 | 
 199 | # Remove all duplicates
 200 | for line in input:
 201 |     if (line.startswith('#') or line.startswith('\\n') or
 202 |         line == 'Duplicates Found' or line == 'No Duplicates Found\\n'):
 203 |         pass
 204 |     else:
 205 |         line = line.split(" ")
 206 |         print(line)
 207 |         duplicate_snp = line[5].strip()
 208 |         print(duplicate_snp)
 209 |         output.write(duplicate_snp + '\\n')
 210 |   """
 211 | }
 212 | 
 213 | //---- Process 4 ------------------------------------------------------------//
 214 | 
 215 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 216 | 
 217 | /* Process to remove all the duplicate markers from the plink output.
 218 |  *
 219 |  * Inputs:
 220 |  * - duplicate_rsids  : A file containing all duplicate rsids to remove.
 221 |  * - filename         : The name of the plink input files wo extension
 222 |  * - container        : The name of the docker container to use
 223 |  * - data_path        : The path to the plink data
 224 |  * - mountpoint       : The mountpoint of the data in the container
 225 |  * - sexinfo          : The command to add to plink for sexinfo availability 
 226 |  *
 227 |  * Outputs:
 228 |  * - qcplink_log*     : Log files from plink with the output, these are the 
 229 |  *                      input to later processes. 
 230 |  *
 231 |  * NOTES              : Multiple outputs are required so that other processes 
 232 |  *                      which use the output can be started concurrently. If 
 233 |  *                      only a single file is output, then the processes will
 234 |  *                      execute sequentially, and each one will have to output
 235 |  *                      the file.
 236 |  */
 237 | process removeDuplicateMarkers {
 238 |   input:
 239 |   file duplicate_rsids
 240 |   val  filename         from params.plink_fname
 241 |   val  container        from params.dock_container
 242 |   val  data_path        from plink_data_path
 243 |   val  mountpoint       from params.dock_mpoint
 244 |   val  sexinfo          from params.sexinfo_command
 245 | 
 246 |   output:
 247 |   file 'qcplink_log'  into receiver
 248 |   file 'qcplink_log1' into receiver
 249 |   file 'qcplink_log2' into receiver
 250 |   file 'qcplink_log3' into receiver
 251 | 
 252 |   script:
 253 |   """
 254 |   if [[ -s duplicate_rsids ]]; then               
 255 |     # Copy the file to the mount path of the container
 256 |     cp duplicate_rsids $data_path/duplicate_rsids 
 257 | 
 258 |     # Remove duplicate ID's, running plinnk through the container
 259 |     docker run -v $data_path:/$mountpoint -w /$mountpoint       \
 260 |       $container plink1 --noweb --bfile $filename $sexinfo      \
 261 |       --exclude duplicate_rsids --make-bed --out                \
 262 |       qcplink >> qcplink_log
 263 |   else                                                          
 264 |     # There are no duplicate RSID's, so don;t specify exclude file
 265 |     docker run -v $data_path:/$mountpoint -w /$mountpoint       \
 266 |       $container plink1 --noweb --bfile $filename $sexinfo      \
 267 |       --make-bed --out qcplink >> qcplink_log
 268 |   fi
 269 | 
 270 |   # Create links for the outputs
 271 |   ln -s qcplink_log qcplink_log1 
 272 |   ln -s qcplink_log qcplink_log2
 273 |   ln -s qcplink_log qcplink_log3
 274 |   """
 275 | }
 276 | 
 277 | //---- Process 5 ------------------------------------------------------------//
 278 | 
 279 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 280 | 
 281 | /* Process to identify individual discordant sex information.
 282 |  *
 283 |  * Inputs:
 284 |  * - qcplink_log  : The log file previously generated from running plink.
 285 |  * - filename     : The name of the plink input files wo extension
 286 |  * - container    : The name of the docker container to use
 287 |  * - data_path    : The path to the plink data
 288 |  * - mountpoint   : The mountpoint of the data in the container
 289 |  * - sexinfo      : The command to add to plink for sexinfo availability  
 290 |  * 
 291 |  * Outputs:
 292 |  * -sexstat_problems  : All sexinfo results which have problems.
 293 |  *
 294 |  * NOTES : The qcplink_log file is used as a 'start parameter', since when a 
 295 |  *         nextflow process uses the output of another process as input, the 
 296 |  *         process will only run once that input has become available. If we 
 297 |  *         did not do this, then the process would try and run concurrently at
 298 |  *         the start, which would not work since the input data would not be 
 299 |  *         ready.
 300 |  */
 301 | process identifyIndivDiscSexinfo {
 302 |   input:
 303 |   file qcplink_log      from receiver
 304 |   val  filename         from params.plink_fname
 305 |   val  container        from params.dock_container
 306 |   val  data_path        from plink_data_path
 307 |   val  mountpoint       from params.dock_mpoint
 308 |   val  sexinfo          from params.sexinfo_available
 309 | 
 310 |   output:
 311 |   file 'failed_sexcheck'
 312 | 
 313 |   script:
 314 |   """
 315 |   # Check that the input is available.
 316 |   if [[ -s qcplink_log ]]; then 
 317 |     echo 'Plink log received, can continue!'
 318 |   fi
 319 | 
 320 |   if [[ $sexinfo == 'true' ]]; then 
 321 |     # Generate all the sex info. Because this runs through docker the output
 322 |     # will be in the workdir of the container ($data_path).
 323 |     docker run -v $data_path:/$mountpoint -w /$mountpoint     \
 324 |       $container plink --bfile qcplink --check-sex            \
 325 |       --out sexstat 
 326 |   else 
 327 |     echo 'no sexinfo available for qcplink' > $data_path/sexstat.sexcheck
 328 |   fi
 329 | 
 330 |   # Check for all the "PROBLEM" sex information.
 331 |   if grep -Rn 'PROBLEM' $data_path/sexstat.sexcheck > failed_sexcheck; then
 332 |     echo 'Discordant sex info found'
 333 |   else                                                      
 334 |     echo 'No discordant sex info found'
 335 |   fi
 336 |   """
 337 | }
 338 | 
 339 | //---- Process 6 ------------------------------------------------------------//
 340 | 
 341 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 342 | script_path = Channel.fromPath(params.script_path, type : 'dir')
 343 | 
 344 | /* Process to calculate the sample missingness.
 345 |  *
 346 |  * Inputs:
 347 |  * - qcplink_log1   : The log file previously generated from running plink.
 348 |  * - filename       : The name of the plink input files wo extension
 349 |  * - container      : The name of the docker container to use
 350 |  * - data_path      : The path to the plink data
 351 |  * - script_path    : The path to the scripts
 352 |  * - mountpoint     : The mountpoint of the data in the container
 353 |  * - sexinfo        : The command to add to plink for sexinfo availability  
 354 |  * 
 355 |  * Outputs:
 356 |  * - qcplink_imiss* : Information for the missingness. Again multiple files so
 357 |  *                    that later processes start concurrently.
 358 |  *
 359 |  * NOTES : The qcplink_log file is used as a 'start parameter', since when a 
 360 |  *         nextflow process uses the output of another process as input, the 
 361 |  *         process will only run once that input has become available. If we 
 362 |  *         did not do this, then the process would try and run concurrently at
 363 |  *         the start, which would not work since the input data would not be 
 364 |  *         ready.
 365 |  */
 366 | process calculateSampleMissingness {
 367 |   input:
 368 |   file qcplink_log1  from receiver
 369 |   val  container     from params.dock_container
 370 |   val  script_dir    from script_path
 371 |   val  data_path     from plink_data_path
 372 |   val  mountpoint    from params.dock_mpoint
 373 |   val  sexinfo       from params.sexinfo_command
 374 | 
 375 |   output:
 376 |   file 'qcplink_missing'
 377 |   file 'qcplink_missing1'
 378 |   file 'qcplink_missing2'
 379 | 
 380 |   script:
 381 |   """
 382 |   if [[ -s qcplink_log1 ]]; then 
 383 |     echo 'Plink log received, can continue!'
 384 |   fi
 385 | 
 386 |   docker run -v $data_path:/$mountpoint -w /$mountpoint     \
 387 |     $container plink --bfile qcplink $sexinfo --missing     \
 388 |     --out qcplink_missing
 389 | 
 390 |   # Create output links
 391 |   ln $data_path/qcplink_missing.imiss $script_dir/qcplink_miss.imiss
 392 | 
 393 |   echo "Complete" > qcplink_missing
 394 |   echo "Complete" > qcplink_missing1
 395 |   echo "Complete" > qcplink_missing2
 396 |   """
 397 | }
 398 | 
 399 | //---- Process 7 ------------------------------------------------------------//
 400 | 
 401 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 402 | script_path = Channel.fromPath(params.script_path, type : 'dir')
 403 | 
 404 | /* Process to calculate the heterozygosity for the samples.
 405 |  *
 406 |  * Inputs:
 407 |  * - qcplink_log2 : The log file previously generated from running plink.
 408 |  * - filename     : The name of the plink input files wo extension
 409 |  * - container    : The name of the docker container to use
 410 |  * - data_path    : The path to the plink data
 411 |  * - script_path  : The path ot the scripts
 412 |  * - mountpoint   : The mountpoint of the data in the container
 413 |  * - sexinfo      : The command to add to plink for sexinfo availability  
 414 |  * 
 415 |  * Outputs:
 416 |  * - qcplink_het* : Information about the heterozygosity. Again multiple 
 417 |  *                  so that multiple processes can start.
 418 |  *
 419 |  * NOTES : The qcplink_log file is used as a 'start parameter', since when a 
 420 |  *         nextflow process uses the output of another process as input, the 
 421 |  *         process will only run once that input has become available. If we 
 422 |  *         did not do this, then the process would try and run concurrently at
 423 |  *         the start, which would not work since the input data would not be 
 424 |  *         ready.
 425 |  */
 426 | process calculateSampleHetrozygosity {
 427 |   input:
 428 |   file qcplink_log2  from receiver
 429 |   val  container     from params.dock_container
 430 |   val  data_path     from plink_data_path
 431 |   val  script_dir    from script_path
 432 |   val  mountpoint    from params.dock_mpoint
 433 |   val  sexinfo       from params.sexinfo_command
 434 | 
 435 |   output:
 436 |   file 'qcplink_het'
 437 |   file 'qcplink_het1'
 438 | 
 439 |   script:
 440 |   """ 
 441 |   if [[ -s qcplink_log2 ]]; then 
 442 |     echo 'Plink log received, can continue!'
 443 |   fi
 444 | 
 445 |   docker run -v $data_path:/$mountpoint -w /$mountpoint   \
 446 |     $container plink --bfile qcplink $sexinfo --het       \
 447 |     --out qcplink_het
 448 | 
 449 |   # Link the result in the data path to the output stream 
 450 |   ln $data_path/qcplink_het.het $script_dir/qcplink_het.het
 451 | 
 452 |    echo "Complete" > qcplink_het
 453 |    echo "Complete" > qcplink_het1
 454 |   """
 455 | }
 456 | 
 457 | //---- Process 8 ------------------------------------------------------------//
 458 | 
 459 | script_path = Channel.fromPath(params.script_path, type : 'dir')
 460 | 
 461 | /* Process to generate plots for the missingness and heterozygosity.
 462 |  *
 463 |  * Inputs:
 464 |  * - qcplink_missing  : Link to the missingness data
 465 |  * - qcplink_het      : Link to the heterozygosity data
 466 |  * - script_dir       : Script directory to find scripts
 467 |  * - container        : Docker container to use
 468 |  * - mountpoint       : Mountpoint in container
 469 |  *
 470 |  * Outputs:
 471 |  * - qcplink_missing  : Results for the missingness.
 472 |  * - qcplink_het      : Results for the heterozygosity.
 473 |  * - failed_miss_het  : Failed results for the missingness and heterozygosity.
 474 |  */
 475 | process generateMissHetPlot {
 476 |   errorStrategy 'ignore'
 477 | 
 478 |   input:
 479 |   file qcplink_missing
 480 |   file qcplink_het     
 481 |   val  script_dir       from script_path
 482 |   val  container        from params.dock_container
 483 |   val  mountpoint       from params.dock_mpoint
 484 | 
 485 |   output:
 486 |   file 'failed_miss_het'
 487 | 
 488 |   script:
 489 |   """
 490 |   if [[ qcplink_missing ]]; then
 491 |     echo "Missingness available"
 492 |   fi
 493 | 
 494 |   if [[ qcplink_het ]]; then
 495 |     echo "Heterozygosity available"
 496 |   fi
 497 | 
 498 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint  \
 499 |     $container Rscript miss_het_plot_qcplink.R
 500 | 
 501 |   # Create a link which is the output file
 502 |   ln $script_dir/fail_miss_het_qcplink.txt failed_miss_het
 503 |   """
 504 | }
 505 | 
 506 | //---- Process 9 ------------------------------------------------------------//
 507 | 
 508 | script_path = Channel.fromPath(params.script_path, type : 'dir')
 509 | 
 510 | /*
 511 |  * Process to find individuals with extreme missingness and heterozygosity
 512 |  * scores.
 513 |  *
 514 |  * Inputs:
 515 |  * - qcplink_missing1   : A link to the missingness file
 516 |  * - qcplink_het1       : A link to the heterozygosity file
 517 |  * - script_dir         : The scripts directory
 518 |  * - container          : The docker container to use
 519 |  * - mountpoint         : The mountpoint in the container
 520 |  * - cut_het_high       : The high values for heterozygosity
 521 |  * - cut_het_low        : The low values for heterozygosity
 522 |  * - cut_miss           : The missingness rate
 523 |  *
 524 |  * Outputs:
 525 |  * - None, the results are written to the scripts directory.
 526 |  */
 527 | process findIndivWithHighMissExtremeHet {
 528 |   input:
 529 |   file qcplink_missing1
 530 |   file qcplink_het1  
 531 |   val  script_dir       from script_path
 532 |   val  container        from params.dock_container
 533 |   val  mountpoint       from params.dock_mpoint
 534 |   val  cut_het_high     from params.cut_het_high 
 535 |   val  cut_het_low      from params.cut_het_low
 536 |   val  cut_miss         from params.cut_miss
 537 | 
 538 |   output:
 539 |   stdout 'result'
 540 | 
 541 |   script:
 542 |   """
 543 |   if [[ qcplink_missing1 ]]; then
 544 |     echo "Missing file available"
 545 |   fi
 546 | 
 547 |   if [[ qcplink_het1 ]]; then 
 548 |     echo "Heterozygosity file available"
 549 |   fi
 550 | 
 551 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container   \
 552 |     perl select_miss_het_qcplink.pl $cut_het_high $cut_het_low $cut_miss
 553 |   """
 554 | }
 555 | 
 556 | //---- Process 10 -----------------------------------------------------------//
 557 | 
 558 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 559 | 
 560 | /* Process to prune for IBD.
 561 |  * 
 562 |  * Inputs:
 563 |  * - qcplink_log3   : File specifying the the plink input files are ready.
 564 |  * - high_ld_file   : File specifying high ld regions to exclude
 565 |  * - container      : The docker container to use
 566 |  * - data_path      : The path where the data is, mounted onto container
 567 |  * - mountpoint     : The location on the container where data is mounted
 568 |  * - sexinfo        : Command to add based on sexinfo availability
 569 |  *
 570 |  * Outputs:
 571 |  * - qcplink_ibd_prune_status* : 
 572 |  *    The status of the process, when complete this file is created.
 573 |  *
 574 |  * NOTES : Plink data is written to the data_path directory.
 575 |  */
 576 | process pruneForIBD {
 577 |   input:
 578 |   file qcplink_log3 from receiver
 579 |   val  high_ld_file from params.high_ld_regions_fname
 580 |   val  container    from params.dock_container
 581 |   val  data_path    from plink_data_path
 582 |   val  mountpoint   from params.dock_mpoint
 583 |   val  sexinfo      from params.sexinfo_command
 584 | 
 585 |   output:
 586 |   file 'qcplink_ibd_prune_status'
 587 |   file 'qcplink_ibd_prune_status1'
 588 | 
 589 |   script:
 590 |   """
 591 |   if [[ -s qcplink_log3 ]]; then 
 592 |     echo 'Qcplink log received, pruning IBD'
 593 |   fi
 594 | 
 595 |   docker run -v $data_path:/$mountpoint -w /$mountpoint                   \
 596 |     $container plink --bfile qcplink $sexinfo --exclude $high_ld_file     \
 597 |     --range --indep-pairwise 50 5 0.2 --out qcplink_ibd
 598 | 
 599 |   echo 'Complete' > qcplink_ibd_prune_status
 600 |   ln qcplink_ibd_prune_status qcplink_ibd_prune_status1
 601 |   """
 602 | }
 603 | 
 604 | //---- Process 11 -----------------------------------------------------------//
 605 | 
 606 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 607 | 
 608 | /* Process to calculate the IBD.
 609 |  *  
 610 |  * Inputs: 
 611 |  * - qc_plink_ibd_prune_status  : The status of the pruning process.
 612 |  * - container      : The docker container to use
 613 |  * - data_path      : The path where the data is, mounted onto container
 614 |  * - mountpoint     : The location on the container where data is mounted
 615 |  * - sexinfo        : Command to add based on sexinfo availability
 616 |  * 
 617 |  * Outputs:
 618 |  * - None : Output files are written to the data_path directory.
 619 |  */
 620 | process calculateIBD {
 621 |   input:
 622 |   file qcplink_ibd_prune_status
 623 |   val  container    from params.dock_container
 624 |   val  data_path    from plink_data_path
 625 |   val  mountpoint   from params.dock_mpoint
 626 |   val  sexinfo      from params.sexinfo_command
 627 | 
 628 |   output:
 629 |   stdout 'result'
 630 | 
 631 |   script:
 632 |   """
 633 |   if [[ -s qcplink_ibd_prune_status ]]; then 
 634 |     echo "IBD Prune status file received, calculating IBD"
 635 |   fi
 636 | 
 637 |   docker run -v $data_path:/$mountpoint -w /$mountpoint                       \
 638 |     $container plink --bfile qcplink $sexinfo --extract qcplink_ibd.prune.in  \
 639 |     --genome --out qcplink_ibd
 640 |   """
 641 | }
 642 | 
 643 | //---- Process 12 -----------------------------------------------------------//
 644 | 
 645 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 646 | 
 647 | /* Process to calculate the IBD with Min Pi Hat.
 648 |  *
 649 |  * - qc_plink_ibd_prune_status1  : The status of the pruning process.
 650 |  * - container      : The docker container to use
 651 |  * - data_path      : The path where the data is, mounted onto container
 652 |  * - mountpoint     : The location on the container where data is mounted
 653 |  * - sexinfo        : Command to add based on sexinfo availability
 654 |  * 
 655 |  * Outputs:
 656 |  * - qcplink_ind_min_004*   : The IBD results from plink.
 657 |  */
 658 | process calculateIBDMinPiHat {
 659 |   input:
 660 |   file qcplink_ibd_prune_status1
 661 |   val  container    from params.dock_container
 662 |   val  data_path    from plink_data_path
 663 |   val  mountpoint   from params.dock_mpoint
 664 |   val  sexinfo      from params.sexinfo_command
 665 | 
 666 |   output:
 667 |   file 'qcplink_ibd_min_004'
 668 |   file 'qcplink_ibd_min_0041'
 669 | 
 670 |   script:
 671 |   """
 672 |   if [[ -s qcplink_ibd_prune_status1 ]]; then
 673 |     echo "IBD prune status recieved"
 674 |   fi
 675 | 
 676 |   docker run -v $data_path:/$mountpoint -w /$mountpoint                      \
 677 |     $container plink --bfile qcplink $sexinfo --extract qcplink_ibd.prune.in \
 678 |     --genome --min 0.04 --out qcplink_ibd_min_0_04
 679 | 
 680 |   ln $data_path/qcplink_ibd_min_0_04.genome qcplink_ibd_min_004
 681 |   ln $data_path/qcplink_ibd_min_0_04.genome qcplink_ibd_min_0041
 682 |   """
 683 | }
 684 | 
 685 | //---- Process 13 -----------------------------------------------------------//
 686 | 
 687 | /* Proces to sort the results from runnning IBD Min Pi hat.
 688 |  *
 689 |  * Inputs:
 690 |  * -qcplink_ibd_min_004   : The input file to sort.
 691 |  *
 692 |  * Outputs:
 693 |  * - qc_plink_ibd_min_004_sorted_pihat.txt  : The sorted results.
 694 |  */
 695 | process sortByPiHat {
 696 |   input:
 697 |   file qcplink_ibd_min_004
 698 | 
 699 |   output:
 700 |   file 'qcplink_ibd_min_0_04_sorted_pihat.txt'
 701 | 
 702 |   """
 703 |   sort -k10n qcplink_ibd_min_004 > qcplink_ibd_min_0_04_sorted_pihat.txt
 704 |   """
 705 | }
 706 | 
 707 | //---- Process 14 -----------------------------------------------------------//
 708 | 
 709 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
 710 | 
 711 | /* Filters all the related individuals.
 712 |  *
 713 |  * Inputs:
 714 |  * - qcplink_missing2     : A link to the missingness file
 715 |  * - qcplink_ibd_min_0041 : A link to the ind file.
 716 |  * - script_dir           : The scripts directory
 717 |  * - container            : The docker container to use
 718 |  * - mountpoint           : The directory in the conmtainer to mount to.
 719 |  *
 720 |  * Outputs:
 721 |  * - None : Results are written to the scripts directory.
 722 |  */
 723 | /*
 724 | process filterRelatedIndiv {
 725 |   errorStrategy 'ignore'
 726 | 
 727 |   input:
 728 |   file qcplink_missing2
 729 |   file qcplink_ibd_min_0041
 730 |   val  script_dir   from script_path
 731 |   val  container    from params.dock_container
 732 |   val  mountpoint   from params.dock_mpoint
 733 | 
 734 |   output:
 735 |   stdout 'result'
 736 | 
 737 |   script:
 738 |   """
 739 |   # Check that there are no old links
 740 |   if [[ -e $script_dir/qcplink_missing.imiss ]]; then
 741 |     rm $script_dir/qcplink_missing.imiss
 742 |   fi
 743 | 
 744 |   if [[ -e $script_dir/qcplink_genome.genome ]]; then
 745 |     rm $script_dir/qcplink_genome.genome
 746 |   fi
 747 | 
 748 |   # Make a link for the missing file so that the file has .imiss ext
 749 |   if [[ -s qcplink_missing2 ]]; then 
 750 |     ln qcplink_missing2 $script_dir/qcplink_missing.imiss
 751 |   fi
 752 | 
 753 |   if [[ -s qcplink_ibd_min_0041 ]]; then
 754 |     ln qcplink_ibd_min_0041 $script_dir/qcplink_genome.genome
 755 |   fi
 756 | 
 757 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container \
 758 |     perl run_IBD_QC_qcplink.pl qcplink_missing qcplink_genome
 759 |   """
 760 | }
 761 | */
 762 | 
 763 | //---- Process 15 -----------------------------------------------------------//
 764 | 
 765 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 766 | 
 767 | /* Process to join the failed individuals into a single file.
 768 |  * 
 769 |  * Inputs:
 770 |  * - failed_miss_het  : The failed missingness and heterozygosity results.
 771 |  * - failed_sexcheck  : The failed sex stat results.
 772 |  *
 773 |  * Ouputs:
 774 |  * - failed_qc_plink_inds : The combined failed results.
 775 |  */
 776 | process joinQcplinkFailedIndivIntoSingleFile {
 777 |   input:
 778 |   file failed_miss_het
 779 |   file failed_sexcheck 
 780 |   val  data_path       from plink_data_path
 781 | 
 782 |   output:
 783 |   file 'failed_qc_plink_inds'
 784 | 
 785 |   script:
 786 |   """
 787 |   cat failed_sexcheck failed_miss_het | sort -k1 | \
 788 |     uniq > $data_path/qcplink_failed_inds
 789 | 
 790 |   echo "Complete" > failed_qc_plink_inds
 791 |   """
 792 | }
 793 | 
 794 | 
 795 | //---- Process 16 -----------------------------------------------------------//
 796 | 
 797 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 798 | 
 799 | /* Process to remove all failed individuals.
 800 |  *
 801 |  * Inputs:
 802 |  * - failed_qc_plink_inds : The failed individuals to remove.
 803 |  * - script_dir           : The scripts directory
 804 |  * - container            : The docker container to use
 805 |  * - mountpoint           : The directory in the conmtainer to mount to.
 806 |  *
 807 |  * Outputs:
 808 |  * - qced_qcplink_status* : The output file indicating that the process is done.
 809 |  */
 810 | process removeQcPlinkFailedIndiv {
 811 |   input:
 812 |   file failed_qc_plink_inds
 813 |   val  container    from params.dock_container
 814 |   val  data_path    from plink_data_path
 815 |   val  mountpoint   from params.dock_mpoint
 816 |   val  sexinfo      from params.sexinfo_command
 817 | 
 818 |   output:
 819 |   file 'qced_qcplink_status1'
 820 |   file 'qced_qcplink_status2'
 821 |   file 'qced_qcplink_status3'
 822 |   file 'qced_qcplink_status4'
 823 |   file 'qced_qcplink_status5'
 824 |   file 'qced_qcplink_status6'
 825 | 
 826 |   script:
 827 |   """
 828 |   # Make a link in the data_path directory for the failed indices
 829 |   if [[ -s failed_qc_plink_inds ]]; then 
 830 |     echo "Failed inds input available"
 831 |   fi
 832 | 
 833 |   docker run -v $data_path:/$mountpoint -w /$mountpoint           \
 834 |     $container plink --noweb --bfile qcplink $sexinfo --remove   \
 835 |     qcplink_failed_inds --make-bed --out qc_plink_clean_inds
 836 | 
 837 |   # Create output files
 838 |   echo 'Qced complete' > qced_qcplink_status1
 839 |   echo 'Qced complete' > qced_qcplink_status2
 840 |   echo 'Qced complete' > qced_qcplink_status3
 841 |   echo 'Qced complete' > qced_qcplink_status4
 842 |   echo 'Qced complete' > qced_qcplink_status5
 843 |   echo 'Qced complete' > qced_qcplink_status6
 844 |   """
 845 | }
 846 | 
 847 | //---- Process 17 -----------------------------------------------------------//
 848 | 
 849 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 850 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
 851 | 
 852 | /* Process to calculate the Maf results.
 853 |  * 
 854 |  * Inputs:
 855 |  * - qced_qcplink_status1 : The file indicating input data is available.
 856 |  * - container            : The docker container to use
 857 |  * - data_path            : The path the input data.
 858 |  * - script_dir 	  : The directory where the scripts are.
 859 |  * - mountpoint           : The directory in the conmtainer to mount to.
 860 |  * - sexinfo              : The command to add for sexinfo.
 861 |  *
 862 |  * Outputs:
 863 |  * - qxced_clean_inds_freq  : The output results for Maf calculation.
 864 |  */
 865 | process calculateMaf {
 866 |   input:
 867 |   file qced_qcplink_status1
 868 |   val  container    from params.dock_container
 869 |   val  data_path    from plink_data_path
 870 |   val  script_dir   from script_path
 871 |   val  mountpoint   from params.dock_mpoint
 872 |   val  sexinfo      from params.sexinfo_command
 873 | 
 874 |   output:
 875 |   file 'qced_clean_inds_freq'
 876 | 
 877 |   script:
 878 |   """
 879 |   if [[ -s qced_qcplink_status1 ]]; then
 880 |     echo "Input available, can calculate maf"
 881 |   fi
 882 | 
 883 |   docker run -v $data_path:/$mountpoint -w /$mountpoint            \
 884 |     $container plink --noweb --bfile qc_plink_clean_inds $sexinfo  \
 885 |     --freq --out qc_plink_clean_inds_freq
 886 | 
 887 |   ln $data_path/qc_plink_clean_inds_freq.frq \
 888 |     $script_dir/qced_clean_inds_freq.frq
 889 | 
 890 |   echo "Complete" > qced_clean_inds_freq
 891 |   """
 892 | }
 893 | 
 894 | //---- Process 18 -----------------------------------------------------------//
 895 | 
 896 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
 897 | 
 898 | /* Process to generate the Maf plot.
 899 |  *
 900 |  * Inputs:
 901 |  * - qced_clean_inds_freq : A link to the input data from the calculateMaf 
 902 |  *                          process.
 903 |  * - container            : The docker container to use
 904 |  * - mountpoint           : The directory in the conmtainer to mount to.
 905 |  * - script_dir           : The directory where scripts are.
 906 |  *
 907 |  * Outputs:
 908 |  * - generate_maf_status  : The status of the process.
 909 |  */
 910 | process generateMafPlot {
 911 |   input:
 912 |   file qced_clean_inds_freq
 913 |   val  container    from params.dock_container
 914 |   val  mountpoint   from params.dock_mpoint
 915 |   val  script_dir   from script_path
 916 | 
 917 |   output:
 918 |   file 'generate_maf_status'
 919 | 
 920 |   script:
 921 |   """
 922 |   if [[ -s qced_clean_inds_freq ]]; then 
 923 |     echo "Input available"
 924 |   fi
 925 | 
 926 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container  \
 927 |     Rscript maf_plot_qcplink.R
 928 | 
 929 |   echo "Complete" > generate_maf_status
 930 |   """
 931 | }
 932 | 
 933 | //---- Process 19 -----------------------------------------------------------//
 934 | 
 935 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
 936 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
 937 | 
 938 | /* Process to calculate the snp missingness.
 939 |  *
 940 |  * Inputs:
 941 |  * - qced_qcplink_status2 : The file indicating input data is available.
 942 |  * - container            : The docker container to use
 943 |  * - data_path            : The path the input data.
 944 |  * - script_dir		  : The directoryw where the scripts are.
 945 |  * - mountpoint           : The directory in the conmtainer to mount to.
 946 |  * - sexinfo              : The command to add for sexinfo.
 947 |  *
 948 |  * Outputs:
 949 |  * - qxced_clean_inds_missing  : The output results for missingness calculation
 950 |  */
 951 | process calculateSnpMissigness {
 952 |   input:
 953 |   file qced_qcplink_status2
 954 |   val  container    from params.dock_container
 955 |   val  data_path    from plink_data_path
 956 |   val  script_dir   from script_path
 957 |   val  mountpoint   from params.dock_mpoint
 958 |   val  sexinfo      from params.sexinfo_command
 959 | 
 960 |   output:
 961 |   file 'qced_clean_inds_missing'
 962 | 
 963 |   script:
 964 |   """
 965 |   if [[ -s qced_qcplink_status2 ]]; then 
 966 |     echo "Input available, can calculate missingness"
 967 |   fi
 968 | 
 969 |   docker run -v $data_path:/$mountpoint -w /$mountpoint              \
 970 |     $container plink --bfile qc_plink_clean_inds $sexinfo --missing  \
 971 |     --out qc_plink_clean_inds_missing
 972 | 
 973 |   ln $data_path/qc_plink_clean_inds_missing.lmiss \
 974 |     $script_dir/clean_inds_qcplink_missing.lmiss
 975 | 
 976 |   echo "Complete" > qced_clean_inds_missing
 977 |   """
 978 | }
 979 | 
 980 | //---- Process 20 -----------------------------------------------------------//
 981 | 
 982 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
 983 | 
 984 | /* Proces to generate a plot of the missingness results.
 985 |  *
 986 |  * Inputs:
 987 |  * - qced_clean_inds_missing : A link to the input data from the missingness
 988 |  *                             calculatio process.
 989 |  * - container               : The docker container to use
 990 |  * - mountpoint              : The directory in the conmtainer to mount to.
 991 |  * - script_dir              : The directory where scripts are.
 992 |  *
 993 |  * Outputs:
 994 |  * - generate_missingness_status : The status of the missingness plot 
 995 |  *                                 generation.
 996 |  */
 997 | process generateSnpMissingnessPlot {
 998 |   input:
 999 |   file qced_clean_inds_missing
1000 |   val  container    from params.dock_container
1001 |   val  mountpoint   from params.dock_mpoint
1002 |   val  script_dir   from script_path
1003 | 
1004 |   output: 
1005 |   file 'generate_snp_missingness_status'
1006 | 
1007 |   script:
1008 |   """
1009 |   if [[ -s qced_clean_inds_missing ]]; then 
1010 |     echo 'Finished calculating snp missingness, now plotting'
1011 |   fi
1012 | 
1013 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container  \
1014 |     Rscript snpmiss_plot_qcplink.R
1015 | 
1016 |   echo "Complete" > generate_snp_missingness_status
1017 |   """
1018 | }
1019 | 
1020 | //---- Process 21 -----------------------------------------------------------//
1021 | 
1022 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1023 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
1024 | 
1025 | /* Process to calculate the snp differential missingness.
1026 |  *
1027 |  * Inputs:
1028 |  * - qced_qcplink_status3 : The file indicating input data is available.
1029 |  * - container            : The docker container to use
1030 |  * - data_path            : The path the input data.
1031 |  * - script_dir  	  : The path to the scripts
1032 |  * - mountpoint           : The directory in the conmtainer to mount to.
1033 |  * - sexinfo              : The command to add for sexinfo.
1034 |  *
1035 |  * Outputs:
1036 |  * - qced_clean_inds_test_missing* : The results of the process.
1037 |  */
1038 | process calculateSnpDifferentialMissingness {
1039 |   input:
1040 |   file qced_qcplink_status3
1041 |   val  container    from params.dock_container
1042 |   val  data_path    from plink_data_path
1043 |   val  script_dir   from script_path 
1044 |   val  mountpoint   from params.dock_mpoint
1045 |   val  sexinfo      from params.sexinfo_command
1046 | 
1047 |   output:
1048 |   file 'qced_clean_inds_test_missing1'
1049 |   file 'qced_clean_inds_test_missing2'
1050 | 
1051 |   script:
1052 |   """
1053 |   if [[ -s qced_qcplink_status3 ]]; then 
1054 |     echo "Input available, can calculate differential missingness"
1055 |   fi
1056 | 
1057 |   docker run -v $data_path:/$mountpoint -w /$mountpoint              \
1058 |     $container plink --bfile qc_plink_clean_inds $sexinfo --missing  \
1059 |     --out qc_plink_clean_inds_test_missing
1060 | 
1061 |   ln $data_path/qc_plink_clean_inds_test_missing.lmiss \
1062 |     $script_dir/clean_inds_qcplink_test_missing.missing
1063 | 
1064 |   echo "Complete" > qced_clean_inds_test_missing1
1065 |   echo "Complete" > qced_clean_inds_test_missing2
1066 |   """
1067 | }  
1068 | 
1069 | //---- Process 22 -----------------------------------------------------------//
1070 | 
1071 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
1072 | 
1073 | /* Process to generate a plot for the differential missngness.
1074 |  * 
1075 |  * Inputs: 
1076 |  * - qced-clean_inds_test_missing1 : The results to use to generate the plot.
1077 |  * - container                     : The docker container to use
1078 |  * - mountpoint                    : The directory in the conmtainer to mount to.
1079 |  * - script_dir                    : The directory where scripts are.
1080 |  *
1081 |  * Outputs:
1082 |  * - generate_diff_miss_status     : The status of the plot generation.
1083 |  *
1084 |  * NOTES : Specifying "ignore" for the error strategy allows the pipeline to 
1085 |  *         continue but still reports an error -- remove if this is not desired
1086 |  */
1087 | process generateDifferentialMissingnessPlot {
1088 |   errorStrategy 'ignore'
1089 | 
1090 |   input:
1091 |   file qced_clean_inds_test_missing1
1092 |   val  container    from params.dock_container
1093 |   val  mountpoint   from params.dock_mpoint
1094 |   val  script_dir   from script_path
1095 | 
1096 |   output:
1097 |   file 'generate_diff_miss_status'
1098 | 
1099 |   script:
1100 |   """
1101 |   if [[ -s qced_clean_inds_test_missing1 ]]; then 
1102 |     echo "Input data available"
1103 |   fi
1104 | 
1105 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container  \
1106 |     Rscript diffmiss_plot_qcplink.R
1107 | 
1108 |   echo "Complete" > generate_diff_miss_status
1109 |   """
1110 | }
1111 | 
1112 | //---- Process 23 -----------------------------------------------------------//
1113 | 
1114 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
1115 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1116 | 
1117 | /* Process to find snps with extreme differential missingness.
1118 |  *
1119 |  * Inputs:
1120 |  * - qced_clean_inds_test_staus2 : The file indicating input data is available.
1121 |  * - container                   : The docker container to use
1122 |  * - data_path                   : The path the input data.
1123 |  * - mountpoint                  : The directory in the conmtainer to mount to.
1124 |  * - sexinfo                     : The command to add for sexinfo.
1125 |  * - cut_diff_miss               : The value to use to evaluate diff miss.
1126 |  *
1127 |  * Outputs:
1128 |  * - failed_diffmiss : The failed results for the process.
1129 |  */
1130 | process findSnpExtremeDifferentialMissingness {
1131 |   input:
1132 |   file qced_clean_inds_test_missing2
1133 |   val  container                      from params.dock_container
1134 |   val  mountpoint                     from params.dock_mpoint
1135 |   val  data_path 		      from plink_data_path
1136 |   val  script_dir                     from script_path
1137 |   val  cut_diff_miss                  from params.cut_diff_miss
1138 | 
1139 |   output:
1140 |   file 'failed_diffmiss'
1141 | 
1142 |   script:
1143 |   """ 
1144 |   if [[ qced_clean_inds_test_missing2 ]]; then 
1145 |     echo "Input data available"
1146 |   fi
1147 | 
1148 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container  \
1149 |     perl select_diffmiss_qcplink.pl $cut_diff_miss
1150 | 
1151 |   ln $script_dir/fail_diffmiss_qcplink.txt \
1152 |     $data_path/fail_diffmiss_qcplink.txt 
1153 | 
1154 |   echo "Complete" > failed_diffmiss
1155 |   """
1156 | }
1157 | 
1158 | //---- Process 24 -----------------------------------------------------------//
1159 | 
1160 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1161 | 
1162 | /* Process to find snps with extreme Hardy Weinburg deviations.
1163 |  * 
1164 |  * Inputs:
1165 |  * - qced_qcplink_status4 : The file indicating input data is available.
1166 |  * - container            : The docker container to use
1167 |  * - data_path            : The path the input data.
1168 |  * - mountpoint           : The directory in the conmtainer to mount to.
1169 |  * - sexinfo              : The command to add for sexinfo.
1170 |  *
1171 |  * Outputs:
1172 |  * - qced_clean_inds_hwe  : The results with extreme hwe deviations.
1173 |  */
1174 | process findSnpsExtremeHweDeviations {
1175 |   input:
1176 |   file qced_qcplink_status4
1177 |   val  container    from params.dock_container
1178 |   val  data_path    from plink_data_path
1179 |   val  mountpoint   from params.dock_mpoint
1180 |   val  sexinfo      from params.sexinfo_command
1181 | 
1182 |   output:
1183 |   file 'qced_clean_inds_hwe'
1184 | 
1185 |   script:
1186 |   """
1187 |   if [[ -s qced_qcplink_status4 ]]; then 
1188 |     echo "Input available, can find extreme hew variations"
1189 |   fi
1190 | 
1191 |   docker run -v $data_path:/$mountpoint -w /$mountpoint              \
1192 |     $container plink --bfile qc_plink_clean_inds $sexinfo --hardy    \
1193 |     --out qc_plink_clean_inds_hwe
1194 | 
1195 |   ln $data_path/qc_plink_clean_inds_hwe.hwe qced_clean_inds_hwe
1196 |   """
1197 | }
1198 | 
1199 | //---- Process 25 -----------------------------------------------------------//
1200 | 
1201 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1202 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
1203 | 
1204 | /* Process to find unaffected from HWE.
1205 |  *
1206 |  * Inputs:
1207 |  * - qced_clean_inds_hwe  : The hwe results from the previous process.
1208 |  * - data_path            : The path to all data.
1209 |  * - script_dir 	  : The directory where the scripts are.
1210 |  *
1211 |  * Outputs:
1212 |  * - qced_clean_inds_hweu : The results for those unaffected from HWE.
1213 |  */
1214 | process findUnaffectedForHwePlot {
1215 |   input:
1216 |   file qced_clean_inds_hwe
1217 |   val data_path    from plink_data_path
1218 |   val script_dir   from script_path
1219 | 
1220 |   output:
1221 |   file 'qced_clean_inds_hweu'
1222 | 
1223 |   script:
1224 |   """
1225 |   if [[ -s qced_clean_inds_hwe ]]; then 
1226 |     echo "Prev stage complete, continuing"
1227 |   fi
1228 | 
1229 |   head -1 $data_path/qc_plink_clean_inds_hwe.hwe          \
1230 |     > $script_dir/clean_inds_qcplink_hweu.hwe |           \
1231 |     grep 'UNAFF' $data_path/qc_plink_clean_inds_hwe.hwe   \
1232 |     >> $script_dir/clean_inds_qcplink_hweu.hwe
1233 | 
1234 |   echo "Complete" > qced_clean_inds_hweu
1235 |   """
1236 | }
1237 | 
1238 | //---- Process 26 -----------------------------------------------------------//
1239 | 
1240 | script_path     = Channel.fromPath(params.script_path, type : 'dir')
1241 | 
1242 | /* Process to generate a plot for the HWE results.
1243 |  *
1244 |  * Inputs:
1245 |  * qced_clean_inds_hweu : The result of those unaffected from HWE.
1246 |  * - container          : The docker container to use
1247 |  * - mountpoint         : The directory in the conmtainer to mount to.
1248 |  * - scipt_dir          : The directory where the scripts are.
1249 |  *
1250 |  * Outputs:
1251 |  * - generate_hwe_status : The status of the plot generation.
1252 |  */
1253 | process generateHwePlot {
1254 |   input:
1255 |   file  qced_clean_inds_hweu
1256 |   val  container    from params.dock_container
1257 |   val  mountpoint   from params.dock_mpoint
1258 |   val  script_dir   from script_path
1259 | 
1260 |   output:
1261 |   file 'generate_hwe_status'
1262 | 
1263 |   script:
1264 |   """
1265 |   if [[ -s qced_clean_inds_hweu ]]; then 
1266 |     echo "Input available"
1267 |   fi
1268 | 
1269 |   docker run -v $script_dir:/$mountpoint -w /$mountpoint $container  \
1270 |     Rscript hwe_plot_qcplink.R
1271 | 
1272 |   echo "Complete" > generate_hwe_status
1273 |   """
1274 | }
1275 | 
1276 | //---- Process 27 -----------------------------------------------------------//
1277 | 
1278 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1279 | 
1280 | /* Process to remove snps which failed QC.
1281 |  *
1282 |  * Inputs:
1283 |  * - qced_qcplink_status5 : The file indicating input data is available.
1284 |  * - failed_diffmiss      : The file with the failed diffmiss results.
1285 |  * - cut_maf              : Value for maf cut.
1286 |  * - cut_geno             : Value of genome cut.
1287 |  * - cut_hwe              : Value for hwe cut.
1288 |  * - container            : The docker container to use
1289 |  * - data_path            : The path the input data.
1290 |  * - mountpoint           : The directory in the conmtainer to mount to.
1291 |  * - sexinfo              : The command to add for sexinfo.
1292 |  *
1293 |  * Outputs:
1294 |  * - None : Results are written to the data_path directory.
1295 |  *
1296 |  * NOTES : Specifying "ignore" for the error strategy allows the pipeline to 
1297 |  *         continue but still reports an error -- remove if this is not desired
1298 |  */
1299 | process removeSnpsFailingQc {
1300 |   errorStrategy 'ignore'
1301 | 
1302 |   input:
1303 |   file qced_qcplink_status5
1304 |   file failed_diffmiss
1305 |   val  cut_maf              from params.cut_maf
1306 |   val  cut_geno             from params.cut_genome
1307 |   val  cut_hwe              from params.cut_hwe
1308 |   val  container            from params.dock_container
1309 |   val  data_path            from plink_data_path
1310 |   val  mountpoint           from params.dock_mpoint
1311 |   val  sexinfo              from params.sexinfo_command  
1312 | 
1313 |   output:
1314 |   stdout 'result'
1315 | 
1316 |   script:
1317 |   """
1318 |   if [[ -s qced_qcplink_status5 ]]; then 
1319 |     echo "Input available, can find extreme hew variations"
1320 |   fi
1321 | 
1322 |   if [[ -s failed_diffmis ]]; then 
1323 |     echo "Dffmiss available"
1324 |   fi
1325 | 
1326 |   docker run -v $data_path:/$mountpoint -w /$mountpoint           \
1327 |     $container plink --bfile qc_plink_clean_inds $sexinfo         \
1328 |     --maf $cut_maf --geno $cut_geno --exclude fail_diffmiss_qcplink.txt  \
1329 |     --hwe $cut_hwe --make-bed --out qc_plink_cleaned
1330 |   """
1331 | }
1332 | 
1333 | //---- Process 28 -----------------------------------------------------------//
1334 | 
1335 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1336 | 
1337 | /* Process to find Xchr snps.
1338 |  *
1339 |  * Inputs:
1340 |  * - qced_qcplink_status6 : The file indicating input data is available.
1341 |  * - container            : The docker container to use
1342 |  * - data_path            : The path the input data.
1343 |  * - mountpoint           : The directory in the conmtainer to mount to.
1344 |  * - sexinfo              : The command to add for sexinfo.
1345 |  *
1346 |  * Outputs:
1347 |  * - xsnps_staus          : The status of the process.
1348 |  *
1349 |  * NOTES : Specifying "ignore" for the error strategy allows the pipeline to 
1350 |  *         continue but still reports an error -- remove if this is not desired
1351 |  */
1352 | process findXchrSnps {
1353 |   errorStrategy 'ignore'
1354 | 
1355 |   input:
1356 |   file qced_qcplink_status6
1357 |   val  container            from params.dock_container
1358 |   val  data_path            from plink_data_path
1359 |   val  mountpoint           from params.dock_mpoint
1360 |   val  sexinfo              from params.sexinfo_command  
1361 | 
1362 |   output:
1363 |   file "xsnps_status"
1364 | 
1365 |   script:
1366 |   """
1367 |   if [[ -s qced_qcplink_status6 ]]; then 
1368 |     echo "Input available, can find extreme hew variations"
1369 |   fi
1370 | 
1371 |   docker run -v $data_path:/$mountpoint -w /$mountpoint       \
1372 |     $container plink --bfile qc_plink_clean_inds --chr 23     \
1373 |       --make-bed --out xsnps
1374 | 
1375 |   echo "Complete" > xsnps_status
1376 |   """
1377 | }
1378 | 
1379 | //---- Process 29 -----------------------------------------------------------//
1380 | 
1381 | plink_data_path = Channel.fromPath(params.plink_inputpath, type : 'dir')
1382 | 
1383 | /* Process to remove Xchr snps.
1384 |  *
1385 |  * Inputs:
1386 |  * - xsnps_status : The file indicating that the process can start.
1387 |  * - cut_maf      : Value for maf cut.
1388 |  * - cut_geno     : Value of genome cut.
1389 |  * - container    : The docker container to use
1390 |  * - data_path    : The path the input data.
1391 |  * - mountpoint   : The directory in the conmtainer to mount to.
1392 |  * - sexinfo      : The command to add for sexinfo.
1393 |  *
1394 |  * Outputs:
1395 |  * - None : Results are written to the data_path directory.
1396 |  */
1397 | process removeXchrSnps {
1398 |   input:
1399 |   file xsnps_status
1400 |   val  cut_maf      from params.cut_maf
1401 |   val  cut_geno     from params.cut_genome
1402 |   val  container    from params.dock_container
1403 |   val  data_path    from plink_data_path
1404 |   val  mountpoint   from params.dock_mpoint
1405 |   val  sexinfo      from params.sexinfo_command
1406 | 
1407 |   output:
1408 |   stdout 'result'
1409 | 
1410 |   script:
1411 |   """
1412 |   if [[ -s xsnps_status ]]; then 
1413 |     echo "Have input data"
1414 |   fi
1415 | 
1416 |   docker run -v $data_path:/$mountpoint -w /$mountpoint       \
1417 |     $container plink --bfile qc_plink_clean_inds $sexinfo     \
1418 |     --maf $cut_maf --geno $cut_geno --exclude xsnps.bim       \
1419 |     --make-bed --out xsnps_removed
1420 |   """
1421 | }
1422 | 


--------------------------------------------------------------------------------