├── .document ├── .gitignore ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── VERSION ├── bin └── pipengine ├── bio-pipengine.gemspec ├── joss ├── paper.bib └── paper.md ├── lib ├── bio-pipengine.rb └── bio │ ├── pipengine.rb │ └── pipengine │ ├── job.rb │ ├── sample.rb │ └── step.rb └── test ├── data ├── mapping.yml ├── pipeline-enh.yml ├── pipeline.yml └── samples.yml ├── examples ├── pipeline.yml ├── pipeline_multi.yml └── samples.yml ├── helper.rb └── test_bioruby-pipengine.rb /.document: -------------------------------------------------------------------------------- 1 | lib/**/*.rb 2 | bin/* 3 | - 4 | features/**/*.feature 5 | LICENSE.txt 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # rcov generated 2 | coverage 3 | coverage.data 4 | 5 | # rdoc generated 6 | rdoc 7 | 8 | # yard generated 9 | doc 10 | .yardoc 11 | 12 | # bundler 13 | .bundle 14 | 15 | # jeweler generated 16 | pkg 17 | 18 | # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore: 19 | # 20 | # * Create a file at ~/.gitignore 21 | # * Include files you want ignored 22 | # * Run: git config --global core.excludesfile ~/.gitignore 23 | # 24 | # After doing this, these files will be ignored in all your git projects, 25 | # saving you from having to 'pollute' every project you touch with them 26 | # 27 | # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line) 28 | # 29 | # For MacOS: 30 | # 31 | .DS_Store 32 | 33 | # For TextMate 34 | #*.tmproj 35 | #tmtags 36 | 37 | # For emacs: 38 | #*~ 39 | #\#* 40 | #.\#* 41 | 42 | # For vim: 43 | *.swp 44 | 45 | # For redcar: 46 | #.redcar 47 | 48 | # For rubinius: 49 | #*.rbc 50 | 51 | *.lock 52 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "http://rubygems.org" 2 | ruby ">= 2.1.2" 3 | 4 | # Add dependencies required to use your gem here. 5 | # Example: 6 | # gem "activesupport", ">= 2.3.5" 7 | gem "trollop", ">= 2.1.2" 8 | gem "colorize", ">= 0.8.1" 9 | 10 | group :development do 11 | gem 'jeweler' 12 | end 13 | 14 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Francesco Strozzi 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PipEngine 2 | ========= 3 | 4 | A simple launcher for complex biological pipelines. 5 | 6 | PipEngine will generate runnable shell scripts, already configured for the PBS/Torque job scheduler, for each sample in the pipeline. It allows to run a complete pipeline or just a single step depending on the needs. 7 | 8 | PipEngine is best suited for NGS pipelines, but it can be used for any kind of pipeline that can be runned on a job scheduling system and which is "sample" centric, i.e. you have from one side a list of samples with their corresponding raw data, and from the other side a pipeline that you would like to apply to them. 9 | 10 | PipEngine was developed to combine the typical flexibility and portability of shell scripts, with the concept of pipeline templates that can be easily applied on different input data to reproduce scientific results. The overall improvement over Makefiles or customised ad-hoc shell scripts is better readability of the pipelines using the YAML format, especially for people with no coding experience, the automated scripts generation which allows adding extra functionalities like error controls and logging directly into script jobs, and an enforced separation between the description of input data and the pipeline template, which improves clarity and reusability of analysis protocols. 11 | 12 | 13 | Installation 14 | ============ 15 | 16 | If you already have Ruby, just install PipEngine using RubyGems: 17 | 18 | ```shell 19 | gem install bio-pipengine 20 | ``` 21 | 22 | If you don't have Ruby installed we reccomend you use the Anaconda Package Manager. 23 | 24 | Download the installer from [here](http://conda.pydata.org/miniconda.html) and once installed you can simply type: 25 | 26 | ```shell 27 | conda install -c bioconda ruby 28 | ``` 29 | 30 | and then install PipEngine using RubyGems: 31 | 32 | ```shell 33 | gem install bio-pipengine 34 | ``` 35 | 36 | Pipengine has been tested and should work with Ruby >= 2.1.2 37 | 38 | :: Topics :: 39 | ============ 40 | 41 | [Usage](https://github.com/bioinformatics-ptp/bioruby-pipengine#-usage-) 42 | 43 | [The Pipeline YAML](https://github.com/bioinformatics-ptp/bioruby-pipengine#-the-pipeline-yaml-) 44 | 45 | [The Samples YAML](https://github.com/bioinformatics-ptp/bioruby-pipengine#-the-samples-yaml-) 46 | 47 | [Input and output conventions](https://github.com/bioinformatics-ptp/bioruby-pipengine#-input-and-output-conventions-) 48 | 49 | [Sample groups and complex steps](https://github.com/bioinformatics-ptp/bioruby-pipengine#-sample-groups-and-complex-steps-) 50 | 51 | [What happens at run-time](https://github.com/bioinformatics-ptp/bioruby-pipengine#-what-happens-at-run-time-) 52 | 53 | [Examples](https://github.com/bioinformatics-ptp/bioruby-pipengine#-examples-) 54 | 55 | [PBS Options](https://github.com/bioinformatics-ptp/bioruby-pipengine#-pbs-options-) 56 | 57 | [Extending and contributing](https://github.com/bioinformatics-ptp/bioruby-pipengine#-extending-and-contributing-) 58 | 59 | :: Usage :: 60 | =========== 61 | 62 | 63 | ```shell 64 | > pipengine -h 65 | List of available commands: 66 | run Submit pipelines to the job scheduler 67 | ``` 68 | 69 | Command line for RUN mode 70 | ------------------------- 71 | 72 | **Command line** 73 | ```shell 74 | > pipengine run -p pipeline.yml -f samples.yml -s mapping --tmp /tmp 75 | ``` 76 | 77 | **Parameters** 78 | ```shell 79 | -p, --pipeline= YAML file with pipeline and sample details (default: pipeline.yml) 80 | -f, --samples-file= YAML file with samples name and directory paths (default: samples.yml) 81 | -l, --samples= List of sample names to run the pipeline 82 | -s, --steps= List of steps to be executed 83 | -d, --dry Dry run. Just create the job script without submitting it to the batch system 84 | -t, --tmp= Temporary output folder 85 | -c, --create-samples= Create samples.yml file from a Sample directory (only for CASAVA projects) 86 | -m, --multi= List of samples to be processed by a given step (the order matters) 87 | -g, --group= Specify the group of samples to run the pipeline steps on (do not specify --multi) 88 | -a, --allgroups Apply the step(s) to all the groups defined into the samples file 89 | -n, --name= Analysis name 90 | -o, --output-dir= Output directory (override standard output directory names) 91 | -b, --pbs-opts= PBS options 92 | -q, --pbs-queue= PBS queue 93 | -i, --inspect-pipeline= Show steps 94 | --log= Log script activities, by default stdin. Options are fluentd (default: stdin) 95 | -e, --log-adapter= (stdin|syslog|fluentd) In case of fluentd use http://destination.hostname:port/yourtag 96 | --tag= Overwrite tags present in samples.yml and pipeline.yml files (e.g. tag1=value1 tag2=value2) 97 | -h, --help Show this message 98 | ``` 99 | 100 | PipEngine accepts two input files: 101 | * A YAML file describing the pipeline steps 102 | * A YAML file describing samples names, samples location and other samples-specific information 103 | 104 | 105 | :: The Pipeline YAML :: 106 | ======================= 107 | 108 | The basic structure of a pipeline YAML is divided into three parts: 1) pipeline name, 2) resources, 3) steps. 109 | 110 | An example YAML file is like the following: 111 | 112 | ```yaml 113 | 114 | pipeline: resequencing 115 | 116 | resources: 117 | fastqc: /software/FastQC/fastqc 118 | bwa: /software/bwa 119 | gatk: /software/gatk-lite/GenomeAnalysisTk.jar 120 | samtools: /software/samtools 121 | samsort: /software/picard-tools-1.77/SortSam.jar 122 | mark_dup: /software/picard-tools-1.77/MarkDuplicates.jar 123 | bam: /software/bam 124 | pigz: /software/pigz 125 | 126 | steps: 127 | mapping: 128 | desc: Run BWA on each sample to perform alignment 129 | run: 130 | - ls /*_R1_*.gz | xargs zcat | -p 10 >> R1.fastq.gz 131 | - ls /*_R2_*.gz | xargs zcat | -p 10 >> R2.fastq.gz 132 | - sampe -P <( aln -t 4 -q 20 R1.fastq.gz) <( aln -t 4 -q 20 R2.fastq.gz) R1.fastq.gz R2.fastq.gz | view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=.sorted.bam SO=coordinate LB= PL=illumina PU=PU SM= TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000 133 | - rm -f R1.fastq.gz R2.fastq.gz 134 | cpu: 11 135 | 136 | mark_dup: 137 | run: java -Xmx4g -jar VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=.sorted.bam OUTPUT=.md.sort.bam METRICS_FILE=.metrics REMOVE_DUPLICATES=false 138 | 139 | realign_target: 140 | run: java -Xmx4g -jar -T RealignerTargetCreator -I .md.sort.bam -nt 8 -R -o .indels.intervals 141 | cpu: 8 142 | 143 | realign: 144 | run: java -Xmx4g -jar -T IndelRealigner -LOD 0.4 -model USE_READS --disable_bam_indexing --target_intervals .indels.intervals -R -I .md.sort.bam -o .realigned.bam 145 | 146 | fixtags: 147 | run: calmd -r -E -u .realigned.bam | squeeze --in -.ubam --out .final.bam --rmTags 'XM:i;XG:i;XO:i' --keepDups 148 | 149 | bam_index: 150 | run: index .final.bam 151 | 152 | clean: 153 | run: ls | grep -v final | xargs rm -fr 154 | 155 | ``` 156 | 157 | Resources definition 158 | -------------------- 159 | 160 | PipEngine is entirely based on the placeholder and substitution logic. For example in the Pipeline YAML, each tool is declared under the resources and at run time PipEngine will search for the corresponding placeholder in the command lines. 161 | 162 | So, for instance, if I have declared a software **bwa** under resources, PipEngine will search for a `````` placeholder in all the command lines and will substitute it with the software complete path declared in resources. 163 | 164 | This makes command lines definition shorter and easier to read and avoid problems when moving from one software version to another (i.e. you just need to change the bwa definition once, and not 10 times in 5 different command lines) 165 | 166 | The same thing happens for samples names, input and output directories and intermediate output files. This allows to create true pipelines templates that can be reused and applied to different samples sets. 167 | 168 | Step definition 169 | --------------- 170 | 171 | A step must be defined using standard keys: 172 | 173 | * the first key must be the step name 174 | * under the step name, a **run** key must be defined to hold the actual command line that will be executed 175 | * a **cpu** key must be defined if the command line uses more than 1 CPU at runtime 176 | * a **multi** key must be defined if the command line takes as input more than one sample (more details later) 177 | * a **desc** key has been added to insert a short description that will be displayed using the **-i** option of PipEngine 178 | * a **nodes** and **mem** keys can be used to specify the resources needed for this job 179 | 180 | A note on the **run** key. If a single step need more than a command line to execute the required actions, these multiple command lines must be defined as an array in YAML (see the mapping step in the above example). 181 | 182 | 183 | :: The Samples YAML :: 184 | ===================== 185 | 186 | The samples YAML is much simpler then the pipeline YAML: 187 | 188 | ```yaml 189 | resources: 190 | index: /storage/genomes/bwa_index/genome 191 | genome: /storage/genomes/genome.fa 192 | output: /storage/results 193 | 194 | samples: 195 | sampleA: /ngs_reads/sampleA 196 | sampleB: /ngs_reads/sampleB 197 | sampleC: /ngs_reads/sampleC 198 | sampleD: /ngs_reads/sampleD 199 | ``` 200 | 201 | In this YAML there is again a **resources** key, but this time the tags defined here are dependent on the samples described in the YAML. 202 | 203 | For instance, if I am working with human RNA-seq samples, these data must be aligned on the human genome, so it makes sense that the **genome** tag must be defined here and not in the pipeline YAML, which must be as much generic as possible. 204 | 205 | Generally, the tags defined under the samples **resources** are dependent on the pipeline and analysis one wants to run. So if using BWA to perform reads alignemnt, an **index** tag must be defined here to set the BWA index prefix and it will be substituted in the pipelines command lines every time an `````` placeholder will be found in the pipeline YAML. 206 | 207 | Sample groups 208 | ------------- 209 | 210 | If you want to organize your samples by groups, it is possible to do it directly in the samples.yml file: 211 | 212 | 213 | ```yaml 214 | resources: 215 | index: /storage/genomes/bwa_index/genome 216 | genome: /storage/genomes/genome.fa 217 | output: /storage/results 218 | 219 | samples: 220 | Group1: 221 | sampleA: /ngs_reads/sampleA 222 | sampleB: /ngs_reads/sampleB 223 | Group2: 224 | sampleC: /ngs_reads/sampleC 225 | sampleD: /ngs_reads/sampleD 226 | ``` 227 | 228 | Then, by using the **-g** option of PipEngine, it is possible to run steps and pipelines directly on groups of samples. 229 | 230 | 231 | How to create the Samples file 232 | ------------------------------ 233 | 234 | PipEngine is created to work primarly for NGS pipelines and with Illumina data in mind. So, the easiest thing to do if you have your samples already organized into a typical Illumina folder is to run: 235 | 236 | ```shell 237 | > pipengine run -c /path/to/illumina/data 238 | ``` 239 | 240 | This will generate a samples.yml file with all the sample names and path derived from the run folder. The "resources" part is left blank for you to fill. 241 | 242 | As a plus, if you have your samples scattered thoughout many different run folders, you can specify all the paths that you want to PipEngine and it will combine all the paths in the same samples file. So if you have your samples spread across let's say 3 runs, you can call PipEngine in this way: 243 | 244 | ```shell 245 | > pipengine run -c /path/to/illumina/run1 /path/to/illumina/run2 /path/to/illumina/run3 246 | ``` 247 | 248 | If a sample is repeated in more than one run, all the paths will be combined in the samples.yml and PipEngine will take care of handling the multiple paths correctly. 249 | 250 | 251 | 252 | :: Input and output conventions :: 253 | ================================== 254 | 255 | The inputs in the steps defined in the pipeline YAML are expressed by the `````` placeholder that will be substituted with a sample name and the ``````, which will be changed with the location where initial data (i.e. raw sequencing reads) are stored for that particular sample. Both this information are provided in the sample YAML file. 256 | 257 | The `````` placeholder is a generic one to define the root location for the pipeline outputs. This parameter is also defined in the samples YAML. By default, PipEngine will write jobs scripts and will save stdout and stderr files from PBS in this folder. 258 | 259 | By convention, each sample output is saved under a folder with the sample name and each step is saved in a sub-folder with the step name. 260 | 261 | That is, given a generic /storage/pipeline_results `````` folder, the outputs of the **mapping** step will be organized in this way: 262 | 263 | ```shell 264 | /storage/pipeline_results/SampleA/mapping/SampleA.bam 265 | /SampleB/mapping/SampleB.bam 266 | /SampleC/mapping/SampleC.bam 267 | /SampleD/mapping/SampleD.bam 268 | ``` 269 | 270 | This simple convention keeps things clean and organized. The output file name can be decided during the pipeline creation, but it's a good habit to name it using the sample name. 271 | 272 | When new steps of the same pipeline are run, output folders are updated accordingly. So for example if after the **mapping** step a **mark_dup** step is run, the output folder will look like this: 273 | 274 | ```shell 275 | /storage/pipeline_results/SampleA/mapping 276 | /SampleA/mark_dup 277 | 278 | /storage/pipeline_results/SampleB/mapping 279 | /SampleB/mark_dup 280 | ..... 281 | ``` 282 | 283 | In case you are working with group of samples, specified by the **-g** option, the output folder will be changed to reflect the samples grouping. So for example if a **mapping** step is called on the **Group1** group of samples, all the outputs will be saved under the ```/Group1``` folder and results of mapping for SampleA, will be found under ```/Group1/SampleA/mapping``` . 284 | 285 | 286 | How steps are connected together 287 | -------------------------------- 288 | 289 | One step is connected to another by simply requiring that its input is the output of another preceding step. This is just achived by a combination of `````` and `````` placeholders in the pipeline command line definitions. 290 | 291 | For instance, if I have a resequencing pipeline that will first run BWA to map the reads and then a mark duplicate step, the mark_dup step will be dependent from the BWA output. 292 | 293 | ```yaml 294 | mapping: 295 | run: 296 | - ls /*_R1_*.gz | xargs zcat | -p 10 >> R1.fastq.gz 297 | - ls /*_R2_*.gz | xargs zcat | -p 10 >> R2.fastq.gz 298 | - sampe -P <( aln -t 4 -q 20 R1.fastq.gz) <( aln -t 4 -q 20 R2.fastq.gz) R1.fastq.gz R2.fastq.gz > .sorted.bam 299 | - rm -f R1.fastq.gz R2.fastq.gz 300 | cpu: 11 301 | 302 | mark_dup: 303 | run: java -Xmx4g -jar INPUT=.sorted.bam OUTPUT=.md.sort.bam 304 | ``` 305 | 306 | So in the **mark_dup** step the input placeholder (defined under the **run** key in the pipeline YAML) will be written as: 307 | 308 | ``` 309 | .sorted.bam 310 | ``` 311 | 312 | If the `````` tag is defined for instance as "/storage/results", this will be translated at run-time into: 313 | 314 | ``` 315 | /storage/results/SampleA/mapping/SampleA.sorted.bam 316 | ``` 317 | 318 | for SampleA outputs. Basically the `````` placeholder is a shortcut for ```//{step name, mapping in this case}/``` 319 | 320 | Following the same idea, using a `````` placeholder (note the / at the end) will be translated into ```//{step name, mapping in this case}/``` , to address the scenario when a user wants to point to the previous step output directory, but without having the `````` appended to the end of the path. 321 | 322 | More complex dependences can be defined by combinations of `````` and `````` placeholders, or using the `````` and `````` placeholders, without having to worry about the actual sample name and the complete input and output paths. 323 | 324 | Jobs dependencies 325 | ------------------------- 326 | Steps can also be defined with dependencies so the user can just call the final step and all the upper chain is called automatically. To achieve this task Pipengine requires that the user defines a 327 | ``` 328 | pre: 329 | ``` 330 | tag in the step definition: 331 | 332 | ``` 333 | root_step: 334 | desc: root step to test dependencies 335 | run: 336 | - echo "root" 337 | 338 | child_step: 339 | desc: child step to test dependencies 340 | pre: root_step 341 | run: 342 | - echo "I am the child" 343 | ``` 344 | 345 | 346 | :: Multi-Samples and complex steps :: 347 | ===================================== 348 | 349 | The pipeline steps can be defined to run on a single sample or to take as input more than one sample data, depending on the command line used. 350 | 351 | A typical example is running a differential expression step for example with CuffDiff. This requires to take all the output generated from the previous Cufflinks step (i.e. the gtf files) and process them to generate a unique transcripts reference (CuffCompare) and then perform the differential expression across the samples using the BAM files generated by, let's say, TopHat in a **mapping** step. 352 | 353 | This is an extract of the step definition in the pipeline YAML to describe these two steps: 354 | 355 | ```yaml 356 | diffexp: 357 | multi: 358 | - //cufflinks/transcripts.gtf 359 | - _tophat/accepted_hits.bam 360 | run: 361 | - echo '' | sed -e 's/,/ /g' | xargs ls >> gtf_list.txt 362 | - -s -r -i gtf_list.txt 363 | - -p 12 -N -u -b ./*combined.gtf 364 | cpu: 12 365 | ``` 366 | 367 | In this case we need to combine the outputs of all the samples from the cufflinks step and pass that information to cuffcompare and combine the outputs of the mapping steps and pass them to the cuffdiff command line. 368 | 369 | This is achived in two ways. First, the step definition must include a **multi** key, that simply defines what, for each sample, will be substituted where the `````` placeholder is found. 370 | 371 | In the example above, the step includes two command lines, one for cuffcompare and the other for cuffdiff. Cuffcompare requires the transcripts.gtf for each sample, while Cuffdiff requires the BAM file for each sample, plus the output of Cuffcompare. 372 | 373 | So the two command lines need two different kind of files as input from the same set of samples, therefore two **multi** keywords are defined as well as two placeholders `````` and `````` 374 | 375 | Once the step has been defined in the pipeline YAML, PipEngine must be invoked using the **-m** parameter, to specify the samples that should be grouped together by this step: 376 | 377 | ```shell 378 | pipengine run -p pipeline.yml -m SampleA,SampleB SampleC,SampleB 379 | ``` 380 | 381 | Note that the use of commas is not casual, since the **-m** parameter specifies not only which samples should be used for this step, but also how they should be organized on the corresponding command line. The **-m** parameter takes the sample names and underneath it will combine the sample name with the 'multi' keywords and then it will substitute back the command line by keeping the samples in the same order as provided with the **-m**. 382 | 383 | The above command line will be translated, for the **cuffdiff** command line in the following: 384 | 385 | ```shell 386 | /software/cuffdiff -p 12 -N -u -b /storage/genome.fa combined.gtf /storage/results/SampleA/cufflinks/transcripts.gtf,/storage/results/SampleB/cufflinks/transcripts.gtf /storage/results/SampleC/cufflinks/transcripts.gtf /storage/results/SampleD/cufflinks/transcripts.gtf 387 | ``` 388 | 389 | and this will correspond to the way CuffDiff wants biological replicates for each condition to be described on the command line. 390 | 391 | **Note** 392 | 393 | Multi-samples step management is complex and it's a task that can't be easily generalized since every software has it's own way to require and organize the inputs on the command line. This approach it's probably not the most elegant solution but works quite well, even if there are some drawbacks. For instance, as stated above, the samples groups is processed and passed to command lines as it is taken from the **-m** parameter. 394 | 395 | So for Cuffdiff, the presence of commas is critical to divide biological replicates from different conditions, but for Cuffcompare the commas are not needed and will raise an error on the command line. That's the reason of the: 396 | 397 | ```shell 398 | echo '' | sed -e 's/,/ /g' | xargs ls >> gtf_list.txt 399 | ``` 400 | 401 | This line generates the input file for Cuffcompare with the list of the transcripts.gtf files for each sample, generated using the 'multi' definition in the pipeline YAML and the line passed through the **-m** parameter, but getting rid of the commas that separate sample names. It's a workaround and it's not a super clean solution, but PipEngine wants to be a general tool not binded to specific corner cases and it always lets the user define it's own custom command lines to manage particular steps, as in this case. 402 | 403 | Composable & Modular steps definition 404 | ------------------------------------ 405 | 406 | Since now steps are defined inside a single YAML file. This approach is usefult to have a stable and reproducible analysis pipeline. But what if, multiple users whant to collaborate on the same pipeline improving it and, most importantly, re-using the same steps in different analyses ? What happend is a proliferation of highly similar pipelines that are very complicate to compare and to maintain over time. 407 | In this scenario, the very first thing that a developer imagine is the ability to include external files, unfortunately YAML does not implement this feature. A possible workaround, remember that we are in the Ruby land, is to embed some Ruby code into the YAML file and include external steps. 408 | 409 | Creating a file `mapping.yml` that describe the mapping step with BWA 410 | 411 | ``` 412 | mapping: 413 | cpu: 8 414 | desc: Run BWA MEM and generates a sorted BAM file 415 | run: 416 | - mem -t -R '@RG\tID:\tLB:\tPL:ILLUMINA\tPU:\tCN:PTP\tSM:' .trim.fastq | view -bS - > .bam 417 | - sort -@ .bam .sort 418 | - rm -f .bam 419 | ``` 420 | 421 | is then possible to include the `mapping.yml` file inside your pipeline with a snipped of Ruby code `<%= include :name_of_the_step, "file_step.yml" %> 422 | Right now is very important that you place the tag at the very first start of the line ( no spaces at the beginning of the line) 423 | 424 | ``` 425 | steps: 426 | <%= include :mapping, "./mapping.yml" %> 427 | 428 | index: 429 | desc: Make BAM index 430 | run: index .sort.bam 431 | ```` 432 | 433 | are later run pipengine as usual. 434 | TODO: Dump the whole pipeline file for reproducibility purposes. 435 | 436 | 437 | :: What happens at run-time :: 438 | ============================== 439 | 440 | When invoking PipEngine, the tool will look for the pipeline YAML specified and for the sample YAML file. It will load the list of samples (names and paths of input data) and for each sample it will load the information of the step specified in the command line ( **-s** parameter ). 441 | 442 | PipEngine will then combine the data from the two YAML, generating the specific command lines of the selected steps and substituing all the placeholders to generate the final command lines. 443 | 444 | A shell script will be finally generated, for each sample, that will contain all the instructions to run a specific step of the pipeline plus the meta-data for the PBS scheduler. The shell scripts are written inside the directory specified on the ```output:``` key in the ```samples.yml``` file, the directory is created if it does not exist. 445 | 446 | If not invoked with the **-d** option (dry-run) PipEngine will directly submit the jobs to the PBS scheduler using the "qsub" command. 447 | 448 | Dry Run 449 | ------- 450 | 451 | The **-d** parameter lets you create the runnable shell scripts without submitting them to PBS. Use it often to check that the pipeline that will be executed is correct and it is doing what you thought. The runnable scripts are saved by default in the `````` directory. 452 | 453 | Use it also to learn how the placeholders works, especially the dependency placeholders (e.g. ``````) and to cross-check that all the placeholders in the pipeline command lines were substituted correctly before submitting the jobs. 454 | 455 | Temporary output folder 456 | ------------------- 457 | 458 | By using the '--tmp' option, PipEngine will generate a job script (for each sample) that will save all the output files or folders for a particular step in a directory (e.g. /tmp) that is different from the one provided with the ``````. 459 | 460 | By default PipEngine will generate output folders directly under the location defined by the `````` tag in the Sample YAML. The --tmp solution instead can be useful when we don't want to save directly to the final location (e.g maybe a slower network storage) or we don't want to keep all the intermediate files but just the final ones. 461 | 462 | With this option enabled, PipEngine will also generate instructions in the job script to copy, at the end of the job, all the outputs from the temporary directory to the final output folder (i.e. ``````) and then to remove the temporary copy. 463 | 464 | When '--tmp' is used, a UUID is generated for each job and prepended to the job name and to the temporary output folder, to avoid possible name collisions and data overwrite if more jobs with the same name (e.g. mapping) are running and writing to the same temporary location. 465 | 466 | One job with multiple steps 467 | --------------------------- 468 | 469 | It is of course possible to aggregate multiple steps of a pipeline and run them in one single job. For instance let's say I want to run in the same job the steps mapping, mark_dup and realign_target (see pipeline YAML example above). 470 | 471 | From the command line it's just: 472 | 473 | ```shell 474 | pipengine run -p pipeline.yml -s mapping mark_dup realign_target 475 | ``` 476 | 477 | A single job script, for each sample, will be generated with all the instructions for these steps. If more than one step declares a **cpu** key, the highest cpu value will be assigned for the whole job. 478 | 479 | Each step will save outputs into a separated folder, under the ``````, exactly if they were run separately. This way, if the job fails for some reason, it will be possible to check which steps were already completed and restart from there. 480 | 481 | When multiple steps are run in the same job, by default PipEngine will generate the job name as the concatenation of all the steps names. Since this could be a problem when a lot of steps are run together in the same job, a '--name' parameter it's available to rename the job in a more convenient way. 482 | 483 | :: Examples :: 484 | ============== 485 | 486 | All these files can be found into the test/examples directory of the repository. 487 | 488 | Example 1: One step and multiple command lines 489 | ---------------------------------------------- 490 | 491 | This is an example on how to prepare the inputs for BWA and run it along with Samtools: 492 | 493 | **pipeline.yml** 494 | ```yaml 495 | pipeline: resequencing 496 | 497 | resources: 498 | bwa: /software/bwa 499 | samtools: /software/samtools 500 | pigz: /software/pigz 501 | 502 | steps: 503 | mapping: 504 | run: 505 | - ls /*_R1_*.gz | xargs zcat | -p 10 >> R1.fastq.gz 506 | - ls /*_R2_*.gz | xargs zcat | -p 10 >> R2.fastq.gz 507 | - sampe -P <( aln -t 4 -q 20 R1.fastq.gz) <( aln -t 4 -q 20 R2.fastq.gz) R1.fastq.gz R2.fastq.gz | view -Sb - > .bam 508 | - rm -f R1.fastq.gz R2.fastq.gz 509 | cpu: 12 510 | ``` 511 | 512 | **samples.yml** 513 | ```yaml 514 | resources: 515 | index: /storage/genomes/bwa_index/genome 516 | genome: /storage/genomes/genome.fa 517 | output: ./working 518 | 519 | samples: 520 | sampleA: /ngs_reads/sampleA 521 | sampleB: /ngs_reads/sampleB 522 | sampleC: /ngs_reads/sampleC 523 | sampleD: /ngs_reads/sampleD 524 | ``` 525 | 526 | Running PipEngine with the following command line: 527 | 528 | ``` 529 | pipengine run -p pipeline.yml -f samples.yml -s mapping -d 530 | ``` 531 | 532 | will generate a runnable shell script for each sample (available in the ./working directory): 533 | 534 | ```shell 535 | #!/usr/bin/env bash 536 | #PBS -N 2c57c1a853-sampleA-mapping 537 | #PBS -d ./working 538 | #PBS -l nodes=1:ppn=12 539 | if [ ! -f ./working/sampleA/mapping/checkpoint ] 540 | then 541 | echo "mapping 2c57c1a853-sampleA-mapping start `whoami` `hostname` `pwd` `date`." 542 | 543 | mkdir -p ./working/sampleA/mapping 544 | cd ./working/sampleA/mapping 545 | ls /ngs_reads/sampleA/*_R1_*.gz | xargs zcat | /software/pigz -p 10 >> R1.fastq.gz || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 0 `whoami` `hostname` `pwd` `date`."; exit 1; } 546 | ls /ngs_reads/sampleA/*_R2_*.gz | xargs zcat | /software/pigz -p 10 >> R2.fastq.gz || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 1 `whoami` `hostname` `pwd` `date`."; exit 1; } 547 | /software/bwa sampe -P /storage/genomes/bwa_index/genome <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R1.fastq.gz) <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R2.fastq.gz) R1.fastq.gz R2.fastq.gz | /software/samtools view -Sb - > sampleA.bam || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 2 `whoami` `hostname` `pwd` `date`."; exit 1; } 548 | rm -f R1.fastq.gz R2.fastq.gz || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 3 `whoami` `hostname` `pwd` `date`."; exit 1; } 549 | echo "mapping 2c57c1a853-sampleA-mapping finished `whoami` `hostname` `pwd` `date`." 550 | touch ./working/sampleA/mapping/checkpoint 551 | else 552 | echo "mapping 2c57c1a853-sampleA-mapping already executed, skipping this step `whoami` `hostname` `pwd` `date`." 553 | fi 554 | ``` 555 | As you can see the command line described in the pipeline YAML are translated into normal Unix command lines, therefore every solution that works on a standard Unix shell (pipes, bash substitutions) is perfectly acceptable. Pipengine addes extra lines in the script for steps checkpoint controls to avoid re-running already executed steps, and error controls with logging. 556 | 557 | In this case also, the **run** key defines three different command lines, that are described using YAML array (a line prepended with a -). This command lines are all part of the same step, since the first two are required to prepare the input for the third command line (BWA), using standard bash commands. 558 | 559 | As a rule of thumb you should put more command line into an array under the same step if these are all logically correlated and required to manipulate intermidiate files. Otherwise if command lines executes conceptually different actions they should go into different steps. 560 | 561 | Example 2: Multiple steps in one job 562 | ------------------------------------ 563 | 564 | Now I want to execute more steps in a single job for each sample. The pipeline YAML is defined in this way: 565 | 566 | ```yaml 567 | 568 | pipeline: resequencing 569 | 570 | resources: 571 | bwa: /software/bwa 572 | samtools: /software/samtools 573 | mark_dup: /software/picard-tools-1.77/MarkDuplicates.jar 574 | gatk: /software/GenomeAnalysisTK/GenomeAnalysisTK.jar 575 | 576 | steps: 577 | mapping: 578 | run: 579 | - ls /*_R1_*.gz | xargs zcat | pigz -p 10 >> R1.fastq.gz 580 | - ls /*_R2_*.gz | xargs zcat | pigz -p 10 >> R2.fastq.gz 581 | - sampe -P <( aln -t 4 -q 20 R1.fastq.gz) <( aln -t 4 -q 20 R2.fastq.gz) R1.fastq.gz R2.fastq.gz | view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=.sorted.bam SO=coordinate LB= PL=illumina PU=PU SM= TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000 582 | - rm -f R1.fastq.gz R2.fastq.gz 583 | cpu: 12 584 | 585 | mark_dup: 586 | pre: mapping 587 | run: java -Xmx4g -jar VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=.sorted.bam OUTPUT=.md.sort.bam METRICS_FILE=.metrics REMOVE_DUPLICATES=false 588 | 589 | realign_target: 590 | pre: mark_dup 591 | run: java -Xmx4g -jar -T RealignerTargetCreator -I .md.sort.bam -nt 8 -R -o .indels.intervals 592 | cpu: 8 593 | ``` 594 | 595 | The sample YAML file is the same as the example above. Now to execute together the 3 steps defined in the pipeline, PipEngine can be invoked with this command line: 596 | 597 | ``` 598 | pipengine run -p pipeline_multi.yml -f samples.yml -s realign_target -d 599 | ``` 600 | Since dependencies have been defined for the steps using the ```pre``` key, it is sufficient to invoke Pipengine with the last step and the other two are automatically included in the script. Messages will be prompted in this case since Pipengine just warns that the directories for certain steps, that are needed for other steps in the pipeline, are not yet available (and thus the corresponding steps will be executed to generate the necessary data). The command line will generate the following shell script (one for each sample, available in the ./working directory): 601 | 602 | ```shell 603 | #!/usr/bin/env bash 604 | #PBS -N 6f3c911c49-sampleC-realign_target 605 | #PBS -d ./working 606 | #PBS -l nodes=1:ppn=12 607 | if [ ! -f ./working/sampleC/mapping/checkpoint ] 608 | then 609 | echo "mapping 6f3c911c49-sampleC-realign_target start `whoami` `hostname` `pwd` `date`." 610 | 611 | mkdir -p ./working/sampleC/mapping 612 | cd ./working/sampleC/mapping 613 | ls /ngs_reads/sampleC/*_R1_*.gz | xargs zcat | pigz -p 10 >> R1.fastq.gz || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 0 `whoami` `hostname` `pwd` `date`."; exit 1; } 614 | ls /ngs_reads/sampleC/*_R2_*.gz | xargs zcat | pigz -p 10 >> R2.fastq.gz || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 1 `whoami` `hostname` `pwd` `date`."; exit 1; } 615 | /software/bwa sampe -P /storage/genomes/bwa_index/genome <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R1.fastq.gz) <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R2.fastq.gz) R1.fastq.gz R2.fastq.gz | /software/samtools view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=sampleC.sorted.bam SO=coordinate LB=sampleC PL=illumina PU=PU SM=sampleC TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000 || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 2 `whoami` `hostname` `pwd` `date`."; exit 1; } 616 | rm -f R1.fastq.gz R2.fastq.gz || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 3 `whoami` `hostname` `pwd` `date`."; exit 1; } 617 | echo "mapping 6f3c911c49-sampleC-realign_target finished `whoami` `hostname` `pwd` `date`." 618 | touch ./working/sampleC/mapping/checkpoint 619 | else 620 | echo "mapping 6f3c911c49-sampleC-realign_target already executed, skipping this step `whoami` `hostname` `pwd` `date`." 621 | fi 622 | if [ ! -f ./working/sampleC/mark_dup/checkpoint ] 623 | then 624 | echo "mark_dup 6f3c911c49-sampleC-realign_target start `whoami` `hostname` `pwd` `date`." 625 | 626 | mkdir -p ./working/sampleC/mark_dup 627 | cd ./working/sampleC/mark_dup 628 | java -Xmx4g -jar /software/picard-tools-1.77/MarkDuplicates.jar VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=./working/sampleC/mapping/sampleC.sorted.bam OUTPUT=sampleC.md.sort.bam METRICS_FILE=sampleC.metrics REMOVE_DUPLICATES=false || { echo "mark_dup 6f3c911c49-sampleC-realign_target FAILED `whoami` `hostname` `pwd` `date`."; exit 1; } 629 | echo "mark_dup 6f3c911c49-sampleC-realign_target finished `whoami` `hostname` `pwd` `date`." 630 | touch ./working/sampleC/mark_dup/checkpoint 631 | else 632 | echo "mark_dup 6f3c911c49-sampleC-realign_target already executed, skipping this step `whoami` `hostname` `pwd` `date`." 633 | fi 634 | if [ ! -f ./working/sampleC/realign_target/checkpoint ] 635 | then 636 | echo "realign_target 6f3c911c49-sampleC-realign_target start `whoami` `hostname` `pwd` `date`." 637 | 638 | mkdir -p ./working/sampleC/realign_target 639 | cd ./working/sampleC/realign_target 640 | java -Xmx4g -jar /software/GenomeAnalysisTK/GenomeAnalysisTK.jar -T RealignerTargetCreator -I ./working/sampleC/mark_dup/sampleC.md.sort.bam -nt 8 -R /storage/genomes/genome.fa -o sampleC.indels.intervals || { echo "realign_target 6f3c911c49-sampleC-realign_target FAILED `whoami` `hostname` `pwd` `date`."; exit 1; } 641 | echo "realign_target 6f3c911c49-sampleC-realign_target finished `whoami` `hostname` `pwd` `date`." 642 | touch ./working/sampleC/realign_target/checkpoint 643 | else 644 | echo "realign_target 6f3c911c49-sampleC-realign_target already executed, skipping this step `whoami` `hostname` `pwd` `date`." 645 | fi 646 | ``` 647 | 648 | 649 | Logging 650 | --------------------------- 651 | 652 | It is always usefult to log activities and collect the output from your software. Pipengine can log to: 653 | 654 | * stdin, just print on the terminal 655 | * syslog send the log to the system log using logger 656 | * fluentd send the log to a collector/centralized logging system (http://fluentd.org) 657 | 658 | 659 | :: PBS Options :: 660 | ================= 661 | 662 | If there is the need to pass to PipEngine specific PBS options, the ```--pbs-opts``` parameter can be used. 663 | 664 | This parameter accepts a list of options and each one will be added to the PBS header in the shell script, along with the ```-l``` PBS parameter. 665 | 666 | So for example, the following options passed to ```--pbs-opts```: 667 | 668 | ```shell 669 | --pbs-opts nodes=2:ppn=8 host=node5 670 | ``` 671 | 672 | will become, in the shell script: 673 | 674 | ```shell 675 | #PBS -l nodes=2:ppn=8 676 | #PBS -l host=node5 677 | ``` 678 | 679 | Note also that from version 0.5.2, it is possible to specify common PBS options like "nodes" and "mem" (for memory) directly within a step defition in the Pipeline yaml, exactly as it's done with the "cpu" parameter. So in a step it is possible to write: 680 | 681 | ```yaml 682 | realign_target: 683 | run: java -Xmx4g -jar -T RealignerTargetCreator -I .md.sort.bam -nt 8 -R -o .indels.intervals 684 | cpu: 8 685 | nodes: 2 686 | mem: 8G 687 | ``` 688 | 689 | to have PipEngine translate this into: 690 | 691 | ```shell 692 | #PBS -l nodes=2:ppn=8,mem=8G 693 | ``` 694 | 695 | within the job script. 696 | 697 | If a specific queue needs to be selected for sending the jobs to PBS, the ```--pbs-queue``` (short version **-q**) parameter can be used. This will pass to the ```qsub``` command the ```-q ``` taken from the command line. 698 | 699 | :: Extending and contributing :: 700 | ================================ 701 | 702 | Pipengine code is organized around main methods allowing for YAML parsing and command line arguments substitutions that are available in lib/bio/pipengine.rb. Specific logic for jobs, pipeline steps and samples is described in dedicated classes called Bio::Pipengine::Job, Bio::Pipengine::Step and Bio::Pipengine::Sample. 703 | 704 | For instance, in case the support for different schedulers needs to be introduced, it will be sufficient to modify or extend the Job.to_script method, which is the one defining scheduler-specific options in the runnable bash script. 705 | 706 | Copyright 707 | ========= 708 | 709 | ©2017 Francesco Strozzi, Raoul Jean Pierre Bonnal 710 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | require 'rubygems' 4 | require 'bundler' 5 | begin 6 | Bundler.setup(:default, :development) 7 | rescue Bundler::BundlerError => e 8 | $stderr.puts e.message 9 | $stderr.puts "Run `bundle install` to install missing gems" 10 | exit e.status_code 11 | end 12 | require 'rake' 13 | 14 | require 'jeweler' 15 | Jeweler::Tasks.new do |gem| 16 | # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options 17 | gem.name = "bio-pipengine" 18 | gem.homepage = "http://github.com/fstrozzi/bioruby-pipengine" 19 | gem.license = "MIT" 20 | gem.summary = %Q{A pipeline manager} 21 | gem.description = %Q{A pipeline manager} 22 | gem.email = ["francesco.strozzi@gmail.com", "ilpuccio.febo@gmail.com"] 23 | gem.authors = ["Francesco Strozzi", "Raoul Jean Pierre Bonnal"] 24 | gem.files = Dir.glob("lib/**/*.rb") 25 | gem.files << "VERSION" 26 | # dependencies defined in Gemfile 27 | end 28 | Jeweler::RubygemsDotOrgTasks.new 29 | 30 | require 'rake/testtask' 31 | Rake::TestTask.new(:test) do |test| 32 | test.libs << 'lib' << 'test' 33 | test.pattern = 'test/**/test_*.rb' 34 | test.verbose = true 35 | end 36 | 37 | task :default => :test 38 | 39 | require 'rdoc/task' 40 | Rake::RDocTask.new do |rdoc| 41 | version = File.exist?('VERSION') ? File.read('VERSION') : "" 42 | 43 | rdoc.rdoc_dir = 'rdoc' 44 | rdoc.title = "bioruby-pipengine #{version}" 45 | rdoc.rdoc_files.include('README*') 46 | rdoc.rdoc_files.include('lib/**/*.rb') 47 | end 48 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.9.7 -------------------------------------------------------------------------------- /bin/pipengine: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | $:<< File.expand_path(File.join(File.dirname(File.dirname __FILE__),"lib")) 4 | require 'bio-pipengine' 5 | 6 | banner_text = "\nLauncher for Complex Biological Pipelines . Copyright(C) 2012 Francesco Strozzi, Raoul Jean Pierre Bonnal\n\n" 7 | version_text = File.read File.expand_path(File.join(File.dirname(File.dirname __FILE__),"VERSION")) 8 | SUB_COMMANDS = %w(run jobs) 9 | 10 | 11 | #Bio::Pipengine.check_config 12 | 13 | options = {} 14 | cmd = ARGV.first # get the subcommand 15 | opts = case cmd 16 | when "run" 17 | options[:run] = true 18 | ARGV.shift 19 | Trollop::options do 20 | opt :pipeline, "YAML file with pipeline and sample details", :short => "p", :type => :string, :default => "pipeline.yml" 21 | opt :samples_file, "YAML file with samples name and directory paths", :short => "f", :type => :string, :default => "samples.yml" 22 | opt :samples, "List of sample names to run the pipeline", :type => :strings, :short => "l" 23 | opt :steps, "List of steps to be executed", :type => :strings, :short => "s" 24 | opt :dry,"Dry run. Just create the job script without submitting it to the batch system", :short => "d" 25 | opt :tmp, "Temporary output folder", :type => :string, :short => "t" 26 | opt :create_samples, "Create samples.yml file from a Sample directory (only for CASAVA projects)", :short => "c", :type => :strings 27 | opt :multi, "List of samples to be processed by a given step (the order matters)", :short => "m", :type => :strings 28 | opt :group, "Specify the group of samples to run the pipeline steps on (do not specify --multi)", :short => "g", :type => :string 29 | opt :allgroups, "Apply the step(s) to all the groups defined into the samples file", :short => "a" 30 | opt :name, "Analysis name", :short => "n", :type => :string 31 | opt :output_dir, "Output directory (override standard output directory names)", :short => "o", :type => :string 32 | opt :pbs_opts, "PBS options", :type => :strings, :short => "b" 33 | opt :pbs_queue, "PBS queue", :type => :string, :short => "q" 34 | opt :inspect_pipeline, "Show steps", :short => "i", :type => :string 35 | opt :log, "Log script activities, by default stdin. Options are fluentd", :type => :string, :default => "stdin" 36 | opt :log_adapter, "(stdin|syslog|fluentd) In case of fluentd use http://destination.hostname:port/yourtag", :type => :string 37 | opt :tag, "Overwrite tags present in samples.yml and pipeline.yml files (e.g. tag1=value1 tag2=value2)", :type => :strings 38 | end 39 | when "-h" 40 | puts banner_text 41 | puts "List of available commands:\n\trun\tSubmit pipelines to the job scheduler\n" 42 | exit 43 | else 44 | global_opts = Trollop::options do 45 | banner banner_text 46 | version "PipEngine v#{version_text}" 47 | puts banner_text 48 | puts "PipEngine v#{version_text}" 49 | puts "List of available commands:\n\trun\tSubmit pipelines to the job scheduler\n\n" 50 | end 51 | end 52 | 53 | 54 | 55 | options = options.merge opts 56 | Trollop::die :multi, "Specifing both --group and --multi is not allowed" if options[:multi] and options[:group] 57 | 58 | if options[:create_samples] 59 | Bio::Pipengine.create_samples options[:create_samples] 60 | #elsif options[:jobs] 61 | # if options[:job_id] 62 | # Bio::Pipengine.show_stats(options[:job_id]) 63 | # elsif options[:job_name] 64 | # warn "Not yet implemented" 65 | # exit 66 | # elsif options[:delete] 67 | # if options[:delete].empty? 68 | # warn "Provide one or more Job IDs or write 'all' to delete all your running jobs".red 69 | # exit 70 | # end 71 | # puts "Warning: this will delete the following running jobs: ".light_blue + "#{options[:delete].join(",")}".green 72 | # print "Are you sure? (y|n):" 73 | # answer = gets.chomp 74 | # if answer == "y" 75 | # Bio::Pipengine.delete_jobs(options[:delete]) 76 | # else 77 | # puts "Aborting..." 78 | # exit 79 | # end 80 | # else 81 | # Bio::Pipengine.show_stats(["all"]) 82 | # end 83 | elsif options[:pipeline] && options[:samples_file] 84 | if options[:inspect_pipeline] 85 | Bio::Pipengine.inspect_steps(options[:inspect_pipeline]) 86 | exit 87 | else 88 | abort("File not found: #{options[:pipeline]}".red) unless File.exists? options[:pipeline] 89 | abort("File not found: #{options[:samples_file]}".red) unless File.exists? options[:samples_file] 90 | abort("Please provide a valid step name with the --step parameter".red) unless options[:steps] 91 | if options[:allgroups] 92 | Bio::Pipengine.load_samples_file(options[:samples_file])["samples"].keys.each do |group| 93 | options[:group] = group 94 | Bio::Pipengine.run(options) 95 | end 96 | else 97 | Bio::Pipengine.run(options) 98 | end 99 | end 100 | end 101 | 102 | 103 | -------------------------------------------------------------------------------- /bio-pipengine.gemspec: -------------------------------------------------------------------------------- 1 | # Generated by jeweler 2 | # DO NOT EDIT THIS FILE DIRECTLY 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec' 4 | # -*- encoding: utf-8 -*- 5 | # stub: bio-pipengine 0.9.7 ruby lib 6 | 7 | Gem::Specification.new do |s| 8 | s.name = "bio-pipengine".freeze 9 | s.version = "0.9.7" 10 | 11 | s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version= 12 | s.require_paths = ["lib".freeze] 13 | s.authors = ["Francesco Strozzi".freeze, "Raoul Jean Pierre Bonnal".freeze] 14 | s.date = "2017-08-28" 15 | s.description = "A pipeline manager".freeze 16 | s.email = ["francesco.strozzi@gmail.com".freeze, "ilpuccio.febo@gmail.com".freeze] 17 | s.executables = ["pipengine".freeze] 18 | s.extra_rdoc_files = [ 19 | "LICENSE.txt", 20 | "README.md" 21 | ] 22 | s.files = [ 23 | "VERSION", 24 | "lib/bio-pipengine.rb", 25 | "lib/bio/pipengine.rb", 26 | "lib/bio/pipengine/job.rb", 27 | "lib/bio/pipengine/sample.rb", 28 | "lib/bio/pipengine/step.rb" 29 | ] 30 | s.homepage = "http://github.com/fstrozzi/bioruby-pipengine".freeze 31 | s.licenses = ["MIT".freeze] 32 | s.rubygems_version = "2.6.11".freeze 33 | s.summary = "A pipeline manager".freeze 34 | 35 | if s.respond_to? :specification_version then 36 | s.specification_version = 4 37 | 38 | if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then 39 | s.add_runtime_dependency(%q.freeze, [">= 2.1.2"]) 40 | s.add_runtime_dependency(%q.freeze, [">= 0.8.1"]) 41 | s.add_development_dependency(%q.freeze, [">= 0"]) 42 | else 43 | s.add_dependency(%q.freeze, [">= 2.1.2"]) 44 | s.add_dependency(%q.freeze, [">= 0.8.1"]) 45 | s.add_dependency(%q.freeze, [">= 0"]) 46 | end 47 | else 48 | s.add_dependency(%q.freeze, [">= 2.1.2"]) 49 | s.add_dependency(%q.freeze, [">= 0.8.1"]) 50 | s.add_dependency(%q.freeze, [">= 0"]) 51 | end 52 | end 53 | 54 | -------------------------------------------------------------------------------- /joss/paper.bib: -------------------------------------------------------------------------------- 1 | @online{Torque2017, 2 | author = "Adaptive Computing Inc.", 3 | title = {TORQUE Resource Manager}, 4 | year = {2017}, 5 | url = {http://www.adaptivecomputing.com/products/open-source/torque/} 6 | } 7 | 8 | @online{CWL2017, 9 | author = "Common Workflow Language", 10 | title = {Common Workflow Language}, 11 | year = {2017}, 12 | url = {http://www.commonwl.org}, 13 | doi = {dx.doi.org/10.6084/m9.figshare.3115156.v2} 14 | } 15 | 16 | @article{Goto2010, 17 | title = {Bioinformatics software for the Ruby programming language}, 18 | author = {Goto N, Prins P, Nakao M, Bonnal RJP, Aerts J, Katayama T}, 19 | year = {2010}, 20 | doi = {doi.org/10.1093/bioinformatics/btq475}, 21 | journal = {Bioinformatics} 22 | } 23 | 24 | 25 | @article{Bonnal2012, 26 | title = {Biogem: An effective tool-based approach for scaling up open source software development in bioinformatics}, 27 | author = {Bonnal RJP, Aerts J, Githinji G, Goto N, Maclean D, Miller CA, Mishima H, Pagani M, Ramirez-gonzalez R, Smant G, Strozzi F, Syme R, Vos R, Wennblom TJ, Woodcroft BJ, Katayama T, Prins P}, 28 | year = {2012}, 29 | doi = {doi.org/10.1093/bioinformatics/bts080}, 30 | journal = {Bioinformatics} 31 | } 32 | -------------------------------------------------------------------------------- /joss/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Pipengine: an ultra light YAML-based pipeline execution engine' 3 | tags: 4 | - pipeline 5 | - workflows 6 | - reproducibility 7 | authors: 8 | - name: Francesco Strozzi 9 | orcid: 0000-0002-6845-6982 10 | affiliation: 1 11 | - name: Raoul Jean Pierre Bonnal 12 | orcid: 0000-0002-2123-6536 13 | affiliation: 2 14 | affiliations: 15 | - name: Enterome Bioscience, Paris - France 16 | index: 1 17 | - name: INGM - Isitituto Nazionale Genetica Molecolare "Romeo ed Enrica Invernizzi": Milan, Italy 18 | index: 2 19 | date: 25 July 2017 20 | bibliography: paper.bib 21 | --- 22 | 23 | # Summary 24 | 25 | This is an ultra light YAML-based pipeline execution engine. The tool allows defining a pipeline template in YAML, specifying command lines, resources and software to be used along with pipeline steps dependencies. Pipengine is a sample-centric tool, so the pipeline can then be applied over a single sample or multiple samples data, generating actual runnable bash scripts which can then be submitted automatically to a scheduling system or run locally. 26 | 27 | The bash scripts generated by Pipengine includes a list of features such as: 28 | 29 | * error controls and logging for each step 30 | 31 | * the automated generation of directories based on sample and pipeline steps names 32 | 33 | * the moving of input and output data across original and temporary folders if needed 34 | 35 | * a simple checkpoint strategy to avoid re-running already completed steps in a pipeline. 36 | 37 | All these features prevent the users to write boiler plate code to perform all these necessary accessory tasks. 38 | 39 | Moreover, Pipengine creates a stable and reproducible working and output tree for each analysis, which transparently stores all the results of each step of a pipeline for each sample analyzed. In this way pipelines' intermediate or final results can be predictably accessed by the analysts and/or easily parsed with other tools. 40 | 41 | The software was developed back in 2012, when more generalized schemas such as for instance the Common Workflow Language [@CWL2017] were not yet defined, and thus was among the firsts utilities to introduce the concept of using simple YAML as a template format to define reusable bioinformatics pipelines. 42 | 43 | Pipengine has been used across several research groups and bioinformatics core facilities since its first appearance. It directly supports the PBS/Torque scheduler [@Torque2017] for submission of jobs, but given that the support for a scheduler is based on specific options written automatically inside the bash scripts generated by the tool, it can be easily adapted to work with other schedulers, if needed. 44 | 45 | Pipengine is written in Ruby and is available for download as a BioRuby Gem [@Goto2010; @Bonnal2012]. 46 | 47 | # References 48 | 49 | 50 | -------------------------------------------------------------------------------- /lib/bio-pipengine.rb: -------------------------------------------------------------------------------- 1 | require 'yaml' 2 | 3 | require 'securerandom' 4 | require 'trollop' 5 | require 'colorize' 6 | #require 'torque_rm' 7 | #require 'terminal-table' 8 | require 'fileutils' 9 | require 'logger' 10 | require 'erb' 11 | 12 | require 'bio/pipengine/sample' 13 | require 'bio/pipengine/step' 14 | require 'bio/pipengine/job' 15 | require 'bio/pipengine' 16 | -------------------------------------------------------------------------------- /lib/bio/pipengine.rb: -------------------------------------------------------------------------------- 1 | module Bio 2 | module Pipengine 3 | 4 | def self.include(name, filename) 5 | File.readlines(filename).map {|line| " "+line}.join("\n") 6 | end 7 | 8 | @@logger_error = Logger.new(STDERR) 9 | def self.run(options) 10 | 11 | # reading the yaml files 12 | pipeline = YAML.load ERB.new(File.read(options[:pipeline])).result(binding) 13 | samples_file = load_samples_file options[:samples_file] 14 | 15 | # make sure all sample names are always Strings 16 | converted_samples_list = {} 17 | samples_file["samples"].each_key do |sample| 18 | if samples_file["samples"][sample].kind_of? Hash # it's a group of samples 19 | converted_samples_list[sample.to_s] = Hash[samples_file["samples"][sample].map{ |k, v| [k.to_s, v] }] 20 | else 21 | converted_samples_list[sample.to_s] = samples_file["samples"][sample] 22 | end 23 | end 24 | samples_file["samples"] = converted_samples_list # replacing original samples hash with the converted one 25 | 26 | # pre-running checks 27 | check_steps(options[:steps],pipeline) 28 | check_samples(options[:samples],samples_file) if options[:samples] 29 | 30 | # list of samples the jobs will work on 31 | samples_list = nil 32 | # check if a group is specified 33 | if options[:group] 34 | samples_list = options[:samples] ? samples_file["samples"][options[:group]].select {|k,v| options[:samples].include? k} : samples_file["samples"][options[:group]] 35 | options[:multi] = samples_list.keys 36 | samples_file["resources"]["output"] << "/#{options[:group]}" 37 | else # if not, proceed normalizing the sample list to remove groups and get a list of all samples 38 | full_list_samples = {} 39 | samples_file["samples"].each_key do |k| 40 | if samples_file["samples"][k].kind_of? Hash 41 | full_list_samples.merge! samples_file["samples"][k] 42 | else 43 | full_list_samples[k] = samples_file["samples"][k] 44 | end 45 | end 46 | samples_list = options[:samples] ? full_list_samples.select {|k,v| options[:samples].include? k} : full_list_samples 47 | end 48 | 49 | ########### START ########### 50 | 51 | # create output directory (jobs scripts will be saved there) 52 | FileUtils.mkdir_p samples_file["resources"]["output"] #unless options[:dry] #&& options[:spooler]!="pbs" 53 | 54 | # check if the requested steps are multi-samples 55 | run_multi = check_and_run_multi(samples_file,pipeline,samples_list,options) 56 | 57 | unless run_multi # there are no multi-samples steps, so iterate on samples and create one job per sample 58 | samples_list.each_key do |sample_name| 59 | sample = Bio::Pipengine::Sample.new(sample_name.to_s,samples_list[sample_name],options[:group]) 60 | create_job(samples_file,pipeline,samples_list,options,sample) 61 | end 62 | end 63 | end 64 | 65 | def self.parse_tag_option(option_tag) 66 | if !option_tag 67 | return {} 68 | else 69 | tags = {} 70 | option_tag.each do |tag| 71 | values = tag.split("=") 72 | if values.empty? 73 | @@logger_error.error "\nAbort! Unrecognized values for tag option, please provide the tags as follows: tag1=value1 tag2=value2".red 74 | exit 75 | else 76 | tags.merge! Hash[*values.flatten] 77 | end 78 | end 79 | return tags 80 | end 81 | end 82 | 83 | # handle steps that run on multiple samples (i.e. sample groups job) 84 | def self.check_and_run_multi(samples_file,pipeline,samples_list,options) 85 | step_multi = options[:steps].map {|s| Bio::Pipengine::Step.new(s,pipeline["steps"][s]).is_multi?} 86 | 87 | if step_multi.include? false 88 | if step_multi.uniq.size > 1 89 | @@logger_error.error "\nAbort! You are trying to run both multi-samples and single sample steps in the same job".red 90 | exit 91 | else 92 | return false 93 | end 94 | else 95 | samples_obj = {} 96 | samples_list.each_key {|sample_name| samples_obj[sample_name] = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name],options[:group])} 97 | create_job(samples_file,pipeline,samples_list,options,samples_obj) 98 | return true 99 | end 100 | end 101 | 102 | def self.create_job(samples_file,pipeline,samples_list,options,sample) 103 | # getting the sample name (only if this is not a multi samples job) 104 | sample_name = (sample.kind_of? Hash) ? nil : sample.name+"-" 105 | # setting the job name 106 | job_name = nil 107 | if options[:name] 108 | job_name = options[:name] 109 | elsif options[:steps].size > 1 110 | job_name = "#{sample_name}#{options[:steps].join("-")}" 111 | else 112 | job_name = "#{sample_name}#{options[:steps].first}" 113 | end 114 | # creating the Job object 115 | job = Bio::Pipengine::Job.new(job_name) 116 | job.local = options[:tmp] 117 | job.custom_output = options[:output_dir] 118 | job.custom_name = (options[:name]) ? options[:name] : nil 119 | # Adding pipeline and samples resources 120 | job.add_resources pipeline["resources"] 121 | job.add_resources samples_file["resources"] 122 | # Adding resource tag from the command line which can overwrite resources defined in the pipeline and samples files 123 | job.add_resources parse_tag_option(options[:tag]) 124 | #setting the logging system 125 | job.log = options[:log] 126 | job.log_adapter = options[:log_adapter] 127 | # setting sample groups either by cli option (if present) or by taking all available samples 128 | job.multi_samples = (options[:multi]) ? options[:multi] : samples_list.keys 129 | job.samples_obj = sample if sample.kind_of? Hash 130 | # cycling through steps and add command lines to the job 131 | options[:steps].each do |step_name| 132 | # TODO WARNING this can add multiple times the same step if the are multi dependencies 133 | self.add_job(job, pipeline, step_name, sample) 134 | end 135 | 136 | if options[:dry] 137 | job.to_script(options) 138 | else 139 | job.to_script(options) 140 | job.submit 141 | end 142 | end 143 | 144 | # check if sample exists 145 | def self.check_samples(passed_samples,samples) 146 | passed_samples.each do |sample| 147 | samples_names = [] 148 | samples["samples"].each_key do |k| 149 | if samples["samples"][k].kind_of? Hash 150 | samples["samples"][k].each_key {|s| samples_names << s} 151 | else 152 | samples_names << k 153 | end 154 | end 155 | unless samples_names.include? sample 156 | @@logger_error.error "Sample \"#{sample}\" does not exist in sample file!".red 157 | exit 158 | end 159 | end 160 | end 161 | 162 | # check if step exists 163 | def self.check_steps(passed_steps,pipeline) 164 | passed_steps.each do |step| 165 | unless pipeline["steps"].keys.include? step 166 | @@logger_error.error "Step \"#{step}\" does not exist in pipeline file!".red 167 | exit 168 | end 169 | end 170 | end 171 | 172 | # load the pipeline file and show a list of available steps 173 | def self.inspect_steps(pipeline_file) 174 | pipeline = YAML.load_file pipeline_file 175 | print "\nPipeline: ".blue 176 | print "#{pipeline["pipeline"]}\n\n".green 177 | puts "List of available steps:".light_blue 178 | pipeline["steps"].each_key do |s| 179 | print "\s\s#{s}:\s\s".blue 180 | print "#{pipeline["steps"][s]["desc"]}\n".green 181 | end 182 | puts "\n" 183 | end 184 | 185 | # create the samples.yml file 186 | def self.create_samples(dir) 187 | File.open("samples.yml","w") do |file| 188 | file.write "resources:\n\soutput: #{`pwd -L`}\n\nsamples:\n" 189 | samples = Hash.new {|hash,key| hash[key] = []} 190 | dir.each do |path| 191 | projects = Dir.glob(path+"/*").sort.select {|folders| folders.split("/")[-1] =~/Project_/} 192 | unless projects.empty? 193 | projects.each do |project_folder| 194 | Dir.glob(project_folder+"/*").sort.each {|s| samples[s.split("/")[-1]] << s} 195 | end 196 | else 197 | Dir.glob(path+"/*").sort.each {|s| samples[s.split("/")[-1]] << s if Dir.exists? s} 198 | end 199 | end 200 | samples.each_key do |sample| 201 | file.write "\s"+sample+":\s"+samples[sample].join(",")+"\n" 202 | end 203 | end 204 | end 205 | 206 | def self.add_job(job, pipeline, step_name, sample) 207 | step = Bio::Pipengine::Step.new(step_name,pipeline["steps"][step_name]) # parsing step instructions 208 | self.add_job(job, pipeline, step.pre, sample) if step.has_prerequisite? 209 | job.add_step(step,sample) # adding step command lines to the job 210 | end #add_job 211 | 212 | def self.load_samples_file(file) 213 | samples_file = YAML.load_file file 214 | samples_file["samples"].each do |k,v| 215 | if v.kind_of? Hash 216 | samples_file["samples"][k] = Hash[samples_file["samples"][k].map{ |key, value| [key.to_s, value.to_s] }] 217 | else 218 | samples_file["samples"][k] = v.to_s 219 | end 220 | end 221 | # make sure everything in Samples and Resources is converted to string 222 | #samples_file["samples"] = Hash[samples_file["samples"].map{ |key, value| [key.to_s, value.to_s] }] 223 | samples_file["resources"] = Hash[samples_file["resources"].map {|k,v| [k.to_s, v.to_s]}] 224 | samples_file 225 | end 226 | 227 | 228 | end 229 | end 230 | -------------------------------------------------------------------------------- /lib/bio/pipengine/job.rb: -------------------------------------------------------------------------------- 1 | module Bio 2 | 3 | module Pipengine 4 | 5 | class Job 6 | 7 | @@logger = Logger.new(STDOUT) 8 | @@logger_error = Logger.new(STDERR) 9 | # a Job object holds information on a job to be submitted 10 | # samples_groups and samples_obj are used to store information in case of steps that require to combine info 11 | # from multiple samples 12 | attr_accessor :name, :cpus, :nodes, :mem, :resources, :command_line, :local, 13 | :multi_samples, :samples_obj, :custom_output, :custom_name, 14 | :log, :log_adapter 15 | def initialize(name) 16 | @name = generate_uuid + "-" + name 17 | @shortname = name 18 | @command_line = [] 19 | @resources = {} 20 | @cpus = 1 21 | @nodes = "1" 22 | @log = "stdin" 23 | @log_adapter = nil 24 | end 25 | 26 | def add_resources(resources) 27 | self.resources.merge! resources 28 | end 29 | 30 | def output 31 | self.resources["output"] 32 | end 33 | 34 | # add all the command lines for a given step 35 | def add_step(step,sample) 36 | 37 | # setting job working directory 38 | working_dir = "" 39 | if self.local 40 | working_dir = self.local+"/"+self.name 41 | else 42 | working_dir = self.output 43 | 44 | if step.is_multi? 45 | folder = (self.custom_output) ? self.custom_output : @shortname 46 | working_dir += "/#{folder}" 47 | else 48 | folder = 49 | if self.custom_output 50 | self.custom_output 51 | elsif self.custom_name 52 | self.custom_name 53 | else 54 | step.name 55 | end 56 | working_dir += "/#{sample.name}/#{folder}" 57 | end 58 | 59 | end 60 | 61 | # set job cpus number to the higher step cpus (this in case of multiple steps) 62 | self.cpus = step.cpus if step.cpus > self.cpus 63 | 64 | # set number of nodes for job 65 | self.nodes = (step.nodes) ? step.nodes : @nodes 66 | 67 | # set the memory used 68 | self.mem = step.mem 69 | 70 | # adding job working directory 71 | unless step.name.start_with? "_" 72 | self.command_line << "if [ ! -f #{working_dir}/checkpoint ]" 73 | self.command_line << "then" 74 | self.command_line << logger(step, "start") 75 | self.command_line << "\nmkdir -p #{working_dir}" 76 | self.command_line << "cd #{working_dir}" 77 | end 78 | 79 | # generate command lines for this step 80 | if step.run.kind_of? Array 81 | step.run.each_with_index do |cmd, i| 82 | command = generate_cmd_line(cmd,sample,step) 83 | # TODO verify that logger works in this case 84 | # self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name}:#{i}\" ; exit 1; }" 85 | self.command_line << "#{command} || { #{logger(step, "FAILED #{i}" )}; exit 1; }" 86 | end 87 | else 88 | command = generate_cmd_line(step.run,sample,step) 89 | # TODO verify that logger works in this case 90 | # self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name} \" ; exit 1; }" 91 | self.command_line << "#{command} || { #{logger(step, "FAILED" )}; exit 1; }" 92 | end 93 | self.command_line << logger(step, "finished") 94 | self.command_line << "touch #{working_dir}/checkpoint" 95 | self.command_line << "else" 96 | self.command_line << logger(step, "already executed, skipping this step") 97 | self.command_line << "fi" 98 | 99 | # check if a temporary (i.e. different from 'output') directory is set 100 | if self.local 101 | final_output = "" 102 | 103 | if step.is_multi? 104 | folder = (self.custom_output) ? self.custom_output : @shortname 105 | final_output = self.output+"/#{folder}" 106 | else 107 | folder = (self.custom_output) ? self.custom_output : step.name 108 | final_output = self.output+"/#{sample.name}/#{folder}" 109 | end 110 | 111 | self.command_line << "mkdir -p #{final_output}" 112 | self.command_line << "cp -r #{working_dir}/* #{final_output}" 113 | self.command_line << "rm -fr #{working_dir}" 114 | end 115 | 116 | end 117 | 118 | def to_script(options) 119 | File.open(self.output+"/"+self.name+'.pbs','w') do |file| 120 | file.puts "#!/usr/bin/env bash" 121 | file.puts "#PBS -N #{self.name}" 122 | file.puts "#PBS -d #{self.output}" 123 | file.puts "#PBS -q #{options[:pbs_queue]}" if options[:pbs_queue] 124 | if options[:pbs_opts] 125 | file.puts "#PBS -l #{options[:pbs_opts].join(",")}" 126 | else 127 | l_string = [] 128 | l_string << "nodes=#{self.nodes}:ppn=#{self.cpus}" 129 | l_string << "mem=#{self.mem}" if self.mem 130 | file.puts "#PBS -l #{l_string.join(",")}" 131 | end 132 | file.puts self.command_line.join("\n") 133 | end 134 | end 135 | 136 | def submit 137 | job_id = `qsub #{self.output}/#{self.name}.pbs` 138 | @@logger.info "#{job_id}".green 139 | end 140 | 141 | private 142 | 143 | # create a unique ID for each job 144 | def generate_uuid 145 | SecureRandom.hex(5) 146 | end 147 | 148 | # this method call other methods to perform the right substitutions into the command lines 149 | def generate_cmd_line(cmd,sample,step) 150 | if step.is_multi? # if is a multi samples step call a different method 151 | set_multi_cmd(step,self.multi_samples) 152 | cmd = sub_multi(cmd,step) 153 | else 154 | cmd = sub_placeholders(cmd,sample,step) # normal step, perform usual substitutions 155 | end 156 | 157 | # Check that all placeholders have been substituted, if not terminate with an error 158 | cmd.scan(/<\S+>/).each do |unsubstituted_tag| 159 | @@logger_error.error("Found an unsubstituted tag #{unsubstituted_tag} . Terminating the execution".red) 160 | exit 161 | end 162 | return cmd 163 | end 164 | 165 | # perform substitutions on all the placeholders 166 | def sub_placeholders(cmd,sample,step=nil) 167 | tmp_cmd = cmd.gsub(//,sample.name) 168 | if tmp_cmd =~// 169 | sample_path_glob = (tmp_cmd.scan(/(\S+)/).map {|e| e.first}) 170 | if sample_path_glob.empty? 171 | tmp_cmd.gsub!(//,sample.path.join("\s")) 172 | else 173 | sample_path_glob.each do |append| 174 | tmp_cmd.gsub!(/#{Regexp.quote(append)}/,(sample.path.map {|s| s+append}).join("\s")) 175 | end 176 | end 177 | end 178 | # for resourcers and cpus 179 | tmp_cmd = sub_resources_and_cpu(tmp_cmd,step) 180 | 181 | # for placeholders like 182 | tmp_cmd.scan(/<(\S+)\/sample>/).map {|e| e.first}.each do |input_folder| 183 | @@logger.info "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder 184 | tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/sample>/,self.output+"/"+sample.name+"/"+input_folder+"/"+sample.name) 185 | end 186 | 187 | # for placeholders like 188 | tmp_cmd.scan(/<(\S+)\/>/).map {|e| e.first}.each do |input_folder| 189 | @@logger.info "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder 190 | tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/>/,self.output+"/"+sample.name+"/"+input_folder+"/") 191 | end 192 | return tmp_cmd 193 | end 194 | 195 | def sub_resources_and_cpu(cmd,step) 196 | # for all resources tags like etc. 197 | self.resources.each_key do |r| 198 | cmd.gsub!(/<#{r}>/,self.resources[r]) 199 | end 200 | # set number of cpus for this command line 201 | cmd.gsub!(//,step.cpus.to_s) unless step.nil? 202 | return cmd 203 | end 204 | 205 | 206 | # creates actual multi-samples command lines to be substituted where placeholders are found 207 | def set_multi_cmd(step,multi_samples) 208 | if step.multi_def.kind_of? Array # in case of many multi-samples command lines 209 | step.multi_cmd = [] 210 | step.multi_def.each do |m_def| 211 | step.multi_cmd << generate_multi_cmd(m_def,multi_samples) 212 | end 213 | else 214 | step.multi_cmd = generate_multi_cmd(step.multi_def,multi_samples) 215 | end 216 | end 217 | 218 | # take the multi_cmd and perform the subsitutions into the step command lines 219 | def sub_multi(cmd,step) 220 | cmd = sub_resources_and_cpu(cmd,step) 221 | if step.multi_cmd.kind_of? Array 222 | step.multi_cmd.each_with_index do |m,index| 223 | cmd.gsub!(//,m) 224 | end 225 | else 226 | cmd.gsub!(//,step.multi_cmd) 227 | end 228 | return cmd 229 | end 230 | 231 | # this sub method handle different multi-samples definitions (like comma separated list, space separated etc.) 232 | def generate_multi_cmd(multi_def,multi_samples) 233 | multi_cmd = [] 234 | multi_samples.each do |sample_name| 235 | if sample_name.include? "," 236 | multi_cmd << split_and_sub(",",multi_def,sample_name) 237 | elsif sample_name.include? ";" 238 | multi_cmd << split_and_sub(";",multi_def,sample_name) 239 | else 240 | multi_cmd << sub_placeholders(multi_def,self.samples_obj[sample_name]) 241 | end 242 | end 243 | return multi_cmd.join("\s") 244 | end 245 | 246 | # take a non-space separated list of samples and perform the substitution with the group defitions 247 | def split_and_sub(sep,multi_def,multi) 248 | cmd_line = [] 249 | multi.split(sep).each do |sample_name| 250 | cmd_line << sub_placeholders(multi_def,self.samples_obj[sample_name]) 251 | end 252 | cmd_line.join(sep) 253 | end 254 | 255 | # log a step according to the selected adapter 256 | def logger(step, message) 257 | case self.log 258 | when "stdin" 259 | "echo \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd` `date`.\"" 260 | when "syslog" 261 | "logger -t PIPENGINE \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd`\"" 262 | when "fluentd" 263 | "curl -X POST -d 'json={\"source\":\"PIPENGINE\", \"step\":\"#{step.name}\", \"message\":\"#{message}\", \"job_id\":\"#{name}\", \"user\":\"\'\"`whoami`\"\'\", \"host\":\"\'\"`hostname`\"\'\", \"pwd\":\"\'\"`pwd`\"\'\"}' #{self.log_adapter}" 264 | end 265 | end #logger 266 | 267 | end 268 | end 269 | end 270 | 271 | -------------------------------------------------------------------------------- /lib/bio/pipengine/sample.rb: -------------------------------------------------------------------------------- 1 | module Bio 2 | module Pipengine 3 | class Sample 4 | # Sample holds all the information on a sample and its original input path (or multiple paths) 5 | attr_accessor :path 6 | def initialize(name,path_string,group) 7 | @path = path_string.split(",") 8 | @name = name 9 | @group = group 10 | end 11 | 12 | def name=(name) 13 | @name 14 | end 15 | 16 | def group=(group) 17 | @group 18 | end 19 | 20 | def group 21 | @group 22 | end 23 | 24 | def x_name 25 | "#{@group}/#{@name}" 26 | end 27 | 28 | def name 29 | @name 30 | end 31 | end 32 | end 33 | end 34 | 35 | -------------------------------------------------------------------------------- /lib/bio/pipengine/step.rb: -------------------------------------------------------------------------------- 1 | module Bio 2 | module Pipengine 3 | 4 | # Step holds information for a pipeline step 5 | # groups_def is used to store information on groups definition (i.e. generic cmd lines with placeholders) 6 | # groups_cmd is used to store the actual command lines for all the samples to be combined in a "groups" step 7 | # this are generated by combining groups_def information with sample groups information and will be placed 8 | # where placeholder is found into the step command lines. 9 | class Step 10 | attr_accessor :name, :run, :cpus, :mem, :nodes, :multi_def, :multi_cmd, :pre 11 | def initialize(name,step_instructions) 12 | @name = name 13 | parse_yaml(step_instructions) 14 | end 15 | 16 | def is_multi? 17 | return (self.multi_def.nil?) ? false : true 18 | end 19 | 20 | def has_prerequisite? 21 | return (self.pre.nil?) ? false : true 22 | end 23 | 24 | private 25 | 26 | def parse_yaml(step_instructions) 27 | self.cpus = step_instructions["cpu"].to_i 28 | self.nodes = step_instructions["nodes"] 29 | self.mem = step_instructions["mem"] 30 | self.run = step_instructions["run"] 31 | self.multi_def = step_instructions["multi"] 32 | self.pre = step_instructions["pre"] 33 | end 34 | 35 | end 36 | 37 | end 38 | end 39 | 40 | -------------------------------------------------------------------------------- /test/data/mapping.yml: -------------------------------------------------------------------------------- 1 | mapping: 2 | cpu: 8 3 | desc: Run BWA MEM and generates a sorted BAM file 4 | run: 5 | - mem -t -R '@RG\tID:\tLB:\tPL:ILLUMINA\tPU:\tCN:PTP\tSM:' .trim.fastq | view -bS - > .bam 6 | - sort -@ .bam .sort 7 | - rm -f .bam 8 | -------------------------------------------------------------------------------- /test/data/pipeline-enh.yml: -------------------------------------------------------------------------------- 1 | 2 | pipeline: gbs 3 | 4 | resources: 5 | fastqc: /storage/software/FastQC/fastqc 6 | bwa: /storage/software/bwa-0.7.5a/bwa 7 | samtools: /storage/software/samtools 8 | bgzip: /storage/software/tabix-0.2.6/bgzip 9 | tabix: /storage/software/tabix-0.2.6/tabix 10 | bcftools: /storage/software/bcftools 11 | vcfutils: /storage/software/vcfutils.pl 12 | adapters: /storage/software/Trimmomatic-0.30/adapters/TruSeq3-PE.fa 13 | trimmomatic: /storage/software/Trimmomatic-0.30/trimmomatic-0.30.jar 14 | gatk_jar: /storage/software/GenomeAnalysisTK-2.7-2-g6bda569/GenomeAnalysisTK.jar 15 | 16 | steps: 17 | <%= include :mapping, "./mapping.yml" %> 18 | 19 | index: 20 | desc: Make BAM index 21 | run: index .sort.bam 22 | 23 | gatk: 24 | desc: Run GATK to perform SNP and InDel calling, then compress and index the VCF file 25 | multi: -I .final.bam 26 | run: 27 | - java -Xmx4g -jar -T UnifiedGenotyper -R -mbq 30 -o all.gatk.vcf -nt -glm BOTH 28 | - all.gatk.vcf 29 | - -p vcf all.gatk.vcf.gz 30 | cpu: 20 31 | nodes: 2 32 | 33 | root_step: 34 | desc: root step to test dependencies 35 | run: 36 | - echo "root" 37 | 38 | child_step: 39 | desc: child step to test dependencies 40 | pre: root_step 41 | run: 42 | - echo "I am the child" 43 | -------------------------------------------------------------------------------- /test/data/pipeline.yml: -------------------------------------------------------------------------------- 1 | 2 | pipeline: simple mapping and variation calling 3 | 4 | resources: 5 | trimmomatic: /storage/software/trimmomatic 6 | adapters: /storage/software/trimmomatic/adapters.fa 7 | bwa: /storage/software/bwa 8 | samtools: /storage/software/samtools 9 | samblaster: /storage/software/samblaster 10 | freebayes: /storage/software/freebayes 11 | 12 | steps: 13 | 14 | trim: 15 | desc: Run Trimmomatic to remove adapters and low quality bases from reads 16 | run: 17 | - java -jar PE -threads -phred33 <(ls /*_R1_*.gz | xargs zcat) <(ls /*_R2_*.gz | xargs zcat) _R1_paired.fastq.gz _R1_unpaired.fastq.gz _R2_paired.fastq.gz _R2_unpaired.fastq.gz ILLUMINACLIP::2:30:10 LEADING:5 TRAILING:5 SLIDINGWINDOW:4:20 MINLEN:36 18 | - rm -f _R1_unpaired.fastq _R2_unpaired.fastq 19 | cpu: 8 20 | 21 | mapping: 22 | desc: Run BWA MEM and generates a sorted BAM file 23 | pre: trim 24 | run: 25 | - mem -t -R '@RG\tID:\tLB:\tPL:ILLUMINA\tPU:\tCN:PTP\tSM:' _R1_paired.fastq.gz _R2_paired.fastq.gz | -M | view -bS - > .bam 26 | - sort -@ .bam > .sort.bam 27 | - index .sort.bam 28 | - rm -f .bam 29 | cpu: 8 30 | 31 | freebayes: 32 | desc: Run FreeBayes to call variants on all samples 33 | multi: -b .sort.bam 34 | run: 35 | - --min-mapping-quality 30 --min-coverage 5 --min-alternate-fraction 0.5 -f -v all.fb.vcf 36 | cpu: 4 37 | 38 | root_step: 39 | desc: root step to test dependencies 40 | run: 41 | - echo "root" 42 | 43 | child_step: 44 | desc: child step to test dependencies 45 | pre: root_step 46 | run: 47 | - echo "I am the child" 48 | -------------------------------------------------------------------------------- /test/data/samples.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | index: /db/genome/human 3 | flowcell: ILLU_1234 4 | output: ./test 5 | genome: /db/genome/human.fa 6 | 7 | samples: 8 | Group1: 9 | sampleA: ./sampleA 10 | sampleB: ./sampleB 11 | Group2: 12 | sampleC: ./sampleC 13 | sampleD: ./sampleD 14 | -------------------------------------------------------------------------------- /test/examples/pipeline.yml: -------------------------------------------------------------------------------- 1 | pipeline: resequencing 2 | 3 | resources: 4 | bwa: /software/bwa 5 | samtools: /software/samtools 6 | pigz: /software/pigz 7 | 8 | steps: 9 | mapping: 10 | run: 11 | - ls /*_R1_*.gz | xargs zcat | -p 10 >> R1.fastq.gz 12 | - ls /*_R2_*.gz | xargs zcat | -p 10 >> R2.fastq.gz 13 | - sampe -P <( aln -t 4 -q 20 R1.fastq.gz) <( aln -t 4 -q 20 R2.fastq.gz) R1.fastq.gz R2.fastq.gz | view -Sb - > .bam 14 | - rm -f R1.fastq.gz R2.fastq.gz 15 | cpu: 12 16 | -------------------------------------------------------------------------------- /test/examples/pipeline_multi.yml: -------------------------------------------------------------------------------- 1 | pipeline: resequencing 2 | 3 | resources: 4 | bwa: /software/bwa 5 | samtools: /software/samtools 6 | mark_dup: /software/picard-tools-1.77/MarkDuplicates.jar 7 | gatk: /software/GenomeAnalysisTK/GenomeAnalysisTK.jar 8 | 9 | steps: 10 | mapping: 11 | run: 12 | - ls /*_R1_*.gz | xargs zcat | pigz -p 10 >> R1.fastq.gz 13 | - ls /*_R2_*.gz | xargs zcat | pigz -p 10 >> R2.fastq.gz 14 | - sampe -P <( aln -t 4 -q 20 R1.fastq.gz) <( aln -t 4 -q 20 R2.fastq.gz) R1.fastq.gz R2.fastq.gz | view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=.sorted.bam SO=coordinate LB= PL=illumina PU=PU SM= TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000 15 | - rm -f R1.fastq.gz R2.fastq.gz 16 | cpu: 12 17 | 18 | mark_dup: 19 | pre: mapping 20 | run: java -Xmx4g -jar VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=.sorted.bam OUTPUT=.md.sort.bam METRICS_FILE=.metrics REMOVE_DUPLICATES=false 21 | 22 | realign_target: 23 | pre: mark_dup 24 | run: java -Xmx4g -jar -T RealignerTargetCreator -I .md.sort.bam -nt 8 -R -o .indels.intervals 25 | cpu: 8 26 | -------------------------------------------------------------------------------- /test/examples/samples.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | index: /storage/genomes/bwa_index/genome 3 | genome: /storage/genomes/genome.fa 4 | output: ./working 5 | 6 | samples: 7 | sampleA: /ngs_reads/sampleA 8 | sampleB: /ngs_reads/sampleB 9 | sampleC: /ngs_reads/sampleC 10 | sampleD: /ngs_reads/sampleD 11 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | require 'rubygems' 2 | require 'bundler' 3 | begin 4 | Bundler.setup(:default, :development) 5 | rescue Bundler::BundlerError => e 6 | $stderr.puts e.message 7 | $stderr.puts "Run `bundle install` to install missing gems" 8 | exit e.status_code 9 | end 10 | require 'test/unit' 11 | require 'shoulda' 12 | 13 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib')) 14 | $LOAD_PATH.unshift(File.dirname(__FILE__)) 15 | require 'bioruby-pipengine' 16 | 17 | class Test::Unit::TestCase 18 | end 19 | -------------------------------------------------------------------------------- /test/test_bioruby-pipengine.rb: -------------------------------------------------------------------------------- 1 | require 'helper' 2 | 3 | class TestBiorubyPipengine < Test::Unit::TestCase 4 | should "probably rename this file and start testing for real" do 5 | flunk "hey buddy, you should probably rename this file and start testing for real" 6 | end 7 | end 8 | --------------------------------------------------------------------------------