├── .document
├── .gitignore
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── VERSION
├── bin
    └── pipengine
├── bio-pipengine.gemspec
├── joss
    ├── paper.bib
    └── paper.md
├── lib
    ├── bio-pipengine.rb
    └── bio
    │   ├── pipengine.rb
    │   └── pipengine
    │       ├── job.rb
    │       ├── sample.rb
    │       └── step.rb
└── test
    ├── data
        ├── mapping.yml
        ├── pipeline-enh.yml
        ├── pipeline.yml
        └── samples.yml
    ├── examples
        ├── pipeline.yml
        ├── pipeline_multi.yml
        └── samples.yml
    ├── helper.rb
    └── test_bioruby-pipengine.rb


/.document:
--------------------------------------------------------------------------------
1 | lib/**/*.rb
2 | bin/*
3 | - 
4 | features/**/*.feature
5 | LICENSE.txt
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # rcov generated
 2 | coverage
 3 | coverage.data
 4 | 
 5 | # rdoc generated
 6 | rdoc
 7 | 
 8 | # yard generated
 9 | doc
10 | .yardoc
11 | 
12 | # bundler
13 | .bundle
14 | 
15 | # jeweler generated
16 | pkg
17 | 
18 | # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore: 
19 | #
20 | # * Create a file at ~/.gitignore
21 | # * Include files you want ignored
22 | # * Run: git config --global core.excludesfile ~/.gitignore
23 | #
24 | # After doing this, these files will be ignored in all your git projects,
25 | # saving you from having to 'pollute' every project you touch with them
26 | #
27 | # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
28 | #
29 | # For MacOS:
30 | #
31 | .DS_Store
32 | 
33 | # For TextMate
34 | #*.tmproj
35 | #tmtags
36 | 
37 | # For emacs:
38 | #*~
39 | #\#*
40 | #.\#*
41 | 
42 | # For vim:
43 | *.swp
44 | 
45 | # For redcar:
46 | #.redcar
47 | 
48 | # For rubinius:
49 | #*.rbc
50 | 
51 | *.lock
52 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "http://rubygems.org"
 2 | ruby ">= 2.1.2"
 3 | 
 4 | # Add dependencies required to use your gem here.
 5 | # Example:
 6 | #   gem "activesupport", ">= 2.3.5"
 7 | gem "trollop", ">= 2.1.2"
 8 | gem "colorize", ">= 0.8.1"
 9 | 
10 | group :development do
11 | 	gem 'jeweler'
12 | end
13 | 
14 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Francesco Strozzi
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | PipEngine
  2 | =========
  3 | 
  4 | A simple launcher for complex biological pipelines.
  5 | 
  6 | PipEngine will generate runnable shell scripts, already configured for the PBS/Torque job scheduler, for each sample in the pipeline. It allows to run a complete pipeline or just a single step depending on the needs.
  7 | 
  8 | PipEngine is best suited for NGS pipelines, but it can be used for any kind of pipeline that can be runned on a job scheduling system and which is "sample" centric, i.e. you have from one side a list of samples with their corresponding raw data, and from the other side a pipeline that you would like to apply to them.
  9 | 
 10 | PipEngine was developed to combine the typical flexibility and portability of shell scripts, with the concept of pipeline templates that can be easily applied on different input data to reproduce scientific results. The overall improvement over Makefiles or customised ad-hoc shell scripts is better readability of the pipelines using the YAML format, especially for people with no coding experience, the automated scripts generation which allows adding extra functionalities like error controls and logging directly into script jobs, and an enforced separation between the description of input data and the pipeline template, which improves clarity and reusability of analysis protocols.
 11 | 
 12 | 
 13 | Installation
 14 | ============
 15 | 
 16 | If you already have Ruby, just install PipEngine using RubyGems:
 17 | 
 18 | ```shell
 19 | gem install bio-pipengine
 20 | ```
 21 | 
 22 | If you don't have Ruby installed we reccomend you use the Anaconda Package Manager.
 23 | 
 24 | Download the installer from [here](http://conda.pydata.org/miniconda.html) and once installed you can simply type:
 25 | 
 26 | ```shell
 27 | conda install -c bioconda ruby
 28 | ```
 29 | 
 30 | and then install PipEngine using RubyGems:
 31 | 
 32 | ```shell
 33 | gem install bio-pipengine
 34 | ```
 35 | 
 36 | Pipengine has been tested and should work with Ruby >= 2.1.2
 37 | 
 38 | :: Topics ::
 39 | ============
 40 | 
 41 | [Usage](https://github.com/bioinformatics-ptp/bioruby-pipengine#-usage-)
 42 | 
 43 | [The Pipeline YAML](https://github.com/bioinformatics-ptp/bioruby-pipengine#-the-pipeline-yaml-)
 44 | 
 45 | [The Samples YAML](https://github.com/bioinformatics-ptp/bioruby-pipengine#-the-samples-yaml-)
 46 | 
 47 | [Input and output conventions](https://github.com/bioinformatics-ptp/bioruby-pipengine#-input-and-output-conventions-)
 48 | 
 49 | [Sample groups and complex steps](https://github.com/bioinformatics-ptp/bioruby-pipengine#-sample-groups-and-complex-steps-)
 50 | 
 51 | [What happens at run-time](https://github.com/bioinformatics-ptp/bioruby-pipengine#-what-happens-at-run-time-)
 52 | 
 53 | [Examples](https://github.com/bioinformatics-ptp/bioruby-pipengine#-examples-)
 54 | 
 55 | [PBS Options](https://github.com/bioinformatics-ptp/bioruby-pipengine#-pbs-options-)
 56 | 
 57 | [Extending and contributing](https://github.com/bioinformatics-ptp/bioruby-pipengine#-extending-and-contributing-)
 58 | 
 59 | :: Usage ::
 60 | ===========
 61 | 
 62 | 
 63 | ```shell
 64 | > pipengine -h
 65 | List of available commands:
 66 | 	run		Submit pipelines to the job scheduler
 67 | ```
 68 | 
 69 | Command line for RUN mode
 70 | -------------------------
 71 | 
 72 | **Command line**
 73 | ```shell
 74 | > pipengine run -p pipeline.yml -f samples.yml -s mapping --tmp /tmp
 75 | ```
 76 | 
 77 | **Parameters**
 78 | ```shell
 79 |   -p, --pipeline=<s>            YAML file with pipeline and sample details (default: pipeline.yml)
 80 |   -f, --samples-file=<s>        YAML file with samples name and directory paths (default: samples.yml)
 81 |   -l, --samples=<s+>            List of sample names to run the pipeline
 82 |   -s, --steps=<s+>              List of steps to be executed
 83 |   -d, --dry                     Dry run. Just create the job script without submitting it to the batch system
 84 |   -t, --tmp=<s>                 Temporary output folder
 85 |   -c, --create-samples=<s+>     Create samples.yml file from a Sample directory (only for CASAVA projects)
 86 |   -m, --multi=<s+>              List of samples to be processed by a given step (the order matters)
 87 |   -g, --group=<s>               Specify the group of samples to run the pipeline steps on (do not specify --multi)
 88 |   -a, --allgroups               Apply the step(s) to all the groups defined into the samples file
 89 |   -n, --name=<s>                Analysis name
 90 |   -o, --output-dir=<s>          Output directory (override standard output directory names)
 91 |   -b, --pbs-opts=<s+>           PBS options
 92 |   -q, --pbs-queue=<s>           PBS queue
 93 |   -i, --inspect-pipeline=<s>    Show steps
 94 |   --log=<s>                     Log script activities, by default stdin. Options are fluentd (default: stdin)
 95 |   -e, --log-adapter=<s>         (stdin|syslog|fluentd) In case of fluentd use http://destination.hostname:port/yourtag
 96 |   --tag=<s+>                    Overwrite tags present in samples.yml and pipeline.yml files (e.g. tag1=value1 tag2=value2)
 97 |   -h, --help                    Show this message
 98 | ```
 99 | 
100 | PipEngine accepts two input files:
101 | * A YAML file describing the pipeline steps
102 | * A YAML file describing samples names, samples location and other samples-specific information
103 | 
104 | 
105 | :: The Pipeline YAML ::
106 | =======================
107 | 
108 | The basic structure of a pipeline YAML is divided into three parts: 1) pipeline name, 2) resources, 3) steps.
109 | 
110 | An example YAML file is like the following:
111 | 
112 | ```yaml
113 | 
114 | pipeline: resequencing
115 | 
116 | resources:
117 |   fastqc: /software/FastQC/fastqc
118 |   bwa: /software/bwa
119 |   gatk: /software/gatk-lite/GenomeAnalysisTk.jar
120 |   samtools: /software/samtools
121 |   samsort: /software/picard-tools-1.77/SortSam.jar
122 |   mark_dup: /software/picard-tools-1.77/MarkDuplicates.jar
123 |   bam: /software/bam
124 |   pigz: /software/pigz
125 | 
126 | steps:
127 |   mapping:
128 |     desc: Run BWA on each sample to perform alignment
129 |     run:
130 |      - ls <sample_path>/*_R1_*.gz | xargs zcat | <pigz> -p 10 >> R1.fastq.gz
131 |      - ls <sample_path>/*_R2_*.gz | xargs zcat | <pigz> -p 10 >> R2.fastq.gz
132 |      - <bwa> sampe -P <index> <(<bwa> aln -t 4 -q 20 <index> R1.fastq.gz) <(<bwa> aln -t 4 -q 20 <index> R2.fastq.gz) R1.fastq.gz R2.fastq.gz | <samtools> view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=<sample>.sorted.bam SO=coordinate LB=<pipeline> PL=illumina PU=PU SM=<sample> TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000
133 |      - rm -f R1.fastq.gz R2.fastq.gz
134 |     cpu: 11
135 | 
136 |   mark_dup:
137 |     run: java -Xmx4g -jar <mark_dup> VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=<mapping/sample>.sorted.bam OUTPUT=<sample>.md.sort.bam METRICS_FILE=<sample>.metrics REMOVE_DUPLICATES=false
138 | 
139 |   realign_target:
140 |     run: java -Xmx4g -jar <gatk> -T RealignerTargetCreator -I <mark_dup/sample>.md.sort.bam -nt 8 -R <genome> -o <sample>.indels.intervals
141 |     cpu: 8
142 | 
143 |   realign:
144 |     run: java -Xmx4g -jar <gatk> -T IndelRealigner -LOD 0.4 -model USE_READS --disable_bam_indexing --target_intervals <realign_target/sample>.indels.intervals -R <genome> -I <mark_dup/sample>.md.sort.bam -o <sample>.realigned.bam
145 | 
146 |   fixtags:
147 |     run: <samtools> calmd -r -E -u <realign/sample>.realigned.bam <genome> | <bam> squeeze --in -.ubam --out <sample>.final.bam --rmTags 'XM:i;XG:i;XO:i' --keepDups
148 | 
149 |   bam_index:
150 |     run: <samtools> index <fixtags/sample>.final.bam
151 | 
152 |   clean:
153 |     run: ls | grep -v final | xargs rm -fr
154 | 
155 | ```
156 | 
157 | Resources definition
158 | --------------------
159 | 
160 | PipEngine is entirely based on the placeholder and substitution logic. For example in the Pipeline YAML, each tool is declared under the resources and at run time PipEngine will search for the corresponding placeholder in the command lines.
161 | 
162 | So, for instance, if I have declared a software **bwa** under resources, PipEngine will search for a ```<bwa>``` placeholder in all the command lines and will substitute it with the software complete path declared in resources.
163 | 
164 | This makes command lines definition shorter and easier to read and avoid problems when moving from one software version to another (i.e. you just need to change the bwa definition once, and not 10 times in 5 different command lines)
165 | 
166 | The same thing happens for samples names, input and output directories and intermediate output files. This allows to create true pipelines templates that can be reused and applied to different samples sets.
167 | 
168 | Step definition
169 | ---------------
170 | 
171 | A step must be defined using standard keys:
172 | 
173 | * the first key must be the step name
174 | * under the step name, a **run** key must be defined to hold the actual command line that will be executed
175 | * a **cpu** key must be defined if the command line uses more than 1 CPU at runtime
176 | * a **multi** key must be defined if the command line takes as input more than one sample (more details later)
177 | * a **desc** key has been added to insert a short description that will be displayed using the **-i** option of PipEngine
178 | * a **nodes** and **mem** keys can be used to specify the resources needed for this job
179 | 
180 | A note on the **run** key. If a single step need more than a command line to execute the required actions, these multiple command lines must be defined as an array in YAML (see the mapping step in the above example).
181 | 
182 | 
183 | :: The Samples YAML ::
184 | =====================
185 | 
186 | The samples YAML is much simpler then the pipeline YAML:
187 | 
188 | ```yaml
189 | resources:
190 |   index: /storage/genomes/bwa_index/genome
191 |   genome: /storage/genomes/genome.fa
192 |   output: /storage/results
193 | 
194 | samples:
195 |   sampleA: /ngs_reads/sampleA
196 |   sampleB: /ngs_reads/sampleB
197 |   sampleC: /ngs_reads/sampleC
198 |   sampleD: /ngs_reads/sampleD
199 | ```
200 | 
201 | In this YAML there is again a **resources** key, but this time the tags defined here are dependent on the samples described in the YAML.
202 | 
203 | For instance, if I am working with human RNA-seq samples, these data must be aligned on the human genome, so it makes sense that the **genome** tag must be defined here and not in the pipeline YAML, which must be as much generic as possible.
204 | 
205 | Generally, the tags defined under the samples **resources** are dependent on the pipeline and analysis one wants to run. So if using BWA to perform reads alignemnt, an **index** tag must be defined here to set the BWA index prefix and it will be substituted in the pipelines command lines every time an ```<index>``` placeholder will be found in the pipeline YAML.
206 | 
207 | Sample groups
208 | -------------
209 | 
210 | If you want to organize your samples by groups, it is possible to do it directly in the samples.yml file:
211 | 
212 | 
213 | ```yaml
214 | resources:
215 |   index: /storage/genomes/bwa_index/genome
216 |   genome: /storage/genomes/genome.fa
217 |   output: /storage/results
218 | 
219 | samples:
220 |   Group1:
221 |     sampleA: /ngs_reads/sampleA
222 |     sampleB: /ngs_reads/sampleB
223 |   Group2:
224 |     sampleC: /ngs_reads/sampleC
225 |     sampleD: /ngs_reads/sampleD
226 | ```
227 | 
228 | Then, by using the **-g** option of PipEngine, it is possible to run steps and pipelines directly on groups of samples.
229 | 
230 | 
231 | How to create the Samples file
232 | ------------------------------
233 | 
234 | PipEngine is created to work primarly for NGS pipelines and with Illumina data in mind. So, the easiest thing to do if you have your samples already organized into a typical Illumina folder is to run:
235 | 
236 | ```shell
237 | > pipengine run -c /path/to/illumina/data
238 | ```
239 | 
240 | This will generate a samples.yml file with all the sample names and path derived from the run folder. The "resources" part is left blank for you to fill.
241 | 
242 | As a plus, if you have your samples scattered thoughout many different run folders, you can specify all the paths that you want to PipEngine and it will combine all the paths in the same samples file. So if you have your samples spread across let's say 3 runs, you can call PipEngine in this way:
243 | 
244 | ```shell
245 | > pipengine run -c /path/to/illumina/run1 /path/to/illumina/run2 /path/to/illumina/run3
246 | ```
247 | 
248 | If a sample is repeated in more than one run, all the paths will be combined in the samples.yml and PipEngine will take care of handling the multiple paths correctly.
249 | 
250 | 
251 | 
252 | :: Input and output conventions ::
253 | ==================================
254 | 
255 | The inputs in the steps defined in the pipeline YAML are expressed by the ```<sample>``` placeholder that will be substituted with a sample name and the ```<sample_path>```, which will be changed with the location where initial data (i.e. raw sequencing reads) are stored for that particular sample. Both this information are provided in the sample YAML file.
256 | 
257 | The ```<output>``` placeholder is a generic one to define the root location for the pipeline outputs. This parameter is also defined in the samples YAML. By default, PipEngine will write jobs scripts and will save stdout and stderr files from PBS in this folder. 
258 | 
259 | By convention, each sample output is saved under a folder with the sample name and each step is saved in a sub-folder with the step name.
260 | 
261 | That is, given a generic /storage/pipeline_results ```<output>``` folder, the outputs of the **mapping** step will be organized in this way:
262 | 
263 | ```shell
264 | /storage/pipeline_results/SampleA/mapping/SampleA.bam
265 |                          /SampleB/mapping/SampleB.bam
266 |                          /SampleC/mapping/SampleC.bam
267 |                          /SampleD/mapping/SampleD.bam
268 | ```
269 | 
270 | This simple convention keeps things clean and organized. The output file name can be decided during the pipeline creation, but it's a good habit to name it using the sample name.
271 | 
272 | When new steps of the same pipeline are run, output folders are updated accordingly. So for example if after the **mapping** step a **mark_dup** step is run, the output folder will look like this:
273 | 
274 | ```shell
275 | /storage/pipeline_results/SampleA/mapping
276 |                          /SampleA/mark_dup
277 | 
278 | /storage/pipeline_results/SampleB/mapping
279 |                          /SampleB/mark_dup
280 |                   .....
281 | ```
282 | 
283 | In case you are working with group of samples, specified by the **-g** option, the output folder will be changed to reflect the samples grouping. So for example if a **mapping** step is called on the **Group1** group of samples, all the outputs will be saved under the ```<output>/Group1``` folder and results of mapping for SampleA, will be found under ```<output>/Group1/SampleA/mapping``` .
284 | 
285 | 
286 | How steps are connected together
287 | --------------------------------
288 | 
289 | One step is connected to another by simply requiring that its input is the output of another preceding step. This is just achived by a combination of ```<output>``` and ```<sample>``` placeholders in the pipeline command line definitions.
290 | 
291 | For instance, if I have a resequencing pipeline that will first run BWA to map the reads and then a mark duplicate step, the mark_dup step will be dependent from the BWA output.
292 | 
293 | ```yaml
294 |   mapping:
295 |     run:
296 |      - ls <sample_path>/*_R1_*.gz | xargs zcat | <pigz> -p 10 >> R1.fastq.gz
297 |      - ls <sample_path>/*_R2_*.gz | xargs zcat | <pigz> -p 10 >> R2.fastq.gz
298 |      - <bwa> sampe -P <index> <(<bwa> aln -t 4 -q 20 <index> R1.fastq.gz) <(<bwa> aln -t 4 -q 20 <index> R2.fastq.gz) R1.fastq.gz R2.fastq.gz > <sample>.sorted.bam
299 |      - rm -f R1.fastq.gz R2.fastq.gz
300 |     cpu: 11
301 | 
302 |   mark_dup:
303 |     run: java -Xmx4g -jar <mark_dup> INPUT=<mapping/sample>.sorted.bam OUTPUT=<sample>.md.sort.bam
304 | ```
305 | 
306 | So in the **mark_dup** step the input placeholder (defined under the **run** key in the pipeline YAML) will be written as:
307 | 
308 | ```
309 | <mapping/sample>.sorted.bam
310 | ```
311 | 
312 | If the ```<output>``` tag is defined for instance as "/storage/results", this will be translated at run-time into:
313 | 
314 | ```
315 | /storage/results/SampleA/mapping/SampleA.sorted.bam
316 | ```
317 | 
318 | for SampleA outputs. Basically the ```<mapping/sample>``` placeholder is a shortcut for ```<output>/<sample>/{step name, mapping in this case}/<sample>```
319 | 
320 | Following the same idea, using a ```<mapping/>``` placeholder (note the / at the end) will be translated into ```<output>/<sample>/{step name, mapping in this case}/``` , to address the scenario when a user wants to point to the previous step output directory, but without having the ```<sample>``` appended to the end of the path.
321 | 
322 | More complex dependences can be defined by combinations of ```<output>``` and ```<sample>``` placeholders, or using the ```<step/>``` and ```<step/sample>``` placeholders, without having to worry about the actual sample name and the complete input and output paths.
323 | 
324 | Jobs dependencies
325 | -------------------------
326 | Steps can also be defined with dependencies so the user can just call the final step and all the upper chain is called automatically. To achieve this task Pipengine requires that the user defines a
327 | ```
328 |  pre:
329 | ```
330 | tag in the step definition:
331 | 
332 | ```
333 |   root_step:
334 |     desc: root step to test dependencies
335 |     run:
336 |      - echo "root"
337 | 
338 |   child_step:
339 |     desc: child step to test dependencies
340 |     pre: root_step
341 |     run:
342 |       - echo "I am the child"
343 | ```
344 | 
345 | 
346 | :: Multi-Samples and complex steps ::
347 | =====================================
348 | 
349 | The pipeline steps can be defined to run on a single sample or to take as input more than one sample data, depending on the command line used.
350 | 
351 | A typical example is running a differential expression step for example with CuffDiff. This requires to take all the output generated from the previous Cufflinks step (i.e. the gtf files) and process them to generate a unique transcripts reference (CuffCompare) and then perform the differential expression across the samples using the BAM files generated by, let's say, TopHat in a **mapping** step.
352 | 
353 | This is an extract of the step definition in the pipeline YAML to describe these two steps:
354 | 
355 | ```yaml
356 |   diffexp:
357 |     multi:
358 |       - <output>/<sample>/cufflinks/transcripts.gtf
359 |       - <mapping/sample>_tophat/accepted_hits.bam
360 |     run:
361 |       - echo '<multi1>' | sed -e 's/,/ /g' | xargs ls >> gtf_list.txt
362 |       - <cuffcompare> -s <genome> -r <gtf> -i gtf_list.txt
363 |       - <cuffdiff> -p 12 -N -u -b <genome> ./*combined.gtf <multi2>
364 |     cpu: 12
365 | ```
366 | 
367 | In this case we need to combine the outputs of all the samples from the cufflinks step and pass that information to cuffcompare and combine the outputs of the mapping steps and pass them to the cuffdiff command line.
368 | 
369 | This is achived in two ways. First, the step definition must include a **multi** key, that simply defines what, for each sample, will be substituted where the ```<multi>``` placeholder is found.
370 | 
371 | In the example above, the step includes two command lines, one for cuffcompare and the other for cuffdiff. Cuffcompare requires the transcripts.gtf for each sample, while Cuffdiff requires the BAM file for each sample, plus the output of Cuffcompare.
372 | 
373 | So the two command lines need two different kind of files as input from the same set of samples, therefore two **multi** keywords are defined as well as two placeholders ```<multi1>``` and ```<multi2>```
374 | 
375 | Once the step has been defined in the pipeline YAML, PipEngine must be invoked using the **-m** parameter, to specify the samples that should be grouped together by this step:
376 | 
377 | ```shell
378 | pipengine run -p pipeline.yml -m SampleA,SampleB SampleC,SampleB
379 | ```
380 | 
381 | Note that the use of commas is not casual, since the **-m** parameter specifies not only which samples should be used for this step, but also how they should be organized on the corresponding command line. The **-m** parameter takes the sample names and underneath it will combine the sample name with the 'multi' keywords and then it will substitute back the command line by keeping the samples in the same order as provided with the **-m**.
382 | 
383 | The above command line will be translated, for the **cuffdiff** command line in the following:
384 | 
385 | ```shell
386 | /software/cuffdiff -p 12 -N -u -b /storage/genome.fa combined.gtf /storage/results/SampleA/cufflinks/transcripts.gtf,/storage/results/SampleB/cufflinks/transcripts.gtf /storage/results/SampleC/cufflinks/transcripts.gtf /storage/results/SampleD/cufflinks/transcripts.gtf
387 | ```
388 | 
389 | and this will correspond to the way CuffDiff wants biological replicates for each condition to be described on the command line.
390 | 
391 | **Note**
392 | 
393 | Multi-samples step management is complex and it's a task that can't be easily generalized since every software has it's own way to require and organize the inputs on the command line. This approach it's probably not the most elegant solution but works quite well, even if there are some drawbacks. For instance, as stated above, the samples groups is processed and passed to command lines as it is taken from the **-m** parameter.
394 | 
395 | So for Cuffdiff, the presence of commas is critical to divide biological replicates from different conditions, but for Cuffcompare the commas are not needed and will raise an error on the command line. That's the reason of the:
396 | 
397 | ```shell
398 | echo '<multi1>' | sed -e 's/,/ /g' | xargs ls >> gtf_list.txt
399 | ```
400 | 
401 | This line generates the input file for Cuffcompare with the list of the transcripts.gtf files for each sample, generated using the 'multi' definition in the pipeline YAML and the line passed through the **-m** parameter, but getting rid of the commas that separate sample names. It's a workaround and it's not a super clean solution, but PipEngine wants to be a general tool not binded to specific corner cases and it always lets the user define it's own custom command lines to manage particular steps, as in this case.
402 | 
403 | Composable & Modular steps definition
404 | ------------------------------------
405 | 
406 | Since now steps are defined inside a single YAML file. This approach is usefult to have a stable and reproducible analysis pipeline. But what if, multiple users whant to collaborate on the same pipeline improving it and, most importantly, re-using the same steps in different analyses ? What happend is a proliferation of highly similar pipelines that are very complicate to compare and to maintain over time.
407 | In this scenario, the very first thing that a developer imagine is the ability to include external files, unfortunately YAML does not implement this feature. A possible workaround, remember that we are in the Ruby land, is to embed some Ruby code into the YAML file and include external steps.
408 | 
409 | Creating a file `mapping.yml` that describe the mapping step with BWA
410 | 
411 | ```
412 | mapping:
413 |   cpu: 8
414 |   desc: Run BWA MEM and generates a sorted BAM file
415 |   run:
416 |    - <bwa> mem -t <cpu> -R '@RG\tID:<flowcell>\tLB:<sample>\tPL:ILLUMINA\tPU:<flowcell>\tCN:PTP\tSM:<sample>' <index> <trim/sample>.trim.fastq | <samtools> view -bS - > <sample>.bam
417 |    - <samtools> sort -@ <cpu> <sample>.bam <sample>.sort
418 |    - rm -f <sample>.bam
419 | ```
420 | 
421 | is then possible to include the `mapping.yml` file inside your pipeline with a snipped of Ruby code `<%= include :name_of_the_step, "file_step.yml" %>
422 | Right now is very important that you place the tag at the very first start of the line ( no spaces at the beginning of the line)
423 | 
424 | ```
425 | steps:
426 | <%= include :mapping, "./mapping.yml" %>
427 | 
428 |   index:
429 |     desc: Make BAM index
430 |     run: <samtools> index <mapping/sample>.sort.bam
431 | ````
432 | 
433 | are later run pipengine as usual.
434 | TODO: Dump the whole pipeline file for reproducibility purposes.
435 | 
436 | 
437 | :: What happens at run-time ::
438 | ==============================
439 | 
440 | When invoking PipEngine, the tool will look for the pipeline YAML specified and for the sample YAML file. It will load the list of samples (names and paths of input data) and for each sample it will load the information of the step specified in the command line ( **-s** parameter ).
441 | 
442 | PipEngine will then combine the data from the two YAML, generating the specific command lines of the selected steps and substituing all the placeholders to generate the final command lines.
443 | 
444 | A shell script will be finally generated, for each sample, that will contain all the instructions to run a specific step of the pipeline plus the meta-data for the PBS scheduler. The shell scripts are written inside the directory specified on the ```output:``` key in the ```samples.yml``` file, the directory is created if it does not exist.
445 | 
446 | If not invoked with the **-d** option (dry-run) PipEngine will directly submit the jobs to the PBS scheduler using the "qsub" command.
447 | 
448 | Dry Run
449 | -------
450 | 
451 | The **-d** parameter lets you create the runnable shell scripts without submitting them to PBS. Use it often to check that the pipeline that will be executed is correct and it is doing what you thought. The runnable scripts are saved by default in the ```<output>``` directory.
452 | 
453 | Use it also to learn how the placeholders works, especially the dependency placeholders (e.g. ```<mapping/sample>```) and to cross-check that all the placeholders in the pipeline command lines were substituted correctly before submitting the jobs.
454 | 
455 | Temporary output folder
456 | -------------------
457 | 
458 | By using the '--tmp' option, PipEngine will generate a job script (for each sample) that will save all the output files or folders for a particular step in a directory (e.g. /tmp) that is different from the one provided with the ```<output>```.
459 | 
460 | By default PipEngine will generate output folders directly under the location defined by the ```<ouput>``` tag in the Sample YAML. The --tmp solution instead can be useful when we don't want to save directly to the final location (e.g maybe a slower network storage) or we don't want to keep all the intermediate files but just the final ones.
461 | 
462 | With this option enabled, PipEngine will also generate instructions in the job script to copy, at the end of the job, all the outputs from the temporary directory to the final output folder (i.e. ```<output>```) and then to remove the temporary copy.
463 | 
464 | When '--tmp' is used, a UUID is generated for each job and prepended to the job name and to the temporary output folder, to avoid possible name collisions and data overwrite if more jobs with the same name (e.g. mapping) are running and writing to the same temporary location.
465 | 
466 | One job with multiple steps
467 | ---------------------------
468 | 
469 | It is of course possible to aggregate multiple steps of a pipeline and run them in one single job. For instance let's say I want to run in the same job the steps mapping, mark_dup and realign_target (see pipeline YAML example above).
470 | 
471 | From the command line it's just:
472 | 
473 | ```shell
474 | pipengine run -p pipeline.yml -s mapping mark_dup realign_target
475 | ```
476 | 
477 | A single job script, for each sample, will be generated with all the instructions for these steps. If more than one step declares a **cpu** key, the highest cpu value will be assigned for the whole job.
478 | 
479 | Each step will save outputs into a separated folder, under the ```<output>```, exactly if they were run separately. This way, if the job fails for some reason, it will be possible to check which steps were already completed and restart from there.
480 | 
481 | When multiple steps are run in the same job, by default PipEngine will generate the job name as the concatenation of all the steps names. Since this could be a problem when a lot of steps are run together in the same job, a '--name' parameter it's available to rename the job in a more convenient way.
482 | 
483 | :: Examples ::
484 | ==============
485 | 
486 | All these files can be found into the test/examples directory of the repository.
487 | 
488 | Example 1: One step and multiple command lines
489 | ----------------------------------------------
490 | 
491 | This is an example on how to prepare the inputs for BWA and run it along with Samtools:
492 | 
493 | **pipeline.yml**
494 | ```yaml
495 | pipeline: resequencing
496 | 
497 | resources:
498 |   bwa: /software/bwa
499 |   samtools: /software/samtools
500 |   pigz: /software/pigz
501 | 
502 | steps:
503 |   mapping:
504 |     run:
505 |      - ls <sample_path>/*_R1_*.gz | xargs zcat | <pigz> -p 10 >> R1.fastq.gz
506 |      - ls <sample_path>/*_R2_*.gz | xargs zcat | <pigz> -p 10 >> R2.fastq.gz
507 |      - <bwa> sampe -P <index> <(<bwa> aln -t 4 -q 20 <index> R1.fastq.gz) <(<bwa> aln -t 4 -q 20 <index> R2.fastq.gz) R1.fastq.gz R2.fastq.gz | <samtools> view -Sb - > <sample>.bam
508 |      - rm -f R1.fastq.gz R2.fastq.gz
509 |     cpu: 12
510 | ```
511 | 
512 | **samples.yml**
513 | ```yaml
514 | resources:
515 |   index: /storage/genomes/bwa_index/genome
516 |   genome: /storage/genomes/genome.fa
517 |   output: ./working
518 | 
519 | samples:
520 |   sampleA: /ngs_reads/sampleA
521 |   sampleB: /ngs_reads/sampleB
522 |   sampleC: /ngs_reads/sampleC
523 |   sampleD: /ngs_reads/sampleD
524 | ```
525 | 
526 | Running PipEngine with the following command line:
527 | 
528 | ```
529 | pipengine run -p pipeline.yml -f samples.yml -s mapping -d
530 | ```
531 | 
532 | will generate a runnable shell script for each sample (available in the ./working directory):
533 | 
534 | ```shell
535 | #!/usr/bin/env bash
536 | #PBS -N 2c57c1a853-sampleA-mapping
537 | #PBS -d ./working
538 | #PBS -l nodes=1:ppn=12
539 | if [ ! -f ./working/sampleA/mapping/checkpoint ]
540 | then
541 | echo "mapping 2c57c1a853-sampleA-mapping start `whoami` `hostname` `pwd` `date`."
542 | 
543 | mkdir -p ./working/sampleA/mapping
544 | cd ./working/sampleA/mapping
545 | ls /ngs_reads/sampleA/*_R1_*.gz | xargs zcat | /software/pigz -p 10 >> R1.fastq.gz || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 0 `whoami` `hostname` `pwd` `date`."; exit 1; }
546 | ls /ngs_reads/sampleA/*_R2_*.gz | xargs zcat | /software/pigz -p 10 >> R2.fastq.gz || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 1 `whoami` `hostname` `pwd` `date`."; exit 1; }
547 | /software/bwa sampe -P /storage/genomes/bwa_index/genome <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R1.fastq.gz) <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R2.fastq.gz) R1.fastq.gz R2.fastq.gz | /software/samtools view -Sb - > sampleA.bam || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 2 `whoami` `hostname` `pwd` `date`."; exit 1; }
548 | rm -f R1.fastq.gz R2.fastq.gz || { echo "mapping 2c57c1a853-sampleA-mapping FAILED 3 `whoami` `hostname` `pwd` `date`."; exit 1; }
549 | echo "mapping 2c57c1a853-sampleA-mapping finished `whoami` `hostname` `pwd` `date`."
550 | touch ./working/sampleA/mapping/checkpoint
551 | else
552 | echo "mapping 2c57c1a853-sampleA-mapping already executed, skipping this step `whoami` `hostname` `pwd` `date`."
553 | fi
554 | ```
555 | As you can see the command line described in the pipeline YAML are translated into normal Unix command lines, therefore every solution that works on a standard Unix shell (pipes, bash substitutions) is perfectly acceptable. Pipengine addes extra lines in the script for steps checkpoint controls to avoid re-running already executed steps, and error controls with logging.
556 | 
557 | In this case also, the **run** key defines three different command lines, that are described using YAML array (a line prepended with a -). This command lines are all part of the same step, since the first two are required to prepare the input for the third command line (BWA), using standard bash commands.
558 | 
559 | As a rule of thumb you should put more command line into an array under the same step if these are all logically correlated and required to manipulate intermidiate files. Otherwise if command lines executes conceptually different actions they should go into different steps.
560 | 
561 | Example 2: Multiple steps in one job
562 | ------------------------------------
563 | 
564 | Now I want to execute more steps in a single job for each sample. The pipeline YAML is defined in this way:
565 | 
566 | ```yaml
567 | 
568 | pipeline: resequencing
569 | 
570 | resources:
571 |   bwa: /software/bwa
572 |   samtools: /software/samtools
573 |   mark_dup: /software/picard-tools-1.77/MarkDuplicates.jar
574 |   gatk: /software/GenomeAnalysisTK/GenomeAnalysisTK.jar
575 | 
576 | steps:
577 |   mapping:
578 |     run:
579 |      - ls <sample_path>/*_R1_*.gz | xargs zcat | pigz -p 10 >> R1.fastq.gz
580 |      - ls <sample_path>/*_R2_*.gz | xargs zcat | pigz -p 10 >> R2.fastq.gz
581 |      - <bwa> sampe -P <index> <(<bwa> aln -t 4 -q 20 <index> R1.fastq.gz) <(<bwa> aln -t 4 -q 20 <index> R2.fastq.gz) R1.fastq.gz R2.fastq.gz | <samtools> view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=<sample>.sorted.bam SO=coordinate LB=<sample> PL=illumina PU=PU SM=<sample> TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000
582 |      - rm -f R1.fastq.gz R2.fastq.gz
583 |     cpu: 12
584 | 
585 |   mark_dup:
586 |     pre: mapping
587 |     run: java -Xmx4g -jar <mark_dup> VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=<mapping/sample>.sorted.bam OUTPUT=<sample>.md.sort.bam METRICS_FILE=<sample>.metrics REMOVE_DUPLICATES=false
588 | 
589 |   realign_target:
590 |     pre: mark_dup
591 |     run: java -Xmx4g -jar <gatk> -T RealignerTargetCreator -I <mark_dup/sample>.md.sort.bam -nt 8 -R <genome> -o <sample>.indels.intervals
592 |     cpu: 8
593 | ```
594 | 
595 | The sample YAML file is the same as the example above. Now to execute together the 3 steps defined in the pipeline, PipEngine can be invoked with this command line:
596 | 
597 | ```
598 | pipengine run -p pipeline_multi.yml  -f samples.yml -s realign_target -d
599 | ```
600 | Since dependencies have been defined for the steps using the ```pre``` key, it is sufficient to invoke Pipengine with the last step and the other two are automatically included in the script. Messages will be prompted in this case since Pipengine just warns that the directories for certain steps, that are needed for other steps in the pipeline, are not yet available (and thus the corresponding steps will be executed to generate the necessary data). The command line will generate the following shell script (one for each sample, available in the ./working directory):
601 | 
602 | ```shell
603 | #!/usr/bin/env bash
604 | #PBS -N 6f3c911c49-sampleC-realign_target
605 | #PBS -d ./working
606 | #PBS -l nodes=1:ppn=12
607 | if [ ! -f ./working/sampleC/mapping/checkpoint ]
608 | then
609 | echo "mapping 6f3c911c49-sampleC-realign_target start `whoami` `hostname` `pwd` `date`."
610 | 
611 | mkdir -p ./working/sampleC/mapping
612 | cd ./working/sampleC/mapping
613 | ls /ngs_reads/sampleC/*_R1_*.gz | xargs zcat | pigz -p 10 >> R1.fastq.gz || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 0 `whoami` `hostname` `pwd` `date`."; exit 1; }
614 | ls /ngs_reads/sampleC/*_R2_*.gz | xargs zcat | pigz -p 10 >> R2.fastq.gz || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 1 `whoami` `hostname` `pwd` `date`."; exit 1; }
615 | /software/bwa sampe -P /storage/genomes/bwa_index/genome <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R1.fastq.gz) <(/software/bwa aln -t 4 -q 20 /storage/genomes/bwa_index/genome R2.fastq.gz) R1.fastq.gz R2.fastq.gz | /software/samtools view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=sampleC.sorted.bam SO=coordinate LB=sampleC PL=illumina PU=PU SM=sampleC TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000 || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 2 `whoami` `hostname` `pwd` `date`."; exit 1; }
616 | rm -f R1.fastq.gz R2.fastq.gz || { echo "mapping 6f3c911c49-sampleC-realign_target FAILED 3 `whoami` `hostname` `pwd` `date`."; exit 1; }
617 | echo "mapping 6f3c911c49-sampleC-realign_target finished `whoami` `hostname` `pwd` `date`."
618 | touch ./working/sampleC/mapping/checkpoint
619 | else
620 | echo "mapping 6f3c911c49-sampleC-realign_target already executed, skipping this step `whoami` `hostname` `pwd` `date`."
621 | fi
622 | if [ ! -f ./working/sampleC/mark_dup/checkpoint ]
623 | then
624 | echo "mark_dup 6f3c911c49-sampleC-realign_target start `whoami` `hostname` `pwd` `date`."
625 | 
626 | mkdir -p ./working/sampleC/mark_dup
627 | cd ./working/sampleC/mark_dup
628 | java -Xmx4g -jar /software/picard-tools-1.77/MarkDuplicates.jar VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=./working/sampleC/mapping/sampleC.sorted.bam OUTPUT=sampleC.md.sort.bam METRICS_FILE=sampleC.metrics REMOVE_DUPLICATES=false || { echo "mark_dup 6f3c911c49-sampleC-realign_target FAILED `whoami` `hostname` `pwd` `date`."; exit 1; }
629 | echo "mark_dup 6f3c911c49-sampleC-realign_target finished `whoami` `hostname` `pwd` `date`."
630 | touch ./working/sampleC/mark_dup/checkpoint
631 | else
632 | echo "mark_dup 6f3c911c49-sampleC-realign_target already executed, skipping this step `whoami` `hostname` `pwd` `date`."
633 | fi
634 | if [ ! -f ./working/sampleC/realign_target/checkpoint ]
635 | then
636 | echo "realign_target 6f3c911c49-sampleC-realign_target start `whoami` `hostname` `pwd` `date`."
637 | 
638 | mkdir -p ./working/sampleC/realign_target
639 | cd ./working/sampleC/realign_target
640 | java -Xmx4g -jar /software/GenomeAnalysisTK/GenomeAnalysisTK.jar -T RealignerTargetCreator -I ./working/sampleC/mark_dup/sampleC.md.sort.bam -nt 8 -R /storage/genomes/genome.fa -o sampleC.indels.intervals || { echo "realign_target 6f3c911c49-sampleC-realign_target FAILED `whoami` `hostname` `pwd` `date`."; exit 1; }
641 | echo "realign_target 6f3c911c49-sampleC-realign_target finished `whoami` `hostname` `pwd` `date`."
642 | touch ./working/sampleC/realign_target/checkpoint
643 | else
644 | echo "realign_target 6f3c911c49-sampleC-realign_target already executed, skipping this step `whoami` `hostname` `pwd` `date`."
645 | fi
646 | ```
647 | 
648 | 
649 | Logging
650 | ---------------------------
651 | 
652 | It is always usefult to log activities and collect the output from your software. Pipengine can log to:
653 | 
654 | * stdin, just print on the terminal
655 | * syslog send the log to the system log using logger
656 | * fluentd send the log to a collector/centralized logging system (http://fluentd.org)
657 | 
658 | 
659 | :: PBS Options ::
660 | =================
661 | 
662 | If there is the need to pass to PipEngine specific PBS options, the ```--pbs-opts``` parameter can be used.
663 | 
664 | This parameter accepts a list of options and each one will be added to the PBS header in the shell script, along with the ```-l``` PBS parameter.
665 | 
666 | So for example, the following options passed to ```--pbs-opts```:
667 | 
668 | ```shell
669 | --pbs-opts nodes=2:ppn=8 host=node5
670 | ```
671 | 
672 | will become, in the shell script:
673 | 
674 | ```shell
675 | #PBS -l nodes=2:ppn=8
676 | #PBS -l host=node5
677 | ```
678 | 
679 | Note also that from version 0.5.2, it is possible to specify common PBS options like "nodes" and "mem" (for memory) directly within a step defition in the Pipeline yaml, exactly as it's done with the "cpu" parameter. So in a step it is possible to write:
680 | 
681 | ```yaml
682 |   realign_target:
683 |     run: java -Xmx4g -jar <gatk> -T RealignerTargetCreator -I <mark_dup/sample>.md.sort.bam -nt 8 -R <genome> -o <sample>.indels.intervals
684 |     cpu: 8
685 |     nodes: 2
686 |     mem: 8G
687 | ```
688 | 
689 | to have PipEngine translate this into:
690 | 
691 | ```shell
692 | #PBS -l nodes=2:ppn=8,mem=8G
693 | ```
694 | 
695 | within the job script.
696 | 
697 | If a specific queue needs to be selected for sending the jobs to PBS, the ```--pbs-queue``` (short version **-q**) parameter can be used. This will pass to the ```qsub``` command the ```-q <queue name>``` taken from the command line.
698 | 
699 | :: Extending and contributing ::
700 | ================================
701 | 
702 | Pipengine code is organized around main methods allowing for YAML parsing and command line arguments substitutions that are available in lib/bio/pipengine.rb. Specific logic for jobs, pipeline steps and samples is described in dedicated classes called Bio::Pipengine::Job, Bio::Pipengine::Step and Bio::Pipengine::Sample.
703 | 
704 | For instance, in case the support for different schedulers needs to be introduced, it will be sufficient to modify or extend the Job.to_script method, which is the one defining scheduler-specific options in the runnable bash script.
705 | 
706 | Copyright
707 | =========
708 | 
709 | &copy;2017 Francesco Strozzi, Raoul Jean Pierre Bonnal 
710 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | require 'rubygems'
 4 | require 'bundler'
 5 | begin
 6 |   Bundler.setup(:default, :development)
 7 | rescue Bundler::BundlerError => e
 8 |   $stderr.puts e.message
 9 |   $stderr.puts "Run `bundle install` to install missing gems"
10 |   exit e.status_code
11 | end
12 | require 'rake'
13 | 
14 | require 'jeweler'
15 | Jeweler::Tasks.new do |gem|
16 |   # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17 |   gem.name = "bio-pipengine"
18 |   gem.homepage = "http://github.com/fstrozzi/bioruby-pipengine"
19 |   gem.license = "MIT"
20 |   gem.summary = %Q{A pipeline manager}
21 |   gem.description = %Q{A pipeline manager}
22 |   gem.email = ["francesco.strozzi@gmail.com", "ilpuccio.febo@gmail.com"]
23 |   gem.authors = ["Francesco Strozzi", "Raoul Jean Pierre Bonnal"]
24 |   gem.files = Dir.glob("lib/**/*.rb")
25 | 	gem.files << "VERSION"
26 | 	# dependencies defined in Gemfile
27 | end
28 | Jeweler::RubygemsDotOrgTasks.new
29 | 
30 | require 'rake/testtask'
31 | Rake::TestTask.new(:test) do |test|
32 |   test.libs << 'lib' << 'test'
33 |   test.pattern = 'test/**/test_*.rb'
34 |   test.verbose = true
35 | end
36 | 
37 | task :default => :test
38 | 
39 | require 'rdoc/task'
40 | Rake::RDocTask.new do |rdoc|
41 |   version = File.exist?('VERSION') ? File.read('VERSION') : ""
42 | 
43 |   rdoc.rdoc_dir = 'rdoc'
44 |   rdoc.title = "bioruby-pipengine #{version}"
45 |   rdoc.rdoc_files.include('README*')
46 |   rdoc.rdoc_files.include('lib/**/*.rb')
47 | end
48 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.9.7


--------------------------------------------------------------------------------
/bin/pipengine:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | $:<< File.expand_path(File.join(File.dirname(File.dirname __FILE__),"lib"))
  4 | require 'bio-pipengine'
  5 | 
  6 | banner_text = "\nLauncher for Complex Biological Pipelines . Copyright(C) 2012 Francesco Strozzi, Raoul Jean Pierre Bonnal\n\n" 
  7 | version_text = File.read File.expand_path(File.join(File.dirname(File.dirname __FILE__),"VERSION"))
  8 | SUB_COMMANDS = %w(run jobs)
  9 | 
 10 | 
 11 | #Bio::Pipengine.check_config
 12 | 
 13 | options = {}
 14 | cmd = ARGV.first # get the subcommand
 15 | opts = case cmd
 16 | when "run" 	
 17 | 	options[:run] = true
 18 | 	ARGV.shift
 19 | 	Trollop::options do
 20 | 		opt :pipeline, "YAML file with pipeline and sample details", :short => "p", :type => :string, :default => "pipeline.yml"
 21 | 		opt :samples_file, "YAML file with samples name and directory paths", :short => "f", :type => :string, :default => "samples.yml"
 22 | 		opt :samples, "List of sample names to run the pipeline", :type => :strings, :short => "l"
 23 | 		opt :steps, "List of steps to be executed", :type => :strings, :short => "s"
 24 | 		opt :dry,"Dry run. Just create the job script without submitting it to the batch system", :short => "d"
 25 | 		opt :tmp, "Temporary output folder", :type => :string, :short => "t"
 26 | 		opt :create_samples, "Create samples.yml file from a Sample directory (only for CASAVA projects)", :short => "c", :type => :strings
 27 | 		opt :multi, "List of samples to be processed by a given step (the order matters)", :short => "m", :type => :strings
 28 | 		opt :group, "Specify the group of samples to run the pipeline steps on (do not specify --multi)", :short => "g", :type => :string
 29 | 		opt :allgroups, "Apply the step(s) to all the groups defined into the samples file", :short => "a"
 30 | 		opt :name, "Analysis name", :short => "n", :type => :string
 31 | 		opt :output_dir, "Output directory (override standard output directory names)", :short => "o", :type => :string
 32 | 		opt :pbs_opts, "PBS options", :type => :strings, :short => "b"
 33 | 		opt :pbs_queue, "PBS queue", :type => :string, :short => "q"
 34 | 		opt :inspect_pipeline, "Show steps", :short => "i", :type => :string
 35 | 		opt :log, "Log script activities, by default stdin. Options are fluentd", :type => :string, :default => "stdin"
 36 | 		opt :log_adapter, "(stdin|syslog|fluentd) In case of fluentd use http://destination.hostname:port/yourtag", :type => :string
 37 | 		opt :tag, "Overwrite tags present in samples.yml and pipeline.yml files (e.g. tag1=value1 tag2=value2)", :type => :strings
 38 | 	end
 39 | when "-h"
 40 | 	puts banner_text
 41 | 	puts "List of available commands:\n\trun\tSubmit pipelines to the job scheduler\n"
 42 | 	exit
 43 | else
 44 | 	global_opts = Trollop::options do
 45 | 		banner banner_text
 46 | 		version "PipEngine v#{version_text}"
 47 | 	  puts banner_text
 48 | 		puts "PipEngine v#{version_text}" 
 49 | 		puts "List of available commands:\n\trun\tSubmit pipelines to the job scheduler\n\n"
 50 | 	end
 51 | end
 52 | 
 53 | 
 54 | 
 55 | options = options.merge opts
 56 | Trollop::die :multi, "Specifing both --group and --multi is not allowed" if options[:multi] and options[:group]
 57 | 
 58 | if options[:create_samples]
 59 | 	Bio::Pipengine.create_samples options[:create_samples]
 60 | #elsif options[:jobs]
 61 | #	if options[:job_id] 
 62 | #		Bio::Pipengine.show_stats(options[:job_id])
 63 | #	elsif options[:job_name]
 64 | #		warn "Not yet implemented"
 65 | #		exit
 66 | #	elsif options[:delete]
 67 | #		if options[:delete].empty?
 68 | #			warn "Provide one or more Job IDs or write 'all' to delete all your running jobs".red
 69 | #			exit
 70 | #		end
 71 | #		puts "Warning: this will delete the following running jobs: ".light_blue + "#{options[:delete].join(",")}".green
 72 | #		print "Are you sure? (y|n):"
 73 | #		answer = gets.chomp
 74 | #		if answer == "y"
 75 | #			Bio::Pipengine.delete_jobs(options[:delete])
 76 | #		else
 77 | #			puts "Aborting..."
 78 | #			exit
 79 | #		end
 80 | #	else
 81 | #		Bio::Pipengine.show_stats(["all"])
 82 | #	end
 83 | elsif options[:pipeline] && options[:samples_file]
 84 | 	if options[:inspect_pipeline]
 85 | 		Bio::Pipengine.inspect_steps(options[:inspect_pipeline])
 86 | 		exit
 87 | 	else
 88 | 		abort("File not found: #{options[:pipeline]}".red) unless File.exists? options[:pipeline]
 89 | 		abort("File not found: #{options[:samples_file]}".red) unless File.exists? options[:samples_file]
 90 | 		abort("Please provide a valid step name with the --step parameter".red) unless options[:steps]
 91 | 		if options[:allgroups]
 92 | 			Bio::Pipengine.load_samples_file(options[:samples_file])["samples"].keys.each do |group|
 93 | 				options[:group] = group
 94 | 				Bio::Pipengine.run(options)	
 95 | 			end
 96 | 		else
 97 | 			Bio::Pipengine.run(options)
 98 | 		end
 99 | 	end
100 | end
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/bio-pipengine.gemspec:
--------------------------------------------------------------------------------
 1 | # Generated by jeweler
 2 | # DO NOT EDIT THIS FILE DIRECTLY
 3 | # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
 4 | # -*- encoding: utf-8 -*-
 5 | # stub: bio-pipengine 0.9.7 ruby lib
 6 | 
 7 | Gem::Specification.new do |s|
 8 |   s.name = "bio-pipengine".freeze
 9 |   s.version = "0.9.7"
10 | 
11 |   s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
12 |   s.require_paths = ["lib".freeze]
13 |   s.authors = ["Francesco Strozzi".freeze, "Raoul Jean Pierre Bonnal".freeze]
14 |   s.date = "2017-08-28"
15 |   s.description = "A pipeline manager".freeze
16 |   s.email = ["francesco.strozzi@gmail.com".freeze, "ilpuccio.febo@gmail.com".freeze]
17 |   s.executables = ["pipengine".freeze]
18 |   s.extra_rdoc_files = [
19 |     "LICENSE.txt",
20 |     "README.md"
21 |   ]
22 |   s.files = [
23 |     "VERSION",
24 |     "lib/bio-pipengine.rb",
25 |     "lib/bio/pipengine.rb",
26 |     "lib/bio/pipengine/job.rb",
27 |     "lib/bio/pipengine/sample.rb",
28 |     "lib/bio/pipengine/step.rb"
29 |   ]
30 |   s.homepage = "http://github.com/fstrozzi/bioruby-pipengine".freeze
31 |   s.licenses = ["MIT".freeze]
32 |   s.rubygems_version = "2.6.11".freeze
33 |   s.summary = "A pipeline manager".freeze
34 | 
35 |   if s.respond_to? :specification_version then
36 |     s.specification_version = 4
37 | 
38 |     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
39 |       s.add_runtime_dependency(%q<trollop>.freeze, [">= 2.1.2"])
40 |       s.add_runtime_dependency(%q<colorize>.freeze, [">= 0.8.1"])
41 |       s.add_development_dependency(%q<jeweler>.freeze, [">= 0"])
42 |     else
43 |       s.add_dependency(%q<trollop>.freeze, [">= 2.1.2"])
44 |       s.add_dependency(%q<colorize>.freeze, [">= 0.8.1"])
45 |       s.add_dependency(%q<jeweler>.freeze, [">= 0"])
46 |     end
47 |   else
48 |     s.add_dependency(%q<trollop>.freeze, [">= 2.1.2"])
49 |     s.add_dependency(%q<colorize>.freeze, [">= 0.8.1"])
50 |     s.add_dependency(%q<jeweler>.freeze, [">= 0"])
51 |   end
52 | end
53 | 
54 | 


--------------------------------------------------------------------------------
/joss/paper.bib:
--------------------------------------------------------------------------------
 1 | @online{Torque2017,
 2 | 	author = "Adaptive Computing Inc.",
 3 |   title = {TORQUE Resource Manager},
 4 | 	year = {2017},
 5 | 	url = {http://www.adaptivecomputing.com/products/open-source/torque/}
 6 | }
 7 | 
 8 | @online{CWL2017,
 9 | 	author = "Common Workflow Language",
10 |   title = {Common Workflow Language},
11 | 	year = {2017},
12 | 	url = {http://www.commonwl.org},
13 | 	doi = {dx.doi.org/10.6084/m9.figshare.3115156.v2}
14 | }
15 | 
16 | @article{Goto2010,
17 | 	title = {Bioinformatics software for the Ruby programming language},
18 | 	author = {Goto N, Prins P, Nakao M, Bonnal RJP, Aerts J, Katayama T},
19 | 	year = {2010},
20 | 	doi = {doi.org/10.1093/bioinformatics/btq475},
21 | 	journal = {Bioinformatics}
22 | }
23 | 
24 | 
25 | @article{Bonnal2012,
26 | 	title = {Biogem: An effective tool-based approach for scaling up open source software development in bioinformatics},
27 | 	author = {Bonnal RJP, Aerts J, Githinji G, Goto N, Maclean D, Miller CA, Mishima H, Pagani M, Ramirez-gonzalez R, Smant G, Strozzi F, Syme R, Vos R, Wennblom TJ, Woodcroft BJ, Katayama T, Prins P},
28 | 	year = {2012},
29 | 	doi = {doi.org/10.1093/bioinformatics/bts080},
30 | 	journal = {Bioinformatics}
31 | }
32 | 


--------------------------------------------------------------------------------
/joss/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Pipengine: an ultra light YAML-based pipeline execution engine'
 3 | tags:
 4 |   - pipeline
 5 |   - workflows
 6 |   - reproducibility
 7 | authors:
 8 |  - name: Francesco Strozzi
 9 |    orcid: 0000-0002-6845-6982
10 |    affiliation: 1
11 |  - name: Raoul Jean Pierre Bonnal
12 |    orcid: 0000-0002-2123-6536
13 |    affiliation: 2
14 | affiliations:
15 |  - name: Enterome Bioscience, Paris - France
16 |    index: 1
17 |  - name: INGM - Isitituto Nazionale Genetica Molecolare "Romeo ed Enrica Invernizzi": Milan, Italy
18 |    index: 2
19 | date: 25 July 2017
20 | bibliography: paper.bib
21 | ---
22 | 
23 | # Summary
24 | 
25 | This is an ultra light YAML-based pipeline execution engine. The tool allows defining a pipeline template in YAML, specifying command lines, resources and software to be used along with pipeline steps dependencies. Pipengine is a sample-centric tool, so the pipeline can then be applied over a single sample or multiple samples data, generating actual runnable bash scripts which can then be submitted automatically to a scheduling system or run locally.
26 | 
27 | The bash scripts generated by Pipengine includes a list of features such as: 
28 | 
29 | * error controls and logging for each step
30 | 
31 | * the automated generation of directories based on sample and pipeline steps names 
32 | 
33 | * the moving of input and output data across original and temporary folders if needed 
34 | 
35 | * a simple checkpoint strategy to avoid re-running already completed steps in a pipeline.
36 | 
37 | All these features prevent the users to write boiler plate code to perform all these necessary accessory tasks.
38 | 
39 | Moreover, Pipengine creates a stable and reproducible working and output tree for each analysis, which transparently stores all the results of each step of a pipeline for each sample analyzed. In this way pipelines' intermediate or final results can be predictably accessed by the analysts and/or easily parsed with other tools.
40 | 
41 | The software was developed back in 2012, when more generalized schemas such as for instance the Common Workflow Language [@CWL2017] were not yet defined, and thus was among the firsts utilities to introduce the concept of using simple YAML as a template format to define reusable bioinformatics pipelines. 
42 | 
43 | Pipengine has been used across several research groups and bioinformatics core facilities since its first appearance. It directly supports the PBS/Torque scheduler [@Torque2017] for submission of jobs, but given that the support for a scheduler is based on specific options written automatically inside the bash scripts generated by the tool, it can be easily adapted to work with other schedulers, if needed.
44 | 
45 | Pipengine is written in Ruby and is available for download as a BioRuby Gem [@Goto2010; @Bonnal2012].
46 | 
47 | # References
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/lib/bio-pipengine.rb:
--------------------------------------------------------------------------------
 1 | require 'yaml'
 2 | 
 3 | require 'securerandom'
 4 | require 'trollop'
 5 | require 'colorize'
 6 | #require 'torque_rm'
 7 | #require 'terminal-table'
 8 | require 'fileutils'
 9 | require 'logger'
10 | require 'erb'
11 | 
12 | require 'bio/pipengine/sample'
13 | require 'bio/pipengine/step'
14 | require 'bio/pipengine/job'
15 | require 'bio/pipengine'
16 | 


--------------------------------------------------------------------------------
/lib/bio/pipengine.rb:
--------------------------------------------------------------------------------
  1 | module Bio
  2 | 	module Pipengine
  3 | 
  4 | 		def self.include(name, filename)
  5 | 			File.readlines(filename).map {|line| "  "+line}.join("\n")
  6 | 		end
  7 | 	
  8 | 		@@logger_error = Logger.new(STDERR)
  9 | 		def self.run(options)
 10 | 
 11 | 			# reading the yaml files
 12 | 			pipeline = YAML.load ERB.new(File.read(options[:pipeline])).result(binding)
 13 | 			samples_file = load_samples_file options[:samples_file]
 14 | 			
 15 | 			# make sure all sample names are always Strings
 16 | 			converted_samples_list = {}
 17 | 			samples_file["samples"].each_key do |sample|
 18 | 				if samples_file["samples"][sample].kind_of? Hash # it's a group of samples
 19 | 					converted_samples_list[sample.to_s] = Hash[samples_file["samples"][sample].map{ |k, v| [k.to_s, v] }]
 20 | 				else
 21 | 					converted_samples_list[sample.to_s] = samples_file["samples"][sample]
 22 | 				end
 23 | 			end
 24 | 			samples_file["samples"] = converted_samples_list # replacing original samples hash with the converted one
 25 | 
 26 | 			# pre-running checks	
 27 | 			check_steps(options[:steps],pipeline)	
 28 | 			check_samples(options[:samples],samples_file) if options[:samples]
 29 | 
 30 | 			# list of samples the jobs will work on
 31 | 			samples_list = nil
 32 | 			# check if a group is specified
 33 | 			if options[:group]
 34 | 				samples_list = options[:samples] ? samples_file["samples"][options[:group]].select {|k,v| options[:samples].include? k} : samples_file["samples"][options[:group]]
 35 | 				options[:multi] = samples_list.keys 
 36 | 				samples_file["resources"]["output"] << "/#{options[:group]}"	
 37 | 			else # if not, proceed normalizing the sample list to remove groups and get a list of all samples
 38 | 				full_list_samples = {}
 39 | 				samples_file["samples"].each_key do |k| 
 40 | 					if samples_file["samples"][k].kind_of? Hash
 41 | 						full_list_samples.merge! samples_file["samples"][k]
 42 | 					else
 43 | 						full_list_samples[k] = samples_file["samples"][k]
 44 | 					end
 45 | 				end
 46 | 				samples_list = options[:samples] ? full_list_samples.select {|k,v| options[:samples].include? k} : full_list_samples
 47 | 			end
 48 | 				
 49 | 			########### START ###########
 50 | 
 51 | 			# create output directory (jobs scripts will be saved there)
 52 | 			FileUtils.mkdir_p samples_file["resources"]["output"] #unless options[:dry] #&& options[:spooler]!="pbs"
 53 | 
 54 | 			# check if the requested steps are multi-samples
 55 | 			run_multi = check_and_run_multi(samples_file,pipeline,samples_list,options)
 56 | 			
 57 | 			unless run_multi # there are no multi-samples steps, so iterate on samples and create one job per sample
 58 | 				samples_list.each_key do |sample_name|
 59 | 						sample = Bio::Pipengine::Sample.new(sample_name.to_s,samples_list[sample_name],options[:group])
 60 | 					create_job(samples_file,pipeline,samples_list,options,sample)
 61 | 				end
 62 | 			end
 63 | 		end
 64 | 
 65 | 		def self.parse_tag_option(option_tag)
 66 | 			if !option_tag
 67 | 				return {}	
 68 | 			else
 69 | 				tags = {}
 70 | 				option_tag.each do |tag|
 71 | 					values = tag.split("=")
 72 | 					if values.empty?
 73 | 						@@logger_error.error "\nAbort! Unrecognized values for tag option, please provide the tags as follows: tag1=value1 tag2=value2".red
 74 | 						exit
 75 | 					else
 76 | 						tags.merge! Hash[*values.flatten]
 77 | 					end
 78 | 				end
 79 | 				return tags
 80 | 			end	
 81 | 		end
 82 | 
 83 | 		# handle steps that run on multiple samples (i.e. sample groups job)
 84 | 		def self.check_and_run_multi(samples_file,pipeline,samples_list,options)
 85 | 			step_multi = options[:steps].map {|s| Bio::Pipengine::Step.new(s,pipeline["steps"][s]).is_multi?}
 86 | 			
 87 | 			if step_multi.include? false
 88 | 				if step_multi.uniq.size > 1
 89 | 					@@logger_error.error "\nAbort! You are trying to run both multi-samples and single sample steps in the same job".red
 90 | 					exit
 91 | 				else
 92 | 					return false
 93 | 				end
 94 | 			else
 95 | 				samples_obj = {}
 96 | 				samples_list.each_key {|sample_name| samples_obj[sample_name] = Bio::Pipengine::Sample.new(sample_name,samples_list[sample_name],options[:group])}
 97 | 				create_job(samples_file,pipeline,samples_list,options,samples_obj)
 98 | 				return true
 99 | 			end
100 | 		end
101 | 
102 | 		def self.create_job(samples_file,pipeline,samples_list,options,sample)
103 | 			# getting the sample name (only if this is not a multi samples job)
104 | 			sample_name = (sample.kind_of? Hash) ? nil : sample.name+"-"
105 | 			# setting the job name
106 | 			job_name = nil
107 | 			if options[:name] 
108 | 				job_name = options[:name]
109 | 			elsif options[:steps].size > 1
110 | 				job_name = "#{sample_name}#{options[:steps].join("-")}"
111 | 			else
112 | 				job_name = "#{sample_name}#{options[:steps].first}"
113 | 			end	
114 | 			# creating the Job object
115 | 			job = Bio::Pipengine::Job.new(job_name)
116 | 			job.local = options[:tmp]
117 | 			job.custom_output = options[:output_dir]
118 | 			job.custom_name = (options[:name]) ? options[:name] : nil
119 | 			# Adding pipeline and samples resources
120 | 			job.add_resources pipeline["resources"]
121 | 			job.add_resources samples_file["resources"]
122 | 			# Adding resource tag from the command line which can overwrite resources defined in the pipeline and samples files
123 | 			job.add_resources parse_tag_option(options[:tag])
124 | 			#setting the logging system
125 | 			job.log = options[:log]
126 | 			job.log_adapter = options[:log_adapter]
127 | 			# setting sample groups either by cli option (if present) or by taking all available samples
128 | 			job.multi_samples = (options[:multi]) ? options[:multi] : samples_list.keys
129 | 			job.samples_obj = sample if sample.kind_of? Hash
130 | 			# cycling through steps and add command lines to the job
131 | 			options[:steps].each do |step_name| 
132 | 				# TODO WARNING this can add multiple times the same step if the are multi dependencies
133 | 				self.add_job(job, pipeline, step_name, sample)
134 | 			end
135 | 
136 | 			if options[:dry]
137 | 				job.to_script(options)
138 | 			else
139 | 				job.to_script(options)
140 | 				job.submit
141 | 			end
142 | 		end
143 | 
144 | 		# check if sample exists
145 | 		def self.check_samples(passed_samples,samples)
146 | 			passed_samples.each do |sample|
147 | 				samples_names = []
148 | 				samples["samples"].each_key do |k|
149 | 					if samples["samples"][k].kind_of? Hash
150 | 						samples["samples"][k].each_key {|s| samples_names << s}
151 | 					else
152 | 						samples_names << k
153 | 					end
154 | 				end
155 | 				unless samples_names.include? sample
156 | 					@@logger_error.error "Sample \"#{sample}\" does not exist in sample file!".red
157 | 					exit
158 | 				end
159 | 			end
160 | 		end
161 | 
162 | 		# check if step exists
163 | 		def self.check_steps(passed_steps,pipeline)
164 | 			passed_steps.each do |step|
165 | 				unless pipeline["steps"].keys.include? step
166 | 					@@logger_error.error "Step \"#{step}\" does not exist in pipeline file!".red
167 | 					exit
168 | 				end
169 | 			end
170 | 		end
171 | 
172 | 		# load the pipeline file and show a list of available steps
173 | 		def self.inspect_steps(pipeline_file)
174 | 			pipeline = YAML.load_file pipeline_file
175 | 			print "\nPipeline: ".blue 
176 | 			print "#{pipeline["pipeline"]}\n\n".green
177 | 			puts "List of available steps:".light_blue
178 | 			pipeline["steps"].each_key do |s|
179 | 				print "\s\s#{s}:\s\s".blue 
180 | 				print "#{pipeline["steps"][s]["desc"]}\n".green
181 | 			end
182 | 			puts "\n"
183 | 		end
184 | 		
185 | 		# create the samples.yml file
186 | 		def self.create_samples(dir)
187 | 				File.open("samples.yml","w") do |file|
188 | 						file.write "resources:\n\soutput: #{`pwd -L`}\n\nsamples:\n"
189 | 					samples = Hash.new {|hash,key| hash[key] = []}
190 | 					dir.each do |path|
191 | 						projects = Dir.glob(path+"/*").sort.select {|folders| folders.split("/")[-1] =~/Project_/}
192 | 						unless projects.empty?
193 | 							projects.each do |project_folder|
194 | 								Dir.glob(project_folder+"/*").sort.each {|s| samples[s.split("/")[-1]] << s}
195 | 							end
196 | 						else
197 | 							Dir.glob(path+"/*").sort.each {|s| samples[s.split("/")[-1]] << s if Dir.exists? s}
198 | 						end
199 | 					end
200 | 					samples.each_key do |sample|
201 | 						file.write "\s"+sample+":\s"+samples[sample].join(",")+"\n"	
202 | 					end
203 | 				end
204 | 		end
205 | 
206 | 		def self.add_job(job, pipeline, step_name, sample)
207 | 			step = Bio::Pipengine::Step.new(step_name,pipeline["steps"][step_name]) # parsing step instructions
208 | 			self.add_job(job, pipeline, step.pre, sample) if step.has_prerequisite?
209 | 			job.add_step(step,sample) # adding step command lines to the job	
210 | 		end #add_job
211 | 
212 | 		def self.load_samples_file(file)
213 | 			samples_file = YAML.load_file file
214 | 			samples_file["samples"].each do |k,v|
215 | 				if v.kind_of? Hash
216 | 					samples_file["samples"][k] = Hash[samples_file["samples"][k].map{ |key, value| [key.to_s, value.to_s] }] 
217 | 				else
218 | 					samples_file["samples"][k] = v.to_s
219 | 				end
220 | 			end
221 | 			# make sure everything in Samples and Resources is converted to string
222 | 			#samples_file["samples"] = Hash[samples_file["samples"].map{ |key, value| [key.to_s, value.to_s] }] 
223 | 			samples_file["resources"] = Hash[samples_file["resources"].map {|k,v| [k.to_s, v.to_s]}]	
224 | 			samples_file 
225 | 		end
226 | 
227 | 		
228 | 	end
229 | end
230 | 


--------------------------------------------------------------------------------
/lib/bio/pipengine/job.rb:
--------------------------------------------------------------------------------
  1 | module Bio
  2 | 	
  3 | 	module Pipengine
  4 | 		
  5 | 		class Job
  6 | 			
  7 | 			@@logger = Logger.new(STDOUT)
  8 | 			@@logger_error = Logger.new(STDERR)
  9 | 			# a Job object holds information on a job to be submitted
 10 | 			# samples_groups and samples_obj are used to store information in case of steps that require to combine info
 11 | 			# from multiple samples
 12 | 			attr_accessor :name, :cpus, :nodes, :mem, :resources, :command_line, :local,
 13 | 			              :multi_samples, :samples_obj, :custom_output, :custom_name,
 14 | 			              :log, :log_adapter
 15 | 			def initialize(name)
 16 | 				@name = generate_uuid + "-" + name
 17 | 				@shortname = name
 18 | 				@command_line = []
 19 | 				@resources = {}
 20 | 				@cpus = 1
 21 | 				@nodes = "1"
 22 | 				@log = "stdin"
 23 | 				@log_adapter = nil
 24 | 			end
 25 | 
 26 | 			def add_resources(resources)
 27 | 				self.resources.merge! resources
 28 | 			end
 29 | 
 30 | 			def output
 31 | 				self.resources["output"]
 32 | 			end
 33 | 
 34 | 			# add all the command lines for a given step
 35 | 			def add_step(step,sample)	
 36 | 
 37 | 				# setting job working directory
 38 | 				working_dir = ""	
 39 | 				if self.local 
 40 | 					working_dir = self.local+"/"+self.name
 41 | 				else
 42 | 					working_dir = self.output
 43 | 
 44 | 					if step.is_multi?	
 45 | 						folder = (self.custom_output) ? self.custom_output : @shortname 
 46 | 						working_dir += "/#{folder}"
 47 | 					else
 48 | 						folder =
 49 | 						if self.custom_output 
 50 | 							self.custom_output
 51 | 						elsif self.custom_name
 52 | 							self.custom_name
 53 | 						else
 54 | 							step.name
 55 | 						end
 56 | 						working_dir += "/#{sample.name}/#{folder}"
 57 | 					end
 58 | 
 59 | 				end
 60 | 
 61 | 				# set job cpus number to the higher step cpus (this in case of multiple steps)
 62 | 				self.cpus = step.cpus if step.cpus > self.cpus
 63 | 				
 64 | 				# set number of nodes for job
 65 | 				self.nodes = (step.nodes) ? step.nodes : @nodes
 66 | 
 67 | 				# set the memory used
 68 | 				self.mem = step.mem
 69 | 
 70 | 				# adding job working directory
 71 | 				unless step.name.start_with? "_"
 72 | 					self.command_line << "if [ ! -f #{working_dir}/checkpoint ]"
 73 | 					self.command_line << "then"
 74 | 					self.command_line << logger(step, "start")
 75 | 					self.command_line << "\nmkdir -p #{working_dir}"
 76 | 					self.command_line << "cd #{working_dir}"
 77 | 				end
 78 | 
 79 | 				# generate command lines for this step
 80 | 				if step.run.kind_of? Array
 81 | 					step.run.each_with_index do |cmd, i|
 82 | 						command = generate_cmd_line(cmd,sample,step)
 83 | 						# TODO verify that logger works in this case
 84 | 						# self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name}:#{i}\" ; exit 1; }"
 85 | 						self.command_line << "#{command} || { #{logger(step, "FAILED #{i}" )}; exit 1; }"
 86 | 					end
 87 | 				else
 88 | 					command = generate_cmd_line(step.run,sample,step)
 89 | 					# TODO verify that logger works in this case
 90 | 					# self.command_line << "#{command} || { echo \"FAILED `date`: #{step.name} \" ; exit 1; }"
 91 | 					self.command_line << "#{command} || { #{logger(step, "FAILED" )}; exit 1; }"
 92 | 				end
 93 | 				self.command_line << logger(step, "finished")
 94 |                 self.command_line << "touch #{working_dir}/checkpoint"
 95 | 				self.command_line << "else"
 96 | 				self.command_line << logger(step, "already executed, skipping this step")
 97 | 				self.command_line << "fi"
 98 | 			
 99 | 				# check if a temporary (i.e. different from 'output') directory is set
100 | 				if self.local
101 | 					final_output = ""
102 | 
103 | 					if step.is_multi?	
104 | 						folder = (self.custom_output) ? self.custom_output : @shortname 
105 | 						final_output = self.output+"/#{folder}"	 
106 | 					else
107 | 						folder = (self.custom_output) ? self.custom_output : step.name
108 | 						final_output = self.output+"/#{sample.name}/#{folder}"
109 | 					end
110 | 
111 | 					self.command_line << "mkdir -p #{final_output}"
112 | 					self.command_line << "cp -r #{working_dir}/* #{final_output}"
113 | 					self.command_line << "rm -fr #{working_dir}"
114 | 				end
115 | 
116 | 			end
117 | 
118 | 			def to_script(options)
119 | 				File.open(self.output+"/"+self.name+'.pbs','w') do |file|
120 | 					file.puts "#!/usr/bin/env bash"
121 | 					file.puts "#PBS -N #{self.name}"
122 | 					file.puts "#PBS -d #{self.output}"
123 | 					file.puts "#PBS -q #{options[:pbs_queue]}" if options[:pbs_queue]
124 | 					if options[:pbs_opts]
125 | 						file.puts "#PBS -l #{options[:pbs_opts].join(",")}"
126 | 					else
127 | 						l_string = []
128 | 						l_string << "nodes=#{self.nodes}:ppn=#{self.cpus}"
129 | 						l_string << "mem=#{self.mem}" if self.mem
130 | 						file.puts "#PBS -l #{l_string.join(",")}"
131 | 					end
132 | 					file.puts self.command_line.join("\n")
133 | 			  end
134 | 			end
135 | 		
136 | 			def submit
137 | 				job_id = `qsub #{self.output}/#{self.name}.pbs`
138 | 				@@logger.info "#{job_id}".green
139 | 			end
140 | 
141 | 		private
142 | 			
143 | 			# create a unique ID for each job
144 | 			def generate_uuid
145 | 				SecureRandom.hex(5)
146 | 			end
147 | 			
148 | 			# this method call other methods to perform the right substitutions into the command lines
149 | 			def generate_cmd_line(cmd,sample,step)
150 | 				if step.is_multi? # if is a multi samples step call a different method
151 | 					set_multi_cmd(step,self.multi_samples)
152 | 					cmd = sub_multi(cmd,step)
153 | 				else
154 | 					cmd = sub_placeholders(cmd,sample,step) # normal step, perform usual substitutions
155 | 				end
156 | 				
157 | 				# Check that all placeholders have been substituted, if not terminate with an error
158 | 				cmd.scan(/<\S+>/).each do |unsubstituted_tag|
159 | 					@@logger_error.error("Found an unsubstituted tag #{unsubstituted_tag} . Terminating the execution".red)
160 | 					exit
161 | 				end
162 | 				return cmd
163 | 			end
164 | 		
165 | 			# perform substitutions on all the placeholders
166 | 			def sub_placeholders(cmd,sample,step=nil)	
167 | 				tmp_cmd = cmd.gsub(/<sample>/,sample.name)
168 | 				if tmp_cmd =~/<sample_path>/
169 | 					sample_path_glob = (tmp_cmd.scan(/<sample_path>(\S+)/).map {|e| e.first})
170 | 					if sample_path_glob.empty?
171 | 						tmp_cmd.gsub!(/<sample_path>/,sample.path.join("\s"))
172 | 					else
173 | 						sample_path_glob.each do |append|
174 | 							tmp_cmd.gsub!(/<sample_path>#{Regexp.quote(append)}/,(sample.path.map {|s| s+append}).join("\s"))
175 | 						end
176 | 					end
177 | 				end
178 | 				# for resourcers and cpus
179 | 				tmp_cmd = sub_resources_and_cpu(tmp_cmd,step)
180 | 				
181 | 				# for placeholders like <mapping/sample>
182 | 				tmp_cmd.scan(/<(\S+)\/sample>/).map {|e| e.first}.each do |input_folder|
183 | 					@@logger.info "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder
184 | 					tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/sample>/,self.output+"/"+sample.name+"/"+input_folder+"/"+sample.name)
185 | 				end
186 | 				
187 | 				# for placeholders like <mapping/>
188 | 				tmp_cmd.scan(/<(\S+)\/>/).map {|e| e.first}.each do |input_folder|
189 | 					@@logger.info "Directory #{self.output+"/"+sample.name+"/"+input_folder} not found".magenta unless Dir.exists? self.output+"/"+sample.name+"/"+input_folder
190 | 					tmp_cmd = tmp_cmd.gsub(/<#{input_folder}\/>/,self.output+"/"+sample.name+"/"+input_folder+"/")
191 | 				end
192 | 				return tmp_cmd
193 | 			end
194 | 
195 | 			def sub_resources_and_cpu(cmd,step)	
196 | 				# for all resources tags like <gtf> <index> <genome> <bwa> etc.
197 | 				self.resources.each_key do |r|
198 | 					cmd.gsub!(/<#{r}>/,self.resources[r])
199 | 				end
200 | 				# set number of cpus for this command line
201 | 				cmd.gsub!(/<cpu>/,step.cpus.to_s) unless step.nil?
202 | 				return cmd
203 | 			end
204 | 	
205 | 
206 | 			# creates actual multi-samples command lines to be substituted where <multi> placeholders are found
207 | 			def set_multi_cmd(step,multi_samples)
208 | 				if step.multi_def.kind_of? Array # in case of many multi-samples command lines
209 | 					step.multi_cmd = []
210 | 					step.multi_def.each do |m_def|
211 | 						step.multi_cmd << generate_multi_cmd(m_def,multi_samples)
212 | 					end
213 | 				else
214 | 					step.multi_cmd = generate_multi_cmd(step.multi_def,multi_samples)
215 | 				end
216 | 			end
217 | 
218 | 			# take the multi_cmd and perform the subsitutions into the step command lines
219 | 			def sub_multi(cmd,step)
220 | 				cmd = sub_resources_and_cpu(cmd,step)
221 | 				if step.multi_cmd.kind_of? Array
222 | 					step.multi_cmd.each_with_index do |m,index|
223 | 						cmd.gsub!(/<multi#{index+1}>/,m)
224 | 					end
225 | 				else
226 | 					cmd.gsub!(/<multi>/,step.multi_cmd)
227 | 				end
228 | 				return cmd
229 | 			end
230 | 
231 | 			# this sub method handle different multi-samples definitions (like comma separated list, space separated etc.)
232 | 			def generate_multi_cmd(multi_def,multi_samples)
233 | 				multi_cmd = []	
234 | 				multi_samples.each do |sample_name|
235 | 					if sample_name.include? ","
236 | 						multi_cmd << split_and_sub(",",multi_def,sample_name)
237 | 					elsif sample_name.include? ";"
238 | 						multi_cmd << split_and_sub(";",multi_def,sample_name)
239 | 					else
240 | 						multi_cmd << sub_placeholders(multi_def,self.samples_obj[sample_name])
241 | 					end
242 | 				end
243 | 				return multi_cmd.join("\s")
244 | 			end
245 | 
246 | 			# take a non-space separated list of samples and perform the substitution with the group defitions
247 | 			def split_and_sub(sep,multi_def,multi)	
248 | 				cmd_line = []
249 | 				multi.split(sep).each do |sample_name|
250 | 					cmd_line << sub_placeholders(multi_def,self.samples_obj[sample_name])
251 | 				end
252 | 				cmd_line.join(sep)
253 | 			end
254 | 
255 | 			# log a step according to the selected adapter
256 | 			def logger(step, message)  
257 | 				case self.log
258 | 					when "stdin"
259 | 					   "echo \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd` `date`.\""
260 | 					when "syslog"
261 | 						 "logger -t PIPENGINE \"#{step.name} #{name} #{message} `whoami` `hostname` `pwd`\""
262 | 					when "fluentd"
263 | 						 "curl -X POST -d 'json={\"source\":\"PIPENGINE\", \"step\":\"#{step.name}\", \"message\":\"#{message}\", \"job_id\":\"#{name}\", \"user\":\"\'\"`whoami`\"\'\", \"host\":\"\'\"`hostname`\"\'\", \"pwd\":\"\'\"`pwd`\"\'\"}' #{self.log_adapter}"
264 | 					end
265 | 			end #logger
266 | 
267 | 		end
268 | 	end
269 | end
270 | 
271 | 


--------------------------------------------------------------------------------
/lib/bio/pipengine/sample.rb:
--------------------------------------------------------------------------------
 1 | module Bio
 2 | 	module Pipengine
 3 | 		class Sample
 4 | 			# Sample holds all the information on a sample and its original input path (or multiple paths)	
 5 | 			attr_accessor :path
 6 | 				def initialize(name,path_string,group)
 7 | 					@path = path_string.split(",")
 8 | 					@name = name
 9 | 					@group = group
10 | 				end
11 | 
12 | 				def name=(name)
13 | 					@name
14 | 				end
15 | 
16 | 				def group=(group)
17 | 					@group
18 | 				end
19 | 
20 | 				def group
21 | 					@group
22 | 				end
23 | 
24 | 				def x_name
25 | 					"#{@group}/#{@name}"
26 | 				end
27 | 
28 | 				def name
29 | 					@name
30 | 				end
31 | 		end
32 | 	end
33 | end
34 | 
35 | 


--------------------------------------------------------------------------------
/lib/bio/pipengine/step.rb:
--------------------------------------------------------------------------------
 1 | module Bio
 2 | 	module Pipengine
 3 | 		
 4 | 		# Step holds information for a pipeline step
 5 | 		# groups_def is used to store information on groups definition (i.e. generic cmd lines with placeholders)
 6 | 		# groups_cmd is used to store the actual command lines for all the samples to be combined in a "groups" step
 7 | 		# this are generated by combining groups_def information with sample groups information and will be placed
 8 | 		# where <groups> placeholder is found into the step command lines.
 9 | 		class Step
10 | 			attr_accessor :name, :run, :cpus, :mem, :nodes, :multi_def, :multi_cmd, :pre
11 | 			def initialize(name,step_instructions)
12 | 				@name = name
13 | 				parse_yaml(step_instructions)	
14 | 			end
15 | 
16 | 			def is_multi?
17 | 				return (self.multi_def.nil?) ? false : true
18 | 			end
19 | 
20 | 			def has_prerequisite?
21 | 				return (self.pre.nil?) ? false : true
22 | 			end
23 | 
24 | 			private
25 | 
26 | 			def parse_yaml(step_instructions)
27 | 				self.cpus = step_instructions["cpu"].to_i
28 | 				self.nodes = step_instructions["nodes"]
29 | 				self.mem = step_instructions["mem"]
30 | 				self.run = step_instructions["run"]
31 | 				self.multi_def = step_instructions["multi"]
32 | 				self.pre = step_instructions["pre"]
33 | 			end
34 | 
35 | 		end
36 | 
37 | 	end
38 | end
39 | 
40 | 


--------------------------------------------------------------------------------
/test/data/mapping.yml:
--------------------------------------------------------------------------------
1 | mapping:
2 |   cpu: 8
3 |   desc: Run BWA MEM and generates a sorted BAM file
4 |   run:
5 |    - <bwa> mem -t <cpu> -R '@RG\tID:<flowcell>\tLB:<sample>\tPL:ILLUMINA\tPU:<flowcell>\tCN:PTP\tSM:<sample>' <index> <trim/sample>.trim.fastq | <samtools> view -bS - > <sample>.bam
6 |    - <samtools> sort -@ <cpu> <sample>.bam <sample>.sort
7 |    - rm -f <sample>.bam
8 | 


--------------------------------------------------------------------------------
/test/data/pipeline-enh.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | pipeline: gbs
 3 | 
 4 | resources:
 5 |   fastqc: /storage/software/FastQC/fastqc
 6 |   bwa: /storage/software/bwa-0.7.5a/bwa
 7 |   samtools: /storage/software/samtools
 8 |   bgzip: /storage/software/tabix-0.2.6/bgzip
 9 |   tabix: /storage/software/tabix-0.2.6/tabix
10 |   bcftools: /storage/software/bcftools
11 |   vcfutils: /storage/software/vcfutils.pl
12 |   adapters: /storage/software/Trimmomatic-0.30/adapters/TruSeq3-PE.fa
13 |   trimmomatic: /storage/software/Trimmomatic-0.30/trimmomatic-0.30.jar
14 |   gatk_jar: /storage/software/GenomeAnalysisTK-2.7-2-g6bda569/GenomeAnalysisTK.jar
15 | 
16 | steps:
17 | <%= include :mapping, "./mapping.yml" %>
18 | 
19 |   index:
20 |     desc: Make BAM index
21 |     run: <samtools> index <mapping/sample>.sort.bam
22 | 
23 |   gatk:
24 |     desc: Run GATK to perform SNP and InDel calling, then compress and index the VCF file
25 |     multi: -I <realign/sample>.final.bam
26 |     run: 
27 |      - java -Xmx4g -jar <gatk_jar> -T UnifiedGenotyper <multi> -R <genome> -mbq 30 -o all.gatk.vcf -nt <cpu> -glm BOTH
28 |      - <bgzip> all.gatk.vcf
29 |      - <tabix> -p vcf all.gatk.vcf.gz
30 |     cpu: 20
31 |     nodes: 2
32 | 
33 |   root_step:
34 |     desc: root step to test dependencies
35 |     run:
36 |      - echo "root"
37 | 
38 |   child_step:
39 |     desc: child step to test dependencies
40 |     pre: root_step
41 |     run:
42 |       - echo "I am the child"
43 | 


--------------------------------------------------------------------------------
/test/data/pipeline.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | pipeline: simple mapping and variation calling
 3 | 
 4 | resources:
 5 |   trimmomatic: /storage/software/trimmomatic
 6 |   adapters: /storage/software/trimmomatic/adapters.fa
 7 |   bwa: /storage/software/bwa
 8 |   samtools: /storage/software/samtools
 9 |   samblaster: /storage/software/samblaster 
10 |   freebayes: /storage/software/freebayes
11 | 
12 | steps:
13 | 
14 |   trim: 
15 |     desc: Run Trimmomatic to remove adapters and low quality bases from reads
16 |     run:
17 |       - java -jar <trimmomatic> PE -threads <cpu> -phred33 <(ls <sample_path>/*_R1_*.gz | xargs zcat) <(ls <sample_path>/*_R2_*.gz | xargs zcat) <sample>_R1_paired.fastq.gz <sample>_R1_unpaired.fastq.gz <sample>_R2_paired.fastq.gz <sample>_R2_unpaired.fastq.gz ILLUMINACLIP:<adapters>:2:30:10 LEADING:5 TRAILING:5 SLIDINGWINDOW:4:20 MINLEN:36
18 |       - rm -f <sample>_R1_unpaired.fastq <sample>_R2_unpaired.fastq
19 |     cpu: 8
20 | 
21 |   mapping:
22 |     desc: Run BWA MEM and generates a sorted BAM file
23 |     pre: trim
24 |     run:
25 |      - <bwa> mem -t <cpu> -R '@RG\tID:<flowcell>\tLB:<sample>\tPL:ILLUMINA\tPU:<flowcell>\tCN:PTP\tSM:<sample>' <index> <trim/sample>_R1_paired.fastq.gz <trim/sample>_R2_paired.fastq.gz | <samblaster> -M | <samtools> view -bS - > <sample>.bam
26 |      - <samtools> sort -@ <cpu> <sample>.bam > <sample>.sort.bam
27 |      - <samtools> index <sample>.sort.bam
28 |      - rm -f <sample>.bam
29 |     cpu: 8
30 | 
31 |   freebayes:
32 |     desc: Run FreeBayes to call variants on all samples
33 |     multi: -b <mapping/sample>.sort.bam
34 |     run:
35 |       - <freebayes> <multi> --min-mapping-quality 30 --min-coverage 5 --min-alternate-fraction 0.5 -f <genome> -v all.fb.vcf
36 |     cpu: 4
37 | 
38 |   root_step:
39 |     desc: root step to test dependencies
40 |     run:
41 |      - echo "root"
42 | 
43 |   child_step:
44 |     desc: child step to test dependencies
45 |     pre: root_step
46 |     run:
47 |       - echo "I am the child"
48 | 


--------------------------------------------------------------------------------
/test/data/samples.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   index: /db/genome/human
 3 |   flowcell: ILLU_1234
 4 |   output: ./test
 5 |   genome: /db/genome/human.fa
 6 | 
 7 | samples:
 8 |   Group1:
 9 |     sampleA: ./sampleA
10 |     sampleB: ./sampleB
11 |   Group2:
12 |     sampleC: ./sampleC
13 |     sampleD: ./sampleD
14 | 


--------------------------------------------------------------------------------
/test/examples/pipeline.yml:
--------------------------------------------------------------------------------
 1 | pipeline: resequencing
 2 | 
 3 | resources:
 4 |   bwa: /software/bwa
 5 |   samtools: /software/samtools
 6 |   pigz: /software/pigz
 7 | 
 8 | steps:
 9 |   mapping:
10 |     run:
11 |      - ls <sample_path>/*_R1_*.gz | xargs zcat | <pigz> -p 10 >> R1.fastq.gz
12 |      - ls <sample_path>/*_R2_*.gz | xargs zcat | <pigz> -p 10 >> R2.fastq.gz
13 |      - <bwa> sampe -P <index> <(<bwa> aln -t 4 -q 20 <index> R1.fastq.gz) <(<bwa> aln -t 4 -q 20 <index> R2.fastq.gz) R1.fastq.gz R2.fastq.gz | <samtools> view -Sb - > <sample>.bam
14 |      - rm -f R1.fastq.gz R2.fastq.gz
15 |     cpu: 12
16 | 


--------------------------------------------------------------------------------
/test/examples/pipeline_multi.yml:
--------------------------------------------------------------------------------
 1 | pipeline: resequencing
 2 | 
 3 | resources:
 4 |   bwa: /software/bwa
 5 |   samtools: /software/samtools
 6 |   mark_dup: /software/picard-tools-1.77/MarkDuplicates.jar
 7 |   gatk: /software/GenomeAnalysisTK/GenomeAnalysisTK.jar
 8 | 
 9 | steps:
10 |   mapping:
11 |     run:
12 |      - ls <sample_path>/*_R1_*.gz | xargs zcat | pigz -p 10 >> R1.fastq.gz
13 |      - ls <sample_path>/*_R2_*.gz | xargs zcat | pigz -p 10 >> R2.fastq.gz
14 |      - <bwa> sampe -P <index> <(<bwa> aln -t 4 -q 20 <index> R1.fastq.gz) <(<bwa> aln -t 4 -q 20 <index> R2.fastq.gz) R1.fastq.gz R2.fastq.gz | <samtools> view -Su - | java -Xmx4g -jar /storage/software/picard-tools-1.77/AddOrReplaceReadGroups.jar I=/dev/stdin O=<sample>.sorted.bam SO=coordinate LB=<sample> PL=illumina PU=PU SM=<sample> TMP_DIR=/data/tmp CREATE_INDEX=true MAX_RECORDS_IN_RAM=1000000
15 |      - rm -f R1.fastq.gz R2.fastq.gz
16 |     cpu: 12
17 | 
18 |   mark_dup:
19 |     pre: mapping
20 |     run: java -Xmx4g -jar <mark_dup> VERBOSITY=INFO MAX_RECORDS_IN_RAM=500000 VALIDATION_STRINGENCY=SILENT INPUT=<mapping/sample>.sorted.bam OUTPUT=<sample>.md.sort.bam METRICS_FILE=<sample>.metrics REMOVE_DUPLICATES=false
21 | 
22 |   realign_target:
23 |     pre: mark_dup
24 |     run: java -Xmx4g -jar <gatk> -T RealignerTargetCreator -I <mark_dup/sample>.md.sort.bam -nt 8 -R <genome> -o <sample>.indels.intervals
25 |     cpu: 8
26 | 


--------------------------------------------------------------------------------
/test/examples/samples.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   index: /storage/genomes/bwa_index/genome
 3 |   genome: /storage/genomes/genome.fa
 4 |   output: ./working
 5 | 
 6 | samples:
 7 |   sampleA: /ngs_reads/sampleA
 8 |   sampleB: /ngs_reads/sampleB
 9 |   sampleC: /ngs_reads/sampleC
10 |   sampleD: /ngs_reads/sampleD
11 | 


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | require 'rubygems'
 2 | require 'bundler'
 3 | begin
 4 |   Bundler.setup(:default, :development)
 5 | rescue Bundler::BundlerError => e
 6 |   $stderr.puts e.message
 7 |   $stderr.puts "Run `bundle install` to install missing gems"
 8 |   exit e.status_code
 9 | end
10 | require 'test/unit'
11 | require 'shoulda'
12 | 
13 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
14 | $LOAD_PATH.unshift(File.dirname(__FILE__))
15 | require 'bioruby-pipengine'
16 | 
17 | class Test::Unit::TestCase
18 | end
19 | 


--------------------------------------------------------------------------------
/test/test_bioruby-pipengine.rb:
--------------------------------------------------------------------------------
1 | require 'helper'
2 | 
3 | class TestBiorubyPipengine < Test::Unit::TestCase
4 |   should "probably rename this file and start testing for real" do
5 |     flunk "hey buddy, you should probably rename this file and start testing for real"
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------