├── data-format
    ├── README.mkd
    ├── evaluation.mkd
    ├── sequence.mkd
    ├── binning.mkd
    └── profiling.mkd
├── container
    ├── profiling-evaluation
    │   ├── schema.yaml
    │   └── rfc.mkd
    ├── binning_evaluation
    │   ├── unsupervised_binning
    │   │   ├── input_schema.yaml
    │   │   └── rfc.mkd
    │   └── taxonomic_binning
    │   │   ├── input_schema.yaml
    │   │   └── rfc.mkd
    ├── assembly-evaluation
    │   ├── read-based-assembly-evaluation
    │   │   ├── input_schema.yaml
    │   │   └── rfc.mkd
    │   └── reference-based-assembly-evaluation
    │   │   ├── input_schema.yaml
    │   │   └── rfc.mkd
    ├── short-read-assembler
    │   ├── input_schema.yaml
    │   └── rfc.mkd
    ├── profiling
    │   ├── schema.yaml
    │   └── rfc.mkd
    ├── README.mkd
    └── binning
    │   ├── input_schema.yaml
    │   └── rfc.mkd
├── unsupervised_binning
    ├── input_schema.yaml
    └── rfc.mkd
├── LICENSE
├── databases
    ├── ncbi_taxonomy.txt
    └── blastdb.txt
├── README.mkd
└── rfc.mkd


/data-format/README.mkd:
--------------------------------------------------------------------------------
 1 | # Current and Previous Data Format Specifications
 2 | 
 3 | This is a list of permanent links to the latest major specification versions in the git history.
 4 | 
 5 | ## Profiling Output Format
 6 | [Version 0.9](https://github.com/bioboxes/rfc/blob/60263f34c57bc4137deeceec4c68a7f9f810f6a5/data-format/profiling.mkd)
 7 | 
 8 | ## Binning Output Format
 9 | [Version 0.9](https://github.com/bioboxes/rfc/blob/4bb19a633a6a969c2332f1f298852114c5f89b1b/data-format/binning.mkd)
10 | 
11 | ## Bioinformatics File Formats
12 | [Version 0.8](https://github.com/bioboxes/rfc/blob/b3b49b111704803e1427c82e2ecf87c5c8ffdfb9/data-format/sequence.mkd)
13 | 
14 | ## Evaluation Output Format
15 | [Version 0.1](https://github.com/bioboxes/rfc/blob/5f5305300f4609e5b4b477e6184a5d231455ebd0/data-format/evaluation.mkd)
16 | 


--------------------------------------------------------------------------------
/container/profiling-evaluation/schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 |   $schema: "http://json-schema.org/draft-04/schema#"
 3 |   title: "Bioboxes profiling benchmark input file validator"
 4 |   type: "object"
 5 |   additionalProperties: false
 6 |   required: 
 7 |     - "version"
 8 |     - "arguments"
 9 |   properties: 
10 |     version: 
11 |       type: "string"
12 |       pattern: "^0.1.\\d+$"
13 |     arguments: 
14 |       additionalProperties: false
15 |       type: "object"
16 |       required: 
17 |         - "ground_truth"
18 |         - "prediction"
19 |       properties: 
20 |         prediction: 
21 |           type: "object"
22 |           required: 
23 |             - "path"
24 |             - "format"
25 |           properties: 
26 |             format: 
27 |               enum: 
28 |                 - "bioboxes.org:/profiling:0.9"
29 |             path: {}
30 |         ground_truth: 
31 |           type: "object"
32 |           required: 
33 |             - "path"
34 |             - "format"
35 |           properties: 
36 |             format: 
37 |               enum: 
38 |                 - "bioboxes.org:/profiling:0.9"
39 |             path: {}
40 | 


--------------------------------------------------------------------------------
/unsupervised_binning/input_schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | "$schema": http://json-schema.org/draft-04/schema#
 3 | title: Bioboxes unsupervised binning benchmark input file validator
 4 | type: object
 5 | additionalProperties: false
 6 | required:
 7 | - version
 8 | - arguments
 9 | properties:
10 |   version:
11 |     type: string
12 |     pattern: "^0.11.\\d+$"
13 |   arguments:
14 |     additionalProperties: false
15 |     type: array
16 |     required:
17 |     - labels
18 |     - predictions
19 |     properties:
20 |       sequences:
21 |         type: object
22 |         required:
23 |         - value
24 |         - type
25 |         properties:
26 |           id: {}
27 |           type:
28 |             enum:
29 |             - contig
30 |           value: {}
31 |       labels:
32 |         type: object
33 |         required:
34 |         - type
35 |         - value
36 |         properties:
37 |           type:
38 |             enum:
39 |             - binning
40 |           value: {}
41 |       predictions:
42 |         type: object
43 |         required:
44 |         - value
45 |         properties:
46 |           type:
47 |             enum:
48 |             - binning
49 |           value: {}
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 bioinformatics-container-standards
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/container/binning_evaluation/unsupervised_binning/input_schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | "$schema": http://json-schema.org/draft-04/schema#
 3 | title: Bioboxes unsupervised binning benchmark input file validator
 4 | type: object
 5 | additionalProperties: false
 6 | required:
 7 | - version
 8 | - arguments
 9 | properties:
10 |   version:
11 |     type: string
12 |     pattern: "^0.11.\\d+$"
13 |   arguments:
14 |     additionalProperties: false
15 |     type: array
16 |     required:
17 |     - labels
18 |     - predictions
19 |     properties:
20 |       sequences:
21 |         type: object
22 |         required:
23 |         - value
24 |         - type
25 |         properties:
26 |           id: {}
27 |           type:
28 |             enum:
29 |             - contig
30 |           value: {}
31 |       labels:
32 |         type: object
33 |         required:
34 |         - type
35 |         - value
36 |         properties:
37 |           type:
38 |             enum:
39 |             - binning
40 |           value: {}
41 |       predictions:
42 |         type: object
43 |         required:
44 |         - value
45 |         properties:
46 |           type:
47 |             enum:
48 |             - binning
49 |           value: {}
50 | 


--------------------------------------------------------------------------------
/databases/ncbi_taxonomy.txt:
--------------------------------------------------------------------------------
 1 | pub/taxonomy
 2 | pub/taxonomy/.listing
 3 | pub/taxonomy/Ccode_dump.txt
 4 | pub/taxonomy/Cowner_dump.txt
 5 | pub/taxonomy/Icode_dump.txt
 6 | pub/taxonomy/coll_dump.txt
 7 | pub/taxonomy/gi_taxid.readme
 8 | pub/taxonomy/gi_taxid_nucl.dmp.gz
 9 | pub/taxonomy/gi_taxid_nucl.zip
10 | pub/taxonomy/gi_taxid_nucl_diff.dmp.gz
11 | pub/taxonomy/gi_taxid_nucl_diff.zip
12 | pub/taxonomy/gi_taxid_prot.dmp.gz
13 | pub/taxonomy/gi_taxid_prot.zip
14 | pub/taxonomy/gi_taxid_prot_diff.dmp.gz
15 | pub/taxonomy/gi_taxid_prot_diff.zip
16 | pub/taxonomy/taxcat.tar.Z
17 | pub/taxonomy/taxcat.tar.Z.md5
18 | pub/taxonomy/taxcat.tar.gz
19 | pub/taxonomy/taxcat.tar.gz.md5
20 | pub/taxonomy/taxcat.zip
21 | pub/taxonomy/taxcat.zip.md5
22 | pub/taxonomy/taxcat_readme.txt
23 | pub/taxonomy/taxdmp.zip
24 | pub/taxonomy/taxdmp.zip.md5
25 | pub/taxonomy/taxdump.tar.Z
26 | pub/taxonomy/taxdump.tar.Z.md5
27 | pub/taxonomy/taxdump.tar.gz
28 | pub/taxonomy/taxdump.tar.gz.md5
29 | pub/taxonomy/taxdump_readme.txt
30 | pub/taxonomy/citations.dmp
31 | pub/taxonomy/delnodes.dmp
32 | pub/taxonomy/division.dmp
33 | pub/taxonomy/gc.prt
34 | pub/taxonomy/gencode.dmp
35 | pub/taxonomy/merged.dmp
36 | pub/taxonomy/names.dmp
37 | pub/taxonomy/nodes.dmp
38 | pub/taxonomy/readme.txt
39 | 


--------------------------------------------------------------------------------
/container/assembly-evaluation/read-based-assembly-evaluation/input_schema.yaml:
--------------------------------------------------------------------------------
 1 | "$schema": "http://json-schema.org/draft-04/schema#"
 2 | title: "Bioboxes read based assembly validator"
 3 | type: object
 4 | properties: 
 5 |  version: 
 6 |   type: string
 7 |   pattern: "^0.2.\\d+$"
 8 |  arguments: 
 9 |   type: object
10 |   additionalProperties: false
11 |   properties: 
12 |    reads: 
13 |     type: array
14 |     uniqueItems: true
15 |     minItems: 1
16 |     items: 
17 |      type: object
18 |      additionalProperties: false
19 |      required: 
20 |       - path
21 |      properties: 
22 |       id: 
23 |       format: 
24 |        enum: 
25 |         - "bioboxes.org:/fastq"
26 |       type: 
27 |        enum: 
28 |         - paired
29 |         - single
30 |       path: 
31 |    assemblies: 
32 |     type: array
33 |     uniqueItems: true
34 |     minItems: 1
35 |     items: 
36 |      type: object
37 |      additionalProperties: false
38 |      required: 
39 |       - path
40 |      properties: 
41 |       id: 
42 |       format: 
43 |        enum: 
44 |         - "bioboxes.org:/fasta"
45 |       type: 
46 |        enum: 
47 |         - contig
48 |         - scaffold
49 |       path: 
50 | required: 
51 |  - version
52 |  - arguments
53 | additionalProperties: false
54 | 


--------------------------------------------------------------------------------
/container/short-read-assembler/input_schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | $schema: "http://json-schema.org/draft-04/schema#"
 3 | title: "Bioboxes short read assembler input file validator"
 4 | type: "object"
 5 | properties: 
 6 |     version: 
 7 |       type: "string"
 8 |       pattern: "^0.9.\\d+$"
 9 |     arguments: 
10 |       type: "array"
11 |       minItems: 1
12 |       maxItems: 2
13 |       items: 
14 |         oneOf: 
15 |           - 
16 |             $ref: "#/definitions/fastq"
17 |           - 
18 |             $ref: "#/definitions/fragment"
19 | required: 
20 |     - "version"
21 |     - "arguments"
22 | additionalProperties: false
23 | definitions: 
24 |     fastq: 
25 |       type: "object"
26 |       additionalProperties: false
27 |       required: 
28 |         - "fastq"
29 |       properties: 
30 |         fastq: 
31 |           $ref: "#/definitions/values"
32 |     fragment: 
33 |       type: "object"
34 |       additionalProperties: false
35 |       properties: 
36 |         fragment_size: 
37 |           $ref: "#/definitions/values"
38 |     values: 
39 |       type: "array"
40 |       uniqueItems: true
41 |       minItems: 1
42 |       items: 
43 |         type: "object"
44 |         additionalProperties: false
45 |         required: 
46 |           - "id"
47 |           - "value"
48 |         properties: 
49 |           id: {}
50 |           type: {}
51 |           value: {}
52 | 


--------------------------------------------------------------------------------
/container/profiling/schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | "$schema": http://json-schema.org/draft-04/schema#
 3 | title: Bioboxes Profiling
 4 | type: object
 5 | properties:
 6 |   version:
 7 |     type: string
 8 |     pattern: "^1.0.\\d+$"
 9 |   arguments:
10 |     type: array
11 |     required:
12 |     - fastq
13 |     - database
14 |     additionalItems: false
15 |     minItems: 1
16 |     items:
17 |       oneOf:
18 |       - "$ref": "#/definitions/fastq"
19 |       - "$ref": "#/definitions/database"
20 |       - "$ref": "#/definitions/cache"
21 | required:
22 | - version
23 | - arguments
24 | additionalProperties: false
25 | definitions:
26 |   fastq:
27 |     type: object
28 |     minItems: 1
29 |     required:
30 |     - fastq
31 |     properties:
32 |       fastq:
33 |         "$ref": "#/definitions/values"
34 |   values:
35 |     type: array
36 |     uniqueItems: true
37 |     minItems: 1
38 |     items:
39 |       type: object
40 |       additionalProperties: false
41 |       required:
42 |       - type
43 |       - value
44 |       properties:
45 |         type: {}
46 |         value: {}
47 |   cache:
48 |     type: object
49 |     required:
50 |     - cache
51 |     properties:
52 |       cache:
53 |         required:
54 |         - type
55 |         - value
56 |         properties:
57 |           type: {}
58 |           value: {}
59 |   database:
60 |     required:
61 |     - database
62 |     properties:
63 |       database:
64 |         required:
65 |         - type
66 |         - value
67 |         properties:
68 |           type: {}
69 |           value: {}
70 | 


--------------------------------------------------------------------------------
/container/assembly-evaluation/reference-based-assembly-evaluation/input_schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 |   $schema: "http://json-schema.org/draft-04/schema#"
 3 |   title: "Bioboxes short read assembler input file validator"
 4 |   type: "object"
 5 |   properties: 
 6 |     version: 
 7 |       type: "string"
 8 |       pattern: "^0.9.\\d+$"
 9 |     arguments: 
10 |       type: "array"
11 |       minItems: 1
12 |       maxItems: 3
13 |       items: 
14 |         oneOf: 
15 |           - 
16 |             $ref: "#/definitions/fasta"
17 |           - 
18 |             $ref: "#/definitions/fasta_dir"
19 |           - 
20 |             $ref: "#/definitions/cache"
21 |   required: 
22 |     - "version"
23 |     - "arguments"
24 |   additionalProperties: false
25 |   definitions: 
26 |     fasta: 
27 |       type: "object"
28 |       additionalProperties: false
29 |       required: 
30 |         - "fasta"
31 |       properties: 
32 |         fasta: 
33 |           $ref: "#/definitions/values"
34 |     cache: 
35 |       type: "object"
36 |       additionalProperties: false
37 |       required: 
38 |         - "cache"
39 |       properties: 
40 |         cache: {}
41 |     fasta_dir: 
42 |       type: "object"
43 |       additionalProperties: false
44 |       required: 
45 |         - "fasta_dir"
46 |       properties: 
47 |         fasta_dir: {}
48 |     values: 
49 |       type: "array"
50 |       uniqueItems: true
51 |       minItems: 1
52 |       items: 
53 |         type: "object"
54 |         additionalProperties: false
55 |         required: 
56 |           - "id"
57 |           - "value"
58 |         properties: 
59 |           id: {}
60 |           type: {}
61 |           value: {}
62 | 


--------------------------------------------------------------------------------
/container/README.mkd:
--------------------------------------------------------------------------------
 1 | # Current and Previous Container Specifications
 2 | 
 3 | This is a list of permanent links to the latest major specification versions in the git history.
 4 | 
 5 | ## Assembly
 6 | 
 7 | [Assembly container version 0.9](https://github.com/bioboxes/rfc/blob/f551c515b5f7f1db4e18282207dd89f9bcf3ea25/container/short-read-assembler/rfc.mkd)
 8 | 
 9 | [Assembly container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/short-read-assembler.mkd)
10 | 
11 | [Assembly benchmarking container version 0.9](https://github.com/bioboxes/rfc/blob/1a3e2f14188dcd841cdc82e5b442798eb7d795f2/container/assembly-evaluation/rfc.mkd)
12 | 
13 | [Assembly reference-based benchmarking container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/genome-assembly-reference-benchmarking.mkd)
14 | 
15 | ## Binning
16 | 
17 | [Binning container version 0.9](https://github.com/bioboxes/rfc/blob/3835b5721dc03f2fc10d8c9139f7f201ced7ccfe/container/binning/rfc.mkd)
18 | 
19 | [Binning container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/binning.mkd)
20 | 
21 | [Binning evaluation container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/binning-evaluation.mkd)
22 | 
23 | ## Profiling
24 | 
25 | [Profiling container version 0.8.2]
26 | (https://github.com/bioboxes/rfc/blob/f03f05e70972aa2eb3716f57f017d1ec704a84f2/container/profiling.mkd)
27 | 
28 | [Profiling container version 0.8](https://github.com/bioboxes/rfc/blob/5a23b8a40ab67541a9a851c765872aea5c0336f9/container/profiling.mkd)
29 | 


--------------------------------------------------------------------------------
/data-format/evaluation.mkd:
--------------------------------------------------------------------------------
 1 | # Evaluation Output Specification
 2 | 
 3 | *  Version: 0.1.1
 4 | *  Maintainer: Peter Belmann pbelmann@cebitec.uni-bielefeld.de
 5 | *  Authors: Ivan Gregor Ivan.Gregor@uni-duesseldorf.de, Johannes Dröge johannes.droege@uni-duesseldorf.de, Michael Barton mail@michaelbarton.me.uk, Peter Belmann pbelmann@cebitec.uni-bielefeld.de
 6 | 
 7 | This document contains a definition for a yaml that is produced by an evaluation container.  
 8 | 
 9 | ### General Definition
10 | 
11 | This YAML has the following structure: 
12 | 
13 | ```YAML
14 | ---
15 | version: NUMBER.NUMBER.NUMBER
16 | results: ARRAY
17 | ```
18 | 
19 | * version: Version number must match the regular expression `[0-9\.]` 
20 | 
21 | * results: The results property must have an array as value. Each results item consists out of the 
22 | following properties:
23 | 
24 | ### Results Item
25 | 
26 | * name: 
27 | 
28 |   Title is an arbitrary String.
29 | 
30 | * type:
31 | 
32 |   Type attribute has the following structure:
33 | 
34 | ```YAML
35 |   type: txt|png|html|tsv|csv
36 | ```
37 |   
38 | * inline: 
39 | 
40 |   Indicates whether the metric is reprented inline in the **value** field or in an external file. 
41 |   
42 | * value:
43 | 
44 |   If the **inline** property is false then the value is the absolute path to a file.
45 |   If the **inline** property is true then the value contains the metric. 
46 | 
47 | * description
48 | 
49 |   Description for the evaluation method.
50 | 
51 | ## Example
52 | 
53 | ```YAML
54 | version: 0.1.1
55 | results:
56 |   - name: N50
57 |     description: N50 is the length for which the collection of all contigs of that length or longer covers at least half an assembly.
58 |     value: 42
59 |     type: txt
60 |     inline:true
61 |   - name: my metric
62 |     type: csv
63 |     inline: false
64 |     description: Method that produces confusion matrices
65 |     value: /path/to/file 
66 | ```
67 | 


--------------------------------------------------------------------------------
/unsupervised_binning/rfc.mkd:
--------------------------------------------------------------------------------
 1 | ## Binning evaluation container
 2 | 
 3 |  * Version:    0.11.0
 4 |  * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
 5 | 
 6 | ### Contents
 7 | 
 8 | 
 9 | ### Outline
10 | 
11 | This specification describes the interface for containerised binning evaluation applications. 
12 | An unsupervised binning evaluation application should validate the assigned BINID
13 | (https://github.com/bioboxes/rfc/blob/master/data-format/binning.mkd#the-binning-output-format) 
14 | of a binning container.
15 | 
16 | ### Input
17 | 
18 | #### General Definition
19 | 
20 | A biobox requires an input YAML with the following definition 
21 | 
22 | ```YAML
23 | ---
24 | version: NUMBER.NUMBER.NUMBER
25 | arguments:
26 |   - fasta:
27 |       value: STRING
28 |       type: contig
29 |   - labels:
30 |       value: STRING
31 |       type: binning
32 |   - predictions:
33 |       value: STRING
34 |       type: binning
35 | ```
36 | 
37 | ##### Description:
38 | 
39 | * **version**: The current version is specified directly under the heading.
40 | * **arguments**: The arguments field consists of the following fields 
41 |        * **fasta**: The input FASTA file for the binned sequences.
42 |        * **labels**: The correct binning in bioboxes.org binning format.
43 |        * **predictions**: The predicted binning in bioboxes.org binning format.
44 |        
45 | ##### Mounts:
46 |  * Your output directory MUST be mounted to /bbx/mnt/output
47 |  * Your input files MUST be mounted to /bbx/mnt/input
48 |  * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml
49 | 
50 | ##### File/folder object entries
51 | 
52 | ```YAML
53 |   value: STRING
54 |   type: String
55 | ```
56 | 
57 | * `value` means absolute path to file in container
58 | * `type` specifies the semantic type
59 | 
60 | ### Output
61 | 
62 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd).
63 | 


--------------------------------------------------------------------------------
/container/binning_evaluation/unsupervised_binning/rfc.mkd:
--------------------------------------------------------------------------------
 1 | ## Binning evaluation container
 2 | 
 3 |  * Version:    0.11.0
 4 |  * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
 5 | 
 6 | ### Contents
 7 | 
 8 | 
 9 | ### Outline
10 | 
11 | This specification describes the interface for containerised binning evaluation applications. 
12 | An unsupervised binning evaluation application should validate the assigned BINID
13 | (https://github.com/bioboxes/rfc/blob/master/data-format/binning.mkd#the-binning-output-format) 
14 | of a binning container.
15 | 
16 | ### Input
17 | 
18 | #### General Definition
19 | 
20 | A biobox requires an input YAML with the following definition 
21 | 
22 | ```YAML
23 | ---
24 | version: NUMBER.NUMBER.NUMBER
25 | arguments:
26 |   - fasta:
27 |       value: STRING
28 |       type: contig
29 |   - labels:
30 |       value: STRING
31 |       type: binning
32 |   - predictions:
33 |       value: STRING
34 |       type: binning
35 | ```
36 | 
37 | ##### Description:
38 | 
39 | * **version**: The current version is specified directly under the heading.
40 | * **arguments**: The arguments field consists of the following fields 
41 |        * **fasta**: The input FASTA file for the binned sequences.
42 |        * **labels**: The correct binning in bioboxes.org binning format.
43 |        * **predictions**: The predicted binning in bioboxes.org binning format.
44 |        
45 | ##### Mounts:
46 |  * Your output directory MUST be mounted to /bbx/mnt/output
47 |  * Your input files MUST be mounted to /bbx/mnt/input
48 |  * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml
49 | 
50 | ##### File/folder object entries
51 | 
52 | ```YAML
53 |   value: STRING
54 |   type: String
55 | ```
56 | 
57 | * `value` means absolute path to file in container
58 | * `type` specifies the semantic type
59 | 
60 | ### Output
61 | 
62 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd).
63 | 


--------------------------------------------------------------------------------
/container/binning_evaluation/taxonomic_binning/input_schema.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | $schema: "http://json-schema.org/draft-04/schema#"
 3 | title: "Bioboxes unsupervised binning benchmark input file validator"
 4 | type: "object"
 5 | additionalProperties: false
 6 | required: 
 7 |     - "version"
 8 |     - "arguments"
 9 | properties: 
10 |   version: 
11 |     type: "string"
12 |     pattern: "^0.10.\\d+$"
13 |   arguments: 
14 |     additionalProperties: false
15 |     type: object
16 |     required:
17 |        - labels
18 |        - predictions
19 |     properties:
20 |       sequences: 
21 |         type: "object"
22 |         required: 
23 |           - "path"
24 |           - "format"
25 |         properties: 
26 |           format: 
27 |             enum:
28 |                - "bioboxes.org:/fasta"
29 |           id: {}
30 |           type:
31 |             enum:
32 |                - "contig"
33 |           path: {}
34 |       labels: 
35 |         type: "object"
36 |         required: 
37 |           - "format"
38 |           - "path"
39 |         properties: 
40 |           format: 
41 |             enum:
42 |                - "bioboxes.org:/binning/binning:0.9/taxbinning"
43 |           id: {}
44 |           type:
45 |             enum:
46 |                - "binning"
47 |           path: {}
48 |       predictions:
49 |         type: "object"
50 |         required: 
51 |           - "path"
52 |           - "format"
53 |         properties: 
54 |           format: 
55 |             enum:
56 |                - "bioboxes.org:/binning/binning:0.9/taxbinning"
57 |           id: {}
58 |           type:
59 |             enum:
60 |                - "binning"
61 |           path: {}
62 |       databases:
63 |         type: "object"
64 |         properties: 
65 |           taxonomy:
66 |             type: object
67 |             required: 
68 |               - "path"
69 |               - "format"
70 |             properties: 
71 |               format: 
72 |                 enum:
73 |                  - "bioboxes.org:/taxonomy_ncbi_dumps"
74 |               id: {}
75 |               type: 
76 |                 enum: 
77 |                   - "ncbi"
78 |               path: {}
79 | 


--------------------------------------------------------------------------------
/README.mkd:
--------------------------------------------------------------------------------
 1 | <p align="center"> <img width="800px" src="https://raw.githubusercontent.com/bioboxes/logo/0c6bfccd2440ccf06356243ba71b32588ad40617/logo_light.png" /></p>
 2 | 
 3 | ## Outline
 4 | 
 5 | Software containers have the potential to solve the common problem in
 6 | bioinformatics where complex dependencies can make installing and using a tool
 7 | difficult. Containerisation allows any developer to include all the required
 8 | dependencies along with their tool to provide the end-user with everything they
 9 | need to start using it.
10 | 
11 | Two existing projects have taken advantage of this concept to benchmark
12 | bioinformatics software inside containers: [CAMI](http://cami-challenge.org/)
13 | and [nucleotid.es](http://nucleotid.es). We, the developers from these two
14 | projects, met to agree a standard so that containers created by one project
15 | would be usable by another through the same interface.
16 | 
17 | The aim of this RFC is to create a standard for well-defined bioinformatics
18 | applications. This standard will put the users of bioinformatics software
19 | first, so that a community-agreed interface allows the use of different tools
20 | regardless of where or by whom it was developed. We welcome contributions and
21 | suggestions from other developers with aim of creating a standard that everyone
22 | can follow and agree on.
23 | 
24 | ## Development process
25 | 
26 | The development process for bioboxes is outlined on the bioboxes.org and has
27 | information for [beginners to get started with bioboxes][started] and [how to
28 | make contributions to bioboxes][contribute].
29 | 
30 | [started]: http://bioboxes.org/guide/user/
31 | [contribute]: http://bioboxes.org/contribute/getting-started/
32 | 
33 | ### Core team
34 | 
35 | There is a core team who work on developing the bioboxes RFCs. The core team's
36 | goal is to develop the RFC and resolve issues. The members should generally be
37 | selected from those who are already actively involved in bioboxes. The size of
38 | the core team should be small to enable decisions to be made quickly. A core
39 | team member should be willing to:
40 | 
41 |   * Actively follow, resolve issues and answer questions on the bioboxes github
42 |     issue tracker.
43 |   * Meet for a 30 minute teleconference on a biweekly basis to discuss the
44 |     progress of bioboxes.
45 | 


--------------------------------------------------------------------------------
/container/profiling-evaluation/rfc.mkd:
--------------------------------------------------------------------------------
 1 | ## Profiling evaluation container
 2 | 
 3 |  * Version:    0.1.0
 4 |  * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
 5 | 
 6 | ### Contents
 7 | 
 8 | ### Outline
 9 | 
10 | This specification describes the interface for containerised profiling evaluation applications. 
11 | In addition to the specifications described below, this container MUST implement the
12 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 
13 | 
14 | ### Input
15 | 
16 | #### General Definition
17 | 
18 | A biobox requires an input YAML with the following definition 
19 | 
20 | ```YAML
21 | ---
22 | version: NUMBER.NUMBER.NUMBER
23 | arguments:
24 |   prediction:
25 |     path: STRING
26 |     format: bioboxes.org:/profiling:0.9
27 |   ground_truth:
28 |     path: STRING
29 |     format: bioboxes.org:/profiling:0.9
30 | ```
31 | 
32 | ##### Description:
33 | 
34 | * **version**: The current version is specified directly under the heading.
35 | * **arguments**: The arguments field consists of the following fields 
36 |        * **predictions**: Profiling prediction in bioboxes.org profiling format.
37 |        * **ground_truth**: Profiling ground truth/gold standard in bioboxes.org profiling format.
38 | * **path**: Path MUST begin with a slash ('/'), which points to a profiling file. This file has to be mounted to a path that is prefixed by `/bbx/mnt/input`.
39 | 
40 | ##### Mounts:
41 |  * Your output directory MUST be mounted to /bbx/mnt/output
42 |  * Your input files MUST be mounted to /bbx/mnt/input
43 |  * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml
44 |  * If the directory /bbx/mnt/metadata is mounted then the following files should be placed inside the directory:
45 |    log.txt Logging information that is generated by the application inside the container.
46 | 
47 | ##### Formats
48 | * `bioboxes.org:/profiling:0.9`: bioboxes.org profiling file in version 0.9
49 | 
50 | ### Output
51 | The biobox produces on a successful run a `biobox.yaml` and can be found in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd).
52 | 
53 | ### Example
54 | 
55 | ```YAML
56 | ---
57 | version: 0.1.0
58 | arguments:
59 |   prediction:
60 |     path: /bbx/mnt/input/prediction.txt
61 |     format: bioboxes.org:/profiling:0.9
62 |   ground_truth:
63 |     path: /bbx/mnt/input/ground_truth.txt
64 |     format: bioboxes.org:/profiling:0.9
65 | ```
66 | 


--------------------------------------------------------------------------------
/container/binning/input_schema.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | $schema: "http://json-schema.org/draft-04/schema#"
  3 | title: "Bioboxes binning input file validator"
  4 | type: "object"
  5 | properties: 
  6 |   version: 
  7 |     type: "string"
  8 |     pattern: "^0.9.\\d+$"
  9 |   arguments: 
 10 |     type: "array"
 11 |     minItems: 1
 12 |     maxItems: 4
 13 |     uniqueItems: true
 14 |     items: 
 15 |       - 
 16 |         $ref: "#/definitions/fasta"
 17 |     additionalItems: 
 18 |       anyOf: 
 19 |         - 
 20 |           $ref: "#/definitions/fastq"
 21 |         - 
 22 |           $ref: "#/definitions/databases"
 23 |         - 
 24 |           $ref: "#/definitions/cache"
 25 | required: 
 26 |   - "version"
 27 |   - "arguments"
 28 | additionalProperties: false
 29 | definitions: 
 30 |   fasta: 
 31 |     type: "object"
 32 |     additionalProperties: false
 33 |     required: 
 34 |       - "fasta"
 35 |     properties: 
 36 |       fasta: 
 37 |         type: "object"
 38 |         additionalProperties: false
 39 |         required: 
 40 |           - "id"
 41 |           - "value"
 42 |         properties: 
 43 |           id: {}
 44 |           type: {}
 45 |           value: {}
 46 |   fastq: 
 47 |     type: "object"
 48 |     additionalProperties: false
 49 |     required: 
 50 |       - "fastq"
 51 |     properties: 
 52 |       fastq: 
 53 |         $ref: "#/definitions/values"
 54 |   cache: 
 55 |     type: "object"
 56 |     additionalProperties: false
 57 |     required: 
 58 |       - "cache"
 59 |     properties: 
 60 |       cache: {}
 61 |   databases: 
 62 |     type: "object"
 63 |     additionalProperties: false
 64 |     required: 
 65 |       - "databases"
 66 |     properties: 
 67 |       databases: 
 68 |         $ref: "#/definitions/database_values"
 69 |   values: 
 70 |     type: "array"
 71 |     uniqueItems: true
 72 |     minItems: 1
 73 |     items: 
 74 |       type: "object"
 75 |       additionalProperties: false
 76 |       required: 
 77 |         - "id"
 78 |         - "value"
 79 |       properties: 
 80 |         id: {}
 81 |         type: {}
 82 |         value: {}
 83 |   database_values: 
 84 |     type: "array"
 85 |     uniqueItems: true
 86 |     minItems: 1
 87 |     items: 
 88 |       type: "object"
 89 |       additionalProperties: false
 90 |       required: 
 91 |         - "id"
 92 |         - "value"
 93 |       properties: 
 94 |         id: 
 95 |           enum: 
 96 |             - "ncbi_taxonomy"
 97 |             - "refseq"
 98 |             - "blastdb"
 99 |             - "cog"
100 |             - "ncbi_genomes"
101 |         value: {}
102 | 


--------------------------------------------------------------------------------
/container/binning_evaluation/taxonomic_binning/rfc.mkd:
--------------------------------------------------------------------------------
 1 | ## Taxonomic binning evaluation container
 2 | 
 3 |  * Version:    0.10.0
 4 |  * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
 5 | 
 6 | ### Contents
 7 | 
 8 | * Outline
 9 | * Input
10 |   * General Definition
11 |     * Description
12 |     * File/folder object entries
13 |     * Formats
14 | * Output
15 | 
16 | ### Outline
17 | 
18 | This specification describes the interface for containerised binning evaluation applications. 
19 | A binning evaluation application should validate the assigned TAXID
20 | (https://github.com/bioboxes/rfc/blob/master/data-format/binning.mkd#the-binning-output-format) 
21 | of a binning container.
22 | 
23 | In addition to the specifications described below, this container MUST implement the
24 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 
25 | 
26 | 
27 | ### Input
28 | 
29 | #### General Definition
30 | 
31 | A biobox requires an input YAML with the following definition 
32 | 
33 | ```YAML
34 | ---
35 | version: NUMBER.NUMBER.NUMBER
36 | arguments:
37 |   sequences:
38 |     path: STRING
39 |     id: STRING
40 |     type: contig
41 |     format: bioboxes.org:/fasta
42 |   labels:
43 |     path: STRING
44 |     id: STRING
45 |     type: binning
46 |     format: bioboxes.org:/binning/binning:0.9/taxbinning
47 |   predictions:
48 |     path: STRING
49 |     id: STRING
50 |     type: binning
51 |     format: bioboxes.org:/binning/binning:0.9/taxbinning
52 |   databases:
53 |     taxonomy:
54 |       path: STRING
55 |       id: STRING
56 |       type: ncbi
57 |       format: bioboxes.org:/taxonomy_ncbi_dumps
58 | ```
59 | 
60 | ##### Description:
61 | 
62 | * **version**: The current version is specified directly under the heading.
63 | * **arguments**: The arguments field consists of the following fields 
64 |        * **sequences**: The input FASTA file for the binned sequences.
65 |        * **labels**: The correct binning in bioboxes.org (taxonomic) binning format.
66 |        * **predictions**: The predicted binning in bioboxes.org (taxonomic) binning format.
67 |        * databases
68 |          * **taxonomy**: The corresponding taxonomy in NCBI format.
69 |        
70 | ##### Mounts:
71 |  * Your output directory MUST be mounted to /bbx/mnt/output
72 |  * Your input files MUST be mounted to /bbx/mnt/input
73 |  * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml
74 | 
75 | ##### File/folder object entries
76 | 
77 | ```YAML
78 |   path: STRING
79 |   id: STRING
80 |   type: String
81 |   format: STRING
82 | ```
83 | 
84 | * `path` means absolute path to file in container
85 | * `id` is a unique id for the file (optional)
86 | * `type` specifies the semantic type (optional)
87 | * `format` gives a machine-checkable type definition (will be transformed to YAML tag in future)
88 | 
89 | ##### Formats
90 | * `fasta`: FASTA file
91 | * `bioboxes.org:/binning:0.9/taxbinning`: bioboxes.org binning file in version 0.9 with column TAXID
92 | * `bioboxes.org:/taxonomy_ncbi_dumps`: A folder containing at least the NCBI taxonomy dump files names.dmp and nodes.dmp
93 | 
94 | ### Output
95 | 
96 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd).
97 | 
98 | 


--------------------------------------------------------------------------------
/container/profiling/rfc.mkd:
--------------------------------------------------------------------------------
  1 | ## Profiling container
  2 | 
  3 |  * Version: 1.0.0
  4 |  * Maintainer: Johannes Dröge <johannes.droege@uni-duesseldorf.de>
  5 | 
  6 | ### Contents
  7 | 
  8 | * Outline
  9 | * Input
 10 |   * General Definition
 11 |     * Description
 12 |     * Mounts
 13 |     * File/Folder Object Definition
 14 | * Output
 15 |   * General Definition
 16 |   * Description
 17 |   * Mounts
 18 | * Signature
 19 | * Example
 20 |   
 21 | ### Outline
 22 | 
 23 | This specification describes the interface for containerised profiling applications. 
 24 | A profiling application gives an insight into the composition of the microbial community by assigning percentage values to taxonomic identifiers.
 25 | 
 26 | In addition to the specifications described below, this container MUST implement the
 27 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 
 28 | 
 29 | ### Input
 30 | 
 31 | #### General Definition
 32 | 
 33 | A biobox requires an input YAML with the following definition 
 34 | 
 35 | ~~~YAML
 36 | version: 1.0.0
 37 | arguments:
 38 |   - fastq:
 39 |     - type: fastq
 40 |       value: STRING
 41 |   - database:
 42 |       type: bioboxes.org:/taxonomy_ncbi_dumps 
 43 |       value: STRING
 44 |   - cache:
 45 |       type: directory
 46 |       value: STRING
 47 | ~~~
 48 | 
 49 | ##### Description:
 50 | 
 51 | * **version**: The current version is specified directly under the heading.
 52 | * **arguments**: The arguments field consists of the following fields 
 53 |        * **fastq**: An array of gzipped fastq sequence libraries.
 54 |        * **database**: The taxonomy database. A directory containing nodes.dmp and names.dmp. 
 55 |        * **cache**: Path to a cache directory.
 56 | 
 57 | ##### Mounts:
 58 |  * Your output directory MUST be mounted to /bbx/mnt/output
 59 |  * Your input files MUST be mounted to /bbx/mnt/input
 60 |  * The input biobox.yaml MUST be placed as /bbx/mnt/input/biobox.yaml
 61 | 
 62 | ##### File/Folder object entries
 63 | 
 64 | ```YAML
 65 |   path: STRING
 66 |   value: STRING
 67 | ```
 68 | 
 69 | * `value` means absolute path to file in container
 70 | * `type` gives a machine-checkable type definition
 71 | 
 72 | ### Outputs
 73 | 
 74 | #### General Definition
 75 | 
 76 | ~~~YAML
 77 | ---
 78 | version: NUMBER.NUMBER.NUMBER
 79 | arguments: 
 80 |     profiling:
 81 |        - value: STRING
 82 |          type: bioboxes.org:/profling:0.9
 83 | ~~~
 84 | 
 85 | #### Description:
 86 | 
 87 | This yaml with the name biobox.yaml will be available on a successful run and can be found in your mounted output directory.
 88 | 
 89 | * version: The current version is specified directly under the heading.
 90 | * arguments: The arguments field consists out of the profiling field
 91 | 
 92 | #### Mounts:
 93 | 
 94 | If the directory /bbx/metadata is mounted then the following files should be placed inside the directory:
 95 | log.txt Logging information that is generated by the application inside the container.
 96 | 
 97 | ### Signature
 98 | 
 99 | Any biobox based profiling tool accepts at least one of the following signatures:
100 | 
101 |     fastq A, database B, Maybe cache -> profiling C
102 | 
103 | ### Example
104 | 
105 | This is an example biobox.yaml file:
106 | 
107 | ~~~YAML
108 | version: 1.0.0
109 | arguments:
110 |   - fastq:
111 |     - type: fastq
112 |       value: /path/to/fastq
113 |   - database:
114 |       type: bioboxes.org:/taxonomy_ncbi_dumps
115 |       value: /path/to/ncbi_dump
116 |   - cache:
117 |       type: directory
118 |       value: /path/to/cache/directory
119 | ~~~
120 | 


--------------------------------------------------------------------------------
/container/assembly-evaluation/reference-based-assembly-evaluation/rfc.mkd:
--------------------------------------------------------------------------------
  1 | ## Genome assembly benchmarking container
  2 | 
  3 |   * Version:    0.9.0
  4 |   * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
  5 | 
  6 | ### Contents
  7 | * Outline
  8 | * Inputs
  9 |    * General Definition
 10 |       * Description
 11 |       * Mounts
 12 |    * fasta
 13 |    * fasta_dir
 14 |    * cache
 15 | * Outputs
 16 |    * evaluation
 17 | * Signature
 18 | * Example
 19 | 
 20 | ### Outline
 21 | 
 22 | This specification describes the interface for containerised software to
 23 | evaluate a genome assembly in FASTA format using optional multiple reference genome sequences in
 24 | FASTA format. Genome assemblers vary in efficiency and the quality of an assembly
 25 | may be evaluated by comparing it to a higher quality reference of the same
 26 | genome. In addition to the specifications described below, this container MUST
 27 | implement all specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container).
 28 | 
 29 | ### Inputs
 30 | 
 31 | #### General Definition
 32 | 
 33 | A biobox requires an input YAML that follows the below definition and is valid according to [this](https://github.com/bioboxes/rfc/blob/master/container/short-read-assembler/input_schema.yaml) schema. 
 34 | 
 35 | ```YAML
 36 | ---
 37 | version: NUMBER.NUMBER.NUMBER
 38 | arguments:
 39 |   - fasta: LIST
 40 |   - fasta_dir: STRING  
 41 |   - cache: STRING
 42 | ```
 43 | 
 44 | ##### Description:
 45 | * **version**: The current version is specified directly under the heading.
 46 | * **arguments**: The arguments field consists out of the following fields 
 47 |        * fasta
 48 |        * fasta_dir
 49 |        
 50 |        You can find a definition for every field below this section.
 51 | 
 52 | ##### Mounts:
 53 |  * The .yaml MUST be mounted to /bbx/input/biobox.yaml.
 54 |  * Your output directory MUST be mounted to /bbx/output.
 55 |  * Your input files MUST be mounted to /bbx/input. 
 56 | 
 57 | #### cache definition (optional):
 58 | 
 59 | ```YAML
 60 |   value: STRING
 61 | ```
 62 | 
 63 | ##### Description:
 64 | * **value**: Path to a writeable mounted directory. If mounted the tool will place intermediate results in this directory and reuse them on a second run.
 65 | 
 66 | #### fasta definition :
 67 | 
 68 | ```YAML
 69 | - value: STRING
 70 |   id: STRING or NUMBER
 71 |   type: contig or scaffold
 72 | ```
 73 | 
 74 | ##### Description:
 75 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTA file. This file has to be mounted to a path that is prefixed by `/bbx/input`.
 76 | * **id**: A unique id for every entry in the fasta list.
 77 | * **type**: Two options:
 78 |   * **contig**
 79 |   * **scaffold**
 80 | 
 81 | #### fasta_dir definition (optional):
 82 | 
 83 | ```YAML
 84 |   value: STRING
 85 | ```
 86 | 
 87 | ##### Description:
 88 | * **value**: This variable specifies the absolute path to a directoy containing FASTA formatted reference sequence files from the same origin as the fastas specified in the fasta entry.
 89 | 
 90 | ### Outputs
 91 | 
 92 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd).
 93 | 
 94 | ##### Mounts:
 95 |  * If the directory `/bbx/metadata` is mounted then the following files should be placed inside the directory:
 96 |    * `log.txt` Logging information that is generated by the application inside the container.   
 97 | 
 98 | ### Signature
 99 | 
100 | Any biobox based assembler accepts at least one of the following signatures:
101 | 
102 | 1. `[fasta A], [Maybe fasta_dir B] -> evaluation C`
103 | 
104 | where
105 |    * `Maybe` indicates an optional value
106 | 
107 | ### Example
108 | This is an example biobox.yaml file:
109 | 
110 | ```YAML
111 | ---
112 | version: 0.9.0
113 | arguments:
114 |    - fasta:
115 |        - value: "/path/to/lib1"
116 |          id: "pe_1"
117 |          type: "contig"
118 |    - fasta_dir: "/path/to/dir/with/references"
119 | ```
120 | 


--------------------------------------------------------------------------------
/container/assembly-evaluation/read-based-assembly-evaluation/rfc.mkd:
--------------------------------------------------------------------------------
  1 | ## Read based assembly benchmarking container
  2 | 
  3 |   * Version:    0.2.0
  4 |   * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
  5 | 
  6 | ### Contents
  7 | * Outline
  8 | * Inputs
  9 |    * General Definition
 10 |       * Description
 11 |       * Mounts
 12 |    * assemblies
 13 |    * reads
 14 |    * cache
 15 | * Outputs
 16 |    * evaluation
 17 | * Signature
 18 | * Example
 19 | 
 20 | ### Outline
 21 | 
 22 | This specification describes the interface for containerised software to
 23 | evaluate a genome assembly in FASTA format using read data in FASTQ format. 
 24 | In addition to the specifications described below, this container MUST
 25 | implement all specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container).
 26 | 
 27 | ### Input
 28 | 
 29 | #### General Definition
 30 | 
 31 | A biobox requires an input YAML with the following definition 
 32 | 
 33 | ```YAML
 34 | ---
 35 | version: NUMBER.NUMBER.NUMBER
 36 | arguments:
 37 |   assemblies: LIST
 38 |   reads: LIST
 39 |   cache: STRING
 40 | ```
 41 | 
 42 | ##### Description:
 43 | * **version**: The current version is specified directly under the heading.
 44 | * **arguments**: The arguments field consists out of the following fields 
 45 |        * assemblies A list of assembly files in fasta format.
 46 |        * reads Fastqs
 47 |        * cache
 48 |        You can find a definition for every field below this section.
 49 | 
 50 | ##### Mounts:
 51 |  * The .yaml MUST be mounted to /bbx/mnt/input/biobox.yaml.
 52 |  * Your output directory MUST be mounted to /bbx/mnt/output.
 53 |  * Your input files MUST be mounted to /bbx/mnt/input. 
 54 | 
 55 | #### assemblies definition: 
 56 | ```YAML
 57 | - path: STRING
 58 |   id: STRING or NUMBER
 59 |   type: contig or scaffold
 60 |   format: bioboxes.org:/fasta
 61 | ```
 62 | 
 63 | ##### Description:
 64 | * **value**: Path MUST begin with a slash ('/'), which points to FASTA file. This file has to be mounted to a path that is prefixed by `/bbx/mnt/input`.
 65 | * **id**: A unique id for every entry in the fasta list (optional).
 66 | * **type**: Two options:
 67 |   * **contig**
 68 |   * **scaffold**
 69 | 
 70 | #### reads definition: 
 71 | ```YAML
 72 |  - path: STRING
 73 |    id: STRING or NUMBER
 74 |    type: paired or single
 75 |    format: bioboxes.org:/fastq
 76 | ```
 77 | 
 78 | ##### Description:
 79 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTQ file. This file has to be mounted to a path that is prefixed by `/bbx/mnt/input`.
 80 | * **id**: A unique id for every entry in the fastq list (optional).
 81 | * **type**: Two options:
 82 |       * paired: Paired end fastq reads. By choosing this type, the **value** field hast to be interleaved gzipped fastq.
 83 |       * single: Single end gzipped fastq reads. 
 84 | 
 85 | #### cache definition (optional):
 86 | 
 87 | ```YAML
 88 |   cache: STRING
 89 | ```
 90 | 
 91 | ##### Description:
 92 | * **cache**: Path to a writeable mounted directory. If mounted the tool will place intermediate results in this directory and reuse them on a second run.
 93 | 
 94 | ### Outputs
 95 | 
 96 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd).
 97 | 
 98 | ##### Mounts:
 99 | 
100 |  * If the directory `/bbx/mnt/metadata` is mounted then the following files should be placed inside the directory:
101 |    * `log.txt` Logging information that is generated by the application inside the container.   
102 | 
103 | ##### Formats
104 | * `fasta`: FASTA file
105 | * `fastq`: FASTQ file
106 | 
107 | ### Signature
108 | 
109 | Any biobox based assembler accepts at least one of the following signatures:
110 | 
111 | 1. `[fasta A], [fastq B] -> evaluation C`
112 | 
113 | ### Example
114 | This is an example biobox.yaml file:
115 | 
116 | ```YAML
117 | ---
118 | version: 0.2.0
119 | arguments:
120 |    assemblies:
121 |      - path: /path/to/assembly1/fasta
122 |        id: ray
123 |        type: contig
124 |        format: bioboxes.org:/fasta
125 |    reads:
126 |      - path: /path/to/short/read.fastq.gz
127 |        id: lib1
128 |        type: paired
129 |        format: bioboxes.org:/fastq
130 | ```
131 | 


--------------------------------------------------------------------------------
/rfc.mkd:
--------------------------------------------------------------------------------
 1 | ## bioboxes - Standards for Interoperable Bioinformatics Containers
 2 | 
 3 |   * Version: 0.8.1
 4 |   * Maintainer: Michael Barton <mail@michaelbarton.me.uk>
 5 | 
 6 | ## Introduction
 7 | 
 8 | The purpose of this subsequent documents is provide a detailed specification
 9 | for developers to write standardised bioinformatics containers. The goal of
10 | this document is to define a standard whereby bioinformatics software
11 | containers of the same type are interoperable and therefore can used
12 | interchangeably. The audience of this document are bioinformaticians and
13 | developers writing bioinformatics software shared using Linux containers. This
14 | document will describe the interface that MUST be provided to a running
15 | container and that a developer of the bioinformatics container MUST write their
16 | software against.
17 | 
18 | The scope of this standard is bioinformatics software packaged using Linux
19 | containers. Bioinformatics software in a Linux container can be shared and
20 | provided to third parties because software dependencies are included within the
21 | container. Examples of bioinformatics software are genome assemblers, read
22 | binners and read aligners. Examples of container software are Docker, Rocket
23 | and LXC/LXD. Standardising bioinformatics software in containers allows
24 | interchangeable use between different research groups and institutions.
25 | 
26 | Applications of this standardisation are:
27 | 
28 |   * A developer uploads his short read aligner as a container to an online
29 |     repository for others to use. A biologists downloads this aligner and is
30 |     able to use it immediately as it follows a standardised interface that the
31 |     biologist is already familiar with.
32 |   * A genome assembly benchmarking service downloads many genome assembler
33 |     containers. These containers are evaluated using assembly performance
34 |     metrics. The standardised interface allows all containers to be benchmarked
35 |     the same way.
36 |   * A large sequencing centre invests time to develop an improved genome
37 |     assembly pipeline for single cell data. The pipeline is packaged inside a
38 |     Linux container and shared with the bioinformatics community. Another large
39 |     sequencing centre is able to immediately compare this new pipeline with
40 |     their in-house pipeline using the same container interface.
41 | 
42 | ### Notational Conventions
43 | 
44 | * The key words “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”,
45 |   “SHOULD NOT”, “RECOMMENDED”, “MAY”, and “OPTIONAL” in this document are to be
46 |   interpreted as described in [RFC2119].
47 | 
48 | * PAIRED: Paired reads are defined as the organisation of a FASTA or FASTQ file
49 |   where the Nth and Nth+1 reads originate from opposite ends of the same DNA
50 |   fragment, where N % 2 == 0 using 0-based indexing.
51 | 
52 | ## Generic bioinformatics container
53 | 
54 | This specification describes the required inputs for all containerised
55 | bioinformatics software, independent of the application type.
56 | 
57 | ### Inputs
58 | 
59 | * **TASK**: The argument given to start a container MUST be a single string
60 |   containing only the characters A-Z, a-z, 0-9, '_' and '-'. This argument is
61 |   used to differentiate different combination of settings the containerised
62 |   software can be run as. Every container SHOULD support a 'default' task. This
63 |   runs the container in a mode that is applicable to the most common situation
64 |   in which the software is used.
65 | 
66 | ### Outputs
67 | 
68 | The containerised software MUST return a zero exit code when completing
69 | successfully, and return a non-zero exit code when an error occurs.
70 | 
71 | ## Databases
72 | 
73 | This section describes the variables containing the paths to various databases.
74 | 
75 | ### Variables
76 | 
77 | * **CONT_DATABASES_DIR**: This variable specifies the absolute path to a [directory](databases_structure.txt) that contains the following databases:
78 |    * COG
79 |    * NCBI Genomes
80 |    * Refseq
81 |    * BLAST DBs
82 | 
83 | ## Normative References
84 | 
85 | * [RFC2119]	Bradner, S., “Key words for use in RFCs to Indicate Requirement
86 |   Levels”, BCP 14, RFC 2119, March 1997.
87 | 
88 | # Authors's Addresses
89 | 
90 | * Michael Barton <mail@michaelbarton.me.uk>
91 | * Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
92 | * Andreas Bremges <andreas.bremges@helmholtz-hzi.de>
93 | * Johannes Dröge <johannes.droege@uni-duesseldorf.de>
94 | * Alexander Sczyrba <asczyrba@cebitec.uni-bielefeld.de>
95 | 


--------------------------------------------------------------------------------
/container/short-read-assembler/rfc.mkd:
--------------------------------------------------------------------------------
  1 | ## Short-read genome assembler container
  2 | 
  3 |   * Version:    0.9.3
  4 |   * Maintainer: Michael Barton <mail@michaelbarton.me.uk>
  5 | 
  6 | ### Contents
  7 | * Outline
  8 | * Inputs
  9 |    * General Definition
 10 |       * Description
 11 |       * Mounts
 12 |    * fastq
 13 |    * fragment_size
 14 | * Outputs
 15 |    * fasta
 16 | * Signature
 17 | * Example
 18 | 
 19 | ### Outline
 20 | 
 21 | This specification describes the interface for containerised short-read genome
 22 | assemblers. A genome assembler converts one or more FASTQ files of DNA short
 23 | reads into larger contiguous ('contigs') regions of DNA. In addition to the
 24 | specifications described below, this container MUST implement the
 25 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container).
 26 | 
 27 | ### Inputs
 28 | 
 29 | #### General Definition
 30 | 
 31 | A biobox requires an input YAML that follows the below definition and is valid according to [this](https://github.com/bioboxes/rfc/blob/master/container/short-read-assembler/input_schema.yaml) schema. 
 32 | 
 33 | ```YAML
 34 | ---
 35 | version: NUMBER.NUMBER.NUMBER
 36 | arguments:
 37 |   - fastq: LIST
 38 |   - fragment_size: LIST
 39 | ```
 40 | 
 41 | ##### Description:
 42 | * **version**: The current version is specified directly under the heading.
 43 | * **arguments**: The arguments field consists out of the following fields 
 44 |        * fastq
 45 |        * fragment_size
 46 |        
 47 |        You can find a definition for every field below this section.
 48 | 
 49 | ##### Mounts:
 50 |  * The .yaml MUST be mounted to /bbx/input/biobox.yaml.
 51 |  * Your output directory MUST be mounted to /bbx/output.
 52 |  * Your input files MUST be mounted to /bbx/input. 
 53 | 
 54 | #### fastq definition: 
 55 | ```YAML
 56 | - value: STRING
 57 |   id: STRING or NUMBER
 58 |   type: paired or single
 59 | ```
 60 | 
 61 | ##### Description:
 62 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTQ file. This file has to be mounted to a path that is prefixed by `/bbx/input`.
 63 | * **id**: A unique id for every entry in the fastq list.
 64 | * **type**: Two options:
 65 |       * paired: Paired end fastq reads. By choosing this type the **value** field hast to be interleaved gzipped fastq.
 66 |       * single: Single end fastq reads. 
 67 |       
 68 | #### fragment_size definition:
 69 | ```YAML
 70 | - id: STRING,
 71 |   value: NUMBER
 72 | ```
 73 | 
 74 | ##### Description:
 75 | * **id**: The specified id MUST match exactly one entry in the fastq entry list.
 76 | * **number**: Number for the fragment size.
 77 | 
 78 | ### Outputs
 79 | 
 80 | #### General Definition
 81 | 
 82 | ```YAML
 83 | ---
 84 | version: NUMBER.NUMBER.NUMBER
 85 | arguments: 
 86 |     - fasta: LIST
 87 | ```
 88 | 
 89 | ##### Description:
 90 | This yaml with the name `biobox.yaml` will be available on a successful run in your mounted output directory.
 91 | 
 92 | * **version**: The current version is specified directly under the heading.
 93 | * **arguments**: The arguments field consists out of the **fasta** field
 94 | 
 95 | ##### Mounts:
 96 |  * If the directory `/bbx/metadata` is mounted then the following files should be placed inside the directory:
 97 |    * `log.txt` Logging information that is generated by the application inside the container.   
 98 | 
 99 | #### fasta definition:
100 | 
101 | ```YAML
102 | - value: STRING
103 |   id: STRING or NUMBER
104 |   type: contig or scaffold
105 | ```
106 | 
107 | ##### Description:
108 | * **value**: This is the path to a fasta file containing the contigs relative to your mounted output directory.
109 | * **id**: A unique id for every entry in the fasta list.
110 | * **type**: Two options:
111 |   * **contig**
112 |   * **scaffold**
113 | 
114 | ### Signature
115 | 
116 | Any biobox based assembler accepts at least one of the following signatures:
117 | 
118 | 1. `[fastq A], [Maybe fragment_size A] -> contigs B, scaffolds C`
119 | 2. `[fastq A], [fragment_size A] -> contigs B, scaffolds C`
120 | 
121 | where
122 |    * `Maybe` indicates an optional value
123 | 
124 | ### Example
125 | This is an example biobox.yaml file:
126 | 
127 | ```YAML
128 | ---
129 | version: 0.9.0
130 | arguments:
131 |    - fastq:
132 |       - value: "/path/to/lib1"
133 |         id: "pe_1"
134 |         type: paired
135 |       - value: "/path/to/lib2"
136 |         id: "pe_2"
137 |         type: paired
138 |       - value: "/path/to/lib2"
139 |         id: "lmp_1"
140 |         type: paired
141 |    - fragment_size:
142 |       - value: 240
143 |         id: pe_1
144 |       - value: 5000
145 |         id: lmp_1
146 | ```
147 | 


--------------------------------------------------------------------------------
/container/binning/rfc.mkd:
--------------------------------------------------------------------------------
  1 | ## Binning container
  2 | 
  3 |   * Version:    0.9.2
  4 |   * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
  5 | 
  6 | ### Contents
  7 | * Outline
  8 | * Inputs
  9 |    * General Definition
 10 |       * Description
 11 |       * Mounts
 12 |    * fasta
 13 |    * fastq
 14 |    * database
 15 |    * cache
 16 | * Outputs
 17 |    * binning
 18 | * Signature
 19 | * Example
 20 | 
 21 | ### Outline
 22 | 
 23 | This specification describes the interface for containerised binning applications. A binning application groups reads or contigs and/or assigns them to operational taxonomic units. In addition to the specifications described below, this container MUST implement the specifications defined in 'Generic bioinformatics container'.
 24 | ### Inputs
 25 | 
 26 | #### General Definition
 27 | 
 28 | A biobox requires an input YAML that follows the below definition and is valid according to [this](https://github.com/bioboxes/rfc/blob/master/container/binning/input_schema.yaml) schema. 
 29 | 
 30 | ```YAML
 31 | ---
 32 | version: NUMBER.NUMBER.NUMBER
 33 | arguments:
 34 |   - fasta:
 35 |       value: STRING
 36 |       id: STRING
 37 |       type: STRING
 38 |   - fastq: LIST
 39 |   - databases: LIST
 40 |   - cache: STRING
 41 | ```
 42 | 
 43 | ##### Description:
 44 | * **version**: The current version is specified directly under the heading.
 45 | * **arguments**: The arguments field consists out of the following fields 
 46 |        * fasta
 47 |        * fastq
 48 |        * databases
 49 |        * cache
 50 |        You can find a definition for every field below this section.
 51 | 
 52 | ##### Mounts:
 53 |  * The .yaml MUST be mounted to /bbx/input/biobox.yaml.
 54 |  * Your output directory MUST be mounted to /bbx/output.
 55 |  * Your input files MUST be mounted to /bbx/input. 
 56 | 
 57 | #### fastq definition: 
 58 | ```YAML
 59 | - value: STRING
 60 |   id: STRING or NUMBER
 61 |   type: paired or single
 62 | ```
 63 | 
 64 | ##### Description:
 65 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTQ file. This file has to be mounted to a path that is prefixed by `/bbx/input`.
 66 | * **id**: A unique id for every entry in the fastq list.
 67 | * **type**: Two options:
 68 |       * paired: Paired end fastq reads. By choosing this type the **value** field hast to be interleaved gzipped fastq.
 69 |       * single: Single end fastq reads. 
 70 |  
 71 | 
 72 | #### fasta definition:
 73 | 
 74 | ```YAML
 75 |   value: STRING
 76 |   id: STRING or NUMBER
 77 |   type: contig or scaffold
 78 | ```
 79 | 
 80 | ##### Description:
 81 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTA file. This file has to be mounted to a path that is prefixed by `/bbx/input`.
 82 | * **id**: A unique id for every entry in the fasta list.
 83 | * **type**: Two options:
 84 |   * **contig**
 85 |   * **scaffold**
 86 | 
 87 | #### databases definition:
 88 | 
 89 | ```YAML
 90 | - value: STRING
 91 |   id: STRING
 92 | ```
 93 | 
 94 | ##### Description:
 95 | * **value**: Path to a database directory.
 96 | * **id**: Database identifier. Each database identifier is a link to the directory structure. You can find the compressed version of the database with the corresponding structure on this ftp site: ftp://cami.psc.edu/ftp.ncbi.nlm.nih.gov/
 97 |   * **[refseq](https://github.com/bioboxes/rfc/blob/master/databases/refseq.txt)**  
 98 |   * **[blastdb](https://github.com/bioboxes/rfc/blob/master/databases/blastdb.txt)**
 99 |   * **[cog](https://github.com/bioboxes/rfc/blob/master/databases/cog.txt)**
100 |   * **[ncbi_genomes](https://github.com/bioboxes/rfc/blob/master/databases/ncbi_genomes.txt)**
101 |   * **[ncbi_taxonomy](https://github.com/bioboxes/rfc/blob/master/databases/ncbi_taxonomy.txt)**
102 | 
103 | #### cache definition:
104 | 
105 | ```YAML
106 |   value: STRING
107 | ```
108 | 
109 | ##### Description:
110 | * **value**: Path to a writeable mounted directory. If mounted the tool will place intermediate results in this directory and reuse them on a second run.
111 | 
112 | ### Outputs
113 | 
114 | #### General Definition
115 | 
116 | ```YAML
117 | ---
118 | version: NUMBER.NUMBER.NUMBER
119 | arguments: 
120 |     - binning:
121 |        value: STRING
122 |        type: Boolean
123 | ```
124 | 
125 | ##### Description:
126 | This yaml with the name `biobox.yaml` will be available on a successful run and can be found in the `bbx` directory in your mounted output directory.
127 | 
128 | * **version**: The current version is specified directly under the heading.
129 | * **arguments**: The arguments field consists out of the **binning** field
130 | 
131 | ##### Mounts:
132 |  * If the directory `/bbx/metadata` is mounted then the following files should be placed inside the directory:
133 |    * `log.txt` Logging information that is generated by the application inside the container.   
134 | 
135 | #### binning definition:
136 | 
137 | ```YAML
138 |   value: STRING
139 |   type: assignments
140 | ```
141 | 
142 | ##### Description:
143 | * **value**: This is the path to a binning file relative to your mounted output directory.
144 | * **type**
145 |   * **assignments** : Binning file contains the computed binning and/or taxonomic groups.
146 | 
147 | ### Signature
148 | 
149 | Any biobox based assembler accepts at least one of the following signatures:
150 | 
151 | 1. `fasta A -> binning B`
152 | 2. `fasta A, [fastq B] -> binning C`
153 | 
154 | ### Example
155 | This is an example biobox.yaml file:
156 | 
157 | ```YAML
158 | ---
159 | version: 0.9.0
160 | arguments:
161 |    - fasta:
162 |         value: "/path/to/lib1"
163 |         id: "pe_1"
164 |         type: paired
165 | ```
166 | 


--------------------------------------------------------------------------------
/data-format/sequence.mkd:
--------------------------------------------------------------------------------
  1 | ## Specifications of bioinformatics file formats
  2 | 
  3 |   * Version:    0.8.1
  4 |   * Maintainer: Michael Barton <mail@michaelbarton.me.uk>
  5 | 
  6 | ### Outline
  7 | 
  8 | The purpose of this document is to describe the sequence FASTA/Q file formats
  9 | used in the bioboxes RFC. There is no formal definition for FASTA or FASTQ in the same way
 10 | there are RFC definitions for [JSON][json] and [CSV][csv]. Furthermore it is
 11 | beyond the scope of this document create a strict definition of either. Instead
 12 | the community definitions for these formats are included to clarify the terms
 13 | FASTQ or FASTA when used in other RFCs.
 14 | 
 15 | [json]: https://tools.ietf.org/html/rfc7159
 16 | [csv]: https://tools.ietf.org/html/rfc4180
 17 | 
 18 | ### FASTA
 19 | 
 20 | #### Introduction
 21 | 
 22 | The FASTA format is described in the [BLAST input description][blast]. This is
 23 | quoted directly.
 24 | 
 25 | [blast]: http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml
 26 | 
 27 | > A sequence in FASTA format begins with a single-line description, followed by
 28 | > lines of sequence data. The description line (defline) is distinguished from
 29 | > the sequence data by a greater-than (">") symbol at the beginning. It is
 30 | > recommended that all lines of text be shorter than 80 characters in length.
 31 | >
 32 | > Blank lines are not allowed in the middle of FASTA input. Sequences are
 33 | > expected to be represented in the standard IUB/IUPAC amino acid and nucleic
 34 | > acid codes, with these exceptions: lower-case letters are accepted and are
 35 | > mapped into upper-case; a single hyphen or dash can be used to represent a gap
 36 | > of indeterminate length; and in amino acid sequences, U and * are acceptable
 37 | > letters.
 38 | 
 39 | #### Formal Grammar
 40 | 
 41 | A formal grammar using [Backus–Naur Form][bnf] for FASTA is described on [BioStar][biostar]. This is quoted
 42 | below as a specification of the FASTA format.
 43 | 
 44 | [biostar]: https://www.biostars.org/p/11254/#11255
 45 | [bnf]: http://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form
 46 | 
 47 |     <file>     ::= <token> | <token> <file>
 48 |     <token>    ::= <ignore> | <seq>
 49 |     <ignore>   ::= <whitespace> | <comment> <newline>
 50 |     <seq>      ::= <header> <molecule> <newline>
 51 |     <header>   ::= ">" <arbitrary text> <newline>
 52 |     <molecule> ::= <mol-line> | <mol-line> <molecule>
 53 |     <mol-line> ::= <nucl-line> | <prot-line>
 54 |     <nucl-line>::= "^[ACGTURYKMSWBDHVNX-]+$"
 55 |     <prot-line>::= "^[ABCDEFGHIKLMNOPQRSTUVWYZX*-]+$"
 56 | 
 57 | #### Example
 58 | 
 59 |     >gi|129295|sp|P01013|OVAX_CHICK GENE X PROTEIN (OVALBUMIN-RELATED)
 60 |     QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNNSFNVATLPAE
 61 |     KMKILELPFASGDLSMLVLLPDEVSDLERIEKTINFEKLTEWTNPNTMEKRRVKVYLPQMKIEEKYNLTS
 62 |     VLMALGMTDLFIPSANLTGISSAESLKISQAVHGAFMELSEDGIEMAGSTGVIEDIKHSPESEQFRADHP
 63 |     FLFLIKHNPTNTIVYFGRYWSP
 64 | 
 65 | ### FASTQ
 66 | 
 67 | #### Description
 68 | 
 69 | The FASTQ format is described in detail in the article ["The Sanger FASTQ file
 70 | format for sequences with quality scores, and the Solexa/Illumina FASTQ
 71 | variants"][1]. This definition will be used and quoted below. Importantly all
 72 | FASTQ MUST use the Phred+33 quality offset described in the final paragraph.
 73 | 
 74 | [1]: http://nar.oxfordjournals.org/content/38/6/1767
 75 | 
 76 | > There are four line types in the FASTQ format. First a ‘@’ title line which
 77 | > often holds just a record identifier. This is a free format field with no
 78 | > length limit—allowing arbitrary annotation or comments to be included, as in
 79 | > the example above where the NCBI have included an alternative ID and the
 80 | > sequence length. Some sequencing centers encode paired end read information
 81 | > here (alternatively two matched FASTQ files are often used).
 82 | >
 83 | > Second comes the sequence line(s), which as in the FASTA format can be line
 84 | > wrapped. Also like FASTA format, there is no explicit limitation on the
 85 | > characters expected, but restriction to the IUPAC single letter codes for
 86 | > (ambiguous) DNA or RNA is wise, and upper case is conventional. In some
 87 | > contexts, the use of lower or mixed case or the inclusion of a gap character
 88 | > may make sense. White space such as tabs or spaces is not permitted.
 89 | >
 90 | > Third, to signal the end of the sequence lines and the start of the quality
 91 | > string, comes the ‘+’ line. Originally this also included a full repeat of the
 92 | > title line text (as shown in the NCBI example above); however, by common usage
 93 | > and the MAQ tool convention, this is optional and the ‘+’ line can contain just
 94 | > this one character, reducing the file size significantly. The OBF tools follow
 95 | > this MAQ convention on output, and omit the optional repeated title text.
 96 | >
 97 | > Finally, comes quality line(s) which again can be wrapped. As discussed above,
 98 | > these use a subset of the ASCII printable characters (at most ASCII 33–126
 99 | > inclusive) with a simple offset mapping. Crucially, after concatenation
100 | > (removing line breaks), the quality string must be equal in length to the
101 | > sequence string.
102 | 
103 | #### Formal Grammar
104 | 
105 | [MAQ provides a formal grammar][2] using [Backus–Naur Form][bnf] to describe FASTQ. This definition is
106 | provided below as a specification for FASTQ format.
107 | 
108 |     <fastq>   :=  <block>+
109 |     <block>   :=  @<seqname>\n<seq>\n+[<seqname>]\n<qual>\n
110 |     <seqname> :=  [A-Za-z0-9_.:-]+
111 |     <seq>     :=  [A-Za-z\n\.~]+
112 |     <qual>    :=  [!-~\n]+
113 | 
114 | [2]: http://maq.sourceforge.net/fastq.shtml
115 | 
116 | #### Example
117 | 
118 |     @EAS54_6_R1_2_1_413_324
119 |     CCCTTCTTGTCTTCAGCGTTTCTCC
120 |     +
121 |     ;;3;;;;;;;;;;;;7;;;;;;;88
122 | 


--------------------------------------------------------------------------------
/data-format/binning.mkd:
--------------------------------------------------------------------------------
  1 | ## Binning Output Format
  2 | 
  3 |   * Version:    0.10.0
  4 |   * Maintainer: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
  5 |   * Authors: CAMI challenge organizers
  6 | 
  7 | ### 1. Outline
  8 | 
  9 | The binning format was originally specified for the CAMI contest and is intended to serve as a standard format for (taxonomic) binning methods. 
 10 | 
 11 | It is a TAB (`\t`) delimited text format consisting of a header section and an
 12 | output section. The header section MUST be above the output section and header
 13 | lines MUST start with `@` whereas output lines MUST NOT. Comment lines MUST
 14 | start with `#` and MAY occur both in the header and output section. Empty lines
 15 | MAY occur anywhere in the output for better readability. Only the UNIX newline
 16 | character `\n` MUST be used to define the end of a line and the text MUST be
 17 | valid UTF-8 encoding.
 18 | 
 19 | Files containing this data format should be named with the filename suffix `.binning`.
 20 | 
 21 | Regular expressions, when provided, are given as specified in IEEE Std 1003.1™ ERE.
 22 | 
 23 | ### 2. Header section
 24 | 
 25 | Each header line MUST begin with the character `@`. A single `@` defines a
 26 | key-value pair in the format **TAG:VALUE** where **TAG** MUST be an
 27 | alphanumeric string. Tags are case insensitive but MAY be specified using upper
 28 | and lower case letters for better readability. All tags MUST be unique per file.
 29 |  **VALUE** MUST NOT contain characters other than alphanumerical and `,.;_-|`.
 30 | More precisely, each non-empty and non-comment header line except for the last
 31 | header line MUST match the regular expression `^\@(_[A-Za-z]*_)?[A-Za-z]+[A-Za-z0-9]*\:[A-Za-z0-9,\.;_\|]*$`
 32 | 
 33 | The specification requires that the following header tags MUST be present:
 34 | 
 35 |   * **VERSION**: **VALUE** MUST specify the profiling format version in the heading
 36 |   of this specification and MUST match the regular expression `[0-9\.]`.
 37 | 
 38 |   * **SAMPLEID**: **VALUE** is the sample identifier, not the generating user or program name. It MUST match the regular  expression `[A-Za-z0-9\._]+`.
 39 | 
 40 | The following tags MAY be given:
 41 | 
 42 |   * **TAXONOMYID**: **VALUE** specifies an identifier of the external taxonomy
 43 |   which was used in the output section. **TAXID** values MUST be valid
 44 |   taxon identifiers in this taxonomy.
 45 | 
 46 | Additional tags and values MAY be specified but each additional tag MUST be
 47 | prefixed by a case-insensitive string with an underscore before and after the string,
 48 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future.
 49 | Empty prefixes MAY be used and mean that the tag starts with `__`.
 50 | 
 51 | The last header line MUST begin with `@@` and defines TAB-separated column tags,
 52 | where each **TAG** MUST be a string matching the regular expression
 53 | `[A-Za-z]+[A-Za-z0-9]*` and defines the content and format of values in the
 54 | corresponding column of the output section. The following lists all defined tags:
 55 | 
 56 |   * **SEQUENCEID**
 57 |   * **BINID**
 58 |   * **TAXID**
 59 | 
 60 | The format requires that **SEQUENCEID** and at least one of **BINID** and **TAXID** MUST be given.
 61 | Further optional columns can be appended to the right but MUST be 
 62 | prefixed by a case-insensitive string with an underscore before and after the string,
 63 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future.
 64 | Empty prefixes MAY be used and mean that the tag starts with `__`. This means that each
 65 | custom field MUST match the regular expression `_[A-Za-z]*_[A-Za-z]+[A-Za-z0-9]*`.
 66 | 
 67 | For instance:
 68 | 
 69 |     @@SEQUENCEID	TAXID
 70 | 
 71 | or
 72 | 
 73 |     @@SEQUENCEID	BINID
 74 | or
 75 | 
 76 |     @@SEQUENCEID	TAXID	BINID
 77 | 
 78 | or
 79 | 
 80 |     @@SEQUENCEID	TAXID	BINID	__SCORE	_MY_COLUMN2	_MY_COLUMN3
 81 | 
 82 | ### 3. Output section
 83 | 
 84 | An output line MUST consist of TAB-separated fields and MUST correspond to
 85 | the last header line definition. Each field MUST match the regular expression
 86 | `[A-Za-z0-9,\.;,\(\)_\-\ ]*`. This specification defines the following field types:
 87 | 
 88 |   * **SEQUENCEID** specifies the ID of either a read or a contig sequence (depending on
 89 |     the sample).
 90 | 
 91 |   * The **TAXID** field contains a taxonomic assignment for binned sequences corresponding to a taxonomy which should be refered to by the header tag **TaxonomyVersion**. Each individual field MUST match the
 92 | regular expression `[A-Za-z0-9\.;,\(\)_\-\ ]+`.
 93 | 
 94 |   * The **BINID** fields MUST be arbitrary alphanumeric identifiers for each bin.
 95 | 
 96 | ### 4. Multi-sample format
 97 | 
 98 | Starting with version `0.10.0`, multiple samples MAY be represented in a single file by concatenation.
 99 | Sample sections MUST be separated by at least one empty line after the last content line of a section
100 | and preceding the next header line. Additionally, a multi-sample file MUST specify the exact same
101 | **VERSION** tag value in every section. It MUST also specify the exact same **TAXONOMYID** tag value,
102 | if this tag is specified in at least one of the sections. The type and order of column tags MUST be
103 | identical for all sections. The **SAMPLEID** tag values must be unique for all concatenated sections.
104 | The meaning of the **BINID** tag values is local for each section, for instance, `BIN1` would have a
105 | different meaning in each section so that sequences from different samples can only be grouped into
106 | the same bin if they are pooled and reported in a joint section.
107 | 
108 | ### 5. EXAMPLES
109 | 
110 | There are three different scenarios for binning tools.
111 | 
112 | The first case, example A below: If you create taxonomic bins as output without
113 | further resolution, you do not need to include the **BINID** colummn, but only the
114 | **TAXID** column, in your output.
115 | 
116 | The second case, example B below: If you create bins that do not include
117 | taxonomic assignments you do not need to include the **TAXID** column, but only the
118 | **BINID** column, in your output.
119 | 
120 | The third case, example C below, is if you perform taxonomic binning and
121 | additionally resolve bins below existing taxonomic IDs, e.g. to define bins
122 | representing novel strains. In this case, you add both the **TAXID** and **BINID**.
123 | 
124 | A
125 | ```
126 | # This is the bioboxes.org binning output format at
127 | # https://github.com/bioboxes/rfc/tree/master/data-format
128 | 
129 | @Version:0.10.0
130 | @SampleID:mysample1
131 | @@SEQUENCEID	TAXID
132 | read1201	123
133 | read1202	123
134 | read1203	131564
135 | read1204	562
136 | read1205	562
137 | ```
138 | B
139 | ```
140 | # This is the bioboxes.org binning output format at
141 | # https://github.com/bioboxes/rfc/tree/master/data-format
142 | 
143 | @Version:0.10.0
144 | @SampleID:mysample1
145 | @@SEQUENCEID		BINID
146 | contig01	12346BIN
147 | contig02	ANOTHERBIN
148 | contig03	BIN6
149 | contig04	BIN5
150 | contig05	BIN5
151 | ```
152 | C
153 | ```
154 | # This is the bioboxes.org binning output format at
155 | # https://github.com/bioboxes/rfc/tree/master/data-format
156 | 
157 | @Version:0.10.0
158 | @SampleID:mysample1
159 | @@SEQUENCEID	TAXID	BINID
160 | contig01	123	123
161 | contig02	123	123
162 | contig03	131564	131564
163 | contig04	562	562.1
164 | contig05	562	562.2
165 | ```
166 | D
167 | ```
168 | # This is the bioboxes.org binning output format at
169 | # https://github.com/bioboxes/rfc/tree/master/data-format
170 | 
171 | @Version:0.10.0
172 | @SampleID:mysample_A
173 | @@SEQUENCEID		BINID
174 | contig_A_01	BIN_A_1
175 | contig_A_02	BIN_A_2
176 | contig_A_03	BIN_A_1
177 | 
178 | @Version:0.10.0
179 | @SampleID:mysample_B
180 | @@SEQUENCEID		BINID
181 | contig_B_01	BIN_B_1
182 | contig_B_02	BIN_B_1
183 | contig_B_03	BIN_B_2
184 | ```
185 | 
186 | 


--------------------------------------------------------------------------------
/databases/blastdb.txt:
--------------------------------------------------------------------------------
  1 | blast
  2 | blast/db
  3 | blast/db/nr.00.tar.gz
  4 | blast/db/nr.01.tar.gz
  5 | blast/db/nr.02.tar.gz
  6 | blast/db/nr.03.tar.gz
  7 | blast/db/nr.06.tar.gz
  8 | blast/db/nr.07.tar.gz
  9 | blast/db/nr.12.tar.gz
 10 | blast/db/nr.14.tar.gz
 11 | blast/db/nr.15.tar.gz
 12 | blast/db/nr.16.tar.gz
 13 | blast/db/nr.18.tar.gz
 14 | blast/db/nr.21.tar.gz
 15 | blast/db/nr.22.tar.gz
 16 | blast/db/nr.24.tar.gz
 17 | blast/db/nr.25.tar.gz
 18 | blast/db/nt.04.tar.gz
 19 | blast/db/nt.08.tar.gz
 20 | blast/db/nt.09.tar.gz
 21 | blast/db/nt.10.tar.gz
 22 | blast/db/nt.12.tar.gz
 23 | blast/db/nt.14.tar.gz
 24 | blast/db/nt.16.tar.gz
 25 | blast/db/nt.18.tar.gz
 26 | blast/db/nt.19.tar.gz
 27 | blast/db/nt.21.tar.gz
 28 | blast/db/nt.22.tar.gz
 29 | blast/db/env_nr.00.tar.gz
 30 | blast/db/env_nr.01.tar.gz
 31 | blast/db/env_nt.01.tar.gz
 32 | blast/db/env_nt.03.tar.gz
 33 | blast/db/refseq_genomic.00.tar.gz
 34 | blast/db/refseq_genomic.03.tar.gz
 35 | blast/db/refseq_genomic.08.tar.gz
 36 | blast/db/refseq_genomic.100.tar.gz
 37 | blast/db/refseq_genomic.104.tar.gz
 38 | blast/db/refseq_genomic.106.tar.gz
 39 | blast/db/refseq_genomic.107.tar.gz
 40 | blast/db/refseq_genomic.108.tar.gz
 41 | blast/db/refseq_genomic.109.tar.gz
 42 | blast/db/refseq_genomic.11.tar.gz
 43 | blast/db/refseq_genomic.112.tar.gz
 44 | blast/db/refseq_genomic.114.tar.gz
 45 | blast/db/refseq_genomic.115.tar.gz
 46 | blast/db/refseq_genomic.116.tar.gz
 47 | blast/db/refseq_genomic.119.tar.gz
 48 | blast/db/refseq_genomic.121.tar.gz
 49 | blast/db/refseq_genomic.122.tar.gz
 50 | blast/db/refseq_genomic.123.tar.gz
 51 | blast/db/refseq_genomic.124.tar.gz
 52 | blast/db/refseq_genomic.126.tar.gz
 53 | blast/db/refseq_genomic.127.tar.gz
 54 | blast/db/refseq_genomic.130.tar.gz
 55 | blast/db/refseq_genomic.132.tar.gz
 56 | blast/db/refseq_genomic.133.tar.gz
 57 | blast/db/refseq_genomic.135.tar.gz
 58 | blast/db/refseq_genomic.136.tar.gz
 59 | blast/db/refseq_genomic.138.tar.gz
 60 | blast/db/refseq_genomic.14.tar.gz
 61 | blast/db/refseq_genomic.140.tar.gz
 62 | blast/db/refseq_genomic.141.tar.gz
 63 | blast/db/refseq_genomic.142.tar.gz
 64 | blast/db/refseq_genomic.15.tar.gz
 65 | blast/db/refseq_genomic.20.tar.gz
 66 | blast/db/refseq_genomic.22.tar.gz
 67 | blast/db/refseq_genomic.24.tar.gz
 68 | blast/db/refseq_genomic.28.tar.gz
 69 | blast/db/refseq_genomic.29.tar.gz
 70 | blast/db/refseq_genomic.33.tar.gz
 71 | blast/db/refseq_genomic.34.tar.gz
 72 | blast/db/refseq_genomic.35.tar.gz
 73 | blast/db/refseq_genomic.37.tar.gz
 74 | blast/db/refseq_genomic.40.tar.gz
 75 | blast/db/refseq_genomic.41.tar.gz
 76 | blast/db/refseq_genomic.42.tar.gz
 77 | blast/db/refseq_genomic.45.tar.gz
 78 | blast/db/refseq_genomic.46.tar.gz
 79 | blast/db/refseq_genomic.47.tar.gz
 80 | blast/db/refseq_genomic.48.tar.gz
 81 | blast/db/refseq_genomic.49.tar.gz
 82 | blast/db/refseq_genomic.50.tar.gz
 83 | blast/db/refseq_genomic.51.tar.gz
 84 | blast/db/refseq_genomic.57.tar.gz
 85 | blast/db/refseq_genomic.58.tar.gz
 86 | blast/db/refseq_genomic.60.tar.gz
 87 | blast/db/refseq_genomic.61.tar.gz
 88 | blast/db/refseq_genomic.62.tar.gz
 89 | blast/db/refseq_genomic.65.tar.gz
 90 | blast/db/refseq_genomic.67.tar.gz
 91 | blast/db/refseq_genomic.68.tar.gz
 92 | blast/db/refseq_genomic.73.tar.gz
 93 | blast/db/refseq_genomic.74.tar.gz
 94 | blast/db/refseq_genomic.76.tar.gz
 95 | blast/db/refseq_genomic.77.tar.gz
 96 | blast/db/refseq_genomic.79.tar.gz
 97 | blast/db/refseq_genomic.80.tar.gz
 98 | blast/db/refseq_genomic.81.tar.gz
 99 | blast/db/refseq_genomic.82.tar.gz
100 | blast/db/refseq_genomic.83.tar.gz
101 | blast/db/refseq_genomic.84.tar.gz
102 | blast/db/refseq_genomic.85.tar.gz
103 | blast/db/refseq_genomic.87.tar.gz
104 | blast/db/refseq_genomic.88.tar.gz
105 | blast/db/refseq_genomic.89.tar.gz
106 | blast/db/refseq_genomic.90.tar.gz
107 | blast/db/refseq_genomic.91.tar.gz
108 | blast/db/refseq_genomic.93.tar.gz
109 | blast/db/refseq_genomic.94.tar.gz
110 | blast/db/refseq_genomic.96.tar.gz
111 | blast/db/refseq_protein.00.tar.gz
112 | blast/db/refseq_rna.00.tar.gz
113 | blast/db/refseq_rna.02.tar.gz
114 | blast/db/refseq_protein.05.tar.gz
115 | blast/db/refseq_protein.06.tar.gz
116 | blast/db/refseq_rna.04.tar.gz
117 | blast/db/refseq_protein.08.tar.gz
118 | blast/db/refseq_protein.14.tar.gz
119 | blast/db/refseq_genomic.137.tar.gz
120 | blast/db/nr.20.tar.gz
121 | blast/db/nt.03.tar.gz
122 | blast/db/refseq_genomic.13.tar.gz
123 | blast/db/refseq_genomic.118.tar.gz
124 | blast/db/nr.04.tar.gz
125 | blast/db/refseq_genomic.18.tar.gz
126 | blast/db/nt.07.tar.gz
127 | blast/db/nr.10.tar.gz
128 | blast/db/refseq_genomic.27.tar.gz
129 | blast/db/refseq_genomic.19.tar.gz
130 | blast/db/env_nt.00.tar.gz
131 | blast/db/nr.17.tar.gz
132 | blast/db/refseq_genomic.31.tar.gz
133 | blast/db/nr.05.tar.gz
134 | blast/db/nr.27.tar.gz
135 | blast/db/refseq_genomic.12.tar.gz
136 | blast/db/nt.11.tar.gz
137 | blast/db/refseq_genomic.101.tar.gz
138 | blast/db/nt.13.tar.gz
139 | blast/db/refseq_genomic.32.tar.gz
140 | blast/db/refseq_genomic.110.tar.gz
141 | blast/db/nt.23.tar.gz
142 | blast/db/refseq_genomic.10.tar.gz
143 | blast/db/refseq_genomic.23.tar.gz
144 | blast/db/refseq_genomic.30.tar.gz
145 | blast/db/refseq_genomic.17.tar.gz
146 | blast/db/refseq_genomic.129.tar.gz
147 | blast/db/nt.05.tar.gz
148 | blast/db/refseq_genomic.16.tar.gz
149 | blast/db/refseq_genomic.02.tar.gz
150 | blast/db/refseq_genomic.25.tar.gz
151 | blast/db/nt.00.tar.gz
152 | blast/db/nt.01.tar.gz
153 | blast/db/refseq_genomic.105.tar.gz
154 | blast/db/refseq_genomic.120.tar.gz
155 | blast/db/nt.20.tar.gz
156 | blast/db/env_nt.02.tar.gz
157 | blast/db/nr.26.tar.gz
158 | blast/db/refseq_genomic.131.tar.gz
159 | blast/db/refseq_genomic.117.tar.gz
160 | blast/db/refseq_genomic.128.tar.gz
161 | blast/db/nr.23.tar.gz
162 | blast/db/refseq_genomic.07.tar.gz
163 | blast/db/.listing
164 | blast/db/nr.08.tar.gz
165 | blast/db/refseq_genomic.36.tar.gz
166 | blast/db/nt.17.tar.gz
167 | blast/db/nt.06.tar.gz
168 | blast/db/refseq_genomic.134.tar.gz
169 | blast/db/refseq_genomic.21.tar.gz
170 | blast/db/nr.09.tar.gz
171 | blast/db/nr.19.tar.gz
172 | blast/db/refseq_genomic.06.tar.gz
173 | blast/db/refseq_genomic.102.tar.gz
174 | blast/db/nt.24.tar.gz
175 | blast/db/nr.11.tar.gz
176 | blast/db/refseq_genomic.09.tar.gz
177 | blast/db/nt.02.tar.gz
178 | blast/db/refseq_genomic.26.tar.gz
179 | blast/db/refseq_genomic.103.tar.gz
180 | blast/db/refseq_genomic.05.tar.gz
181 | blast/db/refseq_genomic.111.tar.gz
182 | blast/db/refseq_genomic.38.tar.gz
183 | blast/db/refseq_genomic.04.tar.gz
184 | blast/db/nr.13.tar.gz
185 | blast/db/nt.25.tar.gz
186 | blast/db/refseq_genomic.01.tar.gz
187 | blast/db/refseq_genomic.39.tar.gz
188 | blast/db/refseq_genomic.113.tar.gz
189 | blast/db/refseq_genomic.139.tar.gz
190 | blast/db/refseq_genomic.125.tar.gz
191 | blast/db/nt.15.tar.gz
192 | blast/db/refseq_genomic.43.tar.gz
193 | blast/db/refseq_genomic.44.tar.gz
194 | blast/db/refseq_genomic.52.tar.gz
195 | blast/db/refseq_genomic.53.tar.gz
196 | blast/db/refseq_genomic.54.tar.gz
197 | blast/db/refseq_genomic.55.tar.gz
198 | blast/db/refseq_genomic.56.tar.gz
199 | blast/db/refseq_genomic.59.tar.gz
200 | blast/db/refseq_genomic.63.tar.gz
201 | blast/db/refseq_genomic.64.tar.gz
202 | blast/db/refseq_genomic.66.tar.gz
203 | blast/db/refseq_genomic.69.tar.gz
204 | blast/db/refseq_genomic.70.tar.gz
205 | blast/db/refseq_genomic.71.tar.gz
206 | blast/db/refseq_genomic.72.tar.gz
207 | blast/db/refseq_genomic.75.tar.gz
208 | blast/db/refseq_genomic.78.tar.gz
209 | blast/db/refseq_genomic.86.tar.gz
210 | blast/db/refseq_genomic.92.tar.gz
211 | blast/db/refseq_genomic.95.tar.gz
212 | blast/db/refseq_genomic.97.tar.gz
213 | blast/db/refseq_genomic.98.tar.gz
214 | blast/db/refseq_genomic.99.tar.gz
215 | blast/db/refseq_protein.01.tar.gz
216 | blast/db/refseq_protein.02.tar.gz
217 | blast/db/refseq_protein.03.tar.gz
218 | blast/db/refseq_rna.01.tar.gz
219 | blast/db/refseq_protein.04.tar.gz
220 | blast/db/refseq_rna.03.tar.gz
221 | blast/db/refseq_protein.07.tar.gz
222 | blast/db/refseq_protein.09.tar.gz
223 | blast/db/refseq_protein.10.tar.gz
224 | blast/db/refseq_protein.11.tar.gz
225 | blast/db/refseq_protein.12.tar.gz
226 | blast/db/refseqgene.tar.gz
227 | blast/db/refseq_protein.13.tar.gz
228 | 


--------------------------------------------------------------------------------
/data-format/profiling.mkd:
--------------------------------------------------------------------------------
  1 | ## Profiling Output Format 
  2 | 
  3 |   * Version:    0.10.0
  4 |   * Maintainer: Johannes Dröge <code@fungs.de>
  5 |   * Authors:    Alice C. McHardy <alice.mchardy@helmholtz-hzi.de>,  David Koslicki <david.koslicki@math.oregonstate.edu>, Johannes Dröge <code@fungs.de>, Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>, Stephan Majda <stephan.majda@uni-duesseldorf.de>
  6 | 
  7 | ### 1. Outline
  8 | 
  9 | The taxonomic profiling format was originally specified for the CAMI contest
 10 | and is intended to serve as a standard format for the output of
 11 | taxonomic profiling methods.
 12 | 
 13 | It is a TAB (`\t`) delimited text format consisting of a header section and an
 14 | output section. The header section MUST be above the output section and header
 15 | lines MUST start with `@` whereas output lines MUST NOT. Comment lines MUST
 16 | start with `#` and MAY occur both in the header and output section. Empty lines
 17 | MAY occur anywhere in the output for better readability. Only the UNIX newline
 18 | character `\n` MUST be used to define the end of a line and the text MUST be
 19 | valid UTF-8 encoding.
 20 | 
 21 | Files containing this data format should be named with the filename suffix `.profile`.
 22 | 
 23 | Regular expressions, when provided, are given as specified in IEEE Std 1003.1™ ERE.
 24 | 
 25 | ### 2. Header section
 26 | 
 27 | Each header line MUST begin with the character `@`. A single `@` defines a
 28 | key-value pair in the format **TAG:VALUE** where **TAG** MUST be an
 29 | alphanumeric string. Tags are case insensitive but MAY be specified using upper
 30 | and lower case letters for better readability. All tags MUST be unique per file.
 31 |  **VALUE** MUST NOT contain characters other than alphanumerical and `,.;_-|`.
 32 | More precisely, each non-empty and non-comment header line except for the last
 33 | header line MUST match the regular expression `^\@(_[A-Za-z]*_)?[A-Za-z]+[A-Za-z0-9]*\:[A-Za-z0-9,\.;_\|]*$`
 34 | 
 35 | The specification requires that the following header tags MUST be present:
 36 | 
 37 |   * **SAMPLEID**: **VALUE** is the sample identifier, not the generating user or
 38 |   program name. It MUST match the regular expression `[A-Za-z0-9\._]+` and should
 39 |   be unique for the set of relevant samples.
 40 |   * **VERSION**: **VALUE** MUST specify the profiling format version in the heading
 41 |   of this specification and MUST match the regular expression `[0-9\.]`
 42 |   * **RANKS**: **VALUE** MUST specify a list of allowed ranks for the
 43 |   taxa in the output section and ranks MUST be given in increasing order of their
 44 |   distance from the taxonomy root. Each rank MUST be case-insensitive alphanumerical
 45 |   string and each such entry MUST be separated from the previous entry by the '|'
 46 |   character. Therefore, **VALUE** MUST match the regular expression `[A-Za-z]+(\|[A-Za-z]+)*`
 47 |   For example, considering the major ranks in the NCBI taxonomy, VALUE could be
 48 |   specified as `superkingdom|phylum|class|order|family|genus|species`.
 49 | 
 50 | The following tags MAY be given:
 51 | 
 52 |   * **TAXONOMYID**: **VALUE** specifies an identifier of the external taxonomy
 53 |   which was used in the output section. **TAXID** values should be valid
 54 |   taxon identifiers in this taxonomy.
 55 |   
 56 | Additional tags and values MAY be specified but each additional tag MUST be
 57 | prefixed by a case-insensitive string with an underscore before and after the string,
 58 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future.
 59 | Empty prefixes MAY be used and mean that the tag starts with `__`.
 60 | 
 61 | The last header line MUST begin with `@@` and defines TAB-separated column tags,
 62 | where each **TAG** MUST be a string matching the regular expression
 63 | `[A-Za-z]+[A-Za-z0-9]*` and defines the content and format of values in the
 64 | corresponding column of the output section. Tags are considered case-insensitive
 65 | but MAY be specified using upper and lower case letters for better readability.
 66 | The tags MUST be unique in this line. The leading tags and their corresponding
 67 | order MUST be
 68 | 
 69 |   * **TAXID**
 70 |   * **RANK**
 71 |   * **TAXPATH**
 72 |   * **PERCENTAGE**
 73 | 
 74 | except that **TAXPATH** MAY be followed by the optional tag **TAXPATHSN**.
 75 | 
 76 | Additional columns MAY be appended to the right after **PERCENTAGE** but MUST be
 77 | prefixed by a case-insensitive string with an underscore before and after the string,
 78 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future.
 79 | Empty prefixes MAY be used and mean that the tag starts with `__`. This means that each
 80 | custom field MUST match the regular expression `_[A-Za-z]*_[A-Za-z]+[A-Za-z0-9]*`
 81 | 
 82 | For instance:
 83 | 
 84 |     @@TAXID	RANK	TAXPATH	PERCENTAGE
 85 | 
 86 | or
 87 | 
 88 |     @@TAXID	RANK	TAXPATH	TAXPATHSN	PERCENTAGE
 89 | 
 90 | ###3. Output section
 91 | 
 92 | An output line MUST consist of TAB-separated fields and MUST correspond to
 93 | the last header line definition. Each field MUST match the regular expression
 94 | `[A-Za-z0-9,\.;,\(\)_\-\ ]*`. This specification defines the following field types:
 95 | 
 96 | **TAXID**: Fields MUST correspond to unique alphanumeric taxon identifiers,
 97 | for instance in the NCBI taxonomy. Each individual field MUST match the
 98 | regular expression `[A-Za-z0-9\.;,\(\)_\-\ ]+`
 99 | 
100 | **RANK**: Fields are case-insensitive and MUST match one of the rank identifiers
101 | which MUST be given in the header TAG **RANKS** except when an empty rank field
102 | is specified for a leaf taxon which is below the ranks specified by **RANKS**.
103 | The **RANK** field specifies where the respective taxons given in **TAXID**,
104 | **TAXPATH** or **TAXPATHSN** are located.
105 | 
106 | **PERCENTAGE**: Fields specify the relative genome abundance in terms of the
107 | genome copy number for the respective TAXID in the overall sample. Note that this
108 | is not identical to the relative abundance in terms of assigned base pairs.
109 | The PERCENTAGE can be a real number between 0 and 100 but MUST NOT exceed 6 digits
110 | after the decimal point, so it MUST matcht the regular expression
111 | `[0-9]+(\.[0-9]{0,6})?`. The sum of percentages given for all taxa from the same
112 | rank MUST NOT exceed 100, that is, if something is unassigned, this will be
113 | reflected in a percentage of less than 100% being assigned. Also, the value
114 | MUST be greater or equal the sum of values for contained taxa at subordinate ranks.
115 | 
116 | **TAXPATH** and **TAXPATHSN**: Fields specify the path from the root of the
117 | taxonomy to the respective taxon and MUST include the taxon which is given
118 | by **TAXID**. The path entries MUST be alphanumeric, **TAXPATH** entries MUST
119 | be taxon identifiers and **TAXPATHSN** entries should give the corresponding
120 | plain taxonomic names. All entries MUST be separated by a single `|` character
121 | and MUST be specified at the ranks and using their respective order as specified
122 | by the **RANKS** header tag. In particular, if the taxonomic path lacks a specified
123 | rank, this field MUST be left empty and would show as `||`. Empty trailing taxon entries
124 | MUST be omitted and the path MUST NOT end with `|`. Taxon entries which are not
125 | specified by the **RANKS** tag MAY only be appended to the right of a full path and
126 | MUST be refered to by an empty **RANK** field. Each **TAXPATH** and each **TAXPATHSN**
127 | field MUST match the regular expression `[A-Za-z0-9\.;,\(\)_\-\ ]+(\|[A-Za-z0-9\.;,\(\)_\-\ ])*`.
128 | If both **TAXPATH** and **TAXPATHSN** are given, then they MUST have the same number
129 | of taxon entries.
130 | 
131 | For instance:
132 | 
133 |     # Example for TAXPATHSN:
134 |     Archaea|Thaumarchaeota|||Aigarchaeota archaeon JGI 0000001-A7
135 | 
136 | or
137 | 
138 |     # Example for TAXPATH:
139 |     2157|651137|651142|1104572|1052838
140 | 
141 | ### 4. Multi-sample format
142 | 
143 | Starting with version `0.10.0`, multiple samples MAY be represented in a single file by concatenation.
144 | Sample sections MUST be separated by at least one empty line after the last content line of a section
145 | and preceding the next header line. Additionally, a multi-sample file MUST specify the exact same
146 | **VERSION** and **RANKS** tag values in every section and the exact same **TAXONOMYID** tag value, if
147 | this tag is specified in at least one of the sections. The type and order of column tags MUST be
148 | identical for all sections. The **SAMPLEID** tag values must be unique for all concatenated sections.
149 | 
150 | ### 5. Example
151 | 
152 |     # This is the bioboxes.org profiling output format at
153 |     # https://github.com/bioboxes/rfc/tree/master/data-format
154 |     
155 |     @SampleID:mysample1
156 |     @Version:0.10.0
157 |     @Ranks:superkingdom|phylum|class|order|family|genus|species
158 |     @TaxonomyID:ncbi-taxonomy_20171004
159 |     @@TAXID	RANK	TAXPATH	TAXPATHSN	PERCENTAGE
160 |     2	superkingdom	2	Bacteria	98.81211
161 |     2157	superkingdom	2157	Archaea	1.18789
162 |     1239	phylum	2|1239	Bacteria|Firmicutes	59.75801
163 |     1224	phylum	2|1224	Bacteria|Proteobacteria	18.94674
164 |     28890	phylum	2157|28890	Archaea|Euryarchaeotes	1.18789
165 |     91061	class	2|1239|91061	Bacteria|Firmicutes|Bacilli	59.75801
166 |     28211	class	2|1224|28211	Bacteria|Proteobacteria|Alphaproteobacteria	18.94674
167 |     183925	class	2157|28890|183925	Archaea|Euryarchaeotes|Methanobacteria	1.18789
168 |     1385	order	2|1239|91061|1385	Bacteria|Firmicutes|Bacilli|Bacillales	59.75801
169 |     356	order	2|1224|28211|356	Bacteria|Proteobacteria|Alphaproteobacteria|Rhizobacteria	10.52311
170 |     204455	order	2|1224|28211|204455	Bacteria|Proteobacteria|Alphaproteobacteria|Rhodobacterales	8.42263
171 |     2158	order	2157|28890|183925|2158	Archaea|Euryarchaeotes|Methanobacteria|Methanobacteriales	1.18789
172 | 


--------------------------------------------------------------------------------