├── data-format ├── README.mkd ├── evaluation.mkd ├── sequence.mkd ├── binning.mkd └── profiling.mkd ├── container ├── profiling-evaluation │ ├── schema.yaml │ └── rfc.mkd ├── binning_evaluation │ ├── unsupervised_binning │ │ ├── input_schema.yaml │ │ └── rfc.mkd │ └── taxonomic_binning │ │ ├── input_schema.yaml │ │ └── rfc.mkd ├── assembly-evaluation │ ├── read-based-assembly-evaluation │ │ ├── input_schema.yaml │ │ └── rfc.mkd │ └── reference-based-assembly-evaluation │ │ ├── input_schema.yaml │ │ └── rfc.mkd ├── short-read-assembler │ ├── input_schema.yaml │ └── rfc.mkd ├── profiling │ ├── schema.yaml │ └── rfc.mkd ├── README.mkd └── binning │ ├── input_schema.yaml │ └── rfc.mkd ├── unsupervised_binning ├── input_schema.yaml └── rfc.mkd ├── LICENSE ├── databases ├── ncbi_taxonomy.txt └── blastdb.txt ├── README.mkd └── rfc.mkd /data-format/README.mkd: -------------------------------------------------------------------------------- 1 | # Current and Previous Data Format Specifications 2 | 3 | This is a list of permanent links to the latest major specification versions in the git history. 4 | 5 | ## Profiling Output Format 6 | [Version 0.9](https://github.com/bioboxes/rfc/blob/60263f34c57bc4137deeceec4c68a7f9f810f6a5/data-format/profiling.mkd) 7 | 8 | ## Binning Output Format 9 | [Version 0.9](https://github.com/bioboxes/rfc/blob/4bb19a633a6a969c2332f1f298852114c5f89b1b/data-format/binning.mkd) 10 | 11 | ## Bioinformatics File Formats 12 | [Version 0.8](https://github.com/bioboxes/rfc/blob/b3b49b111704803e1427c82e2ecf87c5c8ffdfb9/data-format/sequence.mkd) 13 | 14 | ## Evaluation Output Format 15 | [Version 0.1](https://github.com/bioboxes/rfc/blob/5f5305300f4609e5b4b477e6184a5d231455ebd0/data-format/evaluation.mkd) 16 | -------------------------------------------------------------------------------- /container/profiling-evaluation/schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: "http://json-schema.org/draft-04/schema#" 3 | title: "Bioboxes profiling benchmark input file validator" 4 | type: "object" 5 | additionalProperties: false 6 | required: 7 | - "version" 8 | - "arguments" 9 | properties: 10 | version: 11 | type: "string" 12 | pattern: "^0.1.\\d+$" 13 | arguments: 14 | additionalProperties: false 15 | type: "object" 16 | required: 17 | - "ground_truth" 18 | - "prediction" 19 | properties: 20 | prediction: 21 | type: "object" 22 | required: 23 | - "path" 24 | - "format" 25 | properties: 26 | format: 27 | enum: 28 | - "bioboxes.org:/profiling:0.9" 29 | path: {} 30 | ground_truth: 31 | type: "object" 32 | required: 33 | - "path" 34 | - "format" 35 | properties: 36 | format: 37 | enum: 38 | - "bioboxes.org:/profiling:0.9" 39 | path: {} 40 | -------------------------------------------------------------------------------- /unsupervised_binning/input_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | "$schema": http://json-schema.org/draft-04/schema# 3 | title: Bioboxes unsupervised binning benchmark input file validator 4 | type: object 5 | additionalProperties: false 6 | required: 7 | - version 8 | - arguments 9 | properties: 10 | version: 11 | type: string 12 | pattern: "^0.11.\\d+$" 13 | arguments: 14 | additionalProperties: false 15 | type: array 16 | required: 17 | - labels 18 | - predictions 19 | properties: 20 | sequences: 21 | type: object 22 | required: 23 | - value 24 | - type 25 | properties: 26 | id: {} 27 | type: 28 | enum: 29 | - contig 30 | value: {} 31 | labels: 32 | type: object 33 | required: 34 | - type 35 | - value 36 | properties: 37 | type: 38 | enum: 39 | - binning 40 | value: {} 41 | predictions: 42 | type: object 43 | required: 44 | - value 45 | properties: 46 | type: 47 | enum: 48 | - binning 49 | value: {} 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 bioinformatics-container-standards 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /container/binning_evaluation/unsupervised_binning/input_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | "$schema": http://json-schema.org/draft-04/schema# 3 | title: Bioboxes unsupervised binning benchmark input file validator 4 | type: object 5 | additionalProperties: false 6 | required: 7 | - version 8 | - arguments 9 | properties: 10 | version: 11 | type: string 12 | pattern: "^0.11.\\d+$" 13 | arguments: 14 | additionalProperties: false 15 | type: array 16 | required: 17 | - labels 18 | - predictions 19 | properties: 20 | sequences: 21 | type: object 22 | required: 23 | - value 24 | - type 25 | properties: 26 | id: {} 27 | type: 28 | enum: 29 | - contig 30 | value: {} 31 | labels: 32 | type: object 33 | required: 34 | - type 35 | - value 36 | properties: 37 | type: 38 | enum: 39 | - binning 40 | value: {} 41 | predictions: 42 | type: object 43 | required: 44 | - value 45 | properties: 46 | type: 47 | enum: 48 | - binning 49 | value: {} 50 | -------------------------------------------------------------------------------- /databases/ncbi_taxonomy.txt: -------------------------------------------------------------------------------- 1 | pub/taxonomy 2 | pub/taxonomy/.listing 3 | pub/taxonomy/Ccode_dump.txt 4 | pub/taxonomy/Cowner_dump.txt 5 | pub/taxonomy/Icode_dump.txt 6 | pub/taxonomy/coll_dump.txt 7 | pub/taxonomy/gi_taxid.readme 8 | pub/taxonomy/gi_taxid_nucl.dmp.gz 9 | pub/taxonomy/gi_taxid_nucl.zip 10 | pub/taxonomy/gi_taxid_nucl_diff.dmp.gz 11 | pub/taxonomy/gi_taxid_nucl_diff.zip 12 | pub/taxonomy/gi_taxid_prot.dmp.gz 13 | pub/taxonomy/gi_taxid_prot.zip 14 | pub/taxonomy/gi_taxid_prot_diff.dmp.gz 15 | pub/taxonomy/gi_taxid_prot_diff.zip 16 | pub/taxonomy/taxcat.tar.Z 17 | pub/taxonomy/taxcat.tar.Z.md5 18 | pub/taxonomy/taxcat.tar.gz 19 | pub/taxonomy/taxcat.tar.gz.md5 20 | pub/taxonomy/taxcat.zip 21 | pub/taxonomy/taxcat.zip.md5 22 | pub/taxonomy/taxcat_readme.txt 23 | pub/taxonomy/taxdmp.zip 24 | pub/taxonomy/taxdmp.zip.md5 25 | pub/taxonomy/taxdump.tar.Z 26 | pub/taxonomy/taxdump.tar.Z.md5 27 | pub/taxonomy/taxdump.tar.gz 28 | pub/taxonomy/taxdump.tar.gz.md5 29 | pub/taxonomy/taxdump_readme.txt 30 | pub/taxonomy/citations.dmp 31 | pub/taxonomy/delnodes.dmp 32 | pub/taxonomy/division.dmp 33 | pub/taxonomy/gc.prt 34 | pub/taxonomy/gencode.dmp 35 | pub/taxonomy/merged.dmp 36 | pub/taxonomy/names.dmp 37 | pub/taxonomy/nodes.dmp 38 | pub/taxonomy/readme.txt 39 | -------------------------------------------------------------------------------- /container/assembly-evaluation/read-based-assembly-evaluation/input_schema.yaml: -------------------------------------------------------------------------------- 1 | "$schema": "http://json-schema.org/draft-04/schema#" 2 | title: "Bioboxes read based assembly validator" 3 | type: object 4 | properties: 5 | version: 6 | type: string 7 | pattern: "^0.2.\\d+$" 8 | arguments: 9 | type: object 10 | additionalProperties: false 11 | properties: 12 | reads: 13 | type: array 14 | uniqueItems: true 15 | minItems: 1 16 | items: 17 | type: object 18 | additionalProperties: false 19 | required: 20 | - path 21 | properties: 22 | id: 23 | format: 24 | enum: 25 | - "bioboxes.org:/fastq" 26 | type: 27 | enum: 28 | - paired 29 | - single 30 | path: 31 | assemblies: 32 | type: array 33 | uniqueItems: true 34 | minItems: 1 35 | items: 36 | type: object 37 | additionalProperties: false 38 | required: 39 | - path 40 | properties: 41 | id: 42 | format: 43 | enum: 44 | - "bioboxes.org:/fasta" 45 | type: 46 | enum: 47 | - contig 48 | - scaffold 49 | path: 50 | required: 51 | - version 52 | - arguments 53 | additionalProperties: false 54 | -------------------------------------------------------------------------------- /container/short-read-assembler/input_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: "http://json-schema.org/draft-04/schema#" 3 | title: "Bioboxes short read assembler input file validator" 4 | type: "object" 5 | properties: 6 | version: 7 | type: "string" 8 | pattern: "^0.9.\\d+$" 9 | arguments: 10 | type: "array" 11 | minItems: 1 12 | maxItems: 2 13 | items: 14 | oneOf: 15 | - 16 | $ref: "#/definitions/fastq" 17 | - 18 | $ref: "#/definitions/fragment" 19 | required: 20 | - "version" 21 | - "arguments" 22 | additionalProperties: false 23 | definitions: 24 | fastq: 25 | type: "object" 26 | additionalProperties: false 27 | required: 28 | - "fastq" 29 | properties: 30 | fastq: 31 | $ref: "#/definitions/values" 32 | fragment: 33 | type: "object" 34 | additionalProperties: false 35 | properties: 36 | fragment_size: 37 | $ref: "#/definitions/values" 38 | values: 39 | type: "array" 40 | uniqueItems: true 41 | minItems: 1 42 | items: 43 | type: "object" 44 | additionalProperties: false 45 | required: 46 | - "id" 47 | - "value" 48 | properties: 49 | id: {} 50 | type: {} 51 | value: {} 52 | -------------------------------------------------------------------------------- /container/profiling/schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | "$schema": http://json-schema.org/draft-04/schema# 3 | title: Bioboxes Profiling 4 | type: object 5 | properties: 6 | version: 7 | type: string 8 | pattern: "^1.0.\\d+$" 9 | arguments: 10 | type: array 11 | required: 12 | - fastq 13 | - database 14 | additionalItems: false 15 | minItems: 1 16 | items: 17 | oneOf: 18 | - "$ref": "#/definitions/fastq" 19 | - "$ref": "#/definitions/database" 20 | - "$ref": "#/definitions/cache" 21 | required: 22 | - version 23 | - arguments 24 | additionalProperties: false 25 | definitions: 26 | fastq: 27 | type: object 28 | minItems: 1 29 | required: 30 | - fastq 31 | properties: 32 | fastq: 33 | "$ref": "#/definitions/values" 34 | values: 35 | type: array 36 | uniqueItems: true 37 | minItems: 1 38 | items: 39 | type: object 40 | additionalProperties: false 41 | required: 42 | - type 43 | - value 44 | properties: 45 | type: {} 46 | value: {} 47 | cache: 48 | type: object 49 | required: 50 | - cache 51 | properties: 52 | cache: 53 | required: 54 | - type 55 | - value 56 | properties: 57 | type: {} 58 | value: {} 59 | database: 60 | required: 61 | - database 62 | properties: 63 | database: 64 | required: 65 | - type 66 | - value 67 | properties: 68 | type: {} 69 | value: {} 70 | -------------------------------------------------------------------------------- /container/assembly-evaluation/reference-based-assembly-evaluation/input_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: "http://json-schema.org/draft-04/schema#" 3 | title: "Bioboxes short read assembler input file validator" 4 | type: "object" 5 | properties: 6 | version: 7 | type: "string" 8 | pattern: "^0.9.\\d+$" 9 | arguments: 10 | type: "array" 11 | minItems: 1 12 | maxItems: 3 13 | items: 14 | oneOf: 15 | - 16 | $ref: "#/definitions/fasta" 17 | - 18 | $ref: "#/definitions/fasta_dir" 19 | - 20 | $ref: "#/definitions/cache" 21 | required: 22 | - "version" 23 | - "arguments" 24 | additionalProperties: false 25 | definitions: 26 | fasta: 27 | type: "object" 28 | additionalProperties: false 29 | required: 30 | - "fasta" 31 | properties: 32 | fasta: 33 | $ref: "#/definitions/values" 34 | cache: 35 | type: "object" 36 | additionalProperties: false 37 | required: 38 | - "cache" 39 | properties: 40 | cache: {} 41 | fasta_dir: 42 | type: "object" 43 | additionalProperties: false 44 | required: 45 | - "fasta_dir" 46 | properties: 47 | fasta_dir: {} 48 | values: 49 | type: "array" 50 | uniqueItems: true 51 | minItems: 1 52 | items: 53 | type: "object" 54 | additionalProperties: false 55 | required: 56 | - "id" 57 | - "value" 58 | properties: 59 | id: {} 60 | type: {} 61 | value: {} 62 | -------------------------------------------------------------------------------- /container/README.mkd: -------------------------------------------------------------------------------- 1 | # Current and Previous Container Specifications 2 | 3 | This is a list of permanent links to the latest major specification versions in the git history. 4 | 5 | ## Assembly 6 | 7 | [Assembly container version 0.9](https://github.com/bioboxes/rfc/blob/f551c515b5f7f1db4e18282207dd89f9bcf3ea25/container/short-read-assembler/rfc.mkd) 8 | 9 | [Assembly container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/short-read-assembler.mkd) 10 | 11 | [Assembly benchmarking container version 0.9](https://github.com/bioboxes/rfc/blob/1a3e2f14188dcd841cdc82e5b442798eb7d795f2/container/assembly-evaluation/rfc.mkd) 12 | 13 | [Assembly reference-based benchmarking container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/genome-assembly-reference-benchmarking.mkd) 14 | 15 | ## Binning 16 | 17 | [Binning container version 0.9](https://github.com/bioboxes/rfc/blob/3835b5721dc03f2fc10d8c9139f7f201ced7ccfe/container/binning/rfc.mkd) 18 | 19 | [Binning container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/binning.mkd) 20 | 21 | [Binning evaluation container version 0.8](https://github.com/bioboxes/rfc/blob/1aaeffc1f97dbea1ec05016a4a45e739dd47e20e/container/binning-evaluation.mkd) 22 | 23 | ## Profiling 24 | 25 | [Profiling container version 0.8.2] 26 | (https://github.com/bioboxes/rfc/blob/f03f05e70972aa2eb3716f57f017d1ec704a84f2/container/profiling.mkd) 27 | 28 | [Profiling container version 0.8](https://github.com/bioboxes/rfc/blob/5a23b8a40ab67541a9a851c765872aea5c0336f9/container/profiling.mkd) 29 | -------------------------------------------------------------------------------- /data-format/evaluation.mkd: -------------------------------------------------------------------------------- 1 | # Evaluation Output Specification 2 | 3 | * Version: 0.1.1 4 | * Maintainer: Peter Belmann pbelmann@cebitec.uni-bielefeld.de 5 | * Authors: Ivan Gregor Ivan.Gregor@uni-duesseldorf.de, Johannes Dröge johannes.droege@uni-duesseldorf.de, Michael Barton mail@michaelbarton.me.uk, Peter Belmann pbelmann@cebitec.uni-bielefeld.de 6 | 7 | This document contains a definition for a yaml that is produced by an evaluation container. 8 | 9 | ### General Definition 10 | 11 | This YAML has the following structure: 12 | 13 | ```YAML 14 | --- 15 | version: NUMBER.NUMBER.NUMBER 16 | results: ARRAY 17 | ``` 18 | 19 | * version: Version number must match the regular expression `[0-9\.]` 20 | 21 | * results: The results property must have an array as value. Each results item consists out of the 22 | following properties: 23 | 24 | ### Results Item 25 | 26 | * name: 27 | 28 | Title is an arbitrary String. 29 | 30 | * type: 31 | 32 | Type attribute has the following structure: 33 | 34 | ```YAML 35 | type: txt|png|html|tsv|csv 36 | ``` 37 | 38 | * inline: 39 | 40 | Indicates whether the metric is reprented inline in the **value** field or in an external file. 41 | 42 | * value: 43 | 44 | If the **inline** property is false then the value is the absolute path to a file. 45 | If the **inline** property is true then the value contains the metric. 46 | 47 | * description 48 | 49 | Description for the evaluation method. 50 | 51 | ## Example 52 | 53 | ```YAML 54 | version: 0.1.1 55 | results: 56 | - name: N50 57 | description: N50 is the length for which the collection of all contigs of that length or longer covers at least half an assembly. 58 | value: 42 59 | type: txt 60 | inline:true 61 | - name: my metric 62 | type: csv 63 | inline: false 64 | description: Method that produces confusion matrices 65 | value: /path/to/file 66 | ``` 67 | -------------------------------------------------------------------------------- /unsupervised_binning/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Binning evaluation container 2 | 3 | * Version: 0.11.0 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | 8 | 9 | ### Outline 10 | 11 | This specification describes the interface for containerised binning evaluation applications. 12 | An unsupervised binning evaluation application should validate the assigned BINID 13 | (https://github.com/bioboxes/rfc/blob/master/data-format/binning.mkd#the-binning-output-format) 14 | of a binning container. 15 | 16 | ### Input 17 | 18 | #### General Definition 19 | 20 | A biobox requires an input YAML with the following definition 21 | 22 | ```YAML 23 | --- 24 | version: NUMBER.NUMBER.NUMBER 25 | arguments: 26 | - fasta: 27 | value: STRING 28 | type: contig 29 | - labels: 30 | value: STRING 31 | type: binning 32 | - predictions: 33 | value: STRING 34 | type: binning 35 | ``` 36 | 37 | ##### Description: 38 | 39 | * **version**: The current version is specified directly under the heading. 40 | * **arguments**: The arguments field consists of the following fields 41 | * **fasta**: The input FASTA file for the binned sequences. 42 | * **labels**: The correct binning in bioboxes.org binning format. 43 | * **predictions**: The predicted binning in bioboxes.org binning format. 44 | 45 | ##### Mounts: 46 | * Your output directory MUST be mounted to /bbx/mnt/output 47 | * Your input files MUST be mounted to /bbx/mnt/input 48 | * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml 49 | 50 | ##### File/folder object entries 51 | 52 | ```YAML 53 | value: STRING 54 | type: String 55 | ``` 56 | 57 | * `value` means absolute path to file in container 58 | * `type` specifies the semantic type 59 | 60 | ### Output 61 | 62 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd). 63 | -------------------------------------------------------------------------------- /container/binning_evaluation/unsupervised_binning/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Binning evaluation container 2 | 3 | * Version: 0.11.0 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | 8 | 9 | ### Outline 10 | 11 | This specification describes the interface for containerised binning evaluation applications. 12 | An unsupervised binning evaluation application should validate the assigned BINID 13 | (https://github.com/bioboxes/rfc/blob/master/data-format/binning.mkd#the-binning-output-format) 14 | of a binning container. 15 | 16 | ### Input 17 | 18 | #### General Definition 19 | 20 | A biobox requires an input YAML with the following definition 21 | 22 | ```YAML 23 | --- 24 | version: NUMBER.NUMBER.NUMBER 25 | arguments: 26 | - fasta: 27 | value: STRING 28 | type: contig 29 | - labels: 30 | value: STRING 31 | type: binning 32 | - predictions: 33 | value: STRING 34 | type: binning 35 | ``` 36 | 37 | ##### Description: 38 | 39 | * **version**: The current version is specified directly under the heading. 40 | * **arguments**: The arguments field consists of the following fields 41 | * **fasta**: The input FASTA file for the binned sequences. 42 | * **labels**: The correct binning in bioboxes.org binning format. 43 | * **predictions**: The predicted binning in bioboxes.org binning format. 44 | 45 | ##### Mounts: 46 | * Your output directory MUST be mounted to /bbx/mnt/output 47 | * Your input files MUST be mounted to /bbx/mnt/input 48 | * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml 49 | 50 | ##### File/folder object entries 51 | 52 | ```YAML 53 | value: STRING 54 | type: String 55 | ``` 56 | 57 | * `value` means absolute path to file in container 58 | * `type` specifies the semantic type 59 | 60 | ### Output 61 | 62 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd). 63 | -------------------------------------------------------------------------------- /container/binning_evaluation/taxonomic_binning/input_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: "http://json-schema.org/draft-04/schema#" 3 | title: "Bioboxes unsupervised binning benchmark input file validator" 4 | type: "object" 5 | additionalProperties: false 6 | required: 7 | - "version" 8 | - "arguments" 9 | properties: 10 | version: 11 | type: "string" 12 | pattern: "^0.10.\\d+$" 13 | arguments: 14 | additionalProperties: false 15 | type: object 16 | required: 17 | - labels 18 | - predictions 19 | properties: 20 | sequences: 21 | type: "object" 22 | required: 23 | - "path" 24 | - "format" 25 | properties: 26 | format: 27 | enum: 28 | - "bioboxes.org:/fasta" 29 | id: {} 30 | type: 31 | enum: 32 | - "contig" 33 | path: {} 34 | labels: 35 | type: "object" 36 | required: 37 | - "format" 38 | - "path" 39 | properties: 40 | format: 41 | enum: 42 | - "bioboxes.org:/binning/binning:0.9/taxbinning" 43 | id: {} 44 | type: 45 | enum: 46 | - "binning" 47 | path: {} 48 | predictions: 49 | type: "object" 50 | required: 51 | - "path" 52 | - "format" 53 | properties: 54 | format: 55 | enum: 56 | - "bioboxes.org:/binning/binning:0.9/taxbinning" 57 | id: {} 58 | type: 59 | enum: 60 | - "binning" 61 | path: {} 62 | databases: 63 | type: "object" 64 | properties: 65 | taxonomy: 66 | type: object 67 | required: 68 | - "path" 69 | - "format" 70 | properties: 71 | format: 72 | enum: 73 | - "bioboxes.org:/taxonomy_ncbi_dumps" 74 | id: {} 75 | type: 76 | enum: 77 | - "ncbi" 78 | path: {} 79 | -------------------------------------------------------------------------------- /README.mkd: -------------------------------------------------------------------------------- 1 |

2 | 3 | ## Outline 4 | 5 | Software containers have the potential to solve the common problem in 6 | bioinformatics where complex dependencies can make installing and using a tool 7 | difficult. Containerisation allows any developer to include all the required 8 | dependencies along with their tool to provide the end-user with everything they 9 | need to start using it. 10 | 11 | Two existing projects have taken advantage of this concept to benchmark 12 | bioinformatics software inside containers: [CAMI](http://cami-challenge.org/) 13 | and [nucleotid.es](http://nucleotid.es). We, the developers from these two 14 | projects, met to agree a standard so that containers created by one project 15 | would be usable by another through the same interface. 16 | 17 | The aim of this RFC is to create a standard for well-defined bioinformatics 18 | applications. This standard will put the users of bioinformatics software 19 | first, so that a community-agreed interface allows the use of different tools 20 | regardless of where or by whom it was developed. We welcome contributions and 21 | suggestions from other developers with aim of creating a standard that everyone 22 | can follow and agree on. 23 | 24 | ## Development process 25 | 26 | The development process for bioboxes is outlined on the bioboxes.org and has 27 | information for [beginners to get started with bioboxes][started] and [how to 28 | make contributions to bioboxes][contribute]. 29 | 30 | [started]: http://bioboxes.org/guide/user/ 31 | [contribute]: http://bioboxes.org/contribute/getting-started/ 32 | 33 | ### Core team 34 | 35 | There is a core team who work on developing the bioboxes RFCs. The core team's 36 | goal is to develop the RFC and resolve issues. The members should generally be 37 | selected from those who are already actively involved in bioboxes. The size of 38 | the core team should be small to enable decisions to be made quickly. A core 39 | team member should be willing to: 40 | 41 | * Actively follow, resolve issues and answer questions on the bioboxes github 42 | issue tracker. 43 | * Meet for a 30 minute teleconference on a biweekly basis to discuss the 44 | progress of bioboxes. 45 | -------------------------------------------------------------------------------- /container/profiling-evaluation/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Profiling evaluation container 2 | 3 | * Version: 0.1.0 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | 8 | ### Outline 9 | 10 | This specification describes the interface for containerised profiling evaluation applications. 11 | In addition to the specifications described below, this container MUST implement the 12 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 13 | 14 | ### Input 15 | 16 | #### General Definition 17 | 18 | A biobox requires an input YAML with the following definition 19 | 20 | ```YAML 21 | --- 22 | version: NUMBER.NUMBER.NUMBER 23 | arguments: 24 | prediction: 25 | path: STRING 26 | format: bioboxes.org:/profiling:0.9 27 | ground_truth: 28 | path: STRING 29 | format: bioboxes.org:/profiling:0.9 30 | ``` 31 | 32 | ##### Description: 33 | 34 | * **version**: The current version is specified directly under the heading. 35 | * **arguments**: The arguments field consists of the following fields 36 | * **predictions**: Profiling prediction in bioboxes.org profiling format. 37 | * **ground_truth**: Profiling ground truth/gold standard in bioboxes.org profiling format. 38 | * **path**: Path MUST begin with a slash ('/'), which points to a profiling file. This file has to be mounted to a path that is prefixed by `/bbx/mnt/input`. 39 | 40 | ##### Mounts: 41 | * Your output directory MUST be mounted to /bbx/mnt/output 42 | * Your input files MUST be mounted to /bbx/mnt/input 43 | * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml 44 | * If the directory /bbx/mnt/metadata is mounted then the following files should be placed inside the directory: 45 | log.txt Logging information that is generated by the application inside the container. 46 | 47 | ##### Formats 48 | * `bioboxes.org:/profiling:0.9`: bioboxes.org profiling file in version 0.9 49 | 50 | ### Output 51 | The biobox produces on a successful run a `biobox.yaml` and can be found in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd). 52 | 53 | ### Example 54 | 55 | ```YAML 56 | --- 57 | version: 0.1.0 58 | arguments: 59 | prediction: 60 | path: /bbx/mnt/input/prediction.txt 61 | format: bioboxes.org:/profiling:0.9 62 | ground_truth: 63 | path: /bbx/mnt/input/ground_truth.txt 64 | format: bioboxes.org:/profiling:0.9 65 | ``` 66 | -------------------------------------------------------------------------------- /container/binning/input_schema.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: "http://json-schema.org/draft-04/schema#" 3 | title: "Bioboxes binning input file validator" 4 | type: "object" 5 | properties: 6 | version: 7 | type: "string" 8 | pattern: "^0.9.\\d+$" 9 | arguments: 10 | type: "array" 11 | minItems: 1 12 | maxItems: 4 13 | uniqueItems: true 14 | items: 15 | - 16 | $ref: "#/definitions/fasta" 17 | additionalItems: 18 | anyOf: 19 | - 20 | $ref: "#/definitions/fastq" 21 | - 22 | $ref: "#/definitions/databases" 23 | - 24 | $ref: "#/definitions/cache" 25 | required: 26 | - "version" 27 | - "arguments" 28 | additionalProperties: false 29 | definitions: 30 | fasta: 31 | type: "object" 32 | additionalProperties: false 33 | required: 34 | - "fasta" 35 | properties: 36 | fasta: 37 | type: "object" 38 | additionalProperties: false 39 | required: 40 | - "id" 41 | - "value" 42 | properties: 43 | id: {} 44 | type: {} 45 | value: {} 46 | fastq: 47 | type: "object" 48 | additionalProperties: false 49 | required: 50 | - "fastq" 51 | properties: 52 | fastq: 53 | $ref: "#/definitions/values" 54 | cache: 55 | type: "object" 56 | additionalProperties: false 57 | required: 58 | - "cache" 59 | properties: 60 | cache: {} 61 | databases: 62 | type: "object" 63 | additionalProperties: false 64 | required: 65 | - "databases" 66 | properties: 67 | databases: 68 | $ref: "#/definitions/database_values" 69 | values: 70 | type: "array" 71 | uniqueItems: true 72 | minItems: 1 73 | items: 74 | type: "object" 75 | additionalProperties: false 76 | required: 77 | - "id" 78 | - "value" 79 | properties: 80 | id: {} 81 | type: {} 82 | value: {} 83 | database_values: 84 | type: "array" 85 | uniqueItems: true 86 | minItems: 1 87 | items: 88 | type: "object" 89 | additionalProperties: false 90 | required: 91 | - "id" 92 | - "value" 93 | properties: 94 | id: 95 | enum: 96 | - "ncbi_taxonomy" 97 | - "refseq" 98 | - "blastdb" 99 | - "cog" 100 | - "ncbi_genomes" 101 | value: {} 102 | -------------------------------------------------------------------------------- /container/binning_evaluation/taxonomic_binning/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Taxonomic binning evaluation container 2 | 3 | * Version: 0.10.0 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | 8 | * Outline 9 | * Input 10 | * General Definition 11 | * Description 12 | * File/folder object entries 13 | * Formats 14 | * Output 15 | 16 | ### Outline 17 | 18 | This specification describes the interface for containerised binning evaluation applications. 19 | A binning evaluation application should validate the assigned TAXID 20 | (https://github.com/bioboxes/rfc/blob/master/data-format/binning.mkd#the-binning-output-format) 21 | of a binning container. 22 | 23 | In addition to the specifications described below, this container MUST implement the 24 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 25 | 26 | 27 | ### Input 28 | 29 | #### General Definition 30 | 31 | A biobox requires an input YAML with the following definition 32 | 33 | ```YAML 34 | --- 35 | version: NUMBER.NUMBER.NUMBER 36 | arguments: 37 | sequences: 38 | path: STRING 39 | id: STRING 40 | type: contig 41 | format: bioboxes.org:/fasta 42 | labels: 43 | path: STRING 44 | id: STRING 45 | type: binning 46 | format: bioboxes.org:/binning/binning:0.9/taxbinning 47 | predictions: 48 | path: STRING 49 | id: STRING 50 | type: binning 51 | format: bioboxes.org:/binning/binning:0.9/taxbinning 52 | databases: 53 | taxonomy: 54 | path: STRING 55 | id: STRING 56 | type: ncbi 57 | format: bioboxes.org:/taxonomy_ncbi_dumps 58 | ``` 59 | 60 | ##### Description: 61 | 62 | * **version**: The current version is specified directly under the heading. 63 | * **arguments**: The arguments field consists of the following fields 64 | * **sequences**: The input FASTA file for the binned sequences. 65 | * **labels**: The correct binning in bioboxes.org (taxonomic) binning format. 66 | * **predictions**: The predicted binning in bioboxes.org (taxonomic) binning format. 67 | * databases 68 | * **taxonomy**: The corresponding taxonomy in NCBI format. 69 | 70 | ##### Mounts: 71 | * Your output directory MUST be mounted to /bbx/mnt/output 72 | * Your input files MUST be mounted to /bbx/mnt/input 73 | * The .yaml MUST be placed as /bbx/mnt/input/biobox.yaml 74 | 75 | ##### File/folder object entries 76 | 77 | ```YAML 78 | path: STRING 79 | id: STRING 80 | type: String 81 | format: STRING 82 | ``` 83 | 84 | * `path` means absolute path to file in container 85 | * `id` is a unique id for the file (optional) 86 | * `type` specifies the semantic type (optional) 87 | * `format` gives a machine-checkable type definition (will be transformed to YAML tag in future) 88 | 89 | ##### Formats 90 | * `fasta`: FASTA file 91 | * `bioboxes.org:/binning:0.9/taxbinning`: bioboxes.org binning file in version 0.9 with column TAXID 92 | * `bioboxes.org:/taxonomy_ncbi_dumps`: A folder containing at least the NCBI taxonomy dump files names.dmp and nodes.dmp 93 | 94 | ### Output 95 | 96 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd). 97 | 98 | -------------------------------------------------------------------------------- /container/profiling/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Profiling container 2 | 3 | * Version: 1.0.0 4 | * Maintainer: Johannes Dröge 5 | 6 | ### Contents 7 | 8 | * Outline 9 | * Input 10 | * General Definition 11 | * Description 12 | * Mounts 13 | * File/Folder Object Definition 14 | * Output 15 | * General Definition 16 | * Description 17 | * Mounts 18 | * Signature 19 | * Example 20 | 21 | ### Outline 22 | 23 | This specification describes the interface for containerised profiling applications. 24 | A profiling application gives an insight into the composition of the microbial community by assigning percentage values to taxonomic identifiers. 25 | 26 | In addition to the specifications described below, this container MUST implement the 27 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 28 | 29 | ### Input 30 | 31 | #### General Definition 32 | 33 | A biobox requires an input YAML with the following definition 34 | 35 | ~~~YAML 36 | version: 1.0.0 37 | arguments: 38 | - fastq: 39 | - type: fastq 40 | value: STRING 41 | - database: 42 | type: bioboxes.org:/taxonomy_ncbi_dumps 43 | value: STRING 44 | - cache: 45 | type: directory 46 | value: STRING 47 | ~~~ 48 | 49 | ##### Description: 50 | 51 | * **version**: The current version is specified directly under the heading. 52 | * **arguments**: The arguments field consists of the following fields 53 | * **fastq**: An array of gzipped fastq sequence libraries. 54 | * **database**: The taxonomy database. A directory containing nodes.dmp and names.dmp. 55 | * **cache**: Path to a cache directory. 56 | 57 | ##### Mounts: 58 | * Your output directory MUST be mounted to /bbx/mnt/output 59 | * Your input files MUST be mounted to /bbx/mnt/input 60 | * The input biobox.yaml MUST be placed as /bbx/mnt/input/biobox.yaml 61 | 62 | ##### File/Folder object entries 63 | 64 | ```YAML 65 | path: STRING 66 | value: STRING 67 | ``` 68 | 69 | * `value` means absolute path to file in container 70 | * `type` gives a machine-checkable type definition 71 | 72 | ### Outputs 73 | 74 | #### General Definition 75 | 76 | ~~~YAML 77 | --- 78 | version: NUMBER.NUMBER.NUMBER 79 | arguments: 80 | profiling: 81 | - value: STRING 82 | type: bioboxes.org:/profling:0.9 83 | ~~~ 84 | 85 | #### Description: 86 | 87 | This yaml with the name biobox.yaml will be available on a successful run and can be found in your mounted output directory. 88 | 89 | * version: The current version is specified directly under the heading. 90 | * arguments: The arguments field consists out of the profiling field 91 | 92 | #### Mounts: 93 | 94 | If the directory /bbx/metadata is mounted then the following files should be placed inside the directory: 95 | log.txt Logging information that is generated by the application inside the container. 96 | 97 | ### Signature 98 | 99 | Any biobox based profiling tool accepts at least one of the following signatures: 100 | 101 | fastq A, database B, Maybe cache -> profiling C 102 | 103 | ### Example 104 | 105 | This is an example biobox.yaml file: 106 | 107 | ~~~YAML 108 | version: 1.0.0 109 | arguments: 110 | - fastq: 111 | - type: fastq 112 | value: /path/to/fastq 113 | - database: 114 | type: bioboxes.org:/taxonomy_ncbi_dumps 115 | value: /path/to/ncbi_dump 116 | - cache: 117 | type: directory 118 | value: /path/to/cache/directory 119 | ~~~ 120 | -------------------------------------------------------------------------------- /container/assembly-evaluation/reference-based-assembly-evaluation/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Genome assembly benchmarking container 2 | 3 | * Version: 0.9.0 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | * Outline 8 | * Inputs 9 | * General Definition 10 | * Description 11 | * Mounts 12 | * fasta 13 | * fasta_dir 14 | * cache 15 | * Outputs 16 | * evaluation 17 | * Signature 18 | * Example 19 | 20 | ### Outline 21 | 22 | This specification describes the interface for containerised software to 23 | evaluate a genome assembly in FASTA format using optional multiple reference genome sequences in 24 | FASTA format. Genome assemblers vary in efficiency and the quality of an assembly 25 | may be evaluated by comparing it to a higher quality reference of the same 26 | genome. In addition to the specifications described below, this container MUST 27 | implement all specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 28 | 29 | ### Inputs 30 | 31 | #### General Definition 32 | 33 | A biobox requires an input YAML that follows the below definition and is valid according to [this](https://github.com/bioboxes/rfc/blob/master/container/short-read-assembler/input_schema.yaml) schema. 34 | 35 | ```YAML 36 | --- 37 | version: NUMBER.NUMBER.NUMBER 38 | arguments: 39 | - fasta: LIST 40 | - fasta_dir: STRING 41 | - cache: STRING 42 | ``` 43 | 44 | ##### Description: 45 | * **version**: The current version is specified directly under the heading. 46 | * **arguments**: The arguments field consists out of the following fields 47 | * fasta 48 | * fasta_dir 49 | 50 | You can find a definition for every field below this section. 51 | 52 | ##### Mounts: 53 | * The .yaml MUST be mounted to /bbx/input/biobox.yaml. 54 | * Your output directory MUST be mounted to /bbx/output. 55 | * Your input files MUST be mounted to /bbx/input. 56 | 57 | #### cache definition (optional): 58 | 59 | ```YAML 60 | value: STRING 61 | ``` 62 | 63 | ##### Description: 64 | * **value**: Path to a writeable mounted directory. If mounted the tool will place intermediate results in this directory and reuse them on a second run. 65 | 66 | #### fasta definition : 67 | 68 | ```YAML 69 | - value: STRING 70 | id: STRING or NUMBER 71 | type: contig or scaffold 72 | ``` 73 | 74 | ##### Description: 75 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTA file. This file has to be mounted to a path that is prefixed by `/bbx/input`. 76 | * **id**: A unique id for every entry in the fasta list. 77 | * **type**: Two options: 78 | * **contig** 79 | * **scaffold** 80 | 81 | #### fasta_dir definition (optional): 82 | 83 | ```YAML 84 | value: STRING 85 | ``` 86 | 87 | ##### Description: 88 | * **value**: This variable specifies the absolute path to a directoy containing FASTA formatted reference sequence files from the same origin as the fastas specified in the fasta entry. 89 | 90 | ### Outputs 91 | 92 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd). 93 | 94 | ##### Mounts: 95 | * If the directory `/bbx/metadata` is mounted then the following files should be placed inside the directory: 96 | * `log.txt` Logging information that is generated by the application inside the container. 97 | 98 | ### Signature 99 | 100 | Any biobox based assembler accepts at least one of the following signatures: 101 | 102 | 1. `[fasta A], [Maybe fasta_dir B] -> evaluation C` 103 | 104 | where 105 | * `Maybe` indicates an optional value 106 | 107 | ### Example 108 | This is an example biobox.yaml file: 109 | 110 | ```YAML 111 | --- 112 | version: 0.9.0 113 | arguments: 114 | - fasta: 115 | - value: "/path/to/lib1" 116 | id: "pe_1" 117 | type: "contig" 118 | - fasta_dir: "/path/to/dir/with/references" 119 | ``` 120 | -------------------------------------------------------------------------------- /container/assembly-evaluation/read-based-assembly-evaluation/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Read based assembly benchmarking container 2 | 3 | * Version: 0.2.0 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | * Outline 8 | * Inputs 9 | * General Definition 10 | * Description 11 | * Mounts 12 | * assemblies 13 | * reads 14 | * cache 15 | * Outputs 16 | * evaluation 17 | * Signature 18 | * Example 19 | 20 | ### Outline 21 | 22 | This specification describes the interface for containerised software to 23 | evaluate a genome assembly in FASTA format using read data in FASTQ format. 24 | In addition to the specifications described below, this container MUST 25 | implement all specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 26 | 27 | ### Input 28 | 29 | #### General Definition 30 | 31 | A biobox requires an input YAML with the following definition 32 | 33 | ```YAML 34 | --- 35 | version: NUMBER.NUMBER.NUMBER 36 | arguments: 37 | assemblies: LIST 38 | reads: LIST 39 | cache: STRING 40 | ``` 41 | 42 | ##### Description: 43 | * **version**: The current version is specified directly under the heading. 44 | * **arguments**: The arguments field consists out of the following fields 45 | * assemblies A list of assembly files in fasta format. 46 | * reads Fastqs 47 | * cache 48 | You can find a definition for every field below this section. 49 | 50 | ##### Mounts: 51 | * The .yaml MUST be mounted to /bbx/mnt/input/biobox.yaml. 52 | * Your output directory MUST be mounted to /bbx/mnt/output. 53 | * Your input files MUST be mounted to /bbx/mnt/input. 54 | 55 | #### assemblies definition: 56 | ```YAML 57 | - path: STRING 58 | id: STRING or NUMBER 59 | type: contig or scaffold 60 | format: bioboxes.org:/fasta 61 | ``` 62 | 63 | ##### Description: 64 | * **value**: Path MUST begin with a slash ('/'), which points to FASTA file. This file has to be mounted to a path that is prefixed by `/bbx/mnt/input`. 65 | * **id**: A unique id for every entry in the fasta list (optional). 66 | * **type**: Two options: 67 | * **contig** 68 | * **scaffold** 69 | 70 | #### reads definition: 71 | ```YAML 72 | - path: STRING 73 | id: STRING or NUMBER 74 | type: paired or single 75 | format: bioboxes.org:/fastq 76 | ``` 77 | 78 | ##### Description: 79 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTQ file. This file has to be mounted to a path that is prefixed by `/bbx/mnt/input`. 80 | * **id**: A unique id for every entry in the fastq list (optional). 81 | * **type**: Two options: 82 | * paired: Paired end fastq reads. By choosing this type, the **value** field hast to be interleaved gzipped fastq. 83 | * single: Single end gzipped fastq reads. 84 | 85 | #### cache definition (optional): 86 | 87 | ```YAML 88 | cache: STRING 89 | ``` 90 | 91 | ##### Description: 92 | * **cache**: Path to a writeable mounted directory. If mounted the tool will place intermediate results in this directory and reuse them on a second run. 93 | 94 | ### Outputs 95 | 96 | The biobox produces on a successful run a `biobox.yaml` and can be found in the `bbx` directory in your mounted output directory. The output biobox.yaml must be in an evaluation specific [format](https://github.com/bioboxes/rfc/blob/master/data-format/evaluation.mkd). 97 | 98 | ##### Mounts: 99 | 100 | * If the directory `/bbx/mnt/metadata` is mounted then the following files should be placed inside the directory: 101 | * `log.txt` Logging information that is generated by the application inside the container. 102 | 103 | ##### Formats 104 | * `fasta`: FASTA file 105 | * `fastq`: FASTQ file 106 | 107 | ### Signature 108 | 109 | Any biobox based assembler accepts at least one of the following signatures: 110 | 111 | 1. `[fasta A], [fastq B] -> evaluation C` 112 | 113 | ### Example 114 | This is an example biobox.yaml file: 115 | 116 | ```YAML 117 | --- 118 | version: 0.2.0 119 | arguments: 120 | assemblies: 121 | - path: /path/to/assembly1/fasta 122 | id: ray 123 | type: contig 124 | format: bioboxes.org:/fasta 125 | reads: 126 | - path: /path/to/short/read.fastq.gz 127 | id: lib1 128 | type: paired 129 | format: bioboxes.org:/fastq 130 | ``` 131 | -------------------------------------------------------------------------------- /rfc.mkd: -------------------------------------------------------------------------------- 1 | ## bioboxes - Standards for Interoperable Bioinformatics Containers 2 | 3 | * Version: 0.8.1 4 | * Maintainer: Michael Barton 5 | 6 | ## Introduction 7 | 8 | The purpose of this subsequent documents is provide a detailed specification 9 | for developers to write standardised bioinformatics containers. The goal of 10 | this document is to define a standard whereby bioinformatics software 11 | containers of the same type are interoperable and therefore can used 12 | interchangeably. The audience of this document are bioinformaticians and 13 | developers writing bioinformatics software shared using Linux containers. This 14 | document will describe the interface that MUST be provided to a running 15 | container and that a developer of the bioinformatics container MUST write their 16 | software against. 17 | 18 | The scope of this standard is bioinformatics software packaged using Linux 19 | containers. Bioinformatics software in a Linux container can be shared and 20 | provided to third parties because software dependencies are included within the 21 | container. Examples of bioinformatics software are genome assemblers, read 22 | binners and read aligners. Examples of container software are Docker, Rocket 23 | and LXC/LXD. Standardising bioinformatics software in containers allows 24 | interchangeable use between different research groups and institutions. 25 | 26 | Applications of this standardisation are: 27 | 28 | * A developer uploads his short read aligner as a container to an online 29 | repository for others to use. A biologists downloads this aligner and is 30 | able to use it immediately as it follows a standardised interface that the 31 | biologist is already familiar with. 32 | * A genome assembly benchmarking service downloads many genome assembler 33 | containers. These containers are evaluated using assembly performance 34 | metrics. The standardised interface allows all containers to be benchmarked 35 | the same way. 36 | * A large sequencing centre invests time to develop an improved genome 37 | assembly pipeline for single cell data. The pipeline is packaged inside a 38 | Linux container and shared with the bioinformatics community. Another large 39 | sequencing centre is able to immediately compare this new pipeline with 40 | their in-house pipeline using the same container interface. 41 | 42 | ### Notational Conventions 43 | 44 | * The key words “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”, 45 | “SHOULD NOT”, “RECOMMENDED”, “MAY”, and “OPTIONAL” in this document are to be 46 | interpreted as described in [RFC2119]. 47 | 48 | * PAIRED: Paired reads are defined as the organisation of a FASTA or FASTQ file 49 | where the Nth and Nth+1 reads originate from opposite ends of the same DNA 50 | fragment, where N % 2 == 0 using 0-based indexing. 51 | 52 | ## Generic bioinformatics container 53 | 54 | This specification describes the required inputs for all containerised 55 | bioinformatics software, independent of the application type. 56 | 57 | ### Inputs 58 | 59 | * **TASK**: The argument given to start a container MUST be a single string 60 | containing only the characters A-Z, a-z, 0-9, '_' and '-'. This argument is 61 | used to differentiate different combination of settings the containerised 62 | software can be run as. Every container SHOULD support a 'default' task. This 63 | runs the container in a mode that is applicable to the most common situation 64 | in which the software is used. 65 | 66 | ### Outputs 67 | 68 | The containerised software MUST return a zero exit code when completing 69 | successfully, and return a non-zero exit code when an error occurs. 70 | 71 | ## Databases 72 | 73 | This section describes the variables containing the paths to various databases. 74 | 75 | ### Variables 76 | 77 | * **CONT_DATABASES_DIR**: This variable specifies the absolute path to a [directory](databases_structure.txt) that contains the following databases: 78 | * COG 79 | * NCBI Genomes 80 | * Refseq 81 | * BLAST DBs 82 | 83 | ## Normative References 84 | 85 | * [RFC2119] Bradner, S., “Key words for use in RFCs to Indicate Requirement 86 | Levels”, BCP 14, RFC 2119, March 1997. 87 | 88 | # Authors's Addresses 89 | 90 | * Michael Barton 91 | * Peter Belmann 92 | * Andreas Bremges 93 | * Johannes Dröge 94 | * Alexander Sczyrba 95 | -------------------------------------------------------------------------------- /container/short-read-assembler/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Short-read genome assembler container 2 | 3 | * Version: 0.9.3 4 | * Maintainer: Michael Barton 5 | 6 | ### Contents 7 | * Outline 8 | * Inputs 9 | * General Definition 10 | * Description 11 | * Mounts 12 | * fastq 13 | * fragment_size 14 | * Outputs 15 | * fasta 16 | * Signature 17 | * Example 18 | 19 | ### Outline 20 | 21 | This specification describes the interface for containerised short-read genome 22 | assemblers. A genome assembler converts one or more FASTQ files of DNA short 23 | reads into larger contiguous ('contigs') regions of DNA. In addition to the 24 | specifications described below, this container MUST implement the 25 | specifications defined in ['Generic bioinformatics container'](https://github.com/bioboxes/rfc/blob/master/rfc.mkd#generic-bioinformatics-container). 26 | 27 | ### Inputs 28 | 29 | #### General Definition 30 | 31 | A biobox requires an input YAML that follows the below definition and is valid according to [this](https://github.com/bioboxes/rfc/blob/master/container/short-read-assembler/input_schema.yaml) schema. 32 | 33 | ```YAML 34 | --- 35 | version: NUMBER.NUMBER.NUMBER 36 | arguments: 37 | - fastq: LIST 38 | - fragment_size: LIST 39 | ``` 40 | 41 | ##### Description: 42 | * **version**: The current version is specified directly under the heading. 43 | * **arguments**: The arguments field consists out of the following fields 44 | * fastq 45 | * fragment_size 46 | 47 | You can find a definition for every field below this section. 48 | 49 | ##### Mounts: 50 | * The .yaml MUST be mounted to /bbx/input/biobox.yaml. 51 | * Your output directory MUST be mounted to /bbx/output. 52 | * Your input files MUST be mounted to /bbx/input. 53 | 54 | #### fastq definition: 55 | ```YAML 56 | - value: STRING 57 | id: STRING or NUMBER 58 | type: paired or single 59 | ``` 60 | 61 | ##### Description: 62 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTQ file. This file has to be mounted to a path that is prefixed by `/bbx/input`. 63 | * **id**: A unique id for every entry in the fastq list. 64 | * **type**: Two options: 65 | * paired: Paired end fastq reads. By choosing this type the **value** field hast to be interleaved gzipped fastq. 66 | * single: Single end fastq reads. 67 | 68 | #### fragment_size definition: 69 | ```YAML 70 | - id: STRING, 71 | value: NUMBER 72 | ``` 73 | 74 | ##### Description: 75 | * **id**: The specified id MUST match exactly one entry in the fastq entry list. 76 | * **number**: Number for the fragment size. 77 | 78 | ### Outputs 79 | 80 | #### General Definition 81 | 82 | ```YAML 83 | --- 84 | version: NUMBER.NUMBER.NUMBER 85 | arguments: 86 | - fasta: LIST 87 | ``` 88 | 89 | ##### Description: 90 | This yaml with the name `biobox.yaml` will be available on a successful run in your mounted output directory. 91 | 92 | * **version**: The current version is specified directly under the heading. 93 | * **arguments**: The arguments field consists out of the **fasta** field 94 | 95 | ##### Mounts: 96 | * If the directory `/bbx/metadata` is mounted then the following files should be placed inside the directory: 97 | * `log.txt` Logging information that is generated by the application inside the container. 98 | 99 | #### fasta definition: 100 | 101 | ```YAML 102 | - value: STRING 103 | id: STRING or NUMBER 104 | type: contig or scaffold 105 | ``` 106 | 107 | ##### Description: 108 | * **value**: This is the path to a fasta file containing the contigs relative to your mounted output directory. 109 | * **id**: A unique id for every entry in the fasta list. 110 | * **type**: Two options: 111 | * **contig** 112 | * **scaffold** 113 | 114 | ### Signature 115 | 116 | Any biobox based assembler accepts at least one of the following signatures: 117 | 118 | 1. `[fastq A], [Maybe fragment_size A] -> contigs B, scaffolds C` 119 | 2. `[fastq A], [fragment_size A] -> contigs B, scaffolds C` 120 | 121 | where 122 | * `Maybe` indicates an optional value 123 | 124 | ### Example 125 | This is an example biobox.yaml file: 126 | 127 | ```YAML 128 | --- 129 | version: 0.9.0 130 | arguments: 131 | - fastq: 132 | - value: "/path/to/lib1" 133 | id: "pe_1" 134 | type: paired 135 | - value: "/path/to/lib2" 136 | id: "pe_2" 137 | type: paired 138 | - value: "/path/to/lib2" 139 | id: "lmp_1" 140 | type: paired 141 | - fragment_size: 142 | - value: 240 143 | id: pe_1 144 | - value: 5000 145 | id: lmp_1 146 | ``` 147 | -------------------------------------------------------------------------------- /container/binning/rfc.mkd: -------------------------------------------------------------------------------- 1 | ## Binning container 2 | 3 | * Version: 0.9.2 4 | * Maintainer: Peter Belmann 5 | 6 | ### Contents 7 | * Outline 8 | * Inputs 9 | * General Definition 10 | * Description 11 | * Mounts 12 | * fasta 13 | * fastq 14 | * database 15 | * cache 16 | * Outputs 17 | * binning 18 | * Signature 19 | * Example 20 | 21 | ### Outline 22 | 23 | This specification describes the interface for containerised binning applications. A binning application groups reads or contigs and/or assigns them to operational taxonomic units. In addition to the specifications described below, this container MUST implement the specifications defined in 'Generic bioinformatics container'. 24 | ### Inputs 25 | 26 | #### General Definition 27 | 28 | A biobox requires an input YAML that follows the below definition and is valid according to [this](https://github.com/bioboxes/rfc/blob/master/container/binning/input_schema.yaml) schema. 29 | 30 | ```YAML 31 | --- 32 | version: NUMBER.NUMBER.NUMBER 33 | arguments: 34 | - fasta: 35 | value: STRING 36 | id: STRING 37 | type: STRING 38 | - fastq: LIST 39 | - databases: LIST 40 | - cache: STRING 41 | ``` 42 | 43 | ##### Description: 44 | * **version**: The current version is specified directly under the heading. 45 | * **arguments**: The arguments field consists out of the following fields 46 | * fasta 47 | * fastq 48 | * databases 49 | * cache 50 | You can find a definition for every field below this section. 51 | 52 | ##### Mounts: 53 | * The .yaml MUST be mounted to /bbx/input/biobox.yaml. 54 | * Your output directory MUST be mounted to /bbx/output. 55 | * Your input files MUST be mounted to /bbx/input. 56 | 57 | #### fastq definition: 58 | ```YAML 59 | - value: STRING 60 | id: STRING or NUMBER 61 | type: paired or single 62 | ``` 63 | 64 | ##### Description: 65 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTQ file. This file has to be mounted to a path that is prefixed by `/bbx/input`. 66 | * **id**: A unique id for every entry in the fastq list. 67 | * **type**: Two options: 68 | * paired: Paired end fastq reads. By choosing this type the **value** field hast to be interleaved gzipped fastq. 69 | * single: Single end fastq reads. 70 | 71 | 72 | #### fasta definition: 73 | 74 | ```YAML 75 | value: STRING 76 | id: STRING or NUMBER 77 | type: contig or scaffold 78 | ``` 79 | 80 | ##### Description: 81 | * **value**: Path MUST begin with a slash ('/'), which points to gzipped FASTA file. This file has to be mounted to a path that is prefixed by `/bbx/input`. 82 | * **id**: A unique id for every entry in the fasta list. 83 | * **type**: Two options: 84 | * **contig** 85 | * **scaffold** 86 | 87 | #### databases definition: 88 | 89 | ```YAML 90 | - value: STRING 91 | id: STRING 92 | ``` 93 | 94 | ##### Description: 95 | * **value**: Path to a database directory. 96 | * **id**: Database identifier. Each database identifier is a link to the directory structure. You can find the compressed version of the database with the corresponding structure on this ftp site: ftp://cami.psc.edu/ftp.ncbi.nlm.nih.gov/ 97 | * **[refseq](https://github.com/bioboxes/rfc/blob/master/databases/refseq.txt)** 98 | * **[blastdb](https://github.com/bioboxes/rfc/blob/master/databases/blastdb.txt)** 99 | * **[cog](https://github.com/bioboxes/rfc/blob/master/databases/cog.txt)** 100 | * **[ncbi_genomes](https://github.com/bioboxes/rfc/blob/master/databases/ncbi_genomes.txt)** 101 | * **[ncbi_taxonomy](https://github.com/bioboxes/rfc/blob/master/databases/ncbi_taxonomy.txt)** 102 | 103 | #### cache definition: 104 | 105 | ```YAML 106 | value: STRING 107 | ``` 108 | 109 | ##### Description: 110 | * **value**: Path to a writeable mounted directory. If mounted the tool will place intermediate results in this directory and reuse them on a second run. 111 | 112 | ### Outputs 113 | 114 | #### General Definition 115 | 116 | ```YAML 117 | --- 118 | version: NUMBER.NUMBER.NUMBER 119 | arguments: 120 | - binning: 121 | value: STRING 122 | type: Boolean 123 | ``` 124 | 125 | ##### Description: 126 | This yaml with the name `biobox.yaml` will be available on a successful run and can be found in the `bbx` directory in your mounted output directory. 127 | 128 | * **version**: The current version is specified directly under the heading. 129 | * **arguments**: The arguments field consists out of the **binning** field 130 | 131 | ##### Mounts: 132 | * If the directory `/bbx/metadata` is mounted then the following files should be placed inside the directory: 133 | * `log.txt` Logging information that is generated by the application inside the container. 134 | 135 | #### binning definition: 136 | 137 | ```YAML 138 | value: STRING 139 | type: assignments 140 | ``` 141 | 142 | ##### Description: 143 | * **value**: This is the path to a binning file relative to your mounted output directory. 144 | * **type** 145 | * **assignments** : Binning file contains the computed binning and/or taxonomic groups. 146 | 147 | ### Signature 148 | 149 | Any biobox based assembler accepts at least one of the following signatures: 150 | 151 | 1. `fasta A -> binning B` 152 | 2. `fasta A, [fastq B] -> binning C` 153 | 154 | ### Example 155 | This is an example biobox.yaml file: 156 | 157 | ```YAML 158 | --- 159 | version: 0.9.0 160 | arguments: 161 | - fasta: 162 | value: "/path/to/lib1" 163 | id: "pe_1" 164 | type: paired 165 | ``` 166 | -------------------------------------------------------------------------------- /data-format/sequence.mkd: -------------------------------------------------------------------------------- 1 | ## Specifications of bioinformatics file formats 2 | 3 | * Version: 0.8.1 4 | * Maintainer: Michael Barton 5 | 6 | ### Outline 7 | 8 | The purpose of this document is to describe the sequence FASTA/Q file formats 9 | used in the bioboxes RFC. There is no formal definition for FASTA or FASTQ in the same way 10 | there are RFC definitions for [JSON][json] and [CSV][csv]. Furthermore it is 11 | beyond the scope of this document create a strict definition of either. Instead 12 | the community definitions for these formats are included to clarify the terms 13 | FASTQ or FASTA when used in other RFCs. 14 | 15 | [json]: https://tools.ietf.org/html/rfc7159 16 | [csv]: https://tools.ietf.org/html/rfc4180 17 | 18 | ### FASTA 19 | 20 | #### Introduction 21 | 22 | The FASTA format is described in the [BLAST input description][blast]. This is 23 | quoted directly. 24 | 25 | [blast]: http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml 26 | 27 | > A sequence in FASTA format begins with a single-line description, followed by 28 | > lines of sequence data. The description line (defline) is distinguished from 29 | > the sequence data by a greater-than (">") symbol at the beginning. It is 30 | > recommended that all lines of text be shorter than 80 characters in length. 31 | > 32 | > Blank lines are not allowed in the middle of FASTA input. Sequences are 33 | > expected to be represented in the standard IUB/IUPAC amino acid and nucleic 34 | > acid codes, with these exceptions: lower-case letters are accepted and are 35 | > mapped into upper-case; a single hyphen or dash can be used to represent a gap 36 | > of indeterminate length; and in amino acid sequences, U and * are acceptable 37 | > letters. 38 | 39 | #### Formal Grammar 40 | 41 | A formal grammar using [Backus–Naur Form][bnf] for FASTA is described on [BioStar][biostar]. This is quoted 42 | below as a specification of the FASTA format. 43 | 44 | [biostar]: https://www.biostars.org/p/11254/#11255 45 | [bnf]: http://en.wikipedia.org/wiki/Backus%E2%80%93Naur_Form 46 | 47 | ::= | 48 | ::= | 49 | ::= | 50 | ::=
51 |
::= ">" 52 | ::= | 53 | ::= | 54 | ::= "^[ACGTURYKMSWBDHVNX-]+$" 55 | ::= "^[ABCDEFGHIKLMNOPQRSTUVWYZX*-]+$" 56 | 57 | #### Example 58 | 59 | >gi|129295|sp|P01013|OVAX_CHICK GENE X PROTEIN (OVALBUMIN-RELATED) 60 | QIKDLLVSSSTDLDTTLVLVNAIYFKGMWKTAFNAEDTREMPFHVTKQESKPVQMMCMNNSFNVATLPAE 61 | KMKILELPFASGDLSMLVLLPDEVSDLERIEKTINFEKLTEWTNPNTMEKRRVKVYLPQMKIEEKYNLTS 62 | VLMALGMTDLFIPSANLTGISSAESLKISQAVHGAFMELSEDGIEMAGSTGVIEDIKHSPESEQFRADHP 63 | FLFLIKHNPTNTIVYFGRYWSP 64 | 65 | ### FASTQ 66 | 67 | #### Description 68 | 69 | The FASTQ format is described in detail in the article ["The Sanger FASTQ file 70 | format for sequences with quality scores, and the Solexa/Illumina FASTQ 71 | variants"][1]. This definition will be used and quoted below. Importantly all 72 | FASTQ MUST use the Phred+33 quality offset described in the final paragraph. 73 | 74 | [1]: http://nar.oxfordjournals.org/content/38/6/1767 75 | 76 | > There are four line types in the FASTQ format. First a ‘@’ title line which 77 | > often holds just a record identifier. This is a free format field with no 78 | > length limit—allowing arbitrary annotation or comments to be included, as in 79 | > the example above where the NCBI have included an alternative ID and the 80 | > sequence length. Some sequencing centers encode paired end read information 81 | > here (alternatively two matched FASTQ files are often used). 82 | > 83 | > Second comes the sequence line(s), which as in the FASTA format can be line 84 | > wrapped. Also like FASTA format, there is no explicit limitation on the 85 | > characters expected, but restriction to the IUPAC single letter codes for 86 | > (ambiguous) DNA or RNA is wise, and upper case is conventional. In some 87 | > contexts, the use of lower or mixed case or the inclusion of a gap character 88 | > may make sense. White space such as tabs or spaces is not permitted. 89 | > 90 | > Third, to signal the end of the sequence lines and the start of the quality 91 | > string, comes the ‘+’ line. Originally this also included a full repeat of the 92 | > title line text (as shown in the NCBI example above); however, by common usage 93 | > and the MAQ tool convention, this is optional and the ‘+’ line can contain just 94 | > this one character, reducing the file size significantly. The OBF tools follow 95 | > this MAQ convention on output, and omit the optional repeated title text. 96 | > 97 | > Finally, comes quality line(s) which again can be wrapped. As discussed above, 98 | > these use a subset of the ASCII printable characters (at most ASCII 33–126 99 | > inclusive) with a simple offset mapping. Crucially, after concatenation 100 | > (removing line breaks), the quality string must be equal in length to the 101 | > sequence string. 102 | 103 | #### Formal Grammar 104 | 105 | [MAQ provides a formal grammar][2] using [Backus–Naur Form][bnf] to describe FASTQ. This definition is 106 | provided below as a specification for FASTQ format. 107 | 108 | := + 109 | := @\n\n+[]\n\n 110 | := [A-Za-z0-9_.:-]+ 111 | := [A-Za-z\n\.~]+ 112 | := [!-~\n]+ 113 | 114 | [2]: http://maq.sourceforge.net/fastq.shtml 115 | 116 | #### Example 117 | 118 | @EAS54_6_R1_2_1_413_324 119 | CCCTTCTTGTCTTCAGCGTTTCTCC 120 | + 121 | ;;3;;;;;;;;;;;;7;;;;;;;88 122 | -------------------------------------------------------------------------------- /data-format/binning.mkd: -------------------------------------------------------------------------------- 1 | ## Binning Output Format 2 | 3 | * Version: 0.10.0 4 | * Maintainer: Peter Belmann 5 | * Authors: CAMI challenge organizers 6 | 7 | ### 1. Outline 8 | 9 | The binning format was originally specified for the CAMI contest and is intended to serve as a standard format for (taxonomic) binning methods. 10 | 11 | It is a TAB (`\t`) delimited text format consisting of a header section and an 12 | output section. The header section MUST be above the output section and header 13 | lines MUST start with `@` whereas output lines MUST NOT. Comment lines MUST 14 | start with `#` and MAY occur both in the header and output section. Empty lines 15 | MAY occur anywhere in the output for better readability. Only the UNIX newline 16 | character `\n` MUST be used to define the end of a line and the text MUST be 17 | valid UTF-8 encoding. 18 | 19 | Files containing this data format should be named with the filename suffix `.binning`. 20 | 21 | Regular expressions, when provided, are given as specified in IEEE Std 1003.1™ ERE. 22 | 23 | ### 2. Header section 24 | 25 | Each header line MUST begin with the character `@`. A single `@` defines a 26 | key-value pair in the format **TAG:VALUE** where **TAG** MUST be an 27 | alphanumeric string. Tags are case insensitive but MAY be specified using upper 28 | and lower case letters for better readability. All tags MUST be unique per file. 29 | **VALUE** MUST NOT contain characters other than alphanumerical and `,.;_-|`. 30 | More precisely, each non-empty and non-comment header line except for the last 31 | header line MUST match the regular expression `^\@(_[A-Za-z]*_)?[A-Za-z]+[A-Za-z0-9]*\:[A-Za-z0-9,\.;_\|]*$` 32 | 33 | The specification requires that the following header tags MUST be present: 34 | 35 | * **VERSION**: **VALUE** MUST specify the profiling format version in the heading 36 | of this specification and MUST match the regular expression `[0-9\.]`. 37 | 38 | * **SAMPLEID**: **VALUE** is the sample identifier, not the generating user or program name. It MUST match the regular expression `[A-Za-z0-9\._]+`. 39 | 40 | The following tags MAY be given: 41 | 42 | * **TAXONOMYID**: **VALUE** specifies an identifier of the external taxonomy 43 | which was used in the output section. **TAXID** values MUST be valid 44 | taxon identifiers in this taxonomy. 45 | 46 | Additional tags and values MAY be specified but each additional tag MUST be 47 | prefixed by a case-insensitive string with an underscore before and after the string, 48 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future. 49 | Empty prefixes MAY be used and mean that the tag starts with `__`. 50 | 51 | The last header line MUST begin with `@@` and defines TAB-separated column tags, 52 | where each **TAG** MUST be a string matching the regular expression 53 | `[A-Za-z]+[A-Za-z0-9]*` and defines the content and format of values in the 54 | corresponding column of the output section. The following lists all defined tags: 55 | 56 | * **SEQUENCEID** 57 | * **BINID** 58 | * **TAXID** 59 | 60 | The format requires that **SEQUENCEID** and at least one of **BINID** and **TAXID** MUST be given. 61 | Further optional columns can be appended to the right but MUST be 62 | prefixed by a case-insensitive string with an underscore before and after the string, 63 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future. 64 | Empty prefixes MAY be used and mean that the tag starts with `__`. This means that each 65 | custom field MUST match the regular expression `_[A-Za-z]*_[A-Za-z]+[A-Za-z0-9]*`. 66 | 67 | For instance: 68 | 69 | @@SEQUENCEID TAXID 70 | 71 | or 72 | 73 | @@SEQUENCEID BINID 74 | or 75 | 76 | @@SEQUENCEID TAXID BINID 77 | 78 | or 79 | 80 | @@SEQUENCEID TAXID BINID __SCORE _MY_COLUMN2 _MY_COLUMN3 81 | 82 | ### 3. Output section 83 | 84 | An output line MUST consist of TAB-separated fields and MUST correspond to 85 | the last header line definition. Each field MUST match the regular expression 86 | `[A-Za-z0-9,\.;,\(\)_\-\ ]*`. This specification defines the following field types: 87 | 88 | * **SEQUENCEID** specifies the ID of either a read or a contig sequence (depending on 89 | the sample). 90 | 91 | * The **TAXID** field contains a taxonomic assignment for binned sequences corresponding to a taxonomy which should be refered to by the header tag **TaxonomyVersion**. Each individual field MUST match the 92 | regular expression `[A-Za-z0-9\.;,\(\)_\-\ ]+`. 93 | 94 | * The **BINID** fields MUST be arbitrary alphanumeric identifiers for each bin. 95 | 96 | ### 4. Multi-sample format 97 | 98 | Starting with version `0.10.0`, multiple samples MAY be represented in a single file by concatenation. 99 | Sample sections MUST be separated by at least one empty line after the last content line of a section 100 | and preceding the next header line. Additionally, a multi-sample file MUST specify the exact same 101 | **VERSION** tag value in every section. It MUST also specify the exact same **TAXONOMYID** tag value, 102 | if this tag is specified in at least one of the sections. The type and order of column tags MUST be 103 | identical for all sections. The **SAMPLEID** tag values must be unique for all concatenated sections. 104 | The meaning of the **BINID** tag values is local for each section, for instance, `BIN1` would have a 105 | different meaning in each section so that sequences from different samples can only be grouped into 106 | the same bin if they are pooled and reported in a joint section. 107 | 108 | ### 5. EXAMPLES 109 | 110 | There are three different scenarios for binning tools. 111 | 112 | The first case, example A below: If you create taxonomic bins as output without 113 | further resolution, you do not need to include the **BINID** colummn, but only the 114 | **TAXID** column, in your output. 115 | 116 | The second case, example B below: If you create bins that do not include 117 | taxonomic assignments you do not need to include the **TAXID** column, but only the 118 | **BINID** column, in your output. 119 | 120 | The third case, example C below, is if you perform taxonomic binning and 121 | additionally resolve bins below existing taxonomic IDs, e.g. to define bins 122 | representing novel strains. In this case, you add both the **TAXID** and **BINID**. 123 | 124 | A 125 | ``` 126 | # This is the bioboxes.org binning output format at 127 | # https://github.com/bioboxes/rfc/tree/master/data-format 128 | 129 | @Version:0.10.0 130 | @SampleID:mysample1 131 | @@SEQUENCEID TAXID 132 | read1201 123 133 | read1202 123 134 | read1203 131564 135 | read1204 562 136 | read1205 562 137 | ``` 138 | B 139 | ``` 140 | # This is the bioboxes.org binning output format at 141 | # https://github.com/bioboxes/rfc/tree/master/data-format 142 | 143 | @Version:0.10.0 144 | @SampleID:mysample1 145 | @@SEQUENCEID BINID 146 | contig01 12346BIN 147 | contig02 ANOTHERBIN 148 | contig03 BIN6 149 | contig04 BIN5 150 | contig05 BIN5 151 | ``` 152 | C 153 | ``` 154 | # This is the bioboxes.org binning output format at 155 | # https://github.com/bioboxes/rfc/tree/master/data-format 156 | 157 | @Version:0.10.0 158 | @SampleID:mysample1 159 | @@SEQUENCEID TAXID BINID 160 | contig01 123 123 161 | contig02 123 123 162 | contig03 131564 131564 163 | contig04 562 562.1 164 | contig05 562 562.2 165 | ``` 166 | D 167 | ``` 168 | # This is the bioboxes.org binning output format at 169 | # https://github.com/bioboxes/rfc/tree/master/data-format 170 | 171 | @Version:0.10.0 172 | @SampleID:mysample_A 173 | @@SEQUENCEID BINID 174 | contig_A_01 BIN_A_1 175 | contig_A_02 BIN_A_2 176 | contig_A_03 BIN_A_1 177 | 178 | @Version:0.10.0 179 | @SampleID:mysample_B 180 | @@SEQUENCEID BINID 181 | contig_B_01 BIN_B_1 182 | contig_B_02 BIN_B_1 183 | contig_B_03 BIN_B_2 184 | ``` 185 | 186 | -------------------------------------------------------------------------------- /databases/blastdb.txt: -------------------------------------------------------------------------------- 1 | blast 2 | blast/db 3 | blast/db/nr.00.tar.gz 4 | blast/db/nr.01.tar.gz 5 | blast/db/nr.02.tar.gz 6 | blast/db/nr.03.tar.gz 7 | blast/db/nr.06.tar.gz 8 | blast/db/nr.07.tar.gz 9 | blast/db/nr.12.tar.gz 10 | blast/db/nr.14.tar.gz 11 | blast/db/nr.15.tar.gz 12 | blast/db/nr.16.tar.gz 13 | blast/db/nr.18.tar.gz 14 | blast/db/nr.21.tar.gz 15 | blast/db/nr.22.tar.gz 16 | blast/db/nr.24.tar.gz 17 | blast/db/nr.25.tar.gz 18 | blast/db/nt.04.tar.gz 19 | blast/db/nt.08.tar.gz 20 | blast/db/nt.09.tar.gz 21 | blast/db/nt.10.tar.gz 22 | blast/db/nt.12.tar.gz 23 | blast/db/nt.14.tar.gz 24 | blast/db/nt.16.tar.gz 25 | blast/db/nt.18.tar.gz 26 | blast/db/nt.19.tar.gz 27 | blast/db/nt.21.tar.gz 28 | blast/db/nt.22.tar.gz 29 | blast/db/env_nr.00.tar.gz 30 | blast/db/env_nr.01.tar.gz 31 | blast/db/env_nt.01.tar.gz 32 | blast/db/env_nt.03.tar.gz 33 | blast/db/refseq_genomic.00.tar.gz 34 | blast/db/refseq_genomic.03.tar.gz 35 | blast/db/refseq_genomic.08.tar.gz 36 | blast/db/refseq_genomic.100.tar.gz 37 | blast/db/refseq_genomic.104.tar.gz 38 | blast/db/refseq_genomic.106.tar.gz 39 | blast/db/refseq_genomic.107.tar.gz 40 | blast/db/refseq_genomic.108.tar.gz 41 | blast/db/refseq_genomic.109.tar.gz 42 | blast/db/refseq_genomic.11.tar.gz 43 | blast/db/refseq_genomic.112.tar.gz 44 | blast/db/refseq_genomic.114.tar.gz 45 | blast/db/refseq_genomic.115.tar.gz 46 | blast/db/refseq_genomic.116.tar.gz 47 | blast/db/refseq_genomic.119.tar.gz 48 | blast/db/refseq_genomic.121.tar.gz 49 | blast/db/refseq_genomic.122.tar.gz 50 | blast/db/refseq_genomic.123.tar.gz 51 | blast/db/refseq_genomic.124.tar.gz 52 | blast/db/refseq_genomic.126.tar.gz 53 | blast/db/refseq_genomic.127.tar.gz 54 | blast/db/refseq_genomic.130.tar.gz 55 | blast/db/refseq_genomic.132.tar.gz 56 | blast/db/refseq_genomic.133.tar.gz 57 | blast/db/refseq_genomic.135.tar.gz 58 | blast/db/refseq_genomic.136.tar.gz 59 | blast/db/refseq_genomic.138.tar.gz 60 | blast/db/refseq_genomic.14.tar.gz 61 | blast/db/refseq_genomic.140.tar.gz 62 | blast/db/refseq_genomic.141.tar.gz 63 | blast/db/refseq_genomic.142.tar.gz 64 | blast/db/refseq_genomic.15.tar.gz 65 | blast/db/refseq_genomic.20.tar.gz 66 | blast/db/refseq_genomic.22.tar.gz 67 | blast/db/refseq_genomic.24.tar.gz 68 | blast/db/refseq_genomic.28.tar.gz 69 | blast/db/refseq_genomic.29.tar.gz 70 | blast/db/refseq_genomic.33.tar.gz 71 | blast/db/refseq_genomic.34.tar.gz 72 | blast/db/refseq_genomic.35.tar.gz 73 | blast/db/refseq_genomic.37.tar.gz 74 | blast/db/refseq_genomic.40.tar.gz 75 | blast/db/refseq_genomic.41.tar.gz 76 | blast/db/refseq_genomic.42.tar.gz 77 | blast/db/refseq_genomic.45.tar.gz 78 | blast/db/refseq_genomic.46.tar.gz 79 | blast/db/refseq_genomic.47.tar.gz 80 | blast/db/refseq_genomic.48.tar.gz 81 | blast/db/refseq_genomic.49.tar.gz 82 | blast/db/refseq_genomic.50.tar.gz 83 | blast/db/refseq_genomic.51.tar.gz 84 | blast/db/refseq_genomic.57.tar.gz 85 | blast/db/refseq_genomic.58.tar.gz 86 | blast/db/refseq_genomic.60.tar.gz 87 | blast/db/refseq_genomic.61.tar.gz 88 | blast/db/refseq_genomic.62.tar.gz 89 | blast/db/refseq_genomic.65.tar.gz 90 | blast/db/refseq_genomic.67.tar.gz 91 | blast/db/refseq_genomic.68.tar.gz 92 | blast/db/refseq_genomic.73.tar.gz 93 | blast/db/refseq_genomic.74.tar.gz 94 | blast/db/refseq_genomic.76.tar.gz 95 | blast/db/refseq_genomic.77.tar.gz 96 | blast/db/refseq_genomic.79.tar.gz 97 | blast/db/refseq_genomic.80.tar.gz 98 | blast/db/refseq_genomic.81.tar.gz 99 | blast/db/refseq_genomic.82.tar.gz 100 | blast/db/refseq_genomic.83.tar.gz 101 | blast/db/refseq_genomic.84.tar.gz 102 | blast/db/refseq_genomic.85.tar.gz 103 | blast/db/refseq_genomic.87.tar.gz 104 | blast/db/refseq_genomic.88.tar.gz 105 | blast/db/refseq_genomic.89.tar.gz 106 | blast/db/refseq_genomic.90.tar.gz 107 | blast/db/refseq_genomic.91.tar.gz 108 | blast/db/refseq_genomic.93.tar.gz 109 | blast/db/refseq_genomic.94.tar.gz 110 | blast/db/refseq_genomic.96.tar.gz 111 | blast/db/refseq_protein.00.tar.gz 112 | blast/db/refseq_rna.00.tar.gz 113 | blast/db/refseq_rna.02.tar.gz 114 | blast/db/refseq_protein.05.tar.gz 115 | blast/db/refseq_protein.06.tar.gz 116 | blast/db/refseq_rna.04.tar.gz 117 | blast/db/refseq_protein.08.tar.gz 118 | blast/db/refseq_protein.14.tar.gz 119 | blast/db/refseq_genomic.137.tar.gz 120 | blast/db/nr.20.tar.gz 121 | blast/db/nt.03.tar.gz 122 | blast/db/refseq_genomic.13.tar.gz 123 | blast/db/refseq_genomic.118.tar.gz 124 | blast/db/nr.04.tar.gz 125 | blast/db/refseq_genomic.18.tar.gz 126 | blast/db/nt.07.tar.gz 127 | blast/db/nr.10.tar.gz 128 | blast/db/refseq_genomic.27.tar.gz 129 | blast/db/refseq_genomic.19.tar.gz 130 | blast/db/env_nt.00.tar.gz 131 | blast/db/nr.17.tar.gz 132 | blast/db/refseq_genomic.31.tar.gz 133 | blast/db/nr.05.tar.gz 134 | blast/db/nr.27.tar.gz 135 | blast/db/refseq_genomic.12.tar.gz 136 | blast/db/nt.11.tar.gz 137 | blast/db/refseq_genomic.101.tar.gz 138 | blast/db/nt.13.tar.gz 139 | blast/db/refseq_genomic.32.tar.gz 140 | blast/db/refseq_genomic.110.tar.gz 141 | blast/db/nt.23.tar.gz 142 | blast/db/refseq_genomic.10.tar.gz 143 | blast/db/refseq_genomic.23.tar.gz 144 | blast/db/refseq_genomic.30.tar.gz 145 | blast/db/refseq_genomic.17.tar.gz 146 | blast/db/refseq_genomic.129.tar.gz 147 | blast/db/nt.05.tar.gz 148 | blast/db/refseq_genomic.16.tar.gz 149 | blast/db/refseq_genomic.02.tar.gz 150 | blast/db/refseq_genomic.25.tar.gz 151 | blast/db/nt.00.tar.gz 152 | blast/db/nt.01.tar.gz 153 | blast/db/refseq_genomic.105.tar.gz 154 | blast/db/refseq_genomic.120.tar.gz 155 | blast/db/nt.20.tar.gz 156 | blast/db/env_nt.02.tar.gz 157 | blast/db/nr.26.tar.gz 158 | blast/db/refseq_genomic.131.tar.gz 159 | blast/db/refseq_genomic.117.tar.gz 160 | blast/db/refseq_genomic.128.tar.gz 161 | blast/db/nr.23.tar.gz 162 | blast/db/refseq_genomic.07.tar.gz 163 | blast/db/.listing 164 | blast/db/nr.08.tar.gz 165 | blast/db/refseq_genomic.36.tar.gz 166 | blast/db/nt.17.tar.gz 167 | blast/db/nt.06.tar.gz 168 | blast/db/refseq_genomic.134.tar.gz 169 | blast/db/refseq_genomic.21.tar.gz 170 | blast/db/nr.09.tar.gz 171 | blast/db/nr.19.tar.gz 172 | blast/db/refseq_genomic.06.tar.gz 173 | blast/db/refseq_genomic.102.tar.gz 174 | blast/db/nt.24.tar.gz 175 | blast/db/nr.11.tar.gz 176 | blast/db/refseq_genomic.09.tar.gz 177 | blast/db/nt.02.tar.gz 178 | blast/db/refseq_genomic.26.tar.gz 179 | blast/db/refseq_genomic.103.tar.gz 180 | blast/db/refseq_genomic.05.tar.gz 181 | blast/db/refseq_genomic.111.tar.gz 182 | blast/db/refseq_genomic.38.tar.gz 183 | blast/db/refseq_genomic.04.tar.gz 184 | blast/db/nr.13.tar.gz 185 | blast/db/nt.25.tar.gz 186 | blast/db/refseq_genomic.01.tar.gz 187 | blast/db/refseq_genomic.39.tar.gz 188 | blast/db/refseq_genomic.113.tar.gz 189 | blast/db/refseq_genomic.139.tar.gz 190 | blast/db/refseq_genomic.125.tar.gz 191 | blast/db/nt.15.tar.gz 192 | blast/db/refseq_genomic.43.tar.gz 193 | blast/db/refseq_genomic.44.tar.gz 194 | blast/db/refseq_genomic.52.tar.gz 195 | blast/db/refseq_genomic.53.tar.gz 196 | blast/db/refseq_genomic.54.tar.gz 197 | blast/db/refseq_genomic.55.tar.gz 198 | blast/db/refseq_genomic.56.tar.gz 199 | blast/db/refseq_genomic.59.tar.gz 200 | blast/db/refseq_genomic.63.tar.gz 201 | blast/db/refseq_genomic.64.tar.gz 202 | blast/db/refseq_genomic.66.tar.gz 203 | blast/db/refseq_genomic.69.tar.gz 204 | blast/db/refseq_genomic.70.tar.gz 205 | blast/db/refseq_genomic.71.tar.gz 206 | blast/db/refseq_genomic.72.tar.gz 207 | blast/db/refseq_genomic.75.tar.gz 208 | blast/db/refseq_genomic.78.tar.gz 209 | blast/db/refseq_genomic.86.tar.gz 210 | blast/db/refseq_genomic.92.tar.gz 211 | blast/db/refseq_genomic.95.tar.gz 212 | blast/db/refseq_genomic.97.tar.gz 213 | blast/db/refseq_genomic.98.tar.gz 214 | blast/db/refseq_genomic.99.tar.gz 215 | blast/db/refseq_protein.01.tar.gz 216 | blast/db/refseq_protein.02.tar.gz 217 | blast/db/refseq_protein.03.tar.gz 218 | blast/db/refseq_rna.01.tar.gz 219 | blast/db/refseq_protein.04.tar.gz 220 | blast/db/refseq_rna.03.tar.gz 221 | blast/db/refseq_protein.07.tar.gz 222 | blast/db/refseq_protein.09.tar.gz 223 | blast/db/refseq_protein.10.tar.gz 224 | blast/db/refseq_protein.11.tar.gz 225 | blast/db/refseq_protein.12.tar.gz 226 | blast/db/refseqgene.tar.gz 227 | blast/db/refseq_protein.13.tar.gz 228 | -------------------------------------------------------------------------------- /data-format/profiling.mkd: -------------------------------------------------------------------------------- 1 | ## Profiling Output Format 2 | 3 | * Version: 0.10.0 4 | * Maintainer: Johannes Dröge 5 | * Authors: Alice C. McHardy , David Koslicki , Johannes Dröge , Peter Belmann , Stephan Majda 6 | 7 | ### 1. Outline 8 | 9 | The taxonomic profiling format was originally specified for the CAMI contest 10 | and is intended to serve as a standard format for the output of 11 | taxonomic profiling methods. 12 | 13 | It is a TAB (`\t`) delimited text format consisting of a header section and an 14 | output section. The header section MUST be above the output section and header 15 | lines MUST start with `@` whereas output lines MUST NOT. Comment lines MUST 16 | start with `#` and MAY occur both in the header and output section. Empty lines 17 | MAY occur anywhere in the output for better readability. Only the UNIX newline 18 | character `\n` MUST be used to define the end of a line and the text MUST be 19 | valid UTF-8 encoding. 20 | 21 | Files containing this data format should be named with the filename suffix `.profile`. 22 | 23 | Regular expressions, when provided, are given as specified in IEEE Std 1003.1™ ERE. 24 | 25 | ### 2. Header section 26 | 27 | Each header line MUST begin with the character `@`. A single `@` defines a 28 | key-value pair in the format **TAG:VALUE** where **TAG** MUST be an 29 | alphanumeric string. Tags are case insensitive but MAY be specified using upper 30 | and lower case letters for better readability. All tags MUST be unique per file. 31 | **VALUE** MUST NOT contain characters other than alphanumerical and `,.;_-|`. 32 | More precisely, each non-empty and non-comment header line except for the last 33 | header line MUST match the regular expression `^\@(_[A-Za-z]*_)?[A-Za-z]+[A-Za-z0-9]*\:[A-Za-z0-9,\.;_\|]*$` 34 | 35 | The specification requires that the following header tags MUST be present: 36 | 37 | * **SAMPLEID**: **VALUE** is the sample identifier, not the generating user or 38 | program name. It MUST match the regular expression `[A-Za-z0-9\._]+` and should 39 | be unique for the set of relevant samples. 40 | * **VERSION**: **VALUE** MUST specify the profiling format version in the heading 41 | of this specification and MUST match the regular expression `[0-9\.]` 42 | * **RANKS**: **VALUE** MUST specify a list of allowed ranks for the 43 | taxa in the output section and ranks MUST be given in increasing order of their 44 | distance from the taxonomy root. Each rank MUST be case-insensitive alphanumerical 45 | string and each such entry MUST be separated from the previous entry by the '|' 46 | character. Therefore, **VALUE** MUST match the regular expression `[A-Za-z]+(\|[A-Za-z]+)*` 47 | For example, considering the major ranks in the NCBI taxonomy, VALUE could be 48 | specified as `superkingdom|phylum|class|order|family|genus|species`. 49 | 50 | The following tags MAY be given: 51 | 52 | * **TAXONOMYID**: **VALUE** specifies an identifier of the external taxonomy 53 | which was used in the output section. **TAXID** values should be valid 54 | taxon identifiers in this taxonomy. 55 | 56 | Additional tags and values MAY be specified but each additional tag MUST be 57 | prefixed by a case-insensitive string with an underscore before and after the string, 58 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future. 59 | Empty prefixes MAY be used and mean that the tag starts with `__`. 60 | 61 | The last header line MUST begin with `@@` and defines TAB-separated column tags, 62 | where each **TAG** MUST be a string matching the regular expression 63 | `[A-Za-z]+[A-Za-z0-9]*` and defines the content and format of values in the 64 | corresponding column of the output section. Tags are considered case-insensitive 65 | but MAY be specified using upper and lower case letters for better readability. 66 | The tags MUST be unique in this line. The leading tags and their corresponding 67 | order MUST be 68 | 69 | * **TAXID** 70 | * **RANK** 71 | * **TAXPATH** 72 | * **PERCENTAGE** 73 | 74 | except that **TAXPATH** MAY be followed by the optional tag **TAXPATHSN**. 75 | 76 | Additional columns MAY be appended to the right after **PERCENTAGE** but MUST be 77 | prefixed by a case-insensitive string with an underscore before and after the string, 78 | e.g. `_CUSTOM_`, to avoid collisions when this specification is extended in the future. 79 | Empty prefixes MAY be used and mean that the tag starts with `__`. This means that each 80 | custom field MUST match the regular expression `_[A-Za-z]*_[A-Za-z]+[A-Za-z0-9]*` 81 | 82 | For instance: 83 | 84 | @@TAXID RANK TAXPATH PERCENTAGE 85 | 86 | or 87 | 88 | @@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE 89 | 90 | ###3. Output section 91 | 92 | An output line MUST consist of TAB-separated fields and MUST correspond to 93 | the last header line definition. Each field MUST match the regular expression 94 | `[A-Za-z0-9,\.;,\(\)_\-\ ]*`. This specification defines the following field types: 95 | 96 | **TAXID**: Fields MUST correspond to unique alphanumeric taxon identifiers, 97 | for instance in the NCBI taxonomy. Each individual field MUST match the 98 | regular expression `[A-Za-z0-9\.;,\(\)_\-\ ]+` 99 | 100 | **RANK**: Fields are case-insensitive and MUST match one of the rank identifiers 101 | which MUST be given in the header TAG **RANKS** except when an empty rank field 102 | is specified for a leaf taxon which is below the ranks specified by **RANKS**. 103 | The **RANK** field specifies where the respective taxons given in **TAXID**, 104 | **TAXPATH** or **TAXPATHSN** are located. 105 | 106 | **PERCENTAGE**: Fields specify the relative genome abundance in terms of the 107 | genome copy number for the respective TAXID in the overall sample. Note that this 108 | is not identical to the relative abundance in terms of assigned base pairs. 109 | The PERCENTAGE can be a real number between 0 and 100 but MUST NOT exceed 6 digits 110 | after the decimal point, so it MUST matcht the regular expression 111 | `[0-9]+(\.[0-9]{0,6})?`. The sum of percentages given for all taxa from the same 112 | rank MUST NOT exceed 100, that is, if something is unassigned, this will be 113 | reflected in a percentage of less than 100% being assigned. Also, the value 114 | MUST be greater or equal the sum of values for contained taxa at subordinate ranks. 115 | 116 | **TAXPATH** and **TAXPATHSN**: Fields specify the path from the root of the 117 | taxonomy to the respective taxon and MUST include the taxon which is given 118 | by **TAXID**. The path entries MUST be alphanumeric, **TAXPATH** entries MUST 119 | be taxon identifiers and **TAXPATHSN** entries should give the corresponding 120 | plain taxonomic names. All entries MUST be separated by a single `|` character 121 | and MUST be specified at the ranks and using their respective order as specified 122 | by the **RANKS** header tag. In particular, if the taxonomic path lacks a specified 123 | rank, this field MUST be left empty and would show as `||`. Empty trailing taxon entries 124 | MUST be omitted and the path MUST NOT end with `|`. Taxon entries which are not 125 | specified by the **RANKS** tag MAY only be appended to the right of a full path and 126 | MUST be refered to by an empty **RANK** field. Each **TAXPATH** and each **TAXPATHSN** 127 | field MUST match the regular expression `[A-Za-z0-9\.;,\(\)_\-\ ]+(\|[A-Za-z0-9\.;,\(\)_\-\ ])*`. 128 | If both **TAXPATH** and **TAXPATHSN** are given, then they MUST have the same number 129 | of taxon entries. 130 | 131 | For instance: 132 | 133 | # Example for TAXPATHSN: 134 | Archaea|Thaumarchaeota|||Aigarchaeota archaeon JGI 0000001-A7 135 | 136 | or 137 | 138 | # Example for TAXPATH: 139 | 2157|651137|651142|1104572|1052838 140 | 141 | ### 4. Multi-sample format 142 | 143 | Starting with version `0.10.0`, multiple samples MAY be represented in a single file by concatenation. 144 | Sample sections MUST be separated by at least one empty line after the last content line of a section 145 | and preceding the next header line. Additionally, a multi-sample file MUST specify the exact same 146 | **VERSION** and **RANKS** tag values in every section and the exact same **TAXONOMYID** tag value, if 147 | this tag is specified in at least one of the sections. The type and order of column tags MUST be 148 | identical for all sections. The **SAMPLEID** tag values must be unique for all concatenated sections. 149 | 150 | ### 5. Example 151 | 152 | # This is the bioboxes.org profiling output format at 153 | # https://github.com/bioboxes/rfc/tree/master/data-format 154 | 155 | @SampleID:mysample1 156 | @Version:0.10.0 157 | @Ranks:superkingdom|phylum|class|order|family|genus|species 158 | @TaxonomyID:ncbi-taxonomy_20171004 159 | @@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE 160 | 2 superkingdom 2 Bacteria 98.81211 161 | 2157 superkingdom 2157 Archaea 1.18789 162 | 1239 phylum 2|1239 Bacteria|Firmicutes 59.75801 163 | 1224 phylum 2|1224 Bacteria|Proteobacteria 18.94674 164 | 28890 phylum 2157|28890 Archaea|Euryarchaeotes 1.18789 165 | 91061 class 2|1239|91061 Bacteria|Firmicutes|Bacilli 59.75801 166 | 28211 class 2|1224|28211 Bacteria|Proteobacteria|Alphaproteobacteria 18.94674 167 | 183925 class 2157|28890|183925 Archaea|Euryarchaeotes|Methanobacteria 1.18789 168 | 1385 order 2|1239|91061|1385 Bacteria|Firmicutes|Bacilli|Bacillales 59.75801 169 | 356 order 2|1224|28211|356 Bacteria|Proteobacteria|Alphaproteobacteria|Rhizobacteria 10.52311 170 | 204455 order 2|1224|28211|204455 Bacteria|Proteobacteria|Alphaproteobacteria|Rhodobacterales 8.42263 171 | 2158 order 2157|28890|183925|2158 Archaea|Euryarchaeotes|Methanobacteria|Methanobacteriales 1.18789 172 | --------------------------------------------------------------------------------