├── .gitignore ├── LICENSE ├── README.md ├── config └── default.yaml ├── docs ├── config.md ├── examples.md ├── img │ ├── correlation.png │ ├── correlation_genus.png │ ├── heatmap.png │ ├── heatmap_cluster.png │ ├── heatmap_group.png │ ├── help.png │ ├── logo_panels.png │ ├── overview.png │ ├── overview_bars.png │ ├── overview_decontam.png │ ├── overview_info.png │ ├── overview_mgnify.png │ ├── overview_mgnify_2.png │ ├── overview_references.png │ ├── overview_table.png │ ├── overview_table_strep.png │ ├── samples.png │ ├── samples_bars.png │ ├── samples_table.png │ └── tools.png ├── importing.md ├── index.md └── manual.md ├── env.yaml ├── files ├── README.md ├── contaminants.yml ├── human-related.yml └── mgnify5989.tsv ├── grimer-mgnify.py ├── grimer.py ├── grimer ├── __init__.py ├── callbacks.py ├── cds.py ├── config.py ├── css │ ├── __init__.py │ └── popup.css ├── decontam.py ├── func.py ├── grimer.py ├── img │ ├── __init__.py │ └── logo.png ├── js │ ├── __init__.py │ ├── func.js │ └── popup.js ├── layout.py ├── metadata.py ├── plots.py ├── reference.py ├── scripts │ ├── __init__.py │ └── run_decontam.R └── table.py ├── mkdocs.yml ├── scripts ├── bacdive_download.py ├── ehomd_download.py ├── env.yaml ├── mgnify_download.py └── mgnify_extract.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | lib 16 | lib64 17 | __pycache__ 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 pirovc.github.io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![GRIMER](grimer/img/logo.png) 2 | 3 | GRIMER performs analysis of microbiome studies and generates a portable and interactive dashboard integrating annotation, taxonomy and metadata with focus on contamination detection. 4 | 5 | - [Installation, user manual](https://pirovc.github.io/grimer/) 6 | - [Live examples](https://pirovc.github.io/grimer/examples/) 7 | - [Pre-print](https://doi.org/10.1101/2021.06.22.449360) 8 | 9 | 10 | ![recording-(2)5](https://user-images.githubusercontent.com/4673375/211857099-c9492232-c5f8-444e-aa68-70d6db8c82b4.gif) 11 | 12 | ## Powered by 13 | 14 | 15 | [](https://bokeh.org) 16 | [](https://pandas.org) 17 | [](https://scipy.org) 18 | [](https://scikit-bio.org) 19 | -------------------------------------------------------------------------------- /config/default.yaml: -------------------------------------------------------------------------------- 1 | references: 2 | "Contaminants": "files/contaminants.yml" 3 | "Human-related": "files/human-related.yml" 4 | 5 | # controls: 6 | # "Negative Controls": "path/file1.tsv" 7 | # "Positve Controls": 8 | # "Metadata_Field": 9 | # - "Metadata_Value1" 10 | # - "Metadata_Value2" 11 | 12 | external: 13 | mgnify: "files/mgnify5989.tsv" 14 | decontam: 15 | threshold: 0.1 # [0-1] P* hyperparameter 16 | method: "frequency" # frequency, prevalence, combined 17 | # # frequency (default: use sum of counts) 18 | # frequency_file: "path/file1.txt" 19 | # frequency_metadata: "Field1" 20 | # # prevalence (default: use all controls) 21 | # prevalence_file: 22 | # - "path/file1.txt" 23 | # - "path/file2.txt" 24 | # prevalence_metadata: 25 | # "Field1": 26 | # - "ValueA" 27 | # - "ValueB" 28 | # "Field2": 29 | # - "ValueC" 30 | 31 | -------------------------------------------------------------------------------- /docs/config.md: -------------------------------------------------------------------------------- 1 | # Configuration file 2 | 3 | GRIMER uses a configuration file to set reference sources of annotation (e.g. contaminants), controls and external tools (decontam, mgnify). The configuration can be provided with the argument `-c/--config` and it should be in the [YAML](https://yaml.org/){ target="_blank" } format. 4 | 5 | A basic example of a configuration file: 6 | 7 | ```yaml 8 | references: 9 | "Contaminants": "files/contaminants.yml" 10 | "Human-related": "files/human-related.yml" 11 | 12 | controls: 13 | "Negative Controls": "path/file1.tsv" 14 | "Positve Controls": 15 | "Metadata_Field": 16 | - "Metadata_Value1" 17 | - "Metadata_Value2" 18 | 19 | external: 20 | mgnify: "files/mgnify5989.tsv" 21 | decontam: 22 | threshold: 0.1 23 | method: "frequency" 24 | ``` 25 | 26 | ## references 27 | 28 | References can be provided as an external `.yml/.yaml` file in a specific format (see below) or as a text file with one taxonomic identifier or taxonomic name per line. 29 | 30 | ```yaml 31 | "General Description": 32 | "Specific description": 33 | url: "www.website.com?id={}" 34 | ids: [1,2,3] 35 | ``` 36 | 37 | A real example of saliva organisms extracted from BacDive (NCBI taxonomic ids): 38 | 39 | ```yaml 40 | "Human-related bacterial isolates from BacDive": 41 | "Saliva": 42 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 43 | ids: [152331, 113107, 157688, 979627, 45634, 60133, 157687, 1624, 1583331, 1632, 249188] 44 | ``` 45 | 46 | Common contaminants compiled from the literature and human-related possible sources of contamination are available in the [GRIMER repository](https://github.com/pirovc/grimer/tree/main/files){ target="_blank" }. For more information, please refer to the [pre-print](https://doi.org/10.1101/2021.06.22.449360){ target="_blank" }. If the target study overlaps with some of those annotation (e.g. study of human skin), related entries can be easily removed from the provided files to not generate redundant annotations. 47 | 48 | ## controls 49 | 50 | Several control groups can be provided to annotate samples. They can be provided as a file with one sample identifier per line: 51 | 52 | ```yaml 53 | controls: 54 | "Controls": "controls.txt" 55 | ``` 56 | 57 | or directly from the metadata (`-m/--metadata-file`) as a field and value(s) information: 58 | 59 | ```yaml 60 | controls: 61 | "Other Controls": 62 | "sample_type": # field 63 | - "blank" # value 64 | - "control" # value 65 | ``` 66 | 67 | Both methods can be combined into one configuration file. 68 | 69 | ## external 70 | 71 | Set the configuration and functionality of external tools executed by GRIMER. 72 | 73 | ### mgnify 74 | 75 | GRIMER uses a parsed MGnify database to annotate observations and link them to the respective MGnify repository, reporting most common biome occurrences. Instructions on how to re-generate the parsed database from MGnify can be found [here](https://github.com/pirovc/grimer/tree/main/files#mgnify){ target="_blank" }. 76 | 77 | A [pre-parsed database](https://raw.githubusercontent.com/pirovc/grimer/main/files/mgnify5989.tsv){ target="_blank" } is available in the GRIMER repository (generated on 2022-03-09). To use it, please set the file in the configuration as follows and activate it with the `-g/--mgnify` when running GRIMER. 78 | 79 | ```yaml 80 | external: 81 | mgnify: "files/mgnify5989.tsv" 82 | ``` 83 | 84 | ### decontam 85 | 86 | GRIMER can run [DECONTAM](https://benjjneb.github.io/decontam/){ target="_blank" } with `-d/--decontam`, but some configuration is necessary. It is possible to set the threshold (P* hyperparameter) and the method (frequency, prevalence, combined). 87 | 88 | For the frequency/combined method, DNA frequencies for each sample have to be provided either in a `.tsv` file (sample identifier frequency) or as a metadata field. If none is provided, the sum of all counts in the input table is used for the frequency calculation. 89 | 90 | For the prevalence/combined method, file(s) with a list of sample identifiers or a metadata field/value can be provided. If none is provided, all samples defined in the "controls" are considered for the prevalence calculation. 91 | 92 | Below an example of how to set-up the configuration file for DECONTAM: 93 | 94 | ```yaml 95 | external: 96 | decontam: 97 | threshold: 0.1 # P* hyperparameter threshold, values between 0 and 1 98 | method: "frequency" # Options: frequency, prevalence, combined 99 | frequency_file: "path/file1.txt" 100 | # frequency_metadata: "Field1" 101 | # prevalence_file: 102 | # - "path/file1.txt" 103 | # - "path/file2.txt" 104 | prevalence_metadata: 105 | "Field1": 106 | - "ValueA" 107 | - "ValueB" 108 | "Field2": 109 | - "ValueC" 110 | ``` 111 | 112 | ## Using the configuration file 113 | 114 | Example [UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom){ target="_blank" } file from [microbiomedb.org](https://microbiomedb.org){ target="_blank" } 115 | 116 | config.yml (external .yml files are available in the [GRIMER repository](https://github.com/pirovc/grimer/tree/main/files){ target="_blank" }) 117 | 118 | ```yml 119 | references: 120 | "Contaminants": "files/contaminants.yml" 121 | "Human-related": "files/human-related.yml" 122 | 123 | external: 124 | mgnify: "files/mgnify5989.tsv" 125 | decontam: 126 | threshold: 0.1 # [0-1] P* hyperparameter 127 | method: "frequency" # frequency, prevalence, combined 128 | ``` 129 | 130 | Running GRIMER with DECONTAM and MGnify integration 131 | 132 | ```bash 133 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom \ 134 | --config config.yml \ 135 | --decontam --mgnify \ 136 | --taxonomy ncbi \ 137 | --ranks superkingdom phylum class order family genus species 138 | ``` -------------------------------------------------------------------------------- /docs/examples.md: -------------------------------------------------------------------------------- 1 | ![GRIMER](https://raw.githubusercontent.com/pirovc/grimer/main/grimer/img/logo.png) 2 | 3 | Examples of reports generated with [GRIMER](https://github.com/pirovc/grimer) 4 | 5 | --- 6 | 7 | ### Data analysis from Leiby et al. "Lack of detection of a human placenta microbiome in samples from preterm and term deliveries" 8 | 9 | ***original publication: [10.1186/s40168-018-0575-4](https://doi.org/10.1186/s40168-018-0575-4){ target="_blank" }*** 10 | 11 | **[GRIMER report MGS](https://pirovc.github.io/grimer-reports/placenta/placenta_mgs.html){ target="_blank" }** 12 | 13 | **[GRIMER report AMPLICON](https://pirovc.github.io/grimer-reports/placenta/placenta_amplicon.html){ target="_blank" }** 14 | 15 |
16 | commands used to create report 17 | 18 | ```bash 19 | # Download files (table, metadata and config) 20 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/placenta/placenta_files.tar.gz 21 | tar xf placenta_files.tar.gz 22 | 23 | # Run GRIMER 24 | # AMPLICON 25 | grimer --config placenta_amplicon_config.yaml \ 26 | --input-file placenta_amplicon_table.tsv \ 27 | --metadata-file placenta_metadata.tsv \ 28 | --taxonomy ncbi \ 29 | --ranks superkingdom phylum class order family genus species \ 30 | --level-separator ";" \ 31 | --obs-replace "^.+__" "" "_" " " \ 32 | --unassigned-header "Unassigned" \ 33 | --decontam --mgnify --transpose \ 34 | --title "Placenta study AMPLICON - Leiby, J.S. et al 2018" \ 35 | --output-html placenta_amplicon.html 36 | 37 | # MGS 38 | grimer --config placenta_mgs_config.yaml \ 39 | --input-file placenta_mgs_table.tsv \ 40 | --metadata-file placenta_metadata.tsv \ 41 | --taxonomy ncbi \ 42 | --ranks superkingdom phylum class order family genus species \ 43 | --level-separator "|" \ 44 | --unassigned-header "unassigned" \ 45 | --decontam --mgnify \ 46 | --title "Placenta study MGS - Leiby, J.S. et al 2018" \ 47 | --output-html placenta_mgs.html 48 | ``` 49 |
50 | 51 | --- 52 | 53 | ### KatharoSeq analysis from Minich et al. "KatharoSeq Enables High-Throughput Microbiome Analysis from Low-Biomass Samples" 54 | 55 | ***original publication: [10.1128/mSystems.00218-17](https://doi.org/10.1128/mSystems.00218-17){ target="_blank" }*** 56 | 57 | **[GRIMER report](https://pirovc.github.io/grimer-reports/katharoseq/katharoseq.html){ target="_blank" }** 58 | 59 |
60 | commands used to create report 61 | 62 | ```bash 63 | # Download files (table, metadata and config) 64 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/katharoseq/katharoseq_files.tar.gz 65 | tar xf katharoseq_files.tar.gz 66 | 67 | # Run GRIMER 68 | grimer --config katharoseq_config.yaml \ 69 | --input-file katharoseq_table.tsv \ 70 | --metadata-file katharoseq_metadata.tsv \ 71 | --transformation clr \ 72 | --obs-replace "^.+__" "" "_" " " \ 73 | --taxonomy ncbi \ 74 | --ranks superkingdom phylum class order family genus species \ 75 | --level-separator ";" \ 76 | --decontam --mgnify \ 77 | --title "KatharoSeq - Minich et al. 2018" \ 78 | --output-html katharoseq.html 79 | ``` 80 | 81 |
82 | 83 | --- 84 | 85 | ### Preterm Infant Resistome downloaded from [MicrobiomeDB](https://microbiomedb.org/mbio/app/record/dataset/DS_82fe0308e2){ target="_blank" } 86 | 87 | ***original publication: [10.1038/nmicrobiol.2016.24](https://doi.org/10.1038/nmicrobiol.2016.24){ target="_blank" }*** 88 | 89 | **[GRIMER report](https://pirovc.github.io/grimer-reports/microbiomedb/ResistomeAmplicon.html){ target="_blank" }** 90 | 91 |
92 | commands used to create report 93 | 94 | ```bash 95 | # Download files (table, metadata and config) - Original source: https://microbiomedb.org/common/downloads/release-22/82fe0308e2032de2041694df6592ba542ea84b86/ResistomeAmplicon.16s_DADA2.taxon_abundance.biom 96 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/microbiomedb/microbiomedb_files.tar.gz 97 | tar xf microbiomedb_files.tar.gz 98 | 99 | # Run GRIMER 100 | grimer --config ResistomeAmplicon.16s_DADA2_config.yaml \ 101 | --input-file ResistomeAmplicon.16s_DADA2.taxon_abundance.biom \ 102 | --taxonomy ncbi \ 103 | --ranks superkingdom phylum class order family genus species \ 104 | --decontam --mgnify \ 105 | --title "MicrobiomeDB Preterm Infant Resistome (V4)" \ 106 | --output-html ResistomeAmplicon.html 107 | ``` 108 | 109 |
110 | 111 | --- 112 | 113 | ### Antibiotic induced changes in the microbiota disrupt redox dynamics in the gut downloaded from [MGnify](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005180){ target="_blank" } 114 | 115 | ***original publication [10.7554/elife.35987](https://doi.org/10.7554/elife.35987){ target="_blank" }*** 116 | 117 | **[GRIMER report](https://pirovc.github.io/grimer-reports/mgnify/MGYS00005180.html){ target="_blank" }** 118 | 119 |
120 | commands used to create report 121 | 122 | ```bash 123 | # Script to download files and generate GRIMER report from any MGnify study accession 124 | # Requires "jsonapi-client>=0.9.7" (conda install "jsonapi-client>=0.9.7") 125 | ./grimer-mgnify.py -i MGYS00005180 -o MGYS00005180 -g "--decontam --mgnify" 126 | 127 | # Or directly from files 128 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/mgnify/mgnify_files.tar.gz 129 | tar xf mgnify_files.tar.gz 130 | # Run GRIMER 131 | grimer --config MGYS00005180_config.yaml \ 132 | --input-file MGYS00005180_ERP108433_taxonomy_abundances_SSU_v4.1.tsv \ 133 | --metadata-file MGYS00005180_metadata.tsv \ 134 | --obs-replace "^.+__" "" "_" " " \ 135 | --taxonomy ncbi \ 136 | --ranks superkingdom kingdom phylum class order family genus species \ 137 | --level-separator ";" \ 138 | --decontam --mgnify \ 139 | --title "MGnify study accession MGYS00005180" \ 140 | --output-html MGYS00005180.html 141 | ``` 142 | 143 |
144 | 145 | --- -------------------------------------------------------------------------------- /docs/img/correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/correlation.png -------------------------------------------------------------------------------- /docs/img/correlation_genus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/correlation_genus.png -------------------------------------------------------------------------------- /docs/img/heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/heatmap.png -------------------------------------------------------------------------------- /docs/img/heatmap_cluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/heatmap_cluster.png -------------------------------------------------------------------------------- /docs/img/heatmap_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/heatmap_group.png -------------------------------------------------------------------------------- /docs/img/help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/help.png -------------------------------------------------------------------------------- /docs/img/logo_panels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/logo_panels.png -------------------------------------------------------------------------------- /docs/img/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview.png -------------------------------------------------------------------------------- /docs/img/overview_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_bars.png -------------------------------------------------------------------------------- /docs/img/overview_decontam.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_decontam.png -------------------------------------------------------------------------------- /docs/img/overview_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_info.png -------------------------------------------------------------------------------- /docs/img/overview_mgnify.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_mgnify.png -------------------------------------------------------------------------------- /docs/img/overview_mgnify_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_mgnify_2.png -------------------------------------------------------------------------------- /docs/img/overview_references.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_references.png -------------------------------------------------------------------------------- /docs/img/overview_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_table.png -------------------------------------------------------------------------------- /docs/img/overview_table_strep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_table_strep.png -------------------------------------------------------------------------------- /docs/img/samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/samples.png -------------------------------------------------------------------------------- /docs/img/samples_bars.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/samples_bars.png -------------------------------------------------------------------------------- /docs/img/samples_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/samples_table.png -------------------------------------------------------------------------------- /docs/img/tools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/tools.png -------------------------------------------------------------------------------- /docs/importing.md: -------------------------------------------------------------------------------- 1 | # Importing files 2 | 3 | GRIMER is independent of any quantification method and requires a contingency table with raw counts of observations/components for each samples/compositions in the study. Observations are usually, but not limited to, taxonomic entries (e.g. genus, species, strains), operational taxonomic units (OTUs), amplicon sequence variants (ASVs), metagenome-assembled genomes (MAGs) or sequence features. 4 | 5 | GRIMER `--input-file` accepts a file with tab-separated values (.tsv) containing a table of counts (Observation table, Count table, Contingency Tables, ...) or a [.biom](https://biom-format.org/){ target="_blank" } file. 6 | 7 | ## The Biological Observation Matrix file (.biom) 8 | 9 | GRIMER parses [BIOM](https://biom-format.org/){ target="_blank" } files and affiliated metadata, if available. Alternatively, an external metadata file can be provided with `-m/--metadata` and will take precedence over the .biom metadata. 10 | 11 | Example [UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom){ target="_blank" } file from [microbiomedb.org](https://microbiomedb.org){ target="_blank" } 12 | 13 | - Default report (no taxonomy) 14 | 15 | ```bash 16 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom 17 | ``` 18 | 19 | - Integrated NCBI taxonomy (will translate names to taxonomy ids) 20 | 21 | ```bash 22 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom \ 23 | --taxonomy ncbi \ 24 | --ranks superkingdom phylum class order family genus species 25 | ``` 26 | 27 | - Using an external metadata file ([UgandaMaternalV3V4.16s_DADA2.sample_details.tsv](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.sample_details.tsv){ target="_blank" }) 28 | 29 | ```bash 30 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom \ 31 | --metadata-file UgandaMaternalV3V4.16s_DADA2.sample_details.tsv \ 32 | --taxonomy ncbi \ 33 | --ranks superkingdom phylum class order family genus species 34 | ``` 35 | 36 | ## tab-separated file (.tsv) 37 | 38 | GRIMER parses .tsv files with single taxonomic identifier/names annotations or with multi-level (e.g.: lineage) taxonomic annotated observations. 39 | 40 | - Rows contain observations and columns contain samples (use `--transpose` if your file is reversed) 41 | - First column and first row are used as headers 42 | - Taxonomy integration: files can have either taxonomic identifiers (NCBI, e.g.: 562) or taxonomic names (NCBI, e.g.: Escherichia coli or GTDB, e.g.: s__Escherichia coli) 43 | 44 | ### Multi-level annotations (e.g. Bacteria;Proteobacteria;Gammaproteobacteria...) 45 | 46 | - Example [UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv){ target="_blank" } file from [microbiomedb.org](https://microbiomedb.org){ target="_blank" } 47 | 48 | 49 | ```bash 50 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv \ 51 | --level-separator ";" 52 | ``` 53 | 54 | - With metadata ([UgandaMaternalV3V4.16s_DADA2.sample_details.tsv](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv)) 55 | 56 | ```bash 57 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv \ 58 | --level-separator ";" \ 59 | --metadata-file UgandaMaternalV3V4.16s_DADA2.sample_details.tsv 60 | ``` 61 | 62 | - With integrated NCBI taxonomy (will translate names to taxids) 63 | 64 | ```bash 65 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv \ 66 | --level-separator ";" \ 67 | --metadata-file UgandaMaternalV3V4.16s_DADA2.sample_details.tsv \ 68 | --taxonomy ncbi \ 69 | --ranks superkingdom phylum class order family genus species 70 | ``` 71 | 72 | ### Single level annotations (e.g. Neisseria animalis) 73 | 74 | - Example [ERP108433_phylum_taxonomy_abundances_SSU_v4.1.tsv](https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005180/pipelines/4.1/file/ERP108433_phylum_taxonomy_abundances_SSU_v4.1.tsv) from [MGnify](https://www.ebi.ac.uk/metagenomics), phylum level only 75 | 76 | ```bash 77 | # Removing first column with kingdom 78 | cut -f 2- ERP108433_phylum_taxonomy_abundances_SSU_v4.1.tsv > ERP108433_phylum_taxonomy_abundances_SSU_v4.1_parsed.tsv 79 | # Set identifier for unassigned observations as "Unassigned" (many occurences, will be summed) 80 | grimer --input-file ERP108433_phylum_taxonomy_abundances_SSU_v4.1_parsed.tsv \ 81 | --unassigned-header "Unassigned" 82 | ``` 83 | 84 | - Re-generating taxonomic lineage from single annotations (in this case only superkingdom) 85 | 86 | ```bash 87 | grimer --input-file ERP108433_phylum_taxonomy_abundances_SSU_v4.1_parsed.tsv \ 88 | --unassigned-header "Unassigned" \ 89 | --taxonomy ncbi \ 90 | --ranks superkingdom phylum 91 | ``` 92 | 93 | ## From commonly used tools/sources 94 | 95 | ### ganon 96 | 97 | ```bash 98 | ganon table --input *.tre \ 99 | --output-file ganon_table.tsv \ 100 | --header taxid \ 101 | --rank species 102 | 103 | grimer --input-file ganon_table.tsv \ 104 | --taxonomy ncbi \ 105 | --ranks superkingdom phylum class order family genus species 106 | ``` 107 | 108 | ### MetaPhlAn 109 | 110 | ```bash 111 | # merge_metaphlan_tables.py is available with the metaphlan package 112 | merge_metaphlan_tables.py *.tsv | head -n+2 > metaphlan_table.tsv 113 | 114 | grimer --input-file metaphlan_table.tsv \ 115 | --level-separator "|" \ 116 | --obs-replace '^.+__' '' '_' ' ' \ 117 | --taxonomy ncbi \ 118 | --ranks superkingdom phylum class order family genus species 119 | ``` 120 | 121 | ### QIIME2 feature table (.qza) 122 | 123 | - Example [feature-table.qza](https://docs.qiime2.org/2022.8/data/tutorials/exporting/feature-table.qza) from [QIIME2 docs](https://docs.qiime2.org/2022.8/tutorials/exporting/#exporting-a-feature-table) 124 | 125 | ```bash 126 | qiime tools export --input-path feature-table.qza --output-path exported-feature-table 127 | grimer --input-file exported-feature-table/feature-table.biom 128 | ``` 129 | 130 | ### phyloseq 131 | 132 | 133 | ```R 134 | #source("http://bioconductor.org/biocLite.R") 135 | #biocLite("biomformat") 136 | #biocLite('phyloseq') 137 | library("biomformat") 138 | library('phyloseq') 139 | data(soilrep) 140 | b <- make_biom(data = otu_table(soilrep)) 141 | write_biom(b, 'out.biom') 142 | ``` 143 | 144 | ```bash 145 | grimer --input-file out.biom 146 | ``` 147 | 148 | ### MGnify 149 | 150 | - `grimer-mgnify.py` will download and generate a GRIMER report for any MGnify study accession (e.g. MGYS00006024) 151 | 152 | ```bash 153 | # Install API dependency 154 | conda install "jsonapi-client>=0.9.7" 155 | ./grimer-mgnify.py -i MGYS00006024 -o out_folder_mgnify/ 156 | ``` 157 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # GRIMER 2 | 3 | 4 | 5 | ## About 6 | 7 | GRIMER is a tool that performs automated analyses and generates a portable and interactive dashboard integrating annotation, taxonomy and metadata. It unifies several sources of evidence to help detect contamination. GRIMER is independent of quantification methods and directly analyses contingency tables to create an interactive and offline report. Reports can be created in seconds and are accessible for non-specialists, providing an intuitive set of charts to explore data distribution among observations and samples and its connections with external sources. 8 | 9 | - More information about the method: [pre-print](https://doi.org/10.1101/2021.06.22.449360){ target="_blank" } 10 | - Source-code: [GitHub repository](https://github.com/pirovc/grimer){ target="_blank" } 11 | 12 | ## Installation 13 | 14 | Via conda 15 | 16 | ```bash 17 | conda install -c bioconda -c conda-forge grimer 18 | ``` 19 | 20 | or locally installing only dependencies via conda: 21 | 22 | ```bash 23 | git clone https://github.com/pirovc/grimer.git 24 | cd grimer 25 | conda env create -f env.yaml # or mamba env create -f env.yaml 26 | conda activate grimer # or source activate grimer 27 | python setup.py install --record files.txt # Uninstall: xargs rm -rf < files.txt 28 | grimer -h 29 | ``` 30 | 31 | ## Basic Usage 32 | 33 | - In-depth examples of input files: [Importing files](importing) 34 | - Complete examples of usage with real files: [Examples](examples) 35 | 36 | 37 | Tab-separated input table 38 | 39 | ```bash 40 | grimer -i input_table.tsv 41 | ``` 42 | 43 | BIOM file 44 | ```bash 45 | grimer -i myfile.biom 46 | ``` 47 | 48 | Tab-separated input table with taxonomic annotated observations (e.g. sk__Bacteria;k__;p__Actinobacteria;c__Actinobacteria...) 49 | ```bash 50 | grimer -i input_table.tsv -f ";" 51 | ``` 52 | 53 | Tab-separated input table with metadata 54 | ```bash 55 | grimer -i input_table.tsv -m metadata.tsv 56 | ``` 57 | 58 | With taxonomy integration (ncbi) 59 | ```bash 60 | grimer -i input_table.tsv -m metadata.tsv -t ncbi #optional -b taxdump.tar.gz 61 | ``` 62 | 63 | With configuration file to setup external tools, references and annotations 64 | ```bash 65 | grimer -i input_table.tsv -m metadata.tsv -t ncbi -c config/default.yaml -d -g 66 | ``` 67 | 68 | ## Parameters 69 | 70 | 71 | ▄████ ██▀███ ██▓ ███▄ ▄███▓▓█████ ██▀███ 72 | ██▒ ▀█▒▓██ ▒ ██▒▓██▒▓██▒▀█▀ ██▒▓█ ▀ ▓██ ▒ ██▒ 73 | ▒██░▄▄▄░▓██ ░▄█ ▒▒██▒▓██ ▓██░▒███ ▓██ ░▄█ ▒ 74 | ░▓█ ██▓▒██▀▀█▄ ░██░▒██ ▒██ ▒▓█ ▄ ▒██▀▀█▄ 75 | ░▒▓███▀▒░██▓ ▒██▒░██░▒██▒ ░██▒░▒████▒░██▓ ▒██▒ 76 | ░▒ ▒ ░ ▒▓ ░▒▓░░▓ ░ ▒░ ░ ░░░ ▒░ ░░ ▒▓ ░▒▓░ 77 | ░ ░ ░▒ ░ ▒░ ▒ ░░ ░ ░ ░ ░ ░ ░▒ ░ ▒░ 78 | ░ ░ ░ ░░ ░ ▒ ░░ ░ ░ ░░ ░ 79 | ░ ░ ░ ░ ░ ░ ░ 80 | version 1.1.0 81 | 82 | 83 | usage: grimer [-h] -i INPUT_FILE [-m METADATA_FILE] [-c CONFIG] 84 | [-t {ncbi,gtdb,silva,greengenes,ott}] [-b [TAXONOMY_FILES ...]] [-r [RANKS ...]] 85 | [-l TITLE] [-p [{overview,samples,heatmap,correlation} ...]] [-o OUTPUT_HTML] 86 | [--full-offline] [-g] [-d] [-f LEVEL_SEPARATOR] [-y VALUES] [-w] [-s] 87 | [-u [UNASSIGNED_HEADER ...]] [-z REPLACE_ZEROS] [--obs-replace [OBS_REPLACE ...]] 88 | [--sample-replace [SAMPLE_REPLACE ...]] [--min-frequency MIN_FREQUENCY] 89 | [--max-frequency MAX_FREQUENCY] [--min-count MIN_COUNT] [--max-count MAX_COUNT] 90 | [-j TOP_OBS_BARS] [-a {none,norm,log,clr}] [-e METADATA_COLS] [--optimal-ordering] 91 | [--show-zeros] 92 | [--linkage-methods [{single,complete,average,centroid,median,ward,weighted} ...]] 93 | [--linkage-metrics [{braycurtis,canberra,chebyshev,cityblock,correlation,cosine,dice,euclidean,hamming,jaccard,jensenshannon,kulsinski,kulczynski1,mahalanobis,minkowski,rogerstanimoto,russellrao,seuclidean,sokalmichener,sokalsneath,sqeuclidean,yule} ...]] 94 | [--skip-dendrogram] [-x TOP_OBS_CORR] [-v] 95 | 96 | optional arguments: 97 | -h, --help show this help message and exit 98 | -v, --version show program's version number and exit 99 | 100 | required arguments: 101 | -i INPUT_FILE, --input-file INPUT_FILE 102 | Tab-separatad file with table with counts (Observation table, Count table, 103 | Contingency Tables, ...) or .biom file. By default rows contain observations 104 | and columns contain samples (use --transpose if your file is reversed). The 105 | first column and first row are used as headers. (default: None) 106 | 107 | main arguments: 108 | -m METADATA_FILE, --metadata-file METADATA_FILE 109 | Tab-separated file with metadata. Rows should contain samples and columns 110 | the metadata fields. QIIME2 metadata format is accepted, with an extra row 111 | to define categorical and numerical fields. If --input-file is a .biom file, 112 | metadata will be extracted from it if available. (default: None) 113 | -c CONFIG, --config CONFIG 114 | Configuration file with definitions of references, controls and external 115 | tools. (default: None) 116 | -t {ncbi,gtdb,silva,greengenes,ott}, --taxonomy {ncbi,gtdb,silva,greengenes,ott} 117 | Enable taxonomic analysis, convert entries and annotate samples. Files will 118 | be automatically downloaded and parsed. Optionally, stored files can be 119 | provided with --taxonomy-files. (default: None) 120 | -b [TAXONOMY_FILES ...], --taxonomy-files [TAXONOMY_FILES ...] 121 | Specific taxonomy files to use with --taxonomy. (default: []) 122 | -r [RANKS ...], --ranks [RANKS ...] 123 | Taxonomic ranks to generate visualizations. Use 'default' to use entries 124 | from the table directly. (default: ['default']) 125 | 126 | output arguments: 127 | -l TITLE, --title TITLE 128 | Title to display on the top of the report. (default: ) 129 | -p [{overview,samples,heatmap,correlation} ...], --output-plots [{overview,samples,heatmap,correlation} ...] 130 | Plots to generate. (default: ['overview', 'samples', 'heatmap', 131 | 'correlation']) 132 | -o OUTPUT_HTML, --output-html OUTPUT_HTML 133 | Filename of the HTML report output. (default: output.html) 134 | --full-offline Embed Bokeh javascript library in the output file. Output will be around 135 | 1.5MB bigger but it will work without internet connection. ~your report will 136 | live forever~ (default: False) 137 | 138 | general data options: 139 | -g, --mgnify Plot MGnify, requires --config file with parsed MGnify database. (default: 140 | False) 141 | -d, --decontam Run DECONTAM and generate plots. requires --config file with DECONTAM 142 | configuration. (default: False) 143 | -f LEVEL_SEPARATOR, --level-separator LEVEL_SEPARATOR 144 | If provided, consider --input-table to be a hierarchical multi-level table 145 | where the observations headers are separated by the indicated separator char 146 | (usually ';' or '|') (default: None) 147 | -y VALUES, --values VALUES 148 | Force 'count' or 'normalized' data parsing. Empty to auto-detect. (default: 149 | None) 150 | -w, --cumm-levels Activate if input table has already cummulative values on parent taxonomic 151 | levels. (default: False) 152 | -s, --transpose Transpose --input-table before parsing (if samples are listed on columns and 153 | observations on rows) (default: False) 154 | -u [UNASSIGNED_HEADER ...], --unassigned-header [UNASSIGNED_HEADER ...] 155 | Define one or more header names containing unsassinged/unclassified counts. 156 | (default: None) 157 | -z REPLACE_ZEROS, --replace-zeros REPLACE_ZEROS 158 | Treat zeros in the input table. INT (add 'smallest count' divided by INT to 159 | every value), FLOAT (add FLOAT to every value). Default: 1000 (default: 160 | 1000) 161 | --obs-replace [OBS_REPLACE ...] 162 | Replace values on observations labels/headers (supports regex). Example: '_' 163 | ' ' will replace underscore with spaces, '^.+__' '' will remove the matching 164 | regex. Several pairs of instructions are supported. (default: []) 165 | --sample-replace [SAMPLE_REPLACE ...] 166 | Replace values on sample labels/headers (supports regex). Example: '_' ' ' 167 | will replace underscore with spaces, '^.+__' '' will remove the matching 168 | regex. Several pairs of instructions are supported. (default: []) 169 | --min-frequency MIN_FREQUENCY 170 | Define minimum number/percentage of samples containing an observation to 171 | keep the observation [values between 0-1 for percentage, >1 specific 172 | number]. (default: None) 173 | --max-frequency MAX_FREQUENCY 174 | Define maximum number/percentage of samples containing an observation to 175 | keep the observation [values between 0-1 for percentage, >1 specific 176 | number]. (default: None) 177 | --min-count MIN_COUNT 178 | Define minimum number/percentage of counts to keep an observation [values 179 | between 0-1 for percentage, >1 specific number]. (default: None) 180 | --max-count MAX_COUNT 181 | Define maximum number/percentage of counts to keep an observation [values 182 | between 0-1 for percentage, >1 specific number]. (default: None) 183 | 184 | Samples options: 185 | -j TOP_OBS_BARS, --top-obs-bars TOP_OBS_BARS 186 | Number of top abundant observations to show in the Samples panel, based on 187 | the avg. percentage counts/sample. (default: 20) 188 | 189 | Heatmap and clustering options: 190 | -a {none,norm,log,clr}, --transformation {none,norm,log,clr} 191 | Transformation of counts for Heatmap. none (counts), norm (percentage), log 192 | (log10), clr (centre log ratio). (default: log) 193 | -e METADATA_COLS, --metadata-cols METADATA_COLS 194 | Available metadata cols to be selected on the Heatmap panel. Higher values 195 | will slow down the report navigation. (default: 3) 196 | --optimal-ordering Activate optimal_ordering on scipy linkage method, takes longer for large 197 | number of samples. (default: False) 198 | --show-zeros Do not skip zeros on heatmap plot. File will be bigger and iteraction with 199 | heatmap slower. By default, zeros will be omitted. (default: False) 200 | --linkage-methods [{single,complete,average,centroid,median,ward,weighted} ...] 201 | --linkage-metrics [{braycurtis,canberra,chebyshev,cityblock,correlation,cosine,dice,euclidean,hamming,jaccard,jensenshannon,kulsinski,kulczynski1,mahalanobis,minkowski,rogerstanimoto,russellrao,seuclidean,sokalmichener,sokalsneath,sqeuclidean,yule} ...] 202 | --skip-dendrogram Disable dendogram plots for clustering. (default: False) 203 | 204 | Correlation options: 205 | -x TOP_OBS_CORR, --top-obs-corr TOP_OBS_CORR 206 | Number of top abundant observations to build the correlationn matrix, based 207 | on the avg. percentage counts/sample. 0 for all (default: 50) 208 | 209 | 210 | ## Powered by 211 | 212 | [](https://bokeh.org) 213 | [](https://pandas.org) 214 | [](https://scipy.org) 215 | [](https://scikit-bio.org) 216 | -------------------------------------------------------------------------------- /docs/manual.md: -------------------------------------------------------------------------------- 1 | # GRIMER Reports - User Manual 2 | 3 | --- 4 | 5 | *For this manual, the metagenomics analysis is based on data from Leiby et al. "Lack of detection of a human placenta microbiome in samples from preterm and term deliveries"* 6 | 7 | - **[GRIMER report MGS Leiby et al.](https://pirovc.github.io/grimer-reports/placenta/placenta_mgs.html){ target="_blank" }** 8 | 9 | --- 10 | 11 | GRIMER report contains 4 main panels: [Overview](#overview), [Samples](#samples), [Heatmap](#heatmap), and [Correlation](#correlation). Every panel has one or more visualization and widgets to select, filter, group, and modify its contents. 12 | 13 | 14 | 15 | - Panels can be reported independently with `-p/--output-plots` 16 | - Help buttons provide details and information about the plot/analysis 17 | - All plots have built-in tools to export a png, show/hide tooltips, zoom in/out, select entries, among other features 18 | 19 | ## Overview 20 | 21 | The Overview panel shows an individual summary for each observation, related annotations and their distribution among samples. 22 | 23 | 24 | 25 | ### Table 26 | 27 | On the top, a table will list the observations of the study (e.g. OTUS, species). If taxons are the observations, entries can be divided into taxonomic ranks. 28 | 29 | 30 | 31 | - It is possible to filter the items listed on the table using the widgets on the rigth 32 | - Each entry will contain some details about the observations (e.g. Frequency among samples, total counts, ...) 33 | - Selecting an item on the table will activate further details of the observation in the other plots of the panel 34 | 35 | For example, the genus *Streptococcus*: 36 | 37 | 38 | 39 | - Appears on 61% of the samples of this study. 40 | - Has an average of 2.5% relative abundance among all samples. 41 | - Was reported in 5 studies as a "common contaminant". 42 | - It is highly present in water and negative control samples. 43 | - It was detected as a possible contaminant by the DECONTAM method. 44 | 45 | ### Side plots 46 | 47 | On the top right, additional plots and information are display once an observation is selected on the Table. In this example, the *Streptococcus* genus is selected. 48 | 49 | #### Info 50 | 51 | 52 | 53 | - Further information about the observation and related references is displayed. In this case, common contaminants sources. 54 | 55 | #### References 56 | 57 | 58 | 59 | - This plot shows the number of counts of the observation in the provided references (and the counts on the taxonomic lineage). 60 | - In the example above, the genus *Streptococcus* was reported 5 times directly in one of the reference sets (common contaminants), and 3 times as parent (some species of *Streptococcus* were reported as contaminants). 61 | 62 | #### MGnify 63 | 64 | 65 | 66 | - This plot shows the number of studies in the MGnify database for the selected observation. 67 | - *Streptococcus* was reported in 316 studies for the biome "Host Associated:Human". 68 | - In-detail biome levels can be selected to define more specific groups. In the biome level 5 (see below), *Streptococcus* was reported to be mostly found in Fecal samples among all MGnify studies. 69 | 70 | 71 | 72 | #### DECONTAM 73 | 74 | 75 | 76 | - This plot can be used to verify the DECONTAM output. 77 | - It shows the proportion of counts of the selected observation (y-axis) against DNA Concentration (if provided) or Total number of counts (x-axis) of each sample, both in log10 scale. 78 | - Controls samples are displayed in a different color. 79 | - An indication of contamination can be defined when counts are inversely proportional to DNA concentration. The red and black dotted lines are the expected models for contamination and non-contamination, respectively, based on the data of the study. A good indication for contamination is when the counts (excluding the control samples) "fit" the red line model. 80 | - The P-score statistic is not a P-value and it is not associated with any guarantees on the type 1 error rate. Small scores indicate the contaminant model is a better fit, and high scores indicate that the non-contaminant model is a better fit. 81 | - More details about the DECONTAM method and output can be found [here](https://benjjneb.github.io/decontam/vignettes/decontam_intro.html){ target="_blank" }. 82 | 83 | ### Sample bars 84 | 85 | This plot summarizes samples content, annotated with general classification metrics (left y-axis). Annotations can be selected on the bottom dropdown lists. Once an observation is selected on the top table, this plot will also show the count of the observation for each sample (right y-axis). 86 | 87 | 88 | 89 | - Bars are showing total number of counts for each sample and are annotated with the percentage of "human-related" taxa, provided as a reference. 90 | - The x-axis is grouped by two metadata variables: Type and Case/control. Each sub-group is sorted based on the number of counts (in this case reads). 91 | - Yellow circles (right y-axis) are showing the amount of the selected observation (*Streptococcus*) for each one of the samples in a log scale. 92 | - Parent taxonomic ranks can be activated on the top-rigth legend. 93 | 94 | ## Samples 95 | 96 | In-depth evaluation of individual samples can be performed in this panel. 97 | 98 | 99 | 100 | ### Table 101 | 102 | 103 | 104 | - The top table lists all samples in the study, with information about assigned and unassigned counts. 105 | - Further information for the abundance of each taxonomic rank is display if enabled for the report. 106 | - Rows of the table can be selected using the widgets on the right (or manually with the checkboxes). Selected items will be displayed in the bar plot. In the example above, only samples belonging to the "Maternal Saliva" category for the metadata field "Type" are selected. 107 | 108 | ### Bars 109 | 110 | 111 | 112 | - Observation bars showing proportions of top most abundant taxa. The number of items to be displayed can defined with the parameter `-j/--top-obs-bars` 113 | - In the example above, genus level proportions are displayed only for the items selected in the table. 114 | - The bars are grouped by Case/Control and antibiotic usage. Samples are sorted by the *Streptococcus* (1) abundances within each group. 115 | 116 | ## Heatmap 117 | 118 | Several transformations (`-a/--transformation`) can be applied to the data (normalization, log, center log ratio) to be further visualized in the Heatmap panel. Hierarchical clustering, grouping and sorting options can be independently selected for samples and observations to enable pattern detection (e.g. batch effects, treatment effects etc). 119 | 120 | 121 | 122 | - The heatmap shows values for samples (y-axis) and observations (x-axis). 123 | - Side panels link metadata and annotation information to the heatmap axis. 124 | - By default, all external references are displayed. Metadata field(s) can manually selected in bottom-right list. 125 | 126 | ### Clustering 127 | 128 | 129 | 130 | - Heatmap data can be sorted by hierarchical/agglomerative clustering. 131 | - More clustering methods and metrics can be generated using the parameter `--linkage-methods` and `--linkage-metrics`. 132 | - Additional dendrograms are displayed on the clustered axis. 133 | - Here is possible to explore effects of clusters of data in the external panels (annotations and metadata). 134 | 135 | ### Grouping 136 | 137 | 138 | 139 | - Grouping of the heatmap data can be done by taxonomic ranks for observation and metadata for samples. 140 | - Data can be further sorted among groups. 141 | 142 | ## Correlation 143 | 144 | 145 | 146 | Correlation between all observations in the study are plotted as a heatmap matrix. Positive or negative correlations among observations point to concurrent signals in the microbiome analysis (e.g. certain species with similar abudances in the study). 147 | 148 | 149 | 150 | - In the example above, only top observations are displayed. This can be changed with the parameter `-x/--top-obs-corr`. 151 | - Only highly positive or negative correlated organisms are displayed: -1 to -0.8 and 0.8 to 1, respectively 152 | - Organisms highly correlated can be further investigated in the Overview and Heatmap panels. -------------------------------------------------------------------------------- /env.yaml: -------------------------------------------------------------------------------- 1 | name: grimer 2 | channels: 3 | - defaults 4 | - bioconda 5 | - conda-forge 6 | dependencies: 7 | - bokeh==2.2.3 8 | - pandas 9 | - numpy 10 | - scipy>=1.6.0 11 | - scikit-bio>=0.5.6 12 | - multitax>=1.2.1 13 | - markdown 14 | - biom-format>=2.1.10 15 | - r-base>=4.0.0 #DECONTAM 16 | - bioconductor-decontam==1.10.0 #DECONTAM 17 | - r-optparse==1.6.6 #DECONTAM 18 | - jinja2==3.0.3 # newer version do not work with bokeh==2.2.3 -------------------------------------------------------------------------------- /files/README.md: -------------------------------------------------------------------------------- 1 | # GRIMER References and other files 2 | 3 | ## Reference file format 4 | 5 | 1) File with a list (one per line) of taxonomic identifiers or taxonomic names 6 | 7 | 2) or formatted `.yml` file: 8 | 9 | ```yaml 10 | "General Description": 11 | "Specific description": 12 | url: "www.website.com?id={}" 13 | ids: [1,2,3] 14 | ``` 15 | 16 | The url can be a link to the entries listed on the id. Use the `{}` as a placeholder for the id. Example: `https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}` 17 | 18 | The files should be provided in the main configuration file for grimer as follows: 19 | 20 | ```yaml 21 | references: 22 | "Contaminants": "files/contaminants.yml" 23 | "Human-related": "files/human-related.yml" 24 | "CUSTOM CONTAMINANTS": "file.txt" 25 | "LAB RELATED BACTERIA": "another_file.yml" 26 | ``` 27 | 28 | ### contaminants.yml 29 | 30 | Last update: 2022-03-09 31 | 32 | Manually curated from diverse publications: 33 | 34 | | Organism group | Genus | Species | Reference | 35 | |----------------|-------|---------|-----------| 36 | | Bacteria | 6 | 0 | 1998 Tanner, M.A. et al. | 37 | | Bacteria | 0 | 10 | 2002 Kulakov, L.A. et al. | 38 | | Bacteria | 4 | 0 | 2003 Grahn, N. et al. | 39 | | Bacteria | 16 | 0 | 2006 Barton, H.A. et al. | 40 | | Bacteria | 11 | 1 | 2014 Laurence, M. et al.| 41 | | Bacteria | 92 | 0 | 2014 Salter, S.J. et al. | 42 | | Bacteria | 7 | 0 | 2015 Jervis-Bardy, J. et al. | 43 | | Bacteria | 28 | 0 | 2015 Jousselin, E. et al. | 44 | | Bacteria | 77 | 127 | 2016 Glassing, A. et al.| 45 | | Bacteria | 23 | 0 | 2016 Lauder, A.P. et al. | 46 | | Bacteria | 6 | 0 | 2016 Lazarevic, V. et al. | 47 | | Bacteria | 62 | 0 | 2017 Salter, S.J. et al. | 48 | | Bacteria | 0 | 122 | 2018 Kirstahler, P. et al. | 49 | | Bacteria | 34 | 0 | 2018 Stinson, L.F. et al. | 50 | | Bacteria | 18 | 0 | 2019 Stinson, L.F. et al. | 51 | | Bacteria | 52 | 2 | 2019 Weyrich, L.S. et al. | 52 | | Bacteria | 8 | 26 | 2019 de Goffau, M.C. et al. | 53 | | Bacteria | 15 | 93 | 2020 Nejman D. et al. | 54 | | Viruses | 0 | 1 | 2015 Kjartansdóttir, K.R. et al. | 55 | | Viruses | 0 | 1 | 2015 Mukherjee, S. et al. | 56 | | Viruses | 0 | 291 | 2019 Asplund, M. et al. | 57 | | Eukaryota | 0 | 3 | 2016 Czurda, S. et al. | 58 | | Eukaryota | 0 | 1 | PRJNA168| 59 | | Total (unique) | 210 | 627 | | 60 | 61 | ### human-related.yml 62 | 63 | Last update: 2022-03-09 64 | 65 | Manually curated from from: Byrd, A., Belkaid, Y. & Segre, J. The human skin microbiome. Nat Rev Microbiol 16, 143–155 (2018). https://doi.org/10.1038/nrmicro.2017.157 66 | 67 | ```yaml 68 | "Top organisms form the human skin microbiome": 69 | "Bacteria": 70 | url: "https://doi.org/10.1038/nrmicro.2017.157" 71 | ids: [257758, 225324, 169292, 161879, 146827, 43765, 38304, 38287, 38286, 29466, 29388, 28037, 1747, 1305, 1303, 1290, 1282, 1270] 72 | "Eukarya": 73 | url: "https://doi.org/10.1038/nrmicro.2017.157" 74 | ids: [2510778, 1047171, 379413, 119676, 117179, 76777, 76775, 76773, 44058, 41880, 36894, 34391, 31312, 5480, 5068, 3074, 2762] 75 | "Viruses": 76 | url: "https://doi.org/10.1038/nrmicro.2017.157" 77 | ids: [185639, 746832, 10566, 493803, 10279, 746830, 746831, 46771] 78 | ``` 79 | 80 | BacDive and eHOMD specific subsets. Dump date: 2022-03-09 81 | 82 | ```bash 83 | scripts/bacdive_download.py 84 | scripts/ehomd_download.py 85 | ``` 86 | 87 | ## MGnify 88 | 89 | The downloaded MGnify database file should be provided in the main configuration file for grimer as follows: 90 | 91 | ```yaml 92 | external: 93 | mgnify: "files/mgnify5989.tsv" 94 | ``` 95 | ### mgnify.tsv 96 | 97 | MGnify dump date: 2022-03-09 (latest study accession MGYS00005989) 98 | 99 | ```bash 100 | seq -f "MGYS%08g" 256 5989 | xargs -P 24 -I {} scripts/mgnify_download.py -i {} -v -g -o mgnify_dump_5989/ > mgnify_dump_5989.log 2>|1 | 101 | scripts/mgnify_extract.py -f mgnify_dump_5989 -t 10 -o files/mgnify.tsv 102 | ``` 103 | -------------------------------------------------------------------------------- /files/contaminants.yml: -------------------------------------------------------------------------------- 1 | "Common Bacterial contaminants": 2 | "2019 de Goffau, M.C. et al.": 3 | url: "http://doi.org/10.1038/s41586-019-1451-5" 4 | ids: [407, 335058, 504481, 1747, 40324, 1033, 38304, 28037, 470, 29448, 1828, 92793, 75, 375, 180282, 851, 301302, 853, 816, 33870, 85698, 87883, 147207, 68909, 1043493, 293256, 1134405, 410, 321895, 432308, 1416628, 1314, 1343, 69359] 5 | "2018 Kirstahler, P. et al.": 6 | url: "http://doi.org/10.1038/s41598-018-22416-4" 7 | ids: [1747, 40324, 470, 29448, 294, 1828, 39491, 40214, 28090, 134533, 108981, 202956, 239935, 28117, 28116, 818, 820, 161879, 33011, 80866, 853, 823, 821, 296, 303, 34073, 2559073, 1055192, 106648, 1075768, 1076, 1112209, 1131812, 1150298, 1159870, 1160721, 1198452, 1217692, 1276755, 1320556, 134534, 1353941, 136273, 1435036, 147645, 1492737, 1492738, 1497615, 1504823, 1509403, 1519439, 1538644, 1619232, 162426, 1646498, 165179, 1654716, 1665556, 1678129, 169292, 1706231, 1714344, 172088, 1736272, 1736280, 1736296, 1736316, 1736528, 1736532, 1740090, 1833, 1835254, 192843, 202952, 202954, 211589, 216465, 245, 246602, 246787, 247, 266749, 2702, 2736, 285, 28901, 29536, 310297, 310298, 33010, 34062, 346179, 362413, 362418, 370974, 38303, 387661, 40520, 418240, 46506, 47920, 503361, 50340, 52133, 529884, 53412, 55197, 55508, 5665, 64974, 70863, 75659, 756892, 76773, 80878, 80882, 86182, 96345, 986, 989370, 991, 99158] 8 | "2015 Jervis-Bardy, J. et al.": 9 | url: "http://doi.org/10.1186/s40168-015-0083-8" 10 | ids: [286, 48736, 59732, 335058, 41275, 28100, 34072] 11 | "2003 Grahn, N. et al.": 12 | url: "http://doi.org/10.1016/S0378-1097(02)01190-4" 13 | ids: [286, 48736, 40323, 338] 14 | "2020 Nejman D. et al.": 15 | url: "http://doi.org/10.1126/science.aay9189" 16 | ids: [561, 59732, 504481, 1747, 40324, 501783, 68287, 34072, 38304, 28037, 470, 29448, 294, 1828, 56946, 375, 180282, 225324, 729, 31998, 1282, 1290, 40214, 28090, 134533, 108981, 202956, 161879, 80866, 296, 303, 34073, 222991, 958, 232523, 81, 987053, 35812, 74030, 469322, 213484, 86669, 471, 106649, 40215, 358, 475299, 1036779, 151416, 993502, 328552, 43992, 1698, 74316, 293, 41276, 155892, 225991, 106592, 428988, 135517, 77097, 115555, 29570, 115553, 71999, 72000, 1358, 333297, 560405, 381630, 334852, 944322, 269069, 84292, 82380, 1270, 205844, 94625, 94626, 34004, 47496, 431058, 431059, 98513, 129817, 316, 76761, 256325, 228654, 190721, 329, 38313, 623, 323621, 352475, 382, 258, 59803, 185950, 93064, 68569, 370959, 172044, 117207, 33050, 1304, 1305] 17 | "2019 Weyrich, L.S. et al.": 18 | url: "http://doi.org/10.1111/1755-0998.13011" 19 | ids: [561, 286, 407, 48736, 40323, 469, 59732, 237, 335058, 1743, 13687, 374, 32008, 283, 1716, 44249, 504481, 1279, 1301, 1654, 1386, 501783, 106589, 212791, 1350, 129337, 724, 1578, 1357, 68287, 838, 222, 1485, 46913, 2745, 1269, 165696, 84567, 165695, 343873, 358705, 239935, 59753, 29330, 174708, 57495, 332102, 1243, 665874, 68, 2742, 1847, 114248, 94008] 20 | "2014 Laurence, M. et al.": 21 | url: "http://doi.org/10.1371/journal.pone.0097876" 22 | ids: [561, 286, 48736, 40323, 237, 13687, 374, 32008, 379, 222, 357, 1230476] 23 | "1998 Tanner, M.A. et al.": 24 | url: "http://doi.org/10.1128/AEM.64.8.3110-3113.1998" 25 | ids: [561, 40323, 469, 963, 75654, 88] 26 | "2014 Salter, S.J. et al.": 27 | url: "http://doi.org/10.1186/s12915-014-0087-z" 28 | ids: [561, 286, 407, 48736, 40323, 469, 59732, 237, 335058, 1743, 13687, 374, 32008, 283, 1716, 963, 44249, 1301, 1033, 1663, 1386, 55080, 41275, 106589, 212791, 149698, 68287, 28100, 379, 1827, 34072, 46123, 12916, 92793, 182269, 75, 281915, 46913, 75654, 29580, 32257, 57493, 88, 1269, 165696, 846, 84567, 125216, 165695, 165697, 343873, 338, 2040, 76890, 12960, 146937, 532, 84756, 85413, 1696, 2755, 77583, 2034, 1298, 80865, 37914, 120831, 547, 66831, 1860, 274591, 1004300, 53457, 131079, 16, 378210, 33882, 29404, 64001, 354354, 528, 376469, 265, 361607, 47494, 52972, 83618, 497, 215579, 1054211, 2060, 401469] 29 | "2016 Lauder, A.P. et al.": 30 | url: "http://doi.org/10.1186/s40168-016-0172-3" 31 | ids: [407, 469, 237, 1743, 504481, 1301, 1654, 165779, 1350, 724, 1578, 838, 1380, 1016, 33042, 39948, 117563, 32067, 906, 482, 836, 32207, 29465] 32 | "2015 Jousselin, E. et al.": 33 | url: "http://doi.org/10.1111/1755-0998.12478" 34 | ids: [286, 407, 48736, 469, 1743, 13687, 1716, 44249, 1279, 165779, 1663, 55080, 129337, 1357, 1827, 2745, 165697, 150247, 32199, 568987, 435913, 70774, 89966, 171, 20, 529883, 613, 39643] 35 | "2016 Lazarevic, V. et al.": 36 | url: "http://doi.org/10.1186/s12866-016-0689-4" 37 | ids: [561, 286, 1279, 1357, 1827, 57493] 38 | "2006 Barton, H.A. et al.": 39 | url: "http://doi.org/10.1016/j.mimet.2005.10.005" 40 | ids: [561, 407, 48736, 40323, 469, 13687, 283, 963, 41275, 106589, 149698, 379, 12916, 357, 2282523, 1257] 41 | "2016 Glassing, A. et al.": 42 | url: "http://doi.org/10.1186/s13099-016-0103-7" 43 | ids: [561, 286, 407, 40323, 59732, 237, 335058, 1743, 374, 32008, 283, 1716, 963, 44249, 1279, 1301, 1747, 40324, 1654, 1033, 165779, 1663, 1386, 55080, 501783, 212791, 1350, 129337, 724, 1578, 149698, 28100, 838, 38304, 28037, 294, 46123, 1380, 182269, 1016, 1485, 33042, 281915, 39948, 117563, 29580, 32257, 32067, 906, 482, 846, 836, 125216, 32207, 29465, 56946, 358705, 225324, 851, 729, 31998, 301302, 1282, 1290, 39491, 28117, 28116, 818, 820, 33011, 823, 821, 201096, 244127, 572511, 356778, 135858, 990721, 216851, 236752, 946234, 46466, 43994, 1506553, 28050, 437755, 119852, 100175, 577310, 53370, 46205, 255204, 747294, 1567, 295418, 97050, 84108, 28453, 432330, 1522, 39492, 33038, 46125, 114702, 1655, 1656, 46353, 172371, 33029, 54007, 160404, 92442, 1402, 115979, 35841, 216816, 138336, 136996, 51101, 54914, 60550, 198252, 337315, 1017, 2718, 1492, 29363, 74426, 117506, 39791, 43768, 401472, 38305, 218538, 309120, 39486, 84112, 208479, 1351, 1352, 1547, 562, 564, 1379, 84135, 362076, 249058, 46124, 137732, 272239, 726, 863372, 154046, 261299, 29581, 505, 573, 467210, 1624, 1382, 40542, 1613, 172827, 374425, 61654, 144191, 483, 484, 669464, 33033, 2741, 204516, 28101, 28124, 28127, 840, 28132, 28135, 106588, 47883, 76731, 204525, 172042, 1660, 69823, 615, 102148, 1283, 28035, 29389, 45634, 230120, 230123, 1302, 68892, 1338, 1309, 1303, 257758, 1308, 157076, 154288, 39778, 29466] 44 | "2017 Salter, S.J. et al.": 45 | url: "http://doi.org/10.1371/journal.pntd.0005975" 46 | ids: [50709, 299566, 1375, 2040, 507, 31988, 165779, 161492, 150247, 92793, 374, 55080, 1696, 41275, 369926, 32008, 194, 2717, 75, 10, 59732, 1716, 37914, 231454, 423604, 212791, 117563, 963, 1004300, 682522, 1357, 149698, 906, 68287, 407, 33882, 1839, 528, 376469, 84567, 335058, 28100, 838, 286, 83618, 48736, 379, 1835, 45669, 22, 28453, 13687, 40323, 1054211, 13275, 33057, 157, 213484, 29465, 1827, 265, 1386] 47 | "2018 Stinson, L.F. et al.": 48 | url: "http://doi.org/10.3389/fmicb.2018.00270" 49 | ids: [1696, 1716, 43668, 37914, 1269, 32207, 1743, 836, 838, 1016, 308865, 1386, 2755, 1279, 66831, 1350, 1578, 1301, 29465, 374, 407, 434, 165696, 13687, 283, 80865, 93681, 48736, 570, 713, 469, 212791, 286, 40323] 50 | "2019 Stinson, L.F. et al.": 51 | url: "http://doi.org/10.1111/lam.13091" 52 | ids: [561, 335058, 407, 13687, 407, 374, 165696, 222, 1716, 547, 48736, 1004302, 1827, 1743, 1269, 204456, 106589, 1678] 53 | "2002 Kulakov, L.A. et al.": 54 | url: "http://doi.org/10.1128/AEM.68.4.1548-1555.2002" 55 | ids: [329, 376, 239, 36773, 69392, 1785, 1409, 304, 28214, 294] 56 | "Common Viral contaminants": 57 | "2019 Asplund, M. et al.": 58 | url: "http://doi.org/10.1016/j.cmi.2019.04.028" 59 | ids: [12071, 742919, 11103, 31647, 1678143, 10298, 10376, 10359, 11676, 129951, 10583, 31552, 10798, 11908, 585044, 518981, 1225745, 11620, 1891767, 493803, 11033, 159150, 35306, 68887, 11870, 11958, 11861, 11946, 11864, 363745, 363020, 242521, 11866, 11960, 31668, 31669, 31670, 11867, 11955, 11874, 11876, 11878, 11885, 36381, 11886, 11888, 269447, 269448, 11950, 11948, 1332312, 354090, 11884, 1352534, 1395610, 1395611, 1395612, 1395613, 1395614, 1395615, 1395616, 1395617, 1395618, 1395619, 1395620, 1341019, 11801, 11809, 1511763, 1394983, 697906, 1072204, 1148801, 1574422, 12104, 763552, 10264, 85708, 759804, 28344, 85506, 33747, 10345, 285986, 220638, 1154691, 185638, 1169627, 1045778, 185636, 72201, 345198, 176652, 1301280, 68347, 1618248, 1618254, 10288, 198112, 1454023, 1454024, 1454025, 1278278, 1278246, 1278252, 1278247, 1278248, 1278249, 1278250, 1278251, 399781, 1278255, 346932, 1278261, 1278263, 1278265, 1474867, 1379694, 1521385, 1521387, 1521389, 938081, 938082, 880162, 251749, 455370, 169864, 1379788, 1608440, 642253, 642255, 1224510, 1592207, 1592212, 1592083, 1592085, 1592086, 1592088, 1592093, 1592095, 1592096, 1592081, 1843761, 1519405, 1557033, 1608451, 664785, 1435438, 1170653, 40979, 12235, 12138, 11987, 51680, 12056, 146500, 554168, 212035, 1269028, 693272, 1420594, 1094892, 1128140, 1235314, 1128143, 1128151, 1128131, 1450746, 1461100, 181522, 1424633, 1010698, 1299317, 1450749, 1416631, 1128422, 1034806, 1592112, 1592113, 1592127, 938080, 1074214, 1519385, 1519387, 1519389, 1519390, 1519395, 1519396, 1519397, 186617, 1262072, 1407671, 743583, 340016, 745107, 745102, 745100, 1416009, 1187128, 889876, 760732, 1243183, 1229760, 1481186, 1505225, 1560342, 233894, 115987, 260149, 227470, 926067, 1127514, 1296654, 294382, 1486657, 1084719, 10756, 1486662, 1285382, 1497851, 1127515, 145579, 263375, 764562, 1133292, 1133022, 242527, 260373, 279280, 644524, 242861, 1132026, 1357714, 1197951, 1327981, 1327976, 1327979, 1327992, 1328030, 1327990, 1327980, 1327972, 1327982, 1327995, 1327983, 1327970, 1327971, 756279, 1327977, 1327993, 1328029, 1327975, 1327974, 1327985, 756280, 756282, 1527524, 1540094, 1042123, 541865, 1567016, 765765, 1176422, 1327037, 1162295, 1141135, 1141136, 335924, 536444, 929832, 682650, 1137745, 536473, 749413, 1477406, 1048515, 1048516, 1048517, 1048520, 1048521, 1537091, 1264700, 1609634, 1455074, 414970, 10863, 10864, 1222338, 1147148, 1237364, 1414766, 1977402, 948870, 1524881, 10665, 10760, 1147094, 1429767, 925983, 925984, 1527519, 1527506, 1229753, 1540097, 1540098, 1054461, 1391223, 294631, 1325731, 908819, 1458858, 1458842, 90963, 1536592, 1527515, 551895, 1129191, 139872, 201847, 287412, 1262517, 754044, 1385658, 1176423, 889949, 446529, 1034128, 1056830, 1089119, 1486472, 1034111, 205879, 1340709, 1567475, 1472912, 1204539, 1399915, 1283076, 1283077, 1168479, 1168478, 440250, 400567, 994601, 1465639, 889956, 445700, 444862, 536454, 445688, 444861, 1229794, 1229793, 1229792, 1229791, 1229790, 1229789, 1229786, 1229787, 1229788, 1229784, 1229782, 376758, 1498188, 504501, 504553, 1235647, 1235648, 1235649, 1235650, 1235653, 1235654, 1235655, 1235656, 1235657, 877240, 754052, 1316739, 347326, 1235689, 31535, 757342, 582345, 1462581, 386793, 1204517, 347327, 1335230, 743813, 1348912, 1327964, 270673, 188350, 1541891, 169683, 998086, 1500757, 1458843, 1129146, 1279082, 1114179, 1548900, 1231048, 1548901, 1449437, 1548918, 1476390, 462590, 754048, 948071, 1481785, 1417599, 1131316, 691965, 136084, 754067, 1161935, 1173749, 1173761, 1173759, 1173762, 590739, 1406795, 1141134, 1204529, 1540099, 1168549, 866889, 1458859, 1458860, 1458861, 10761, 754060, 1524882, 1357423, 373126, 1150991, 1195080, 320843, 55510, 1434319, 320850, 369581, 537874, 1208587, 1566990, 10732, 490913, 1526550, 1340810, 756277, 753084, 753085, 756275, 1026955, 1340812, 238854, 555387, 754042, 444860, 981335, 469660, 215796, 1478972, 1385659, 926697, 336724, 278008, 1211417, 271647, 754075, 573173, 573174, 979525, 979534, 1529058, 1283071, 573176, 1589298, 1076759, 1461743, 1150989, 754058, 754051, 929835, 1414739, 754072, 1524880, 194802, 1168281, 1204514, 1188795, 331278] 60 | "2015 Mukherjee, S. et al.": 61 | url: "http://doi.org/10.1186/1944-3277-10-18" 62 | ids: [10847] 63 | "2015 Kjartansdóttir, K.R. et al.": 64 | url: "https://doi.org/10.1073/pnas.1423756112" 65 | ids: [322019] 66 | "Common Eukaryotic contaminants": 67 | "PRJNA168": 68 | url: "https://www.ncbi.nlm.nih.gov/genome/guide/human/" 69 | ids: [9606] 70 | "2016 Czurda, S. et al.": 71 | url: "https://doi.org/10.1128/JCM.02112-15" 72 | ids: [1895944, 76775, 5308] 73 | -------------------------------------------------------------------------------- /files/human-related.yml: -------------------------------------------------------------------------------- 1 | "Top organisms form the human skin microbiome": 2 | "Bacteria": 3 | url: "https://doi.org/10.1038/nrmicro.2017.157" 4 | ids: [257758, 225324, 169292, 161879, 146827, 43765, 38304, 38287, 38286, 29466, 29388, 28037, 1747, 1305, 1303, 1290, 1282, 1270] 5 | "Eukarya": 6 | url: "https://doi.org/10.1038/nrmicro.2017.157" 7 | ids: [2510778, 1047171, 379413, 119676, 117179, 76777, 76775, 76773, 44058, 41880, 36894, 34391, 31312, 5480, 5068, 3074, 2762] 8 | "Viruses": 9 | url: "https://doi.org/10.1038/nrmicro.2017.157" 10 | ids: [185639, 746832, 10566, 493803, 10279, 746830, 746831, 46771] 11 | "Human Oral Microbiome Database (eHOMD)": 12 | "Oral": 13 | url: "http://www.ehomd.org/?name=HOMD" 14 | ids: [712116, 469621, 888056, 767100, 1194526, 2081962, 1547448, 1225197, 936596, 1074118, 1321781, 947828, 1403335, 1046629, 39950, 1242967, 1287474, 1074106, 999424, 319701, 999429, 546268, 927666, 1401072, 857100, 1035189, 638301, 857154, 997347, 1125718, 525375, 1403338, 942513, 1227262, 1411915, 1074166, 575614, 888062, 1125712, 1236516, 936561, 486408, 546269, 1225205, 1095741, 1041521, 712710, 596330, 210007, 655813, 553178, 562981, 1321822, 907491, 553199, 1125701, 857291, 546274, 1227269, 1074104, 749551, 1236508, 1074116, 712362, 553198, 1028803, 1889813, 1125724, 857111, 1225192, 608534, 575612, 1423814, 370554, 1028802, 1244083, 904338, 1167010, 1051006, 999432, 1035196, 592010, 546266, 553220, 864567, 1302863, 862966, 1002365, 1407647, 1051972, 525376, 1440770, 471876, 321967, 487215, 1074175, 546270, 866778, 857131, 1242969, 1225193, 888809, 242619, 712368, 596323, 282402, 122586, 1048332, 680646, 712528, 592028, 1236497, 862969, 626523, 1395125, 888727, 1125725, 1074183, 1423782, 888815, 1366052, 1123249, 1032505, 1316596, 767031, 888743, 158, 1074122, 1266997, 1266996, 1321779, 679199, 754507, 1316593, 1122984, 370551, 904317, 563032, 1190621, 1074119, 1046624, 706439, 203275, 713051, 553201, 999430, 1095731, 857133, 767029, 907492, 686659, 1211023, 1321782, 742820, 1347790, 997353, 879310, 629741, 888728, 28129, 1225202, 1122987, 1321815, 857099, 888808, 1051985, 1321818, 1411148, 1257041, 944564, 1035197, 1127692, 1434264, 1074095, 1074173, 1225204, 1785995, 857129, 944565, 186103, 1125719, 1401068, 630588, 1448849, 634176, 556263, 888814, 712938, 712623, 1074176, 1225187, 1423799, 1114967, 28137, 1440768, 857135, 28112, 1074121, 999438, 393480, 641149, 888061, 1309, 1073367, 1030843, 888721, 553184, 909952, 888825, 1236517, 2081702, 1227276, 1297564, 1074178, 1074124, 1292047, 160491, 1035195, 1114965, 702439, 1307428, 209882, 411466, 1122993, 679198, 696216, 187101, 888811, 712361, 857105, 1243032, 652722, 1257040, 679196, 857149, 1311575, 562982, 521097, 760570, 732, 1122989, 470565, 935589, 491076, 888054, 469607, 1304, 575611, 457405, 1661745, 596315, 909420, 1035185, 1074138, 626522, 1203258, 712122, 883167, 1236518, 889204, 1203602, 374833, 684066, 1074105, 575615, 679201, 649743, 1167007, 1448850, 1155071, 1225188, 944560, 1074165, 999436, 1074156, 575590, 523794, 1739435, 562973, 521095, 857113, 883109, 907488, 888833, 489653, 712466, 122587, 596319, 1127690, 885272, 1125722, 888057, 706436, 1440771, 469602, 1234601, 857125, 1095748, 1283280, 1074179, 1225191, 1000588, 525378, 1035190, 857147, 748671, 888812, 546265, 997830, 871541, 684738, 907490, 936589, 1074066, 1120979, 272556, 1225186, 712357, 568704, 649764, 634994, 1009852, 764544, 1108963, 857140, 1401077, 871237, 591365, 1307427, 999414, 880592, 334390, 2093824, 768728, 1074144, 324831, 857138, 553171, 999422, 888052, 649742, 1161424, 45634, 1167628, 864568, 999428, 1074101, 712538, 1125723, 553175, 1225196, 1334627, 857148, 999423, 861450, 28132, 546271, 1203259, 544580, 712411, 626369, 1074167, 1122982, 679192, 857137, 862513, 746361, 2748316, 585503, 873513, 1161421, 997352, 1321775, 1739279, 76859, 999435, 1321823, 1177574, 546273, 888813, 1128111, 1122986, 1401073, 1307443, 997356, 546262, 888049, 1074109, 471872, 857102, 1074190, 935599, 889201, 754505, 1739543, 1225200, 592026, 857123, 837, 272622, 1257037, 1120943, 712624, 1125717, 857151, 796943, 857290, 1074160, 868129, 907487, 2572089, 1074128, 857108, 712435, 888810, 679193, 767453, 1074148, 857134, 1095752, 1125702, 712363, 1074137, 1321821, 1095750, 861454, 553174, 1710, 1028806, 762965, 1292048, 1074143, 1401079, 52773, 1074151, 864570, 1073372, 28131, 1074159, 1095729, 370552, 272831, 435830, 1115809, 1225194, 360104, 596324, 1074108, 706433, 1318634, 447456, 1005704, 857152, 1074184, 653386, 1074092, 1074112, 857153, 596322, 1114969, 469599, 857110, 2748317, 619693, 1028804, 585501, 1125700, 1383, 999431, 1256219, 431947, 668336, 768726, 76123, 566549, 1227268, 1321772, 1434258, 904306, 1256230, 1095733, 857146, 641147, 1248420, 641143, 521393, 1122174, 768727, 999437, 1074115, 525326, 1074134, 1403949, 1859694, 1074120, 1074157, 1122980, 1074162, 1120957, 1316254, 114527, 1257038, 1074149, 1321816, 1203603, 1074186, 1125699, 638300, 754506, 76857, 1104322, 221027, 712961, 1321774, 714, 1123317, 1074161, 40543, 1074123, 1000590, 537973, 1035194, 1321784, 861455, 1316933, 1225201, 1000570, 889206, 713059, 435838, 873517, 1074155, 1307442, 399795, 706437, 999426, 712310, 1095747, 714315, 1122949, 1074136, 1346615, 1095730, 888060, 1434263, 857120, 1297567, 1434260, 1074126, 985008, 679195, 1115803, 999427, 511691, 164, 671214, 857112, 706434, 1161902, 857104, 857132, 693991, 888050, 1074146, 796942, 999434, 553207, 469604, 630527, 1111678, 176279, 1029822, 457403, 1073353, 671211, 1437447, 35519, 679188, 1705617, 1167009, 1227264, 1120944, 857106, 1403829, 873533, 887325, 702437, 888746, 944557, 857142, 857103, 797473, 1074130, 1161422, 1074171, 1074153, 857130, 929793, 546263, 1884263, 1273133, 699187, 1127691, 862968, 908937, 190304, 857136, 713030, 1074129, 198466, 1198676, 857144, 1321786, 1125720, 176090, 469601, 1074182, 888019, 907486, 604162, 864563, 888832, 1216362, 869214, 857126, 160490, 1095738, 1122172, 76856, 1095742, 679194, 936563, 694569, 866776, 1750, 887901, 272623, 1073366, 1411021, 243275, 1236504, 857109, 888051, 1122985, 1234877, 1074170, 1225203, 596329, 479436, 1321820, 1297566, 1074154, 547045, 1120942, 857143, 742814, 1307444, 1321817, 1227272, 568703, 575593, 638302, 887929, 435832, 1200793, 1123310, 1074168, 363952, 712982, 907493, 1095739, 712471, 1095740, 1256223, 1031709, 1035193, 1122171, 862515, 1089447, 176280, 411465, 1074113, 862967, 1074107, 649760, 857155, 857119, 857115, 883094, 1257042, 553219, 1434265, 554406, 347253, 1226633, 857116, 1157946, 1660, 1074111, 1159208, 1235815, 1074127, 686660, 1410950, 1225190, 1434262, 887898, 929102, 1074494, 1257039, 633147, 1353243, 702438, 1123263, 267747, 1035184, 546264, 905067, 1081904, 1227266, 1122991, 1074140, 563033, 2572088, 2572087, 712150, 1074135, 883092, 645512, 360105, 1088720, 651822, 879309, 857121, 861452, 596320, 416870, 1434261, 45243, 662598, 66851, 888059, 525374, 857101, 1042402, 592031, 28133, 712711, 1074102, 1127699, 999415, 1195243, 888741, 1321814, 1434259, 370553, 857139, 388919, 1091045, 447455, 679200, 286636, 999439, 1440769, 546275, 596317, 857122, 525361, 649761, 888742, 888816, 1227270, 712633, 57171, 883158, 762948, 1203550, 1074093, 857150, 857128, 525337, 904294, 712365, 1074100, 857124, 1074125, 857117, 904296, 1225199, 1287476, 1005705, 1074185, 857292, 768724, 798300, 1227271, 562983, 1095744, 553177, 857141, 857114, 907489, 1138874, 1127694, 1411022, 857118, 857107, 864565, 1120941, 1074164, 77917, 246198, 1225189, 193567, 1297565, 1415626, 1074114, 999440, 1227261, 1127693, 857127, 1127696, 1028805, 469378, 888055, 935598, 1035188, 1095743, 1078483, 1225195, 1074180, 762963, 1074177, 467705, 857145, 999425, 862970, 1125721, 1127695, 1121268, 1404260, 525283, 525325, 352165, 712991, 620833, 553218, 1074169, 1074181, 1114966, 862971, 1293577, 888048, 1167008, 999433, 1403336, 1122994, 1185324, 1073362, 293653, 1225198, 2748177, 1074163, 362948] 15 | "Nasal": 16 | url: "http://www.ehomd.org/?name=HOMD" 17 | ids: [406556, 516950, 1203561, 418127, 282458, 883103, 512767, 497962, 1715217, 1236608, 282459, 1069628, 857571, 553567, 1739317, 857577, 451515, 656912, 760791, 487214, 374927, 1069626, 452948, 553573, 1203622, 656913, 450394, 869216, 553594, 374933, 512566, 374932, 760746, 488222, 548474, 456482, 521005, 869309, 525381, 857575, 1203559, 1069625, 359786, 1203627, 760809, 453362, 406557, 71421, 553574, 574093, 869215, 196620, 553583, 760861, 488221, 512769, 1203566, 1203632, 189423, 406563, 1834153, 1203619, 553580, 406558, 553590, 561276, 869269, 455227, 453361, 760810, 478, 553592, 281310, 548473, 374928, 548470, 1069623, 548475, 553581, 374931, 158879, 553577, 488223, 553571, 553588, 857578, 480, 553601, 857574, 262728, 886289, 585161, 453366, 171101, 760834, 1203625, 497980, 857579, 453365, 521004, 406561, 262727, 375177, 359787, 375063, 374930, 1203557, 158878, 935897, 760787, 453363, 406560, 487213, 857576, 595501, 553596, 497963, 273036, 93061, 512768, 681288, 1121367, 553565, 90241, 1203562, 406562, 727, 170187, 1130804, 93062, 426430, 1715123, 866630, 553568, 857581, 406559, 857573, 451516, 857572, 375432, 1203624, 862964, 373153, 546342, 703339, 453364] 18 | "Human-related bacterial isolates from BacDive": 19 | "Limbs": 20 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 21 | ids: [178214, 52132, 386414, 306, 146827, 38303, 137732, 326522, 760, 755171, 82380, 38304, 1504, 478, 47920, 33010, 37326, 354351, 488, 1717, 33935, 1747, 33007, 1660, 1667, 614, 31973, 358, 29466, 69968, 1141657, 754, 479117, 43770, 1286, 652, 411577, 90245, 487, 1713, 43767, 47312, 59561, 630, 479, 156979, 1648, 1513, 732, 1292, 287, 539, 38313, 150055, 1890675, 291112, 13076, 2014, 1529, 420404, 1785, 196, 1245, 220685, 620903, 53437, 1977869, 217204, 180332, 38875, 400946, 495, 84698, 316, 1766, 28189, 161902, 192066, 714, 2054, 1282, 749, 74706, 38301, 753, 1352, 587, 490, 670, 283734, 29354, 303, 326523, 36740, 470, 28125, 485, 47917, 521520, 120957, 131111, 739, 511, 38289, 550, 200476, 1379, 158822, 220687, 53462, 123899, 650, 84112, 1280, 192, 1536, 1509, 131110, 1351, 46124, 239, 71254, 29380, 78355, 37329, 1506, 1697053, 1303, 158877, 1348, 502790, 28264, 66228, 24, 29317, 1402, 676, 1314, 29391, 1409, 488730, 82347, 193461, 501496, 53972, 43765, 411570, 1365628, 147645, 29388, 28035, 33968, 51671, 33028, 37637, 361500, 65058, 646, 730, 105219, 70348, 752, 1328, 1015, 292, 28450, 28091, 747, 28132, 1273, 755172, 28038, 28188, 33889, 672, 40091, 1296, 53363, 1710, 1547, 180588, 729, 370622, 1430326, 135487, 1305, 644, 90239, 206506, 472, 169292, 39791, 669, 38284, 108980, 1239307, 68892, 28090, 44737, 504, 1891233, 58172, 48296, 29432, 28449, 1311, 41276, 1781, 36809, 1720, 322095, 1034, 565, 1701, 391, 82633, 40542, 310300, 1290, 34105] 22 | "Ear": 23 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 24 | ids: [38313, 1280, 306, 35703, 1776741, 760, 28037, 72557, 480, 319939, 68766, 2702, 1661, 1872515, 44750, 1639, 1014, 32002, 545, 28264, 199591, 1353, 267212, 43263, 316, 1869190, 1747, 1314, 52769, 33007, 134375, 285091, 89093, 29379, 29321, 678932, 184870, 674, 47770, 29388, 1313, 663, 1725, 51671, 753, 217203, 727, 85698, 585, 53364, 670, 666, 105219, 678, 90245, 1311, 1898, 292, 93220, 36809, 59561, 87883, 156979, 131111, 739, 511, 419475, 1895474, 293, 287, 1343, 1421, 38287, 123899, 1652] 25 | "Eye": 26 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 27 | ids: [760, 253, 38304, 154288, 478, 29394, 37330, 37326, 247, 488, 2047, 1671023, 759851, 197575, 945844, 47312, 1401, 59561, 479, 2035, 46125, 34062, 732, 1578165, 207340, 161879, 539, 1931, 187491, 28037, 480, 420404, 1544413, 616, 41202, 38290, 545, 40216, 1544416, 529, 192066, 1270, 753, 490, 29354, 485, 134533, 739, 1671022, 1379, 650, 90241, 1280, 1824, 1351, 1655, 280147, 46124, 69392, 239, 1309, 813, 37329, 571, 47478, 29391, 134375, 1409, 43765, 498, 147645, 1685, 72556, 51671, 723, 752, 1302, 28172, 483, 83558, 1750, 40091, 180588, 47846, 370622, 740, 726, 472, 457921, 38284, 68892, 1313, 477, 756689, 727, 1304, 1177728, 504, 29432, 666, 1396, 1871047, 1720, 161890, 735, 2055, 38287] 28 | "Nose": 29 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 30 | ids: [1591, 90241, 1280, 306, 760, 195105, 1673725, 74319, 478, 29394, 520, 40324, 28264, 39950, 38284, 1282, 31973, 1313, 72556, 727, 181487, 1304, 59823, 504, 65058, 105219, 615, 1328, 131111, 43990, 732, 286802, 33889] 31 | "Skin/Nail/Hair": 32 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 33 | ids: [282305, 1280, 94138, 131110, 1655, 1891644, 729, 1780, 29382, 33010, 202789, 38290, 33034, 37326, 28264, 1347369, 66228, 521392, 1766, 472, 169292, 1261, 1747, 45254, 1869190, 2047, 1817405, 1986155, 1282, 1270, 33918, 1314, 861, 43765, 281920, 29388, 663, 1352, 106654, 1260, 1931, 181487, 1276, 59823, 1965292, 132933, 1286, 1347368, 37923, 29432, 730, 36740, 470, 1622, 1781, 36809, 1288, 1698, 59561, 2035, 1720, 29506, 131111, 1283, 38289, 1648, 1273, 34062, 1292, 287, 1753, 1656, 1290, 71999, 672] 34 | "Oral": 35 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 36 | ids: [1613, 463, 306, 912594, 453, 28085, 29394, 478, 37330, 28131, 247, 1717, 1747, 2047, 358, 327575, 665914, 158, 43770, 76832, 419208, 249188, 544580, 203, 87883, 732, 651822, 207340, 43768, 1241978, 1785, 1590, 28137, 52227, 1501332, 80878, 82541, 192066, 341694, 38301, 272548, 596085, 303, 470, 131111, 739, 78259, 480035, 123899, 84112, 221027, 111015, 1280, 55565, 69392, 571, 28083, 1852361, 28126, 39950, 1310, 29391, 1409, 1397, 72556, 723, 105219, 342002, 1659, 230143, 93220, 84109, 28091, 257758, 449, 206, 459, 1389713, 157691, 573, 200, 28454, 472, 28119, 135080, 28133, 1313, 461393, 85698, 1078480, 504, 1396, 39778, 13689, 82633, 354243, 40542, 132249, 1290, 1017, 82380, 253, 47920, 34059, 205, 488, 28129, 113287, 135083, 1597, 29466, 81950, 194702, 615, 47312, 1110546, 46125, 34062, 1380685, 539, 1931, 1874826, 419015, 273136, 446, 189722, 52768, 1529, 480, 520, 267212, 228603, 2126346, 1318, 42817, 1352, 569, 33033, 544581, 76124, 450, 371601, 81858, 1383, 1246, 114528, 69823, 1019, 486, 131110, 76122, 1309, 239, 78355, 2702, 52773, 40214, 40324, 28134, 1314, 519, 985002, 41986, 65058, 244292, 28087, 1302, 747, 1501329, 41976, 28112, 68766, 1389922, 1234680, 42895, 1305, 726, 35783, 39791, 110845, 135082, 44737, 648, 48296, 32013, 28449, 1559, 1465, 29313, 1871047, 817, 735, 719, 134537, 263, 28141, 851, 60552, 885, 796937, 95486, 582, 56811, 1522312, 487, 47715, 1596, 110505, 142586, 1960874, 38313, 143393, 702745, 55211, 860, 51160, 529, 2054, 489, 223392, 47671, 490, 472693, 59505, 28125, 1490, 485, 228599, 1656, 2104, 93218, 90241, 577, 29446, 93219, 518, 37329, 1303, 76759, 82203, 47884, 1306, 45634, 1402, 158823, 43765, 43769, 51671, 33028, 28110, 1624, 238, 1924944, 204, 43675, 1795, 671224, 28132, 243701, 43997, 28095, 305719, 28214, 237576, 79263, 78258, 206043, 1308, 68892, 189723, 633701, 727, 626084, 439703, 502, 796942, 1720, 43990, 407975, 556499, 109790, 329, 1756149, 29341, 33010, 618, 39777, 1660, 114702, 33053, 754, 133926, 181487, 132933, 2094119, 1871052, 947033, 319706, 447, 80866, 128780, 76123, 1292, 287, 1343, 71451, 2079439, 84163, 28037, 114527, 616, 32002, 28136, 495, 714, 29363, 520603, 56774, 824, 177972, 1689, 319709, 850, 216816, 1778, 540, 327574, 1379, 199, 1351, 1655, 46124, 1287736, 68891, 1506, 1697053, 84521, 182337, 28264, 135079, 29317, 1498, 431269, 29523, 37637, 1309795, 1502, 156978, 1328, 1015, 292, 240125, 1547448, 562, 293, 45242, 1944660, 1831, 1296, 157687, 729, 671218, 61645, 135487, 638849, 1018, 106648, 607712, 1911679, 38284, 467210, 589436, 134034, 2382124, 837, 41200, 66851, 143361, 228604, 218538, 58172, 134534, 36809, 1472, 86185, 2055, 391, 341722] 37 | "Saliva": 38 | url: "https://bacdive.dsmz.de/search?search=taxid:{}" 39 | ids: [152331, 113107, 157688, 979627, 45634, 60133, 157687, 1624, 1583331, 1632, 249188] 40 | -------------------------------------------------------------------------------- /grimer-mgnify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import scripts.mgnify_download 4 | import grimer.grimer 5 | import argparse 6 | import os 7 | import glob 8 | 9 | parser = argparse.ArgumentParser(description='grimer-mgnify') 10 | parser.add_argument('-i', '--mgnify-study-accession', required=True, type=str, help="MGnify study accession (e.g. MGYS00002462)") 11 | parser.add_argument('-g', '--grimer-params', type=str, help="Extra params for grimer") 12 | parser.add_argument('-o', '--output-prefix', type=str, help="Output prefix for files and report") 13 | args = parser.parse_args() 14 | 15 | if args.output_prefix: 16 | prefix = args.output_prefix 17 | else: 18 | prefix = args.mgnify_study_accession 19 | 20 | # download files 21 | print("Downloading files for study accession " + args.mgnify_study_accession) 22 | scripts.mgnify_download.main(['-i', args.mgnify_study_accession, '-o', prefix, '-v']) 23 | 24 | files = filter(os.path.isfile, glob.glob(prefix + '*taxonomy_abundances*')) 25 | # Sort files by size ASC 26 | files = sorted(files, key=lambda x: os.stat(x).st_size) 27 | md = glob.glob(prefix + '*_metadata.tsv*') 28 | 29 | if args.grimer_params: 30 | grimer_params = args.grimer_params.split(" ") 31 | else: 32 | grimer_params = [] 33 | grimer.grimer.main(["-i", files[-1], 34 | "-m", md[-1], 35 | "-c", 'config/default.yaml', 36 | "-f", ";", 37 | "--obs-replace", "^.+__", "", "_", " ", 38 | "-r", "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species", 39 | "-t", "ncbi", 40 | "-o", prefix + ".html", 41 | "--title", "MGnify study accession " + args.mgnify_study_accession, 42 | ] + grimer_params) 43 | -------------------------------------------------------------------------------- /grimer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import grimer.grimer 3 | grimer.grimer.main() 4 | -------------------------------------------------------------------------------- /grimer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/__init__.py -------------------------------------------------------------------------------- /grimer/cds.py: -------------------------------------------------------------------------------- 1 | #General 2 | import pandas as pd 3 | import numpy as np 4 | from math import pi 5 | 6 | #Internal 7 | from grimer.func import print_df, transform_table, print_log, format_js_toString 8 | 9 | #Bokeh 10 | from bokeh.models import ColumnDataSource 11 | 12 | 13 | def dict_taxname(tax, taxids): 14 | """ 15 | mapping taxids to names 16 | (or names to names if taxid is not used) 17 | """ 18 | id_name = {} 19 | for i in taxids: 20 | n = tax.name(i) if tax else i 21 | id_name[i] = n if n else i 22 | return id_name 23 | 24 | 25 | def cds_plot_references(table, tax, references): 26 | # Stacked list of references, accounting for lineage matches 27 | # index -> observations (repeated) 28 | # columns -> "rank", "ref", "direct", "parent" 29 | clist = [] 30 | if references is not None: 31 | for rank in table.ranks(): 32 | for obs in table.observations(rank): 33 | for desc, ref in references.items(): 34 | direct = ref.get_refs_count(obs, direct=True) 35 | parent = ref.get_refs_count(obs, parents=True) 36 | if direct + parent > 0: 37 | clist.append([obs, rank, desc, direct, parent]) 38 | 39 | df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "parent"]) 40 | df_references.set_index('obs', inplace=True) 41 | 42 | print_df(df_references, "cds_p_references") 43 | return ColumnDataSource(df_references) 44 | 45 | 46 | def cds_annotations(table, references, controls, decontam, control_samples): 47 | # Stacked matrix of true annotations (omit false) 48 | # index -> taxids 49 | # columns -> rank, annot 50 | 51 | df_annotations = pd.DataFrame(columns=["rank", "annot", "factors", "ov", "tv"]) 52 | for i, rank in enumerate(table.ranks()): 53 | # Generate a DataFrame to use as source in tables 54 | df_rank = pd.DataFrame(index=table.observations(rank)) 55 | 56 | if decontam is not None: 57 | contaminants = decontam.get_contaminants(rank, df_rank.index).values 58 | if contaminants.any(): 59 | df_rank["decontam"] = decontam.get_pscore(rank, df_rank.index)[contaminants] 60 | 61 | if references is not None: 62 | for desc, ref in references.items(): 63 | df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) 64 | df_rank.loc[df_rank[desc] == 0, desc] = np.nan 65 | 66 | if controls is not None: 67 | for desc, ctrl in controls.items(): 68 | control_table = table.get_subtable(samples=control_samples[desc], rank=rank) 69 | freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0] 70 | df_rank[desc] = table.observations(rank).map(freq_perc_control).to_list() 71 | 72 | df_rank = pd.DataFrame(df_rank.stack(), columns=["ov"]).reset_index(1) 73 | df_rank.rename(columns={"level_1": "annot"}, inplace=True) 74 | 75 | # add transformed values to fit same scale on heatmap 76 | # Decontam reverse p-score normalized 77 | if not df_rank[df_rank["annot"] == "decontam"].empty: 78 | min_val = df_rank[df_rank["annot"] == "decontam"]["ov"].min() 79 | max_val = df_rank[df_rank["annot"] == "decontam"]["ov"].max() 80 | df_rank.loc[df_rank["annot"] == "decontam", "tv"] = 1 - ((df_rank[df_rank["annot"] == "decontam"]["ov"] - min_val) / (max_val - min_val)) 81 | 82 | # max references divided by max 83 | if references is not None: 84 | for desc, ref in references.items(): 85 | if not df_rank[df_rank["annot"] == desc].empty: 86 | max_val = df_rank[df_rank["annot"] == desc]["ov"].max() 87 | df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] / max_val 88 | 89 | # keep same percentage 90 | if controls is not None: 91 | for desc, ctrl in controls.items(): 92 | if not df_rank.loc[df_rank["annot"] == desc].empty: 93 | df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] 94 | 95 | df_rank["rank"] = rank # set rank 96 | df_rank["factors"] = df_rank.index if i == 0 else "" # initialize just for first rank (save space) 97 | 98 | # Concat in the main df 99 | df_annotations = pd.concat([df_annotations, df_rank], axis=0) 100 | 101 | print_df(df_annotations, "cds_p_annotations") 102 | return ColumnDataSource(df_annotations) 103 | 104 | 105 | def cds_obstable(table, tax, references, controls, control_samples, decontam): 106 | # index unique taxids 107 | # col|... values to plot to columns in the datatable 108 | # tax|... auxiliary lineage of taxa entries 109 | # aux|ref auxiliary references identifiers 110 | 111 | df_obstable = pd.DataFrame() 112 | # Create unified DataFrame with all ranks used 113 | for rank in table.ranks(): 114 | # Generate a DataFrame to use as source in tables 115 | df_rank = pd.DataFrame(index=table.observations(rank)) 116 | df_rank["col|rank"] = rank 117 | if tax: 118 | df_rank["col|name"] = table.observations(rank).map(lambda txid: tax.name(txid) if tax.name(txid) else txid).to_list() 119 | else: 120 | df_rank["col|name"] = table.observations(rank) 121 | 122 | # Frequency of taxa among all samples 123 | df_rank["col|frequency_perc"] = table.get_frequency_perc(rank) 124 | df_rank["col|counts_perc_avg"] = table.get_counts_perc_avg_samples(rank) 125 | # Average percentage of counts among all samples 126 | df_rank["col|total_counts"] = table.get_counts(rank) 127 | 128 | # If active - add decontam True/False results 129 | if decontam: 130 | df_rank["col|decontam"] = decontam.get_contaminants(rank, df_rank.index) 131 | 132 | # Add a column for each Annotation source 133 | if references is not None: 134 | for desc, ref in references.items(): 135 | df_rank["col|" + desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)).to_list() 136 | 137 | # Add a column for each Control source 138 | if controls is not None: 139 | # calculate frequency for each group of control provided 140 | for desc, ctrl in controls.items(): 141 | control_table = table.get_subtable(samples=control_samples[desc], rank=rank) 142 | freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0] 143 | df_rank["col|" + desc] = table.observations(rank).map(freq_perc_control).fillna(0).to_list() 144 | 145 | # Add col for each rank with parent taxid if exists, linking entries in their lineage for filtering and plotting 146 | for other_rank in table.ranks(): 147 | if table.ranks().index(other_rank) > table.ranks().index(rank): 148 | df_rank["tax|" + other_rank] = "" 149 | elif other_rank != rank: 150 | df_rank["tax|" + other_rank] = table.observations(rank).map(lambda txid: table.get_lineage(txid, rank, other_rank)).fillna("") 151 | else: 152 | df_rank["tax|" + other_rank] = df_rank.index 153 | # Sort values by frequency to show on table 154 | df_rank.sort_values(by="col|frequency_perc", ascending=False, inplace=True) 155 | 156 | # Concat in the main df 157 | df_obstable = pd.concat([df_obstable, df_rank], axis=0) 158 | 159 | print_df(df_obstable, "cds_m_obstable") 160 | return ColumnDataSource(df_obstable) 161 | 162 | 163 | def cds_sampletable(table): 164 | # index unique sample-ids 165 | # col|... values to plot to columns in the datatable 166 | 167 | df_sampletable = pd.DataFrame(index=table.samples) 168 | df_sampletable["col|total"] = table.get_total() if not table.normalized else 0 169 | df_sampletable["col|assigned"] = table.get_assigned() if not table.normalized else 0 170 | df_sampletable["col|assigned_perc"] = table.get_assigned_perc() 171 | df_sampletable["col|unassigned"] = table.get_unassigned() if not table.normalized else 0 172 | df_sampletable["col|unassigned_perc"] = table.get_unassigned_perc() 173 | 174 | # assigned by rank 175 | for rank in table.ranks(): 176 | df_sampletable["col|" + rank] = table.data[rank].sum(axis=1).divide(table.get_total(), axis=0) 177 | 178 | df_sampletable.fillna(0, inplace=True) 179 | 180 | print_df(df_sampletable, "cds_p_sampletable") 181 | return ColumnDataSource(df_sampletable) 182 | 183 | 184 | def cds_samplebars(table): 185 | # index unique sample-ids 186 | # aux| auxiliary values (not plotted) 187 | # bar| values plotted as bars (sample counts) 188 | # tax| values plotted as circles (taxa value) 189 | 190 | df_bars = pd.DataFrame(index=table.samples) 191 | # factors: set the x-axis reference for plotting, it can be dinamically changed (with groups) 192 | df_bars["aux|factors"] = df_bars.index 193 | df_bars["bar|unassigned"] = table.get_unassigned() 194 | # Initialized with "Assigned" of first rank 195 | df_bars["bar|selected"] = table.get_subtable(table.ranks()[0]).sum(axis=1) 196 | # Total assigned - assigned to rank 197 | df_bars["bar|others"] = (table.get_total() - table.get_unassigned()) - df_bars["bar|selected"] 198 | # Add empty cols for taxa values, to be dynamically inserted (None to avoid printing 0) 199 | for rank in table.ranks(): 200 | df_bars["tax|" + rank] = None 201 | 202 | print_df(df_bars, "cds_p_samplebars") 203 | return ColumnDataSource(df_bars) 204 | 205 | 206 | def cds_samples(table, references, controls, decontam): 207 | # index unique sample-ids 208 | # aux| auxiliary values (not plotted) 209 | # cnt| count values to be copied/traansformed to bars 210 | 211 | df_samples = pd.DataFrame(index=table.samples) 212 | # index to retrieve default input order 213 | df_samples["aux|input_order"] = range(df_samples.shape[0], 0, -1) 214 | df_samples["cnt|total"] = table.get_total() 215 | df_samples["cnt|unassigned"] = table.get_unassigned() 216 | 217 | # Keep total number of assignemnts for calculations 218 | df_samples["cnt|assigned"] = table.get_total() - table.get_unassigned() 219 | 220 | # Add specific rank assignements 221 | for rank in table.ranks(): 222 | df_samples["cnt|" + rank + "|assigned"] = table.data[rank].sum(axis=1) 223 | 224 | # Add counts specific to sources 225 | source_list = [] 226 | if references is not None: 227 | source_list.append(references.items()) 228 | if controls is not None: 229 | source_list.append(controls.items()) 230 | 231 | for sources in source_list: 232 | for desc, src in sources: 233 | for rank in table.ranks(): 234 | idx = table.observations(rank).map(lambda x: src.get_refs_count(x, direct=True)) >= 1 235 | df_samples["cnt|" + rank + "|" + desc] = table.data[rank][table.observations(rank)[idx]].sum(axis=1) 236 | 237 | if decontam: 238 | contaminants = decontam.get_contaminant_list() 239 | for rank in table.ranks(): 240 | idx = table.observations(rank).isin(contaminants) 241 | df_samples["cnt|" + rank + "|decontam"] = table.data[rank][table.observations(rank)[idx]].sum(axis=1) 242 | 243 | # fill NaN with zero so bars do not "dissapear" when plotting 244 | df_samples.fillna(0, inplace=True) 245 | 246 | print_df(df_samples, "cds_d_samples") 247 | return ColumnDataSource(df_samples) 248 | 249 | 250 | def cds_metadata(metadata): 251 | # index -> sample-ids 252 | # columns -> metadata fields 253 | # values -> metadata values 254 | df_md = metadata.get_data() 255 | print_df(df_md, "cds_d_metadata") 256 | return ColumnDataSource(df_md) 257 | 258 | 259 | def cds_plot_metadata(metadata, max_metadata_cols): 260 | # index (unique sample-ids) 261 | # md0, md1, ..., md(max_metadata_cols) 262 | # values (metadata field, metadata values) 263 | 264 | df_plot_md = pd.DataFrame(index=metadata.data.index, columns=["factors"] + [str(i) for i in range(1, max_metadata_cols + 1)]) 265 | df_plot_md["factors"] = df_plot_md.index 266 | # Fill in only first metadata field 267 | first_field = metadata.get_col_headers()[0] 268 | 269 | df_plot_md["1"] = [(first_field, format_js_toString(md_value)) for md_value in metadata.get_col(first_field)] 270 | 271 | # Fill with empty strings to match js output when not selected 272 | df_plot_md.fillna("", inplace=True) 273 | 274 | print_df(df_plot_md, "cds_p_metadata") 275 | return ColumnDataSource(df_plot_md) 276 | 277 | 278 | def cds_plot_decontam(decontam): 279 | # index unique sample-ids 280 | # concentrations from decontam inputs 281 | # controls from decontam inputs 282 | # counts: field to be dynamically filled with click on obstable 283 | df_decontam = decontam.get_data() 284 | df_decontam["controls"] = df_decontam["controls"].map({True: 'Control', False: 'Sample'}) 285 | df_decontam["counts"] = None 286 | print_df(df_decontam, "cds_p_decontam") 287 | return ColumnDataSource(df_decontam) 288 | 289 | 290 | def cds_decontam(decontam, ranks): 291 | """ 292 | cds based on a dict with valid values to plot model lines 293 | {taxid: (contam_y1, contam_y2, non_contam_y, pval)} 294 | """ 295 | dict_coord_mod = {} 296 | for rank in ranks: 297 | df_valid_vals = decontam.rank[rank].dropna(subset=['contam']) 298 | pval = decontam.get_pscore(rank, df_valid_vals.index) 299 | vals = list(zip(df_valid_vals["contam"], df_valid_vals["contam_2"], df_valid_vals["non.contam"], pval)) 300 | dict_coord_mod.update(dict(zip(df_valid_vals.index, vals))) 301 | 302 | print_df(dict_coord_mod, "cds_d_decontam_models") 303 | return ColumnDataSource(dict_coord_mod) 304 | 305 | 306 | def cds_plot_decontam_models(decontam): 307 | """ 308 | cds based on a dict with 3 pairs of values to plot. x is shared among y_cont and y_noncont 309 | # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} 310 | """ 311 | dict_decontam_models = {} 312 | dict_decontam_models["x"] = [decontam.get_data()["concentration"].min(), 313 | decontam.get_data()["concentration"].max()] 314 | dict_decontam_models["y_cont"] = [None, None] 315 | dict_decontam_models["y_noncont"] = [None, None] 316 | print_df(dict_decontam_models, "cds_p_decontam_models") 317 | return ColumnDataSource(dict_decontam_models) 318 | 319 | 320 | def dict_sampleobs(table): 321 | # dict with raw counts (not storing zeros) 322 | # dict_sampleobs[rank][obs][sample] = count 323 | dict_sampleobs = {} 324 | for rank in table.ranks(): 325 | dict_sampleobs[rank] = {} 326 | for obs, sample_val in table.data[rank].to_dict().items(): 327 | dict_sampleobs[rank][obs] = {} 328 | for sample, val in sample_val.items(): 329 | if val > 0: 330 | dict_sampleobs[rank][obs][sample] = val 331 | 332 | print_df(dict_sampleobs, "dict_d_sampleobs") 333 | return dict_sampleobs 334 | 335 | 336 | def cds_heatmap(table, transformation, show_zeros): 337 | # Stacked matrix of raw counts + transformed value 338 | # index -> sample-ids (repeated) 339 | # obs 340 | # rank 341 | # ov -> original value (raw counts) 342 | # tv -> transformed values (user choice: log10, clr, ...) 343 | 344 | df_heatmap = pd.DataFrame(columns=["obs", "rank", "ov", "tv", "factors_sample", "factors_obs"]) 345 | for i, rank in enumerate(table.ranks()): 346 | stacked_rank_df = pd.DataFrame(table.data[rank].stack(), columns=["ov"]).reset_index(1) 347 | # Rename first col to obs 348 | stacked_rank_df.rename(columns={stacked_rank_df.columns[0]: "obs"}, inplace=True) 349 | stacked_rank_df["rank"] = rank 350 | tv = transform_table(table.data[rank], table.get_total(), transformation, table.zerorep) 351 | stacked_rank_df["tv"] = tv.stack().values 352 | #Drop zeros based on original counts 353 | if not show_zeros: 354 | stacked_rank_df = stacked_rank_df[stacked_rank_df["ov"] > 0] 355 | # initialize factors only for first rank 356 | #stacked_rank_df["factors_sample"] = stacked_rank_df.index 357 | #stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] 358 | stacked_rank_df["factors_sample"] = stacked_rank_df.index if i == 0 else "" 359 | stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] if i == 0 else "" 360 | 361 | df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0) 362 | 363 | df_heatmap.drop('ov', axis=1, inplace=True) 364 | print_df(df_heatmap, "cds_p_heatmap") 365 | return ColumnDataSource(df_heatmap) 366 | 367 | 368 | def dict_hcluster(table, hcluster): 369 | # keys -> combination of hclusters 370 | # values -> sorted sample-ids 371 | 372 | leaves_x = {} 373 | # default order 374 | leaves_y = {"default": table.samples.to_list()} 375 | 376 | for rank in hcluster: 377 | # default order for each rank 378 | leaves_x["default|" + rank] = table.observations(rank).to_list() 379 | for method in hcluster[rank]: 380 | for metric in hcluster[rank][method]: 381 | # key 382 | key = rank + "|" + method + "|" + metric 383 | # samples 384 | leaves_y[key] = hcluster[rank][method][metric]["y"]["index"] 385 | # taxa 386 | leaves_x[key] = hcluster[rank][method][metric]["x"]["index"] 387 | 388 | print_df(leaves_x, "dict_d_hcluster_x") 389 | print_df(leaves_y, "dict_d_hcluster_y") 390 | return leaves_x, leaves_y 391 | 392 | 393 | def cds_plot_dendro(): 394 | # Empty CDS {"x": [], "y": [], "c": []} 395 | dendro_x = {"x": [], "y": [], "c": []} 396 | dendro_y = {"x": [], "y": [], "c": []} 397 | print_df(dendro_x, "cds_p_dendro_x") 398 | print_df(dendro_y, "cds_p_dendro_y") 399 | return ColumnDataSource(dendro_x), ColumnDataSource(dendro_y) 400 | 401 | 402 | def dict_dendro(table, dendro): 403 | # dict_d_dedro_x and dict_d_dedro_y: 404 | # key -> key + "|x" , key + "|y" , key + "|c" 405 | # value -> list of lists (x and y) or list (c) 406 | dict_d_dedro_y = {} 407 | dict_d_dedro_x = {} 408 | 409 | for rank in dendro: 410 | for method in dendro[rank]: 411 | for metric in dendro[rank][method]: 412 | # key 413 | key = rank + "|" + method + "|" + metric 414 | # dendrogram values 415 | dict_d_dedro_y[key + "|x"] = dendro[rank][method][metric]["y"]["xs"] 416 | dict_d_dedro_y[key + "|y"] = dendro[rank][method][metric]["y"]["ys"] 417 | dict_d_dedro_y[key + "|c"] = dendro[rank][method][metric]["y"]["colors"] 418 | dict_d_dedro_x[key + "|x"] = dendro[rank][method][metric]["x"]["xs"] 419 | dict_d_dedro_x[key + "|y"] = dendro[rank][method][metric]["x"]["ys"] 420 | dict_d_dedro_x[key + "|c"] = dendro[rank][method][metric]["x"]["colors"] 421 | 422 | return dict_d_dedro_x, dict_d_dedro_y 423 | 424 | 425 | def dict_topobs(table, top_obs_bars): 426 | dict_top_taxa = {} 427 | for rank in table.ranks(): 428 | dict_top_taxa[rank] = table.get_top(rank, top_obs_bars) 429 | print_df(dict_top_taxa, "dict_d_topobs") 430 | return dict_top_taxa 431 | 432 | 433 | def dict_refs(table, references): 434 | # dict with information about sources and references 435 | # references can be repeated among descriptions, sources and taxids 436 | # {taxid: {source: {desc: [refs]}} 437 | d_refs = {} 438 | # Get only valid taxids 439 | used_ids = set() 440 | for rank in table.ranks(): 441 | used_ids.update(table.observations(rank)) 442 | 443 | if references is not None: 444 | for i in used_ids: 445 | for sname, s in references.items(): 446 | for ref, descs in s.get_refs_desc(i, direct=True).items(): 447 | for desc in descs: 448 | # Only add items if they have a reference to it 449 | if i not in d_refs: 450 | d_refs[i] = {} 451 | if sname not in d_refs[i]: 452 | d_refs[i][sname] = {} 453 | if desc not in d_refs[i][sname]: 454 | d_refs[i][sname][desc] = [] 455 | d_refs[i][sname][desc].append(ref) 456 | 457 | print_df(d_refs, "dict_d_refs") 458 | return d_refs 459 | 460 | 461 | def cds_correlation(table, corr): 462 | df_corr = pd.DataFrame(columns=["taxid", "rank", "rho"]) 463 | for rank in table.ranks(): 464 | stacked_rank_df = pd.DataFrame(corr[rank]["rho"], index=corr[rank]["observations"], columns=corr[rank]["observations"]).stack(dropna=False).reset_index(1) 465 | stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True) 466 | stacked_rank_df.rename(columns={0: "rho"}, inplace=True) 467 | stacked_rank_df["rank"] = rank 468 | 469 | # Drop NA for rho (missing values and upper triangular matrix) 470 | stacked_rank_df.dropna(subset=['rho'], inplace=True) 471 | 472 | df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) 473 | 474 | print_df(df_corr, "cds_p_correlation") 475 | return ColumnDataSource(df_corr) 476 | 477 | 478 | def cds_obsbars(table, top_obs_bars): 479 | # index (unique sample-ids) 480 | # cols: 1, 2, ..., top_obs_bars, unassigned, others, factors 481 | 482 | #Load with data from first rank 483 | top_taxids = table.get_top(table.ranks()[0], top_obs_bars) 484 | df_obsbars = table.get_subtable(taxids=top_taxids, rank=table.ranks()[0], keep_shape=True) 485 | df_obsbars.rename(columns={c: str(i) for i, c in enumerate(df_obsbars.columns)}, inplace=True) 486 | # Complete table with None values 487 | ncol = len(df_obsbars.columns) 488 | while ncol < top_obs_bars: 489 | df_obsbars[str(ncol)] = 0 490 | ncol += 1 491 | # Other account for filtered taxa (not on top) and left over percentage for the rank without assignment 492 | df_obsbars["others"] = table.get_total() - table.get_unassigned() - df_obsbars.sum(axis=1) 493 | df_obsbars["unassigned"] = table.get_unassigned() 494 | df_obsbars = transform_table(df_obsbars, table.get_total(), "norm", 0) * 100 495 | df_obsbars["factors"] = df_obsbars.index.to_list() 496 | 497 | print_df(df_obsbars, "cds_p_obsbars") 498 | return ColumnDataSource(df_obsbars) 499 | 500 | 501 | def cds_mgnify(mgnify, table, tax): 502 | # index (taxa, level, lineage) 503 | # count for each combination of index 504 | 505 | df_mgnify = pd.DataFrame(columns=["taxa", "level", "lineage", "count", "angle"]) 506 | 507 | # Match uids (taxid or names) from input and keep only found elements 508 | uids = [txid for rank in table.ranks() for txid in table.observations(rank)] 509 | df_tmp = mgnify[mgnify['taxa'].isin(uids)] 510 | 511 | # reset index to properly concate later with biome lineages 512 | df_tmp.reset_index(drop=True, inplace=True) 513 | 514 | if df_tmp.empty: 515 | print_log("could not find matching entries on MGnify") 516 | return None 517 | 518 | # Split biome lineage 519 | biome_levels = df_tmp['biome'].str.split(':', expand=True) 520 | n_levels = biome_levels.shape[1] 521 | 522 | # Rename levels with full lineage, starting from second level 523 | biome_lineage = pd.DataFrame(biome_levels[1]) 524 | for l in range(2, n_levels): 525 | biome_lineage[l] = pd.Series(biome_levels[[i for i in range(1, l + 1)]].values.tolist()).str.join(':') 526 | 527 | # Concat back 528 | df_tmp = pd.concat([biome_lineage, df_tmp], axis=1) 529 | 530 | # for each biome level (ignoring root 0) 531 | for l in range(1, n_levels): 532 | # group counts by biome, and fix fields 533 | df_biome = df_tmp.groupby(["taxa", l]).sum(numeric_only=True) 534 | df_biome["level"] = str(l) 535 | df_biome.reset_index(inplace=True) 536 | df_biome.rename(columns={l: "lineage"}, inplace=True) 537 | 538 | # Calculate angle for each taxa/level for wedges 539 | total_taxa_level = df_biome.groupby("taxa").sum(numeric_only=True).to_dict()["count"] 540 | df_biome["angle"] = (df_biome['count'] / df_biome['taxa'].map(total_taxa_level)) * (2 * pi) 541 | 542 | # Group to the final df 543 | df_mgnify = pd.concat([df_mgnify, df_biome], axis=0, ignore_index=True) 544 | 545 | # set index 546 | df_mgnify.set_index('taxa', inplace=True) 547 | 548 | print_df(df_mgnify, "cds_p_mgnify") 549 | return ColumnDataSource(df_mgnify) 550 | -------------------------------------------------------------------------------- /grimer/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from scipy.spatial.distance import _METRICS_NAMES 4 | from scipy.cluster.hierarchy import _LINKAGE_METHODS 5 | 6 | 7 | class Config: 8 | 9 | version = "1.1.0" 10 | default_rank_name = "default" 11 | output_plots = ["overview", "samples", "heatmap", "correlation"] 12 | transformations = ["none", "norm", "log", "clr"] 13 | taxonomy = ["ncbi", "gtdb", "silva", "greengenes", "ott"] 14 | 15 | def __new__(self, argv=None): 16 | 17 | formatter_class = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=100) 18 | parser = argparse.ArgumentParser(prog="grimer", 19 | description=logo(self.version), 20 | formatter_class=formatter_class) 21 | 22 | required_group = parser.add_argument_group('required arguments') 23 | required_group.add_argument('-i', '--input-file', required=True, type=str, help="Tab-separatad file with table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --transpose if your file is reversed). The first column and first row are used as headers.") 24 | 25 | main_group = parser.add_argument_group('main arguments') 26 | main_group.add_argument('-m', '--metadata-file', type=str, help="Tab-separated file with metadata. Rows should contain samples and columns the metadata fields. QIIME2 metadata format is accepted, with an extra row to define categorical and numerical fields. If --input-file is a .biom file, metadata will be extracted from it if available.") 27 | main_group.add_argument('-c', '--config', type=str, help="Configuration file with definitions of references, controls and external tools.") 28 | main_group.add_argument('-t', '--taxonomy', type=str, default=None, help="Enable taxonomic analysis, convert entries and annotate samples. Files will be automatically downloaded and parsed. Optionally, stored files can be provided with --taxonomy-files.", choices=Config.taxonomy) 29 | main_group.add_argument('-b', '--taxonomy-files', nargs="*", type=str, default=[], help="Specific taxonomy files to use with --taxonomy.") 30 | main_group.add_argument('-r', '--ranks', nargs="*", default=[Config.default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + Config.default_rank_name + "' to use entries from the table directly.") 31 | 32 | output_group = parser.add_argument_group('output arguments') 33 | output_group.add_argument('-l', '--title', type=str, default="", help="Title to display on the top of the report.") 34 | output_group.add_argument('-p', '--output-plots', nargs="*", type=str, default=Config.output_plots, help="Plots to generate.", choices=Config.output_plots) 35 | output_group.add_argument('-o', '--output-html', type=str, default="output.html", help="Filename of the HTML report output.") 36 | output_group.add_argument('--full-offline', default=False, action='store_true', help="Embed Bokeh javascript library in the output file. Output will be around 1.5MB bigger but it will work without internet connection. ~your report will live forever~") 37 | 38 | data_group = parser.add_argument_group('general data options') 39 | data_group.add_argument('-g', '--mgnify', default=False, action='store_true', help="Plot MGnify, requires --config file with parsed MGnify database.") 40 | data_group.add_argument('-d', '--decontam', default=False, action='store_true', help="Run DECONTAM and generate plots. requires --config file with DECONTAM configuration.") 41 | data_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hierarchical multi-level table where the observations headers are separated by the indicated separator char (usually ';' or '|')") 42 | data_group.add_argument('-y', '--values', default=None, type=str, help="Force 'count' or 'normalized' data parsing. Empty to auto-detect.") 43 | data_group.add_argument('-w', '--cumm-levels', default=False, action='store_true', help="Activate if input table has already cummulative values on parent taxonomic levels.") 44 | data_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table before parsing (if samples are listed on columns and observations on rows)") 45 | data_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") 46 | data_group.add_argument('-z', '--replace-zeros', type=str, default="1000", help="Treat zeros in the input table. INT (add 'smallest count' divided by INT to every value), FLOAT (add FLOAT to every value). Default: 1000") 47 | data_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on observations labels/headers (supports regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex. Several pairs of instructions are supported.") 48 | data_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on sample labels/headers (supports regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex. Several pairs of instructions are supported.") 49 | data_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") 50 | data_group.add_argument('--max-frequency', type=float, help="Define maximum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") 51 | data_group.add_argument('--min-count', type=float, help="Define minimum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") 52 | data_group.add_argument('--max-count', type=float, help="Define maximum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") 53 | 54 | sample_group = parser.add_argument_group('Samples options') 55 | sample_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Number of top abundant observations to show in the Samples panel, based on the avg. percentage counts/sample.") 56 | 57 | heatmap_group = parser.add_argument_group('Heatmap and clustering options') 58 | heatmap_group.add_argument('-a', '--transformation', type=str, default="log", help="Transformation of counts for Heatmap. none (counts), norm (percentage), log (log10), clr (centre log ratio).", choices=Config.transformations) 59 | heatmap_group.add_argument('-e', '--metadata-cols', type=int, default=3, help="Available metadata cols to be selected on the Heatmap panel. Higher values will slow down the report navigation.") 60 | heatmap_group.add_argument('--optimal-ordering', default=False, action='store_true', help="Activate optimal_ordering on scipy linkage method, takes longer for large number of samples.") 61 | heatmap_group.add_argument('--show-zeros', default=False, action='store_true', help="Do not skip zeros on heatmap plot. File will be bigger and iteraction with heatmap slower. By default, zeros will be omitted.") 62 | heatmap_group.add_argument('--linkage-methods', type=str, nargs="*", default=["complete"], choices=list(_LINKAGE_METHODS)) 63 | heatmap_group.add_argument('--linkage-metrics', type=str, nargs="*", default=["euclidean"], choices=_METRICS_NAMES) 64 | heatmap_group.add_argument('--skip-dendrogram', default=False, action='store_true', help="Disable dendogram plots for clustering.") 65 | 66 | correlation_group = parser.add_argument_group('Correlation options') 67 | correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=50, help="Number of top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all") 68 | 69 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + Config.version) 70 | parser.add_argument('-D', '--debug', default=False, action='store_true', help=argparse.SUPPRESS) 71 | 72 | return parser.parse_args(argv) 73 | 74 | 75 | def logo(version): 76 | print("") 77 | print(" ▄████ ██▀███ ██▓ ███▄ ▄███▓▓█████ ██▀███ ") 78 | print(" ██▒ ▀█▒▓██ ▒ ██▒▓██▒▓██▒▀█▀ ██▒▓█ ▀ ▓██ ▒ ██▒") 79 | print(" ▒██░▄▄▄░▓██ ░▄█ ▒▒██▒▓██ ▓██░▒███ ▓██ ░▄█ ▒") 80 | print(" ░▓█ ██▓▒██▀▀█▄ ░██░▒██ ▒██ ▒▓█ ▄ ▒██▀▀█▄ ") 81 | print(" ░▒▓███▀▒░██▓ ▒██▒░██░▒██▒ ░██▒░▒████▒░██▓ ▒██▒") 82 | print(" ░▒ ▒ ░ ▒▓ ░▒▓░░▓ ░ ▒░ ░ ░░░ ▒░ ░░ ▒▓ ░▒▓░") 83 | print(" ░ ░ ░▒ ░ ▒░ ▒ ░░ ░ ░ ░ ░ ░ ░▒ ░ ▒░") 84 | print(" ░ ░ ░ ░░ ░ ▒ ░░ ░ ░ ░░ ░ ") 85 | print(" ░ ░ ░ ░ ░ ░ ░ ") 86 | print(" version " + str(version)) 87 | print("") 88 | print("") 89 | -------------------------------------------------------------------------------- /grimer/css/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/css/__init__.py -------------------------------------------------------------------------------- /grimer/css/popup.css: -------------------------------------------------------------------------------- 1 | /* (A) WRAPPER */ 2 | #pop-up { 3 | position: fixed; 4 | top: 0; left: 0; 5 | z-index: 999; 6 | width: 100vw; 7 | height: 100vh; 8 | background: rgba(0, 0, 0, 0.2); 9 | visibility: hidden; 10 | opacity: 0; 11 | transition: opacity 0.1s; 12 | } 13 | #pop-up.open { 14 | visibility: visible; 15 | opacity: 1; 16 | } 17 | 18 | /* (B) BOX */ 19 | #pop-box { 20 | position: relative; 21 | max-width: 70%; 22 | background: #fff; 23 | margin: 50vh auto 0 auto; 24 | transform: translateY(-50%); 25 | } 26 | 27 | /* (C) TITLE */ 28 | #pop-title { 29 | padding: 5px; 30 | margin: 0; 31 | background: #868b8e; 32 | color: #fff; 33 | } 34 | 35 | /* (D) TEXT */ 36 | #pop-text { 37 | border: 2px solid #868b8e; 38 | padding: 10px; 39 | margin: 0; 40 | background: #fff; 41 | color: #555; 42 | } 43 | 44 | /* (E) CLOSE BUTTON */ 45 | #pop-close { 46 | position: absolute; 47 | top: 0; right: 5px; 48 | padding: 2px; 49 | color: #fff; 50 | font-size: 32px; 51 | cursor: pointer; 52 | } 53 | -------------------------------------------------------------------------------- /grimer/decontam.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class Decontam: 5 | cols_rank = ["freq", "prev", "p.freq", "p.prev", "p", "contaminant"] 6 | 7 | def __init__(self, df_concentration_controls): 8 | self.data = df_concentration_controls 9 | self.rank = {} 10 | 11 | def __repr__(self): 12 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] 13 | return 'Decontam({})'.format(', '.join(args)) 14 | 15 | def add_rank_results(self, rank, decontam_out_file, decontam_mod_file): 16 | self.rank[rank] = pd.read_table(decontam_out_file, sep='\t', header=0, skiprows=0, index_col=0, names=self.cols_rank, dtype={0: str}) 17 | 18 | # Parse models enforcing index as string 19 | mod = pd.read_table(decontam_mod_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0: str}) 20 | 21 | # Remove point counter at end (.1 or .1000) 22 | mod.index = mod.index.map(lambda txid: txid[:-5] if txid.endswith(".1000") else txid[:-2]).to_list() 23 | 24 | # Merge first point of model 25 | self.rank[rank] = self.rank[rank].merge(mod.iloc[0::2, 0], left_index=True, right_index=True) 26 | 27 | # Merge second point of model and non-contant line 28 | self.rank[rank] = self.rank[rank].merge(mod.iloc[1::2, :], suffixes=["", "_2"], left_index=True, right_index=True) 29 | 30 | def add_rank_empty(self, rank, idx): 31 | self.rank[rank] = pd.DataFrame(index=idx, columns=self.cols_rank + ["contam", "contam_2", "non.contam"]) 32 | self.rank[rank]["contaminant"] = False 33 | 34 | def get_data(self): 35 | return self.data.fillna(False) 36 | 37 | def get_contaminants(self, rank, idx): 38 | return self.rank[rank].reindex(idx)["contaminant"] 39 | 40 | def get_pscore(self, rank, idx): 41 | return self.rank[rank].reindex(idx)["p"] 42 | 43 | def get_contaminant_list(self): 44 | clist = [] 45 | for r in self.rank: 46 | clist.extend(self.rank[r].index[self.rank[r]["contaminant"] == True].to_list()) 47 | return clist 48 | -------------------------------------------------------------------------------- /grimer/func.py: -------------------------------------------------------------------------------- 1 | #General 2 | import numpy as np 3 | import os 4 | import sys 5 | import subprocess 6 | import shlex 7 | import pandas as pd 8 | from pandas.api.types import is_numeric_dtype 9 | import yaml 10 | 11 | #Internal 12 | from grimer.config import Config 13 | from grimer.decontam import Decontam 14 | from grimer.metadata import Metadata 15 | from grimer.reference import Reference 16 | from grimer.table import Table 17 | 18 | # Bokeh 19 | from bokeh.palettes import Category10, Category20, Colorblind, linear_palette, Turbo256 20 | 21 | # MultiTax 22 | from multitax import * 23 | 24 | #biom 25 | import biom 26 | 27 | # scikit-bio 28 | from skbio.stats.composition import clr 29 | 30 | # Scipy 31 | import scipy.cluster.hierarchy as sch 32 | 33 | 34 | def parse_config_file(config): 35 | cfg = None 36 | if config: 37 | try: 38 | with open(config, 'r') as file: 39 | cfg = yaml.safe_load(file) 40 | except Exception as e: 41 | print_log("Failed loading configuration file [" + config + "], skipping") 42 | print_log(str(e)) 43 | else: 44 | print_log("Not provided, skipping") 45 | return cfg 46 | 47 | 48 | def parse_taxonomy(taxonomy, taxonomy_files): 49 | tax = None 50 | if taxonomy is not None: 51 | try: 52 | if not taxonomy_files: 53 | print_log("Downloading taxonomy") 54 | if taxonomy == "ncbi": 55 | tax = NcbiTx(files=taxonomy_files, extended_names=True) 56 | elif taxonomy == "gtdb": 57 | tax = GtdbTx(files=taxonomy_files) 58 | elif taxonomy == "silva": 59 | tax = SilvaTx(files=taxonomy_files) 60 | elif taxonomy == "greengenes": 61 | tax = GreengenesTx(files=taxonomy_files) 62 | elif taxonomy == "ott": 63 | tax = OttTx(files=taxonomy_files, extended_names=True) 64 | else: 65 | raise 66 | except Exception as e: 67 | print_log("Failed loading " + taxonomy + " taxonomy, skipping") 68 | print_log(str(e)) 69 | else: 70 | print_log("Not provided, skipping") 71 | return tax 72 | 73 | 74 | def parse_table(args, tax): 75 | # Specific default params if biom file is provided 76 | if args.input_file.endswith(".biom"): 77 | if not args.level_separator: 78 | args.level_separator = ";" 79 | args.transpose = True 80 | 81 | # Read and return full table with separated total and unassigned counts (sharing same index) 82 | table_df, total, unassigned = parse_input_file(args.input_file, args.unassigned_header, args.transpose, args.sample_replace, args.cumm_levels) 83 | 84 | if table_df.empty: 85 | raise Exception("Error parsing input file") 86 | 87 | # Define if table is already normalized (0-100) or has count data 88 | if args.values == "count": 89 | normalized = False 90 | elif args.values == "normalized": 91 | normalized = True 92 | elif (table_df.sum(axis=1).round() == 100).all() or (table_df % 1 != 0).any().any(): 93 | normalized = True 94 | else: 95 | normalized = False 96 | 97 | # Zero replacement 98 | try: 99 | replace_zero_value = table_df[table_df.gt(0)].min().min() / int(args.replace_zeros) 100 | except: 101 | replace_zero_value = float(args.replace_zeros) 102 | if replace_zero_value == 1 and args.transformation == "log": 103 | replace_zero_value = 0.999999 # Do not allow value 1 using log 104 | 105 | # Split table into ranks. Ranks are either in the headers in multi level tables or will be created for a one level table 106 | if args.level_separator: 107 | ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace, args.cumm_levels) 108 | else: 109 | ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, Config.default_rank_name) 110 | 111 | if not ranked_tables: 112 | raise Exception("Error parsing input file") 113 | 114 | table = Table(table_df.index, total, unassigned, lineage, normalized, replace_zero_value) 115 | 116 | print_log("") 117 | print_log("Total valid samples: " + str(len(table.samples))) 118 | # Check for long sample headers, break some plots 119 | long_sample_headers = [h for h in table_df.index if len(h) > 70] 120 | if long_sample_headers: 121 | print_log("Long sample labels/headers detected, plots may break: ") 122 | print_log("\n".join(long_sample_headers)) 123 | print_log("") 124 | 125 | for r, t in ranked_tables.items(): 126 | print_log("--- " + r + " ---") 127 | filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count, normalized)) 128 | if t.empty: 129 | print_log("No valid entries, skipping") 130 | else: 131 | # Trim table for empty zeros rows/cols 132 | table.add_rank(r, filtered_trimmed_t) 133 | print_log("Total valid observations: " + str(len(table.observations(r)))) 134 | 135 | print_log("") 136 | 137 | if not normalized: 138 | print_log("Total assigned (counts): " + str(table.get_total().sum() - table.get_unassigned().sum())) 139 | print_log("Total unassigned (counts): " + str(table.get_unassigned().sum())) 140 | print_log("") 141 | 142 | return table 143 | 144 | 145 | def parse_metadata(args, samples): 146 | metadata = None 147 | 148 | # Parse metadata as DataFrame (md) 149 | md = pd.DataFrame() 150 | if args.metadata_file: 151 | # Parse table as dataframe 152 | md = pd.read_table(args.metadata_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0: str}) 153 | elif args.input_file.endswith(".biom"): 154 | try: 155 | biom_in = biom.load_table(args.input_file) 156 | if biom_in.metadata() is not None: 157 | md = biom_in.metadata_to_dataframe(axis="sample") 158 | except: 159 | print_log("Error parsing metadata from BIOM file, skipping") 160 | return None 161 | 162 | if md.empty: 163 | print_log("No valid metadata, skipping") 164 | return None 165 | 166 | # Enforce string index 167 | md.index = md.index.astype('str') 168 | 169 | # Return type of columns, remove metadata row if present from metadata 170 | md_types = define_metadata_types(md) 171 | 172 | # types defined on file 173 | if str(md.index[0]).startswith("#"): 174 | # Drop row with types from main data 175 | md.drop(md_types.name, inplace=True) 176 | # Enforce column type on dataframe 177 | md[md_types[md_types == "categorical"].index] = md[md_types[md_types == "categorical"].index].astype(str) 178 | md[md_types[md_types == "numeric"].index] = md[md_types[md_types == "numeric"].index].apply(pd.to_numeric) 179 | 180 | # Convert datatypes to adequate numeric values (int, float) 181 | md = md.convert_dtypes(infer_objects=False, convert_string=False, convert_boolean=False) 182 | # Re-convert everything to object to standardize (int64 NA is not seriazable on bokeh) 183 | md = md.astype("object") 184 | 185 | # Remove empty fields 186 | null_cols = md.isna().all(axis=0) 187 | if any(null_cols): 188 | md = md.loc[:, ~null_cols] 189 | md_types = md_types[~null_cols] 190 | print_log(str(sum(null_cols)) + " metadata fields removed without valid values") 191 | 192 | # Convert NaN on categorical to "" 193 | md[md_types[md_types == "categorical"].index] = md[md_types[md_types == "categorical"].index].fillna('') 194 | # Convert boolean from categorical to String 195 | mask = md[md_types[md_types == "categorical"].index].applymap(type) != bool 196 | md[md_types[md_types == "categorical"].index] = md[md_types[md_types == "categorical"].index].where(mask, md[md_types[md_types == "categorical"].index].replace({True: 'True', False: 'False'})) 197 | 198 | # Remove names 199 | md.index.names = [None] 200 | md_types.name = None 201 | 202 | # sort and filter by given samples 203 | md = md.reindex(samples) 204 | 205 | # Check if matched metadata and samples 206 | null_rows = md.isna().all(axis=1) 207 | if any(null_rows): 208 | # Do not remove, just inform user 209 | #md = md.loc[~null_rows, :] 210 | print_log(str(sum(null_rows)) + " samples without valid metadata") 211 | 212 | if md.empty or sum(null_rows) == md.shape[0]: 213 | print_log("No valid metadata, skipping") 214 | return None 215 | 216 | metadata = Metadata(md, md_types) 217 | print_log("Samples: " + str(metadata.data.shape[0])) 218 | print_log("Numeric Fields: " + str(metadata.get_data("numeric").shape[1])) 219 | print_log("Categorical Fields: " + str(metadata.get_data("categorical").shape[1])) 220 | return metadata 221 | 222 | 223 | def define_metadata_types(metadata): 224 | # Define all COLUMN TYPES as default 225 | types = pd.Series(Metadata.default_type, index=metadata.columns) 226 | # Set types 227 | if str(metadata.index[0]).startswith("#"): 228 | # types defined on file: get values defined on the first row 229 | types = metadata.iloc[0] 230 | # Validate declared types 231 | idx_valid = types.isin(Metadata.valid_types) 232 | if not idx_valid.all(): 233 | print_log("Invalid metadata types replaced by: " + Metadata.default_type) 234 | types[~idx_valid] = default_type 235 | else: 236 | # guessed types from read_table 237 | types[metadata.dtypes.map(is_numeric_dtype)] = "numeric" 238 | 239 | return types 240 | 241 | 242 | def parse_references(cfg, tax, taxonomy, ranks): 243 | references = None 244 | if cfg is not None and "references" in cfg: 245 | if taxonomy == "ncbi": 246 | references = {} 247 | for desc, sf in cfg["references"].items(): 248 | references[desc] = Reference(file=sf) 249 | if tax: 250 | # Update taxids / get taxid from name 251 | references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) 252 | for i in list(references[desc].ids.keys()): 253 | # lineage of all parent nodes (without itself) 254 | for l in tax.lineage(i)[:-1]: 255 | references[desc].add_parent(l, i) 256 | else: 257 | print_log("References only possible with ncbi taxonomy, skipping") 258 | else: 259 | print_log("No references defined in the configuration file, skipping") 260 | return references 261 | 262 | 263 | def parse_controls(cfg, table, metadata): 264 | controls = None 265 | control_samples = None 266 | if cfg is not None and "controls" in cfg: 267 | controls = {} 268 | control_samples = {} 269 | for desc, c in cfg["controls"].items(): 270 | samples = set() 271 | if isinstance(c, str): 272 | # If str, it's a file with one sample per line 273 | with open(c, "r") as file: 274 | samples = file.read().splitlines() 275 | elif isinstance(c, dict): 276 | # if a dict, several metadata fields:values can be provided to set control samples 277 | for field, val in c.items(): 278 | if field not in metadata.get_col_headers(): 279 | print_log("Could not find " + field + " in the metadata, skipping for control " + desc) 280 | else: 281 | for v in val: 282 | samples.update(metadata.get_subset(field, v).index) 283 | 284 | if samples: 285 | obs = set() 286 | valid_samples = set() 287 | for rank in table.ranks(): 288 | # Retrieve sub-table for every rank 289 | control_table = table.get_subtable(rank, samples=samples) 290 | obs.update(control_table.columns.to_list()) 291 | valid_samples.update(control_table.index.to_list()) 292 | # Add control observations as a reference 293 | controls[desc] = Reference(ids=obs) 294 | control_samples[desc] = list(valid_samples) 295 | print_log(desc + ": " + str(len(valid_samples)) + " samples / " + str(len(obs)) + " observations") 296 | else: 297 | print_log("Could not identify control input " + desc) 298 | 299 | else: 300 | print_log("No controls defined in the configuration file, skipping") 301 | 302 | return controls, control_samples 303 | 304 | 305 | def parse_mgnify(run_mgnify, cfg, tax, ranks): 306 | mgnify = None 307 | if run_mgnify: 308 | if cfg is not None and "mgnify" in cfg["external"]: 309 | try: 310 | mgnify = pd.read_table(cfg["external"]["mgnify"], header=None, names=["rank", "taxa", "biome", "count"]) 311 | except Exception as e: 312 | print_log("Failed parsing MGnify database file [" + cfg["external"]["mgnify"] + "], skipping") 313 | print_log(str(e)) 314 | return None 315 | # Filter to keep only used ranks, if provided 316 | if ranks: 317 | mgnify = mgnify.loc[mgnify['rank'].isin(ranks)] 318 | mgnify.reset_index(drop=True, inplace=True) 319 | # Convert taxids if tax is provided 320 | if tax: 321 | updated_nodes = update_tax_nodes([tuple(x) for x in mgnify[["rank", "taxa"]].to_numpy()], tax) 322 | mgnify["taxa"] = mgnify[["rank", "taxa"]].apply(lambda rt: updated_nodes[(rt[0], rt[1])] if updated_nodes[(rt[0], rt[1])] is not None else rt[1], axis=1) 323 | if mgnify.empty: 324 | mgnify = None 325 | print_log("No matches with MGnify database, skipping") 326 | else: 327 | print_log("Not defined in the configuration file, skipping") 328 | else: 329 | print_log("Not activated, skipping") 330 | return mgnify 331 | 332 | 333 | def run_correlation(table, top_obs_corr): 334 | corr = {} 335 | for rank in table.ranks(): 336 | corr[rank] = {} 337 | if top_obs_corr: 338 | top_taxids = sorted(table.get_top(rank, top_obs_corr)) 339 | matrix = table.get_subtable(taxids=top_taxids, rank=rank) 340 | else: 341 | top_taxids = sorted(table.observations(rank)) 342 | matrix = table.data[rank] 343 | 344 | corr[rank]["observations"] = top_taxids 345 | corr[rank]["rho"] = [] 346 | # No correlation with just one observation 347 | if len(matrix.columns) >= 2: 348 | rho = pairwise_rho(transform_table(matrix, 0, "clr", table.zerorep).values) 349 | if len(matrix.columns) == 2: 350 | # If there are only 2 observations, return in a float 351 | # re-format in a matrix shape 352 | rho = np.array([[np.nan, np.nan], [rho[1, 0], np.nan]]) 353 | else: 354 | # fill upper triangular matrix (mirrored values) with nan to be ignored by pandas 355 | # to save half of the space 356 | rho[np.triu_indices(rho.shape[0])] = np.nan 357 | 358 | corr[rank]["rho"] = rho 359 | 360 | return corr 361 | 362 | 363 | def parse_input_file(input_file, unassigned_header, transpose, sample_replace, cumm_levels): 364 | 365 | if input_file.endswith(".biom"): 366 | table_df = biom.load_table(input_file).to_dataframe(dense=True) 367 | else: 368 | # Default input_file: index=observations, columns=samples 369 | # table_df should have samples on indices and observations on columns 370 | table_df = pd.read_table(input_file, sep='\t', index_col=0, dtype={0: str}).transpose().fillna(0) 371 | # Enforce string observations 372 | table_df.columns = table_df.columns.astype(str) 373 | 374 | # If user is providing a reverse table, turn back 375 | if transpose: 376 | table_df = table_df.transpose() 377 | 378 | # Remove header on rows 379 | table_df.index.name = None 380 | 381 | # Replace text on sample labels 382 | if sample_replace: 383 | print_log("Replacing sample values:") 384 | before_replace = table_df.head(1).index 385 | #get index as series to use replace method 386 | new_index = table_df.reset_index()["index"].replace(regex=dict(zip(sample_replace[::2], sample_replace[1::2]))) 387 | table_df.set_index(new_index, inplace=True) 388 | for b, a in zip(before_replace, table_df.head(1).index): 389 | print_log(" " + b + " -> " + a) 390 | print_log(" ...") 391 | 392 | # Sum total before split unassigned or filter 393 | if cumm_levels: 394 | total = table_df.max(axis=1) 395 | else: 396 | total = table_df.sum(axis=1) 397 | 398 | # unique unassigned/unclassified for table 399 | # Separate unassigned counts column from main data frame 400 | unassigned = pd.Series(0, index=table_df.index) 401 | if unassigned_header: 402 | for header in unassigned_header: 403 | if header in table_df.columns: 404 | if isinstance(table_df[header], pd.DataFrame): 405 | # Sum in case there are several equally named headers 406 | unassigned += table_df[header].sum(axis=1) 407 | else: 408 | # return a pd.Series 409 | unassigned += table_df[header] 410 | table_df.drop(columns=header, inplace=True) 411 | else: 412 | print_log("'" + header + "' header not found") 413 | 414 | if unassigned.sum() == 0: 415 | print_log("No unassigned entries defined") 416 | 417 | print_log("Trimming table") 418 | table_df = trim_table(table_df) 419 | 420 | # Filter based on the final table 421 | unassigned = unassigned.reindex(table_df.index) 422 | total = total.reindex(table_df.index) 423 | 424 | return table_df, total, unassigned 425 | 426 | 427 | def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count, normalized): 428 | 429 | if min_count: 430 | cnt = table_df.sum().sum() 431 | if min_count < 1: 432 | table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df 433 | table_df = table_df[table_df_norm >= min_count].fillna(0) 434 | elif min_count > 1: 435 | table_df = table_df[table_df >= min_count].fillna(0) 436 | print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --min-count " + str(min_count)) 437 | 438 | if max_count: 439 | cnt = table_df.sum().sum() 440 | if max_count < 1: 441 | table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df 442 | table_df = table_df[table_df_norm <= max_count].fillna(0) 443 | elif max_count > 1: 444 | table_df = table_df[table_df <= max_count].fillna(0) 445 | print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --max-count " + str(max_count)) 446 | 447 | if min_frequency: 448 | cnt = table_df.shape[1] 449 | table_df_freq = table_df.gt(0).sum(axis=0) 450 | if min_frequency < 1: 451 | table_df_freq = table_df_freq / table_df.shape[0] 452 | table_df = table_df.loc[:, table_df_freq >= min_frequency] 453 | elif min_frequency > 1: 454 | table_df = table_df.loc[:, table_df_freq >= min_frequency] 455 | print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --min-frequency " + str(min_frequency)) 456 | 457 | if max_frequency: 458 | cnt = table_df.shape[1] 459 | table_df_freq = table_df.gt(0).sum(axis=0) 460 | if max_frequency < 1: 461 | table_df_freq = table_df_freq / table_df.shape[0] 462 | table_df = table_df.loc[:, table_df_freq <= max_frequency] 463 | elif max_frequency > 1: 464 | table_df = table_df.loc[:, table_df_freq <= max_frequency] 465 | print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --max-frequency " + str(max_frequency)) 466 | 467 | return table_df 468 | 469 | 470 | def trim_table(table_df): 471 | # Check for cols/rows with sum zero 472 | zero_rows = table_df.sum(axis=1).eq(0) 473 | if any(zero_rows): 474 | table_df = table_df.loc[~zero_rows, :] 475 | print_log(str(sum(zero_rows)) + " samples with only zero removed") 476 | 477 | zero_cols = table_df.sum(axis=0).eq(0) 478 | if any(zero_cols): 479 | table_df = table_df.loc[:, ~zero_cols] 480 | print_log(str(sum(zero_cols)) + " observations with only zero removed") 481 | 482 | return table_df 483 | 484 | 485 | def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace, cumm_levels): 486 | from grimer.grimer import _debug 487 | 488 | # Transpose table (obseravations as index) and expand ranks in columns 489 | ranks_df = table_df.T.index.str.split(level_separator, expand=True).to_frame(index=False) 490 | 491 | # For every pair of replace arguments 492 | if obs_replace: 493 | print_log("Replacing observation values:") 494 | before_replace = ranks_df.dropna().head(1).values[0] 495 | ranks_df.replace(regex=dict(zip(obs_replace[::2], obs_replace[1::2])), inplace=True) 496 | for b, a in zip(before_replace, ranks_df.dropna().head(1).values[0]): 497 | print_log(" " + b + " -> " + a) 498 | print_log(" ...") 499 | 500 | # replace entirely space or empty with NaN 501 | ranks_df = ranks_df.replace(r'^\s*$', np.nan, regex=True) 502 | 503 | # Set rank names, matching user defined or default 504 | user_ranks = False 505 | if len(ranks) == ranks_df.shape[1]: 506 | parsed_ranks = {r: ranks[r] for r in range(ranks_df.shape[1])} 507 | user_ranks = True 508 | else: 509 | print_log("Ranks provided (" + str(len(ranks)) + ") do not match file (" + str(ranks_df.shape[1]) + " levels). Using default named ranks.") 510 | parsed_ranks = {r: "rank-" + str(r) for r in range(ranks_df.shape[1])} 511 | ranks_df.rename(columns=parsed_ranks, inplace=True) 512 | 513 | # Update taxids 514 | if tax is not None: 515 | unmatched_nodes = 0 516 | for i, r in parsed_ranks.items(): 517 | rank_nodes = ranks_df[r].dropna().unique() 518 | 519 | # If there is at least one valid entry 520 | if rank_nodes.any(): 521 | # If user-provided ranks are matching, update nodes with rank 522 | if user_ranks: 523 | updated_nodes = {node: unode for (rank, node), unode in update_tax_nodes([(r, n) for n in rank_nodes], tax).items()} 524 | else: 525 | updated_nodes = update_tax_nodes(rank_nodes, tax) 526 | 527 | # Add nan to keep missing ranks (different than tax.undefined_node [None] which will keep the name) 528 | updated_nodes[np.nan] = np.nan 529 | ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] is not None else t) 530 | del updated_nodes[np.nan] 531 | 532 | unmatched_nodes += list(updated_nodes.values()).count(tax.undefined_node) 533 | 534 | if unmatched_nodes: 535 | print_log(str(unmatched_nodes) + " observations not found in taxonomy (but kept)") 536 | 537 | # Check unique lineage 538 | for i, r in parsed_ranks.items(): 539 | if i > 0: 540 | lin_count = ranks_df.iloc[:, :i+1].drop_duplicates().groupby(r).count() 541 | invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list() 542 | if invalid: 543 | print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) 544 | if _debug: 545 | print_log(",".join(invalid) + " observations removed with invalid lineage at " + r) 546 | # Set to NaN to keep shape of ranks_df 547 | ranks_df.loc[ranks_df[r].isin(invalid), r] = np.nan 548 | 549 | ranked_tables = {} 550 | for i, r in parsed_ranks.items(): 551 | # ranks_df and table_df.T have the same shape 552 | ranked_table_df = pd.concat([ranks_df[r], table_df.T.reset_index(drop=True)], axis=1) 553 | if cumm_levels: 554 | ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).max().T 555 | else: 556 | ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).sum().T 557 | ranked_tables[r].columns.name = None 558 | 559 | lineage = ranks_df 560 | return ranked_tables, lineage 561 | 562 | 563 | def parse_single_table(table_df, ranks, tax, default_rank_name): 564 | 565 | # Update taxids 566 | if tax is not None: 567 | print(table_df) 568 | updated_nodes = update_tax_nodes(table_df.columns, tax) 569 | unmatched_nodes = list(updated_nodes.values()).count(tax.undefined_node) 570 | if unmatched_nodes: 571 | print_log(str(unmatched_nodes) + " observations not found in taxonomy") 572 | 573 | for node, upd_node in updated_nodes.items(): 574 | if upd_node is not None and upd_node != node: 575 | # If updated node is a merge on an existing taxid, sum values 576 | if upd_node in table_df: 577 | table_df[upd_node] += table_df[node] 578 | table_df.drop(columns=node, inplace=True) 579 | print_log("Updated and merged taxonomic nodes: " + node + " -> " + upd_node) 580 | else: 581 | table_df.rename(columns={node: upd_node}, inplace=True) 582 | print_log("Updated taxonomic node: " + node + " -> " + upd_node) 583 | 584 | 585 | # Generate ranks 586 | ranked_tables = {} 587 | for rank in ranks: 588 | # Special case for "default" rank 589 | if rank == default_rank_name: 590 | ranked_tables[rank] = table_df 591 | else: 592 | taxid_parent_rank = {i: tax.parent_rank(tax.latest(i), rank) for i in table_df.columns} 593 | rank_df = pd.DataFrame(index=table_df.index) 594 | for taxid, parent_rank_taxid in taxid_parent_rank.items(): 595 | if parent_rank_taxid is None: 596 | #no_rank += 1 597 | continue 598 | if parent_rank_taxid not in rank_df: 599 | rank_df[parent_rank_taxid] = 0 600 | rank_df[parent_rank_taxid] += table_df[taxid] 601 | 602 | if not rank_df.empty: 603 | ranked_tables[rank] = rank_df 604 | 605 | 606 | # Generate lineage 607 | if tax: 608 | lineage = pd.DataFrame(list(map(lambda t: tax.lineage(t, ranks=list(ranked_tables.keys())), table_df.columns)), columns=list(ranked_tables.keys())) 609 | else: 610 | lineage = pd.DataFrame() 611 | 612 | return ranked_tables, lineage 613 | 614 | 615 | def transform_table(df, total_counts, transformation, replace_zero_value): 616 | # Special case clr with one observation (result in zeros) 617 | if transformation == "clr" and df.shape[1] == 1: 618 | print_log("WARNING: using log instead of clr with one observation") 619 | transformation = "log" 620 | 621 | if transformation == "log": 622 | transformed_df = (df + replace_zero_value).apply(np.log10) 623 | elif transformation == "clr": 624 | transformed_df = pd.DataFrame(clr(df + replace_zero_value), index=df.index, columns=df.columns) 625 | elif transformation == "norm": 626 | transformed_df = df.divide(total_counts, axis=0) + replace_zero_value 627 | else: 628 | transformed_df = df + replace_zero_value 629 | 630 | return transformed_df 631 | 632 | 633 | def update_tax_nodes(nodes, tax): 634 | """ 635 | nodes can be a list of strings: taxids or names or a list of tuples with (rank, taxid/name) 636 | Return a dictionary mapping nodes and updated nodes (or None) 637 | First look for id, if nothing found, lookup by unique name 638 | """ 639 | 640 | updated_nodes = {} 641 | for node in nodes: 642 | if isinstance(node, tuple): 643 | r = node[0] 644 | n = node[1] 645 | else: 646 | r = None 647 | n = node 648 | 649 | # Either returns same node, updated or tax.undefined_node (None) 650 | updated_taxid = tax.latest(n) 651 | if updated_taxid: 652 | # Assign updated or same taxid 653 | updated_nodes[node] = updated_taxid 654 | else: 655 | names = tax.search_name(n, rank=r, exact=True) 656 | # Assign taxid if found unique name only 657 | if names and len(names) == 1: 658 | updated_nodes[node] = names[0] 659 | else: 660 | updated_nodes[node] = tax.undefined_node 661 | 662 | return updated_nodes 663 | 664 | 665 | def run_decontam(run_decontam, cfg, table, metadata, control_samples, script_dir): 666 | 667 | if not run_decontam: 668 | print_log("Not activated, skipping") 669 | return None 670 | 671 | if cfg is None: 672 | print_log("Not defined in the configuration file, skipping") 673 | return None 674 | 675 | df_decontam = pd.DataFrame(index=table.samples, columns=["concentration", "controls"]) 676 | cfg_decontam = cfg["external"]["decontam"] 677 | tmp_output_prefix = "tmp_" 678 | 679 | # Collect metadata for DECONTAM (concentrations to use frequency and control for prevalence) 680 | out_table = tmp_output_prefix + "table_counts.tsv" 681 | out_concentration = tmp_output_prefix + "concentration_counts.tsv" 682 | out_controls = tmp_output_prefix + "control_samples_list.txt" 683 | if cfg_decontam["method"] in ["frequency", "combined"]: 684 | out_concentration = tmp_output_prefix + "concentration_counts.tsv" 685 | # Load frequency file, if provided 686 | if "frequency_file" in cfg_decontam: 687 | if os.path.isfile(cfg_decontam["frequency_file"]): 688 | # Load concentrations from file and sort (reindex) based on table inputs 689 | df_decontam["concentration"] = pd.read_table(cfg_decontam["frequency_file"], sep='\t', header=None, skiprows=0, index_col=0).reindex(table.samples) 690 | # If any entry is unknown, input is incomplete 691 | if df_decontam["concentration"].isnull().values.any(): 692 | print_log("File " + cfg_decontam["frequency_file"] + " is incomplete (Missing: " + ",".join(df_decontam[df_decontam.isnull().any(axis=1)].index.to_list()) + "), skipping") 693 | return None 694 | else: 695 | print_log("File " + cfg_decontam["frequency_file"] + " not found, skipping") 696 | return None 697 | elif "frequency_metadata" in cfg_decontam: 698 | if cfg_decontam["frequency_metadata"] in metadata.get_col_headers(): 699 | # Get concentrations from metadata 700 | df_decontam["concentration"] = metadata.get_col(cfg_decontam["frequency_metadata"]) 701 | else: 702 | print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata, skipping.") 703 | return None 704 | elif not table.normalized: 705 | # Use total from table 706 | print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)") 707 | df_decontam["concentration"] = table.get_total() 708 | else: 709 | print_log("Cannot run DECONTAM without defined concentration and normalized input values, skipping") 710 | return None 711 | # Print concentrations to file 712 | df_decontam["concentration"].to_csv(out_concentration, sep="\t", header=False, index=True) 713 | 714 | if cfg_decontam["method"] in ["prevalence", "combined"]: 715 | control_list = set() 716 | if "prevalence_file" in cfg_decontam: 717 | for file in cfg_decontam["prevalence_file"]: 718 | if os.path.isfile(file): 719 | # Load controls from file 720 | control_list.update([line.rstrip() for line in open(file)]) 721 | else: 722 | print_log("File not found " + file) 723 | elif "prevalence_metadata" in cfg_decontam: 724 | # if a dict, several metadata fields:values can be provided to set control samples 725 | for field, val in cfg_decontam["prevalence_metadata"].items(): 726 | if field not in metadata.get_col_headers(): 727 | print_log("Could not find " + field + " in the metadata, skipping for decontam (prevalence)") 728 | else: 729 | for v in val: 730 | control_list.update(metadata.get_subset(field, v).index) 731 | else: 732 | # Use all samples passed as controls 733 | for cs in control_samples.values(): 734 | control_list.update(cs) 735 | 736 | # Select valid controls 737 | df_decontam["controls"] = table.samples.isin(control_list) 738 | 739 | if df_decontam["controls"].any(): 740 | print_log(str(df_decontam["controls"].sum()) + " valid control samples to be used by DECONTAM") 741 | outf = open(out_controls, "w") 742 | print("\n".join(df_decontam.index[df_decontam["controls"]]), file=outf) 743 | outf.close() 744 | else: 745 | print_log("Could not find valid control entries, skipping") 746 | return None 747 | 748 | decontam = Decontam(df_decontam) 749 | # Run DECONTAM for each for each 750 | for rank in table.ranks(): 751 | if len(table.observations(rank)) == 1: 752 | decontam.add_rank_empty(rank, table.observations(rank)) 753 | else: 754 | # normalize and write temporary table for each rank 755 | if not table.normalized: 756 | transform_table(table.data[rank], table.get_total()[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) 757 | else: 758 | table.data[rank].to_csv(out_table, sep="\t", header=True, index=True) 759 | 760 | cmd = " ".join([os.path.join(script_dir, "scripts", "run_decontam.R"), 761 | "--resout " + tmp_output_prefix + "decontam_out.tsv", 762 | "--modout " + tmp_output_prefix + "decontam_mod.tsv", 763 | "--counts " + out_table, 764 | "--concentrations " + out_concentration if cfg_decontam["method"] in ["frequency", "combined"] else "", 765 | "--controls " + out_controls if cfg_decontam["method"] in ["prevalence", "combined"] else "", 766 | "--method " + cfg_decontam["method"], 767 | "--threshold " + str(cfg_decontam["threshold"])]) 768 | stdout, stderr = run_cmd(cmd) 769 | 770 | decontam.add_rank_results(rank, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv") 771 | 772 | for file in [out_table, out_concentration, out_controls, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv"]: 773 | if os.path.isfile(file): 774 | os.remove(file) 775 | 776 | return decontam 777 | 778 | 779 | def run_hclustering(table, linkage_methods, linkage_metrics, transformation, skip_dendrogram, optimal_ordering): 780 | hcluster = {} 781 | dendro = {} 782 | 783 | for rank in table.ranks(): 784 | 785 | # Get .values of transform, numpy array 786 | matrix = transform_table(table.data[rank], table.get_total(), transformation, table.zerorep).values 787 | 788 | hcluster[rank] = {} 789 | dendro[rank] = {} 790 | for method in linkage_methods: 791 | hcluster[rank][method] = {} 792 | dendro[rank][method] = {} 793 | for metric in linkage_metrics: 794 | hcluster[rank][method][metric] = {} 795 | hcluster[rank][method][metric]["x"] = {} 796 | hcluster[rank][method][metric]["y"] = {} 797 | 798 | #H.clustering, returning dendrogram 799 | # Only one observation does not cluster 800 | if matrix.shape[1] > 1: 801 | x = sch.dendrogram(sch.linkage(matrix.transpose(), method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True) 802 | hcluster[rank][method][metric]["x"]["index"] = table.observations(rank)[x["leaves"]].to_list() 803 | else: 804 | hcluster[rank][method][metric]["x"]["index"] = table.observations(rank).to_list() 805 | 806 | # Only one samples does not cluster 807 | if matrix.shape[0] > 1: 808 | y = sch.dendrogram(sch.linkage(matrix, method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True) 809 | hcluster[rank][method][metric]["y"]["index"] = table.samples[y["leaves"]].to_list() 810 | else: 811 | hcluster[rank][method][metric]["y"]["index"] = table.samples.to_list() 812 | 813 | if not skip_dendrogram: 814 | dendro[rank][method][metric] = {} 815 | dendro[rank][method][metric]["y"] = {} 816 | dendro[rank][method][metric]["x"] = {} 817 | 818 | # Save dendrogram values and colors 819 | xs, ys, colors = [[]] * 3 820 | if matrix.shape[1] > 1: 821 | xs, ys, colors = dendro_lines_color(x, "x") 822 | dendro[rank][method][metric]["x"]["xs"] = xs 823 | dendro[rank][method][metric]["x"]["ys"] = ys 824 | dendro[rank][method][metric]["x"]["colors"] = colors 825 | if matrix.shape[0] > 1: 826 | xs, ys, colors = dendro_lines_color(y, "y") 827 | dendro[rank][method][metric]["y"]["xs"] = xs 828 | dendro[rank][method][metric]["y"]["ys"] = ys 829 | dendro[rank][method][metric]["y"]["colors"] = colors 830 | 831 | return hcluster, dendro 832 | 833 | 834 | def dendro_lines_color(dendro, axis): 835 | icoord = pd.DataFrame(dendro["icoord"]) 836 | icoord = icoord * ((len(dendro["icoord"]) + 0.5) / icoord.max().max()) 837 | icoord = icoord.values.tolist() 838 | if axis == "y": 839 | dcoord = dendro["dcoord"] 840 | else: 841 | dcoord = [[-j for j in i] for i in dendro['dcoord']] 842 | 843 | color_list = dendro["color_list"] 844 | unique_colors = sorted(set(color_list)) 845 | cp = make_color_palette(len(unique_colors)) 846 | colors = [cp[unique_colors.index(colorid)] for colorid in color_list] 847 | 848 | if axis == "y": 849 | return dcoord, icoord, colors 850 | else: 851 | return icoord, dcoord, colors 852 | 853 | 854 | def pairwise_vlr(mat): 855 | cov = np.cov(mat.T, ddof=1) 856 | diagonal = np.diagonal(cov) 857 | return -2 * cov + diagonal[:, np.newaxis] + diagonal 858 | 859 | 860 | def pairwise_rho(mat): 861 | variances = np.var(mat, axis=0, ddof=1) 862 | return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) 863 | 864 | 865 | def include_scripts(scripts): 866 | # Insert global js functions and css and return template 867 | template = "{% block postamble %}" 868 | for file, t in scripts.items(): 869 | with open(file, 'r') as file: 870 | template += "<" + t + ">" 871 | template += "".join(file.readlines()) 872 | template += "" 873 | template += "{% endblock %}" 874 | return template 875 | 876 | 877 | def format_js_toString(val): 878 | # Transform numeric value to float and string to match toString 879 | return str(float(val)) if isinstance(val, (int, float)) else str(val) 880 | 881 | 882 | def make_color_palette(n_colors, linear: bool=False, palette: dict=None): 883 | if isinstance(palette, dict) and n_colors <= max(palette.keys()): 884 | # Special case for 1 and 2 (not in palettes) 885 | palette = palette[3 if n_colors < 3 else n_colors] 886 | 887 | if linear or n_colors > 20: 888 | if not palette: 889 | palette = Turbo256 890 | if n_colors <= 256: 891 | return linear_palette(palette, n_colors) 892 | else: 893 | # Repeat colors 894 | return [palette[int(i * 256.0 / n_colors)] for i in range(n_colors)] 895 | else: 896 | # Select color palette based on number of requested colors 897 | # Return the closest palette with most distinc set of colors 898 | if not palette: 899 | if n_colors <= 8: 900 | palette = Colorblind[8] 901 | elif n_colors <= 10: 902 | palette = Category10[10] 903 | elif n_colors <= 20: 904 | palette = Category20[20] 905 | else: 906 | palette = Turbo256 907 | 908 | return palette[:n_colors] 909 | 910 | def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True): 911 | errcode = 0 912 | stdout = "" 913 | stderr = "" 914 | try: 915 | process = subprocess.Popen(shlex.split(cmd), 916 | universal_newlines=True, 917 | stdout=subprocess.PIPE, 918 | stderr=subprocess.PIPE) 919 | # wait for the process to terminate 920 | stdout, stderr = process.communicate() 921 | errcode = process.returncode 922 | if exit_on_error and errcode != 0: 923 | raise Exception() 924 | if print_stderr and stderr: 925 | print_log(stderr) 926 | 927 | except Exception as e: 928 | print_log('The following command failed to run:\n' + cmd) 929 | print_log(str(e)) 930 | print_log("Error code: " + str(errcode)) 931 | print_log("Out: ") 932 | if stdout: 933 | print_log(stdout) 934 | print_log("Error: ") 935 | if stderr: 936 | print_log(stderr) 937 | sys.exit(errcode) 938 | 939 | return stdout, stderr 940 | 941 | 942 | def print_log(text): 943 | sys.stderr.write(text + "\n") 944 | sys.stderr.flush() 945 | 946 | 947 | def print_df(df, name: str=None): 948 | from grimer.grimer import _debug 949 | if _debug: 950 | print(name) 951 | if isinstance(df, dict): 952 | if df: 953 | print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1]) 954 | #print(list(df.values())[0], "...", list(df.values())[-1]) 955 | else: 956 | #print(df.columns) 957 | print(df.head()) 958 | print(df.shape) 959 | print("size:", sys.getsizeof(df)) 960 | print("-----------------------------------------------") 961 | 962 | 963 | def print_logo_cli(version): 964 | print_log("==================") 965 | print_log(" ╔═╗╦═╗╦╔╦╗╔═╗╦═╗ ") 966 | print_log(" ║ ╦╠╦╝║║║║║╣ ╠╦╝ ") 967 | print_log(" ╚═╝╩╚═╩╩ ╩╚═╝╩╚═ ") 968 | print_log(" v" + version) 969 | print_log("==================") 970 | -------------------------------------------------------------------------------- /grimer/grimer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | _debug = False 3 | 4 | #General 5 | import sys 6 | 7 | #Internal 8 | from grimer.callbacks import * 9 | from grimer.cds import * 10 | from grimer.config import Config, logo 11 | from grimer.layout import * 12 | from grimer.plots import * 13 | from grimer.func import * 14 | 15 | #Bokeh 16 | from bokeh.io import save 17 | from bokeh.plotting import output_file 18 | 19 | 20 | def main(argv=sys.argv[1:]): 21 | """ 22 | GRIMER code overview 23 | 1) Load data/analysis: parse configuration, load files and run analysis into data objects 24 | e.g. args.input_file to Table() and decontam 25 | 2) Generata data sources: Convert objects and analysis int cds/dict 26 | e.g. table to cds_m_obstable 27 | 3) Plot elements: plot figures and widgets based on cds/dict (and some objects) 28 | e.g cds_m_obstable to ele["obstable"]["fig"] 29 | 4) Link javascript: link data sources and javascript custom callbacks 30 | 5) Draw layout: Put elements into layout scheme and generate report 31 | """ 32 | 33 | # Parse CLI arguments 34 | args = Config(argv) 35 | 36 | # Setup global _debug variable to be used by other files with #from grimer.grimer import _debug 37 | global _debug 38 | _debug = args.debug 39 | # Define path of running script to get static files 40 | script_dir, _ = os.path.split(__file__) 41 | 42 | # 1) Load data/analysis 43 | # If not parsed, skipped or error, var is None 44 | cfg = None 45 | tax = None 46 | table = None 47 | metadata = None 48 | references = None 49 | controls = None 50 | control_samples = None 51 | hcluster = None 52 | dendro = None 53 | corr = None 54 | 55 | print_log("- Parsing configuration file") 56 | cfg = parse_config_file(args.config) 57 | 58 | print_log("- Parsing taxonomy") 59 | tax = parse_taxonomy(args.taxonomy, args.taxonomy_files) 60 | 61 | print_log("- Parsing input table") 62 | try: 63 | table = parse_table(args, tax) 64 | except Exception as e: 65 | print(e) 66 | return 1 67 | 68 | print_log("- Parsing metadata") 69 | metadata = parse_metadata(args, table.samples.to_list()) 70 | 71 | print_log("- Parsing references") 72 | references = parse_references(cfg, tax, args.taxonomy, table.ranks()) 73 | 74 | print_log("- Parsing controls") 75 | controls, control_samples = parse_controls(cfg, table, metadata) 76 | 77 | print_log("- Parsing MGnify database") 78 | mgnify = parse_mgnify(args.mgnify, cfg, tax, table.ranks()) 79 | 80 | print_log("- Running DECONTAM") 81 | decontam = run_decontam(args.decontam, cfg, table, metadata, control_samples, script_dir) 82 | 83 | print_log("- Running hierarchical clustering") 84 | hcluster, dendro = run_hclustering(table, args.linkage_methods, args.linkage_metrics, args.transformation, args.skip_dendrogram, args.optimal_ordering) 85 | 86 | print_log("- Running correlation") 87 | corr = run_correlation(table, args.top_obs_corr) 88 | 89 | # 2) Generata data sources: 90 | # cds (ColumnDataSource) and dict containers: data structures loaded and parsed by bokehjs 91 | # "cds" for matrix like dataframes with fixed column sizes 92 | # "dict" for variable column sizes 93 | # _p_ : plot -> direct source of figures either pre-loaded or empty 94 | # _d_ : data -> auxiliar containers to be used/shared among plots 95 | # usually by copying and/or transforming values into a _p_ container 96 | # _m_ : mixed -> contain both plot and data properties 97 | 98 | print_log("- Generating data sources") 99 | # _m_ 100 | # df: index (unique observations), col|..., tax|..., aux|ref 101 | cds_m_obstable = cds_obstable(table, tax, references, controls, control_samples, decontam) 102 | # _p_ 103 | # df: index (unique sample-ids), aux|..., bar|..., tax|... 104 | cds_p_samplebars = cds_samplebars(table) 105 | # stacked: index (repeated observations), rank, ref, direct, parent 106 | cds_p_references = cds_plot_references(table, tax, references) 107 | # matrix: index (unique sample-ids), concentrations, controls, counts 108 | cds_p_decontam = cds_plot_decontam(decontam) if decontam else None 109 | # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} 110 | cds_p_decontam_models = cds_plot_decontam_models(decontam) if decontam else None 111 | # stacked: index (taxa, level, lineage), count, perc 112 | cds_p_mgnify = cds_mgnify(mgnify, table, tax) if mgnify is not None else None 113 | # stacked: index (repeated sample-ids), obs, rank, ov, tv 114 | cds_p_heatmap = cds_heatmap(table, args.transformation, args.show_zeros) 115 | # matrix: index (unique sample-ids), md0, md1, ..., md(args.metadata_cols) -> (metadata field, metadata values) 116 | cds_p_metadata = cds_plot_metadata(metadata, args.metadata_cols) if metadata else None 117 | # stacked: index (repeated observations), rank, annot 118 | cds_p_annotations = cds_annotations(table, references, controls, decontam, control_samples) 119 | # empty matrix {"x": [], "y": [], "c": []} 120 | cds_p_dendro_x, cds_p_dendro_y = cds_plot_dendro() if not args.skip_dendrogram else [None, None] 121 | # stacked: index (repeated observations), other observation, rank, rho 122 | cds_p_correlation = cds_correlation(table, corr) 123 | # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors 124 | cds_p_obsbars = cds_obsbars(table, args.top_obs_bars) 125 | # df: index (unique sample-ids), col|... 126 | cds_p_sampletable = cds_sampletable(table) 127 | # _d_ 128 | # df: index (unique sample-ids), aux|..., cnt|..., 129 | cds_d_samples = cds_samples(table, references, controls, decontam) 130 | # matrix: index (unique sample-ids) x columns (metadata fields) -> metadata values 131 | cds_d_metadata = cds_metadata(metadata) if metadata else None 132 | # {taxid: (contam_y1, contam_y2, non_contam_y, pval)} 133 | cds_d_decontam = cds_decontam(decontam, table.ranks()) if decontam else None 134 | # key = rank + "|" + method + "|" + metric 135 | # y: {"default": sorted sample-ids, key: sorted sample-ids, ...} 136 | # x: {"default|rank": sorted sample-ids, key: sorted sample-ids, ...} 137 | dict_d_hcluster_x, dict_d_hcluster_y = dict_hcluster(table, hcluster) 138 | # {key+"|x": x-values, key+"|y": y-values , key+"|c": colors} 139 | dict_d_dedro_x, dict_d_dedro_y = dict_dendro(table, dendro) if not args.skip_dendrogram else [None, None] 140 | # {taxid: name} 141 | dict_d_taxname = dict_taxname(tax, [txid for rank in table.ranks() for txid in table.observations(rank)]) 142 | # {rank: [taxid1,taxid2, ..., taxid(top_obs_bars)]} 143 | dict_d_topobs = dict_topobs(table, args.top_obs_bars) 144 | # {taxid: {source: {desc: [refs]}} 145 | dict_d_refs = dict_refs(table, references) 146 | # dict: {rank: {obs: {sample: count}}} 147 | dict_d_sampleobs = dict_sampleobs(table) 148 | 149 | # 3) Plot elements 150 | print_log("- Plotting elements") 151 | 152 | # Defined fixed layout and plot sizes 153 | sizes = {} 154 | sizes["overview_top_panel_height"] = 300 155 | sizes["overview_top_panel_width_left"] = 250 156 | sizes["overview_top_panel_width_right"] = 450 157 | 158 | # Elements to plot 159 | # ele[name]["fig"] -> main figure/element 160 | # ele[name]["filter"] -> filter to the figure 161 | # ele[name]["wid"][widget1] -> widgets to the figure 162 | ele = {} 163 | 164 | # obstable 165 | ele["obstable"] = {} 166 | ele["obstable"]["fig"], ele["obstable"]["filter"] = plot_obstable(sizes, cds_m_obstable, table.ranks(), references, controls) 167 | ele["obstable"]["wid"] = plot_obstable_widgets(sizes, dict_d_taxname, max(cds_m_obstable.data["col|total_counts"])) 168 | 169 | # infopanel 170 | ele["infopanel"] = {} 171 | ele["infopanel"]["textarea"] = plot_infopanel() 172 | 173 | # references 174 | ele["references"] = {} 175 | if references: 176 | ele["references"]["fig"], ele["references"]["filter"] = plot_references(sizes, table, cds_p_references, dict_d_taxname) 177 | else: 178 | ele["references"]["fig"], ele["references"]["filter"] = None, None 179 | ele["references"]["wid"] = plot_references_widgets(sizes, references) 180 | 181 | # mgnify 182 | ele["mgnify"] = {} 183 | if cds_p_mgnify: 184 | ele["mgnify"]["fig"], ele["mgnify"]["filter"] = plot_mgnify(sizes, cds_p_mgnify) 185 | else: 186 | ele["mgnify"]["fig"], ele["mgnify"]["filter"] = None, None 187 | ele["mgnify"]["wid"] = plot_mgnify_widgets() 188 | 189 | # decontam 190 | ele["decontam"] = {} 191 | ele["decontam"]["wid"] = {} 192 | if decontam: 193 | ele["decontam"]["fig"] = plot_decontam(sizes, cds_p_decontam, cds_p_decontam_models, table.get_min_valid_count_perc()) 194 | else: 195 | ele["decontam"]["fig"] = None 196 | ele["decontam"]["wid"] = plot_decontam_widgets(sizes) 197 | 198 | # samplebars 199 | ele["samplebars"] = {} 200 | ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, table) 201 | ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, references, controls, decontam, table.normalized) 202 | 203 | # sampletable 204 | ele["sampletable"] = {} 205 | ele["sampletable"]["fig"] = plot_sampletable(cds_p_sampletable, sizes, table.ranks()) 206 | ele["sampletable"]["wid"] = plot_sampletable_widgets(sizes, max(cds_p_sampletable.data["col|total"]), metadata) 207 | 208 | # heatmap 209 | tools_heatmap = "hover,save,box_zoom,reset,crosshair,box_select" 210 | ele["heatmap"] = {} 211 | ele["heatmap"]["fig"] = plot_heatmap(table, cds_p_heatmap, tools_heatmap, args.transformation, dict_d_taxname) 212 | ele["heatmap"]["wid"] = plot_heatmap_widgets(table.ranks(), args.linkage_methods, args.linkage_metrics, references, controls, metadata, decontam) 213 | 214 | # metadata (heatmap) 215 | ele["metadata"] = {} 216 | ele["metadata"]["wid"] = {} 217 | if metadata: 218 | ele["metadata"]["fig"], ele["metadata"]["wid"] = plot_metadata(ele["heatmap"]["fig"], tools_heatmap, metadata, cds_d_metadata, cds_p_metadata) 219 | else: 220 | ele["metadata"]["fig"] = Spacer() 221 | ele["metadata"]["wid"]["metadata_multiselect"] = Spacer() 222 | ele["metadata"]["wid"]["legend_colorbars"] = Spacer() 223 | ele["metadata"]["wid"]["toggle_legend"] = Spacer() 224 | 225 | # annotations 226 | ele["annotations"] = {} 227 | if cds_p_annotations.data["index"].size: 228 | ele["annotations"]["fig"] = plot_annotations(ele["heatmap"]["fig"], tools_heatmap, cds_p_annotations, dict_d_taxname) 229 | else: 230 | ele["annotations"]["fig"] = Spacer() 231 | 232 | # dendrograms 233 | ele["dendrox"] = {} 234 | ele["dendroy"] = {} 235 | if not args.skip_dendrogram: 236 | ele["dendrox"]["fig"], ele["dendroy"]["fig"] = plot_dendrogram(ele["heatmap"]["fig"], tools_heatmap, cds_p_dendro_x, cds_p_dendro_y) 237 | else: 238 | ele["dendrox"]["fig"] = Spacer() 239 | ele["dendroy"]["fig"] = Spacer() 240 | 241 | # correlation 242 | ele["correlation"] = {} 243 | ele["correlation"]["fig"], ele["correlation"]["filter"] = plot_correlation(cds_p_correlation, table.ranks(), dict_d_taxname) 244 | ele["correlation"]["wid"] = plot_correlation_widgets(table.ranks(), args.top_obs_corr) 245 | 246 | # obsbars 247 | ele["obsbars"] = {} 248 | ele["obsbars"]["wid"] = plot_obsbars_widgets(table.ranks(), metadata, dict_d_topobs, dict_d_taxname, args.top_obs_bars) 249 | ele["obsbars"]["fig"], ele["obsbars"]["legend"] = plot_obsbars(cds_p_obsbars, dict_d_topobs, table.ranks(), args.top_obs_bars, dict_d_taxname, ele["obsbars"]["wid"]["rank_select"]) 250 | 251 | #4) Link javascript: 252 | print_log("- Linking javascript") 253 | 254 | link_obstable_filter(ele, cds_m_obstable, table.ranks()) 255 | link_obstable_samplebars(ele, 256 | cds_m_obstable, 257 | cds_p_samplebars, 258 | cds_d_samples, 259 | dict_d_sampleobs, 260 | cds_d_metadata, 261 | cds_p_decontam, 262 | cds_p_decontam_models, 263 | cds_d_decontam, 264 | cds_p_references, 265 | table.ranks(), 266 | table.get_min_valid_count_perc(), 267 | table.get_total().max(), 268 | cds_p_mgnify, 269 | dict_d_refs, 270 | dict_d_taxname) 271 | link_heatmap_widgets(ele, 272 | cds_d_samples, 273 | cds_d_metadata, 274 | cds_p_metadata, 275 | dict_d_hcluster_x, 276 | dict_d_hcluster_y, 277 | cds_p_dendro_x, 278 | cds_p_dendro_y, 279 | dict_d_dedro_x, 280 | dict_d_dedro_y, 281 | cds_p_annotations, 282 | cds_m_obstable, 283 | cds_p_heatmap, 284 | table.ranks(), 285 | dict_d_taxname) 286 | link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, args.metadata_cols) 287 | link_correlation_widgets(ele, cds_p_correlation) 288 | link_obsbars_widgets(ele, 289 | cds_p_obsbars, 290 | dict_d_topobs, 291 | dict_d_sampleobs, 292 | cds_d_samples, 293 | args.top_obs_bars, 294 | dict_d_taxname, 295 | cds_d_metadata, 296 | cds_p_sampletable) 297 | link_sampletable_select(ele, cds_p_sampletable, cds_d_metadata) 298 | 299 | # 5) Draw layout 300 | print_log("- Drawing layout") 301 | logo_path = os.path.join(script_dir, "img", "logo.png") 302 | 303 | final_layout = make_layout(ele, sizes, Config.version, logo_path, args.title, args.output_plots) 304 | if final_layout is None: 305 | return 1 306 | 307 | template = include_scripts({os.path.join(script_dir, "js", "func.js"): "script", 308 | os.path.join(script_dir, "js", "popup.js"): "script", 309 | os.path.join(script_dir, "css", "popup.css"): "style"}) 310 | 311 | if args.full_offline: 312 | mode = "inline" # configure to provide entire Bokeh JS and CSS inline 313 | elif _debug: 314 | mode = "absolute-dev" # non-minimized - configure to load from the installed Bokeh library static directory 315 | else: 316 | mode = "cdn" # configure to load Bokeh JS and CSS from https://cdn.bokeh.org 317 | 318 | # setup output file and JS mode 319 | print_log("- Saving report") 320 | output_file(args.output_html, title="GRIMER" if not args.title else "GRIMER - " + args.title, mode=mode) 321 | save(final_layout, template=template) 322 | print_log("File: " + args.output_html) 323 | file_size_bytes = os.path.getsize(args.output_html) 324 | print_log("Size: " + str(file_size_bytes) + " bytes (" + '{0:.2f} MB'.format(file_size_bytes / float(1024 ** 2)) + ")") 325 | 326 | if __name__ == "__main__": 327 | main() 328 | -------------------------------------------------------------------------------- /grimer/img/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/img/__init__.py -------------------------------------------------------------------------------- /grimer/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/img/logo.png -------------------------------------------------------------------------------- /grimer/js/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/js/__init__.py -------------------------------------------------------------------------------- /grimer/js/func.js: -------------------------------------------------------------------------------- 1 | function sort_numeric(a, b){ return a - b; } 2 | function sort_string(a, b){ return a.localeCompare(b); } 3 | 4 | function grimer_sort(factors, sort_col, sort_mode="numeric", desc=false, group_col1=[], group_col2=[], index=[]) { 5 | //sort_mode : numeric, string 6 | 7 | // subset data if index provided 8 | if(index.length){ 9 | factors = index.map( s => factors[s] ); 10 | sort_col = index.map( s => sort_col[s] ); 11 | if(group_col1.length){ 12 | group_col1 = index.map( s => group_col1[s] ); 13 | } 14 | if(group_col2.length){ 15 | group_col2 = index.map( s => group_col2[s] ); 16 | } 17 | } 18 | 19 | // Generate numerical index to sort arrays 20 | var idx = new Array(factors.length); 21 | for (var i = 0; i < idx.length; ++i) idx[i] = i; 22 | //If numeric, replace NaN with sortable value (-Infinity) to be at the end of the sorted array 23 | if (sort_mode=="numeric"){ 24 | sort_col = sort_col.map(function(v){ return isNaN(v) ? -Infinity : v }) 25 | } 26 | 27 | if(group_col1.length && group_col2.length){ 28 | if (sort_mode=="numeric" && desc==false) 29 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[b],sort_col[a])); 30 | else if (sort_mode=="numeric" && desc==true) 31 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[a],sort_col[b])); 32 | else if (sort_mode=="string" && desc==false) 33 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[a],sort_col[b])); 34 | else if (sort_mode=="string" && desc==true) 35 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[b],sort_col[a])); 36 | }else if(group_col1.length){ 37 | if (sort_mode=="numeric" && desc==false) 38 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[b],sort_col[a])); 39 | else if (sort_mode=="numeric" && desc==true) 40 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[a],sort_col[b])); 41 | else if (sort_mode=="string" && desc==false) 42 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[a],sort_col[b])); 43 | else if (sort_mode=="string" && desc==true) 44 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[b],sort_col[a])); 45 | }else{ 46 | if (sort_mode=="numeric" && desc==false) 47 | idx.sort((a, b) => sort_numeric(sort_col[b],sort_col[a])); 48 | else if (sort_mode=="numeric" && desc==true) 49 | idx.sort((a, b) => sort_numeric(sort_col[a],sort_col[b])); 50 | else if (sort_mode=="string" && desc==false) 51 | idx.sort((a, b) => sort_string(sort_col[a],sort_col[b])); 52 | else if (sort_mode=="string" && desc==true) 53 | idx.sort((a, b) => sort_string(sort_col[b],sort_col[a])); 54 | } 55 | 56 | var sorted_factors = new Array(idx.length); 57 | for (var i = 0; i < idx.length; ++i) sorted_factors[i] = factors[idx[i]]; 58 | return sorted_factors; 59 | } 60 | 61 | function table_to_tsv(source, cols, headers, selected) { 62 | 63 | var rows_idx = [] 64 | if(selected==true){ 65 | //remove undefined from selected if present 66 | rows_idx = source.selected.indices.filter(function( element ) { 67 | return element !== undefined; 68 | }); 69 | } 70 | else{ 71 | // include all rows 72 | for (let i = 0; i < source.get_length(); i++) { 73 | rows_idx.push(i); 74 | } 75 | } 76 | 77 | const lines = [headers.join('\t')] 78 | for (let i = 0; i < rows_idx.length; i++) { 79 | let row = []; 80 | for (let j = 0; j < cols.length; j++) { 81 | row.push(source.data[cols[j]][rows_idx[i]].toString()) 82 | } 83 | lines.push(row.join('\t')) 84 | } 85 | return lines.join('\n').concat('\n') 86 | } 87 | 88 | function save_file(filename, filetext){ 89 | const blob = new Blob([filetext], { type: 'text/csv;charset=utf-8;' }) 90 | //addresses IE 91 | if (navigator.msSaveBlob) { 92 | navigator.msSaveBlob(blob, filename) 93 | } else { 94 | const link = document.createElement('a') 95 | link.href = URL.createObjectURL(blob) 96 | link.download = filename 97 | link.target = '_blank' 98 | link.style.visibility = 'hidden' 99 | link.dispatchEvent(new MouseEvent('click')) 100 | } 101 | } -------------------------------------------------------------------------------- /grimer/js/popup.js: -------------------------------------------------------------------------------- 1 | var pop = { 2 | // (A) ATTACH POPUP HTML 3 | pWrap : null, // HTML popup wrapper 4 | pBox : null, // HTML popup box 5 | pTitle : null, // HTML popup title 6 | pText : null, // HTML popup text 7 | pClose : null, // HTML close button 8 | init : function () { 9 | // (A1) POPUP WRAPPER 10 | pop.pWrap = document.createElement("div"); 11 | pop.pWrap.id = "pop-up"; 12 | document.body.appendChild(pop.pWrap); 13 | 14 | // (A2) POPUP BOX 15 | pop.pBox = document.createElement("div"); 16 | pop.pBox.id = "pop-box"; 17 | pop.pWrap.appendChild(pop.pBox); 18 | 19 | // (A3) TITLE 20 | pop.pTitle = document.createElement("h2"); 21 | pop.pTitle.id = "pop-title"; 22 | pop.pBox.appendChild(pop.pTitle); 23 | 24 | // (A4) TEXT 25 | pop.pText = document.createElement("p"); 26 | pop.pText.id = "pop-text"; 27 | pop.pBox.appendChild(pop.pText); 28 | 29 | // (A5) CLOSE 30 | pop.pClose = document.createElement("div"); 31 | pop.pClose.id = "pop-close"; 32 | pop.pClose.innerHTML = "☒"; 33 | pop.pClose.onclick = pop.close; 34 | pop.pBox.appendChild(pop.pClose); 35 | }, 36 | 37 | // (B) OPEN POPUP 38 | open : function (title, text) { 39 | pop.pTitle.innerHTML = title; 40 | pop.pText.innerHTML = text; 41 | pop.pWrap.classList.add("open"); 42 | }, 43 | 44 | // (C) CLOSE POPUP 45 | close : function () { 46 | pop.pWrap.classList.remove("open"); 47 | } 48 | }; 49 | window.addEventListener("DOMContentLoaded", pop.init); 50 | -------------------------------------------------------------------------------- /grimer/layout.py: -------------------------------------------------------------------------------- 1 | from bokeh.layouts import column, row, gridplot 2 | from bokeh.models import Spacer, Tabs, Panel, Div 3 | from grimer.func import print_log 4 | import base64 5 | 6 | 7 | def make_layout(ele, sizes, version, logo_path, title, output_plots): 8 | 9 | main_panels = {} 10 | if "overview" in output_plots: 11 | filterwidgets = column(ele["obstable"]["wid"]["frequency_spinner"], 12 | ele["obstable"]["wid"]["counts_perc_avg_spinner"], 13 | ele["obstable"]["wid"]["total_counts_spinner"], 14 | ele["obstable"]["wid"]["name_multichoice"], 15 | row(ele["obstable"]["wid"]["help_button"], ele["obstable"]["wid"]["export_dropdown"])) 16 | filterwidgetstabs = Tabs(tabs=[Panel(child=filterwidgets, title="Filter")], 17 | sizing_mode="fixed", 18 | height=sizes["overview_top_panel_height"] + 20, 19 | width=sizes["overview_top_panel_width_left"]) 20 | info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] 21 | if ele["references"]["fig"]: 22 | info_tabs.append(Panel(child=column(ele["references"]["fig"], 23 | row(ele["references"]["wid"]["references_select"], 24 | ele["references"]["wid"]["help_button"]) 25 | ), title="References")) 26 | if ele["mgnify"]["fig"]: 27 | info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], 28 | row(ele["mgnify"]["wid"]["biome_spinner"], 29 | ele["mgnify"]["wid"]["help_button"]) 30 | ), title="MGnify")) 31 | if ele["decontam"]["fig"]: 32 | info_tabs.append(Panel(child=column(ele["decontam"]["fig"], 33 | row(ele["decontam"]["wid"]["pscore_text"], 34 | ele["decontam"]["wid"]["pscore_input"], 35 | ele["decontam"]["wid"]["help_button"]) 36 | ), title="DECONTAM")) 37 | infotabs = Tabs(tabs=info_tabs, 38 | sizing_mode="fixed", 39 | height=sizes["overview_top_panel_height"] + 20, 40 | width=sizes["overview_top_panel_width_right"]) 41 | row_obstable = row(filterwidgetstabs, 42 | ele["obstable"]["fig"], 43 | infotabs, 44 | sizing_mode="stretch_width") 45 | row_barpot = column(row(ele["samplebars"]["fig"]), 46 | row(ele["samplebars"]["wid"]["y1_select"], 47 | ele["samplebars"]["wid"]["annotbar_rank_select"], 48 | ele["samplebars"]["wid"]["annotbar_select"], 49 | ele["samplebars"]["wid"]["groupby1_select"], 50 | ele["samplebars"]["wid"]["groupby2_select"], 51 | ele["samplebars"]["wid"]["sort_select"], 52 | ele["samplebars"]["wid"]["y2_select"], 53 | ele["samplebars"]["wid"]["help_button"]), 54 | ele["samplebars"]["wid"]["toggle_label"]) 55 | main_panels["overview"] = Panel(child=column(row_obstable, row_barpot, sizing_mode="stretch_width"), title="Overview") 56 | 57 | if "samples" in output_plots: 58 | selectwidgets = column(ele["sampletable"]["wid"]["total_counts_spinner"], 59 | ele["sampletable"]["wid"]["assigned_spinner"], 60 | ele["sampletable"]["wid"]["metadata_multichoice"], 61 | row(ele["sampletable"]["wid"]["help_button"], ele["sampletable"]["wid"]["export_dropdown"])) 62 | selectwidgetstabs = Tabs(tabs=[Panel(child=selectwidgets, title="Select")], 63 | sizing_mode="fixed", 64 | height=sizes["overview_top_panel_height"] + 20, 65 | width=sizes["overview_top_panel_width_left"]) 66 | row_sampletable = row(selectwidgetstabs, 67 | ele["sampletable"]["fig"], 68 | sizing_mode="stretch_width") 69 | row_obsbars = column(row(ele["obsbars"]["fig"]), 70 | row(ele["obsbars"]["wid"]["rank_select"], 71 | ele["obsbars"]["wid"]["groupby1_select"], 72 | ele["obsbars"]["wid"]["groupby2_select"], 73 | ele["obsbars"]["wid"]["sort_select"], 74 | ele["obsbars"]["wid"]["help_button"]), 75 | ele["obsbars"]["wid"]["toggle_label"]) 76 | main_panels["samples"] = Panel(child=column(row_sampletable, row_obsbars, sizing_mode="stretch_width"), title="Samples") 77 | 78 | if "heatmap" in output_plots: 79 | row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]], 80 | [ele["dendrox"]["fig"]], 81 | [ele["annotations"]["fig"], None, ele["heatmap"]["wid"]["help_button"]]], 82 | toolbar_location='right', 83 | merge_tools=True) 84 | row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"], 85 | ele["heatmap"]["wid"]["toggle_label"], 86 | width=300), 87 | row(column(ele["heatmap"]["wid"]["x_groupby_select"], 88 | ele["heatmap"]["wid"]["x_sort_select"]), 89 | column(ele["heatmap"]["wid"]["y_groupby_select"], 90 | ele["heatmap"]["wid"]["y_sort_select"]), 91 | sizing_mode="stretch_width"), 92 | column(ele["metadata"]["wid"]["metadata_multiselect"], 93 | ele["metadata"]["wid"]["toggle_legend"], 94 | sizing_mode="stretch_height", 95 | width=300)) 96 | main_panels["heatmap"] = Panel(child=column(row_heatmap, row_heatmap_widgets, sizing_mode="stretch_width"), title="Heatmap") 97 | 98 | if "correlation" in output_plots: 99 | row_correlation = row(column(ele["correlation"]["wid"]["rank_select"], 100 | ele["correlation"]["wid"]["neg_slider"], 101 | ele["correlation"]["wid"]["pos_slider"], 102 | ele["correlation"]["wid"]["toggle_label"], 103 | ele["correlation"]["wid"]["help_button"]), 104 | ele["correlation"]["fig"]) 105 | main_panels["correlation"] = Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation") 106 | 107 | if not main_panels: 108 | print_log("No valid plots to output") 109 | return None 110 | else: 111 | # Add plots in user chosen order 112 | tabs = [main_panels[p] for p in output_plots] 113 | 114 | main_tab = Tabs(tabs=tabs) 115 | logo_base64 = base64.b64encode(open(logo_path, 'rb').read()) # encode to base64 116 | logo_base64 = logo_base64.decode() # convert to string 117 | logo_div = Div(text='' + 'v' + version + '', width=300, height=40, sizing_mode="fixed") 118 | if title: 119 | title_div = Div(text='

' + title + '

', height=40, sizing_mode="stretch_width") 120 | else: 121 | title_div = Spacer() 122 | final = column([row(logo_div, title_div), main_tab], sizing_mode="stretch_width") 123 | 124 | return final 125 | -------------------------------------------------------------------------------- /grimer/metadata.py: -------------------------------------------------------------------------------- 1 | class Metadata: 2 | valid_types = ["categorical", "numeric"] 3 | default_type = "categorical" 4 | 5 | def __init__(self, md, types): 6 | self.data = md 7 | self.types = types 8 | 9 | def __repr__(self): 10 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] 11 | return 'Metadata({})'.format(', '.join(args)) 12 | 13 | def get_col_headers(self): 14 | return self.data.columns 15 | 16 | def get_data(self, metadata_type: str=None): 17 | if metadata_type is not None: 18 | return self.data[self.types[self.types == metadata_type].index] 19 | else: 20 | return self.data 21 | 22 | def get_col(self, col): 23 | return self.data[col] 24 | 25 | def get_unique_values(self, col): 26 | return self.get_col(col).dropna().unique() 27 | 28 | def get_type(self, col): 29 | return self.types[col] 30 | 31 | def get_subset(self, column, value): 32 | return self.data[self.data[column] == value] 33 | -------------------------------------------------------------------------------- /grimer/reference.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | class Reference: 5 | def __init__(self, file: str=None, ids: list=[]): 6 | self.ids = {} # {refid: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)}} 7 | self.parents = {} # {parent_id: set(refids)} 8 | 9 | if file is not None: 10 | self.parse(file) 11 | elif ids: 12 | for i in ids: 13 | self.add(i, "", "") 14 | 15 | def __repr__(self): 16 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] 17 | return 'Reference({})'.format(', '.join(args)) 18 | 19 | def add(self, i, ref: str=None, desc: str=None): 20 | if i not in self.ids: 21 | self.ids[i] = {} 22 | if ref is not None: 23 | if ref not in self.ids[i]: 24 | self.ids[i][ref] = set() 25 | if desc is not None: 26 | self.ids[i][ref].add(desc) 27 | 28 | def add_parent(self, parent, refid): 29 | if parent not in self.parents: 30 | self.parents[parent] = set() 31 | self.parents[parent].add(refid) 32 | 33 | def parse(self, file): 34 | with open(file, 'r') as fh: 35 | if file.endswith(".yml") or file.endswith(".yaml"): 36 | src = yaml.safe_load(fh) 37 | for desc, val in src.items(): 38 | for ref, v in val.items(): 39 | for i in map(str, v["ids"]): 40 | self.add(i, (ref, v["url"]), desc) 41 | else: 42 | for line in fh: 43 | main_id = line.rstrip() 44 | self.add(main_id) 45 | 46 | def update_taxids(self, taxid_updated): 47 | for node, upd_node in taxid_updated.items(): 48 | if upd_node is not None and upd_node != node: 49 | print("Updated taxonomic node: " + node + " -> " + upd_node) 50 | self.add(upd_node) 51 | self.ids[upd_node].update(self.ids[node]) 52 | del self.ids[node] 53 | 54 | def get_refs_desc(self, i, direct: bool=False, parents: bool=False): 55 | refs_desc = {} 56 | if direct and i in self.ids: 57 | refs_desc.update(self.ids[i]) 58 | if parents and i in self.parents: 59 | for refid in self.parents[i]: 60 | refs_desc.update(self.ids[refid]) 61 | return refs_desc 62 | 63 | def get_refs_count(self, i, direct: bool=False, parents: bool=False): 64 | return len(self.get_refs_desc(i, direct, parents)) 65 | -------------------------------------------------------------------------------- /grimer/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/scripts/__init__.py -------------------------------------------------------------------------------- /grimer/scripts/run_decontam.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library("optparse") 4 | library("decontam") 5 | library("reshape2") 6 | library("stats") 7 | #library(ggplot2); packageVersion("ggplot2") 8 | 9 | parser <- OptionParser() 10 | parser <- add_option(parser, c("-o", "--resout"), default="decontam_out.tsv", type="character", help="File to output results") 11 | parser <- add_option(parser, c("-d", "--modout"), default="decontam_mod.tsv", type="character", help="File to output models") 12 | parser <- add_option(parser, c("-i", "--counts"), default="", type="character", help="Input count table") 13 | parser <- add_option(parser, c("-c", "--concentrations"), default="", type="character", help="Input table with DNA concentration") 14 | parser <- add_option(parser, c("-n", "--controls"), default="", type="character", help="Input list with control sample ids") 15 | parser <- add_option(parser, c("-m", "--method"), default="frequency", type="character", help="Method to use: frequecy, prevalence, combined") 16 | parser <- add_option(parser, c("-t", "--threshold"), default=0.1, type="double", help="Threshold") 17 | parser <- add_option(parser, c("-v", "--verbose"), action="store_true", default=TRUE, help="Print extra output [default]") 18 | args <- parse_args(parser) 19 | 20 | generate_plot_frequency_values <- function(seqtab, taxa, conc, neg=NULL, normalize=FALSE, showModels=TRUE, log=TRUE, facet=TRUE){ 21 | # Code copied and adapted from https://github.com/benjjneb/decontam/blob/master/R/plotting.R 22 | # v1.1.2 6a242fc7fc452a971b7b60b6757ea81a86ade7b5 23 | 24 | #print(seqtab) 25 | 26 | if(any(rowSums(seqtab) == 0)) { # Catch and remove zero-count samples 27 | zero.count <- rowSums(seqtab) == 0 28 | seqtab <- seqtab[!zero.count,] 29 | conc <- conc[!zero.count] 30 | if(!is.null(neg)) neg <- neg[!zero.count] 31 | warning("Removed ", sum(zero.count), " samples with zero total counts (or frequency).") 32 | } 33 | 34 | if(normalize) seqtab <- sweep(seqtab, 1, rowSums(seqtab), "/") 35 | if(!(is.numeric(conc) && all(conc>0))) stop("conc must be positive numeric.") 36 | if(is.null(neg)) neg <- rep(FALSE, length(conc)) # Don't ignore any samples 37 | if(is.character(taxa)) { 38 | seqtab <- seqtab[,colnames(seqtab) %in% taxa,drop=FALSE] 39 | } else { 40 | stop("taxa must be a vector of taxa names.") 41 | } 42 | ntax.plot <- ncol(seqtab) 43 | if(ntax.plot == 0) stop("None of the provided taxa were present in seqtab.") 44 | # Prepare plotting data.frame 45 | plotdf <- cbind(data.frame(seqtab, check.names=FALSE), DNA_conc=conc, Type=ifelse(neg, "Negative", "Sample")) 46 | plot_melt <- melt(plotdf, measure.vars=1:ntax.plot, variable.name="taxa", value.name="taxon_abundance") 47 | taxon_levels <- taxa 48 | 49 | plot_melt$taxa <- factor(plot_melt$taxa, levels = taxon_levels) 50 | if(showModels) { 51 | mod_melts <- split(plot_melt, plot_melt$taxa) 52 | logc <- log(seq(min(plotdf$DNA_conc), max(plotdf$DNA_conc), length.out=1000)) 53 | for(tax in names(mod_melts)) { 54 | newdata <- data.frame(logc=logc, taxa=tax, DNA_conc=exp(logc)) 55 | freq <- mod_melts[[tax]]$taxon_abundance 56 | conc <- mod_melts[[tax]]$DNA_conc 57 | df <- data.frame(logc=log(conc), logf=log(freq)) 58 | df <- df[!neg | is.na(neg),] 59 | df <- df[freq>0,] 60 | if(sum(freq>0)>1) { 61 | lm1 <- lm(logf~offset(-1*logc), data=df) 62 | lm0 <- lm(logf~1, data=df) 63 | newdata$contam <- exp(predict(lm1, newdata=newdata)) 64 | newdata$non.contam <- exp(predict(lm0, newdata=newdata)) 65 | } else { 66 | newdata$contam <- NA 67 | newdata$non.contam <- NA 68 | } 69 | mod_melts[[tax]] <- newdata 70 | } 71 | mod_melt <- do.call(rbind, mod_melts) 72 | } 73 | 74 | # p1 <- ggplot(data=plot_melt, aes_string("DNA_conc", "taxon_abundance")) + xlab("DNA Concentration") 75 | # p1 <- p1 + ylab(ifelse(normalize, "Frequency", "Relative Abundance")) 76 | # if(log) p1 <- p1 + scale_x_log10() 77 | # if(log) p1 <- p1 + scale_y_log10() 78 | # if(nlevels(factor(neg))>1) p1 <- p1 + aes_string(color="Type") 79 | # if(facet && ntax.plot > 1) p1 <- p1 + facet_wrap(~taxa) 80 | # if(showModels) p1 <- p1 + geom_line(data=mod_melt, aes_string(y="contam"), color="red", linetype="solid") 81 | # if(showModels) p1 <- p1 + geom_line(data=mod_melt, aes_string(y="non.contam"), color="black", linetype="dashed") 82 | # p1 + geom_point() 83 | # ggsave("test.png") 84 | 85 | # Get first and last points of the models 86 | idx <- sort(c(seq(1, length(mod_melt$taxa), 1000), seq(1000, length(mod_melt$taxa), 1000))) 87 | return(mod_melt[idx,c("contam","non.contam")]) 88 | } 89 | 90 | # Load count table 91 | count_table <- read.table(file=args$counts, sep='\t', header=TRUE, check.names=FALSE) 92 | rows_table <- count_table[,1] 93 | count_matrix <- data.matrix(data.frame(count_table[,-1], row.names = rows_table, check.names=FALSE)) 94 | 95 | # Load concentration table 96 | if(!args$concentrations==""){ 97 | concentrations <- read.table(file=args$concentrations, sep='\t', header=FALSE, check.names=FALSE) 98 | concentrations_list <- concentrations[ (concentrations[, "V1"] %in% rows_table) , "V2"] 99 | } 100 | 101 | # Load list of controls 102 | if(!args$controls==""){ 103 | controls <- read.table(file=args$controls, sep='\t', header=FALSE, check.names=FALSE) 104 | controls_index <- rows_table %in% controls[ , "V1"] 105 | } 106 | 107 | # Run DECONTAM 108 | if (args$method=="frequency"){ 109 | decontam_out <- isContaminant(count_matrix, normalize=FALSE, conc=concentrations_list, method="frequency", threshold=args$threshold) 110 | }else if (args$method=="prevalence") { 111 | decontam_out <- isContaminant(count_matrix, normalize=FALSE, neg=controls_index, method="prevalence", threshold=args$threshold) 112 | }else if (args$method=="combined") { 113 | decontam_out <- isContaminant(count_matrix, normalize=FALSE, neg=controls_index, conc=concentrations_list, method="combined", threshold=args$threshold) 114 | } 115 | 116 | write.table(decontam_out, file=args$resout, sep="\t", quote=FALSE) 117 | 118 | models <- generate_plot_frequency_values(count_matrix, colnames(count_table[-1]), normalize=FALSE, conc=concentrations_list) 119 | 120 | write.table(models, file=args$modout, sep="\t", quote=FALSE) 121 | -------------------------------------------------------------------------------- /grimer/table.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | 4 | class Table: 5 | def __init__(self, samples, total, unassigned, lineage, normalized, zerorep): 6 | # Ordered dict to keep rank insert order 7 | self.data = OrderedDict() 8 | self.lineage = lineage 9 | self.samples = samples 10 | self.total = total 11 | self.unassigned = unassigned 12 | self.normalized = normalized 13 | self.zerorep = zerorep 14 | 15 | def __repr__(self): 16 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] 17 | return 'Table({})'.format(', '.join(args)) 18 | 19 | def add_rank(self, rank, table): 20 | self.data[rank] = table 21 | 22 | def observations(self, rank): 23 | return self.data[rank].columns 24 | 25 | def ranks(self): 26 | return list(self.data.keys()) 27 | 28 | def get_min_valid_count_perc(self): 29 | return min([self.get_counts_perc(rank)[self.get_counts_perc(rank) > 0].min().min() for rank in self.ranks()]) 30 | 31 | def get_total(self): 32 | return self.total 33 | 34 | def get_unassigned(self): 35 | return self.unassigned 36 | 37 | def get_assigned(self): 38 | return self.get_total() - self.get_unassigned() 39 | 40 | def get_unassigned_perc(self): 41 | return self.get_unassigned().divide(self.get_total(), axis=0) if not self.normalized else self.get_unassigned().divide(100, axis=0) 42 | 43 | def get_assigned_perc(self): 44 | return self.get_assigned().divide(self.get_total(), axis=0) if not self.normalized else self.get_assigned().divide(100, axis=0) 45 | 46 | def get_lineage(self, taxid, rank, other_rank): 47 | # get lineage up-to requested rank 48 | return self.lineage[self.lineage[rank] == taxid][other_rank].values[0] 49 | 50 | def get_frequency(self, rank): 51 | return self.data[rank].gt(0).sum(axis=0) 52 | 53 | def get_frequency_perc(self, rank): 54 | return self.get_frequency(rank) / len(self.samples) 55 | 56 | def get_counts(self, rank): 57 | return self.data[rank].sum(axis=0) if not self.normalized else 0 58 | 59 | def get_counts_perc(self, rank): 60 | return self.data[rank].divide(self.get_total(), axis=0) if not self.normalized else self.data[rank].divide(100, axis=0) 61 | 62 | def get_counts_perc_avg_samples(self, rank): 63 | return self.get_counts_perc(rank).sum(axis=0) / len(self.samples) 64 | 65 | def get_top(self, rank, n): 66 | return sorted(self.get_counts_perc_avg_samples(rank).sort_values(ascending=False).index[:n].to_list()) 67 | 68 | def get_subtable(self, rank, samples: list=[], taxids: list=[], keep_shape: bool=False): 69 | subtable = self.data[rank] 70 | 71 | if samples: 72 | valid_samples = [] 73 | for s in samples: 74 | if s in self.samples: 75 | valid_samples.append(s) 76 | 77 | if valid_samples: 78 | subtable = subtable.loc[subtable.index.intersection(valid_samples)] 79 | if not keep_shape: 80 | subtable = subtable.loc[:, subtable.sum(axis=0) > 0] 81 | else: 82 | return None 83 | 84 | if taxids: 85 | subtable = subtable[taxids].copy() 86 | if not keep_shape: 87 | subtable = subtable.loc[subtable[taxids].sum(axis=1) > 0, :] 88 | 89 | return subtable 90 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: GRIMER 2 | theme: readthedocs 3 | nav: 4 | - GRIMER: index.md 5 | - Importing files: importing.md 6 | - Configuration file: config.md 7 | - GRIMER Reports - User Manual: manual.md 8 | - GRIMER Reports - Examples: examples.md 9 | plugins: 10 | - glightbox # pip install mkdocs-glightbox 11 | markdown_extensions: 12 | - attr_list -------------------------------------------------------------------------------- /scripts/bacdive_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | from multitax import NcbiTx 4 | import sys 5 | 6 | ## TODO 7 | ## filter infection? 8 | ## find names? 9 | 10 | data = {("Host_Human-HostBodySite_Limb", "Limbs"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=39&filters%5B1%5D%5Bcat3%5D=&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1", 11 | ("Host_Human-HostBodySite_Organ_Ear", "Ear"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=209&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1", 12 | ("Host_Human-HostBodySite_Organ_Eye", "Eye"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=210&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1", 13 | ("Host_Human-HostBodySite_Organ_Nose", "Nose"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=217&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1", 14 | ("Host_Human-HostBodySite_Organ_SkinNailHair", "Skin/Nail/Hair"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=219&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1", 15 | ("Host_Human-HostBodySite_Organ_OralCavityAndAirways", "Oral"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=41&filters%5B1%5D%5Bcat3%5D=&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1", 16 | ("Host_Human-HostBodyProduct_OralCavityAndAirways_Saliva", "Saliva"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=6&filters%5B1%5D%5Bcat2%5D=47&filters%5B1%5D%5Bcat3%5D=276&filters%5B1%5D%5Bcolor%5D=6&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1"} 17 | 18 | tax = NcbiTx(extended_names=True) 19 | 20 | print('"Human-related bacterial isolates from BacDive":') 21 | 22 | for (search, name), url in data.items(): 23 | print(' "' + name + '":') 24 | print(' url: "https://bacdive.dsmz.de/search?search=taxid:{}"') 25 | parsed_ids = set() 26 | df = pd.read_table(url, sep=",", index_col=0).dropna(subset=["Species"]) 27 | for species in df.Species.unique(): 28 | taxids = tax.search_name(species, rank="species", exact=True) 29 | if not taxids: 30 | sys.stderr.write("Species name not found: " + species + "\n") 31 | elif len(taxids) > 1: 32 | sys.stderr.write("Species with ambiguous name: " + species + "\n") 33 | else: 34 | parsed_ids.add(taxids[0]) 35 | print(" ids: [" + ", ".join(parsed_ids) + "]") 36 | -------------------------------------------------------------------------------- /scripts/ehomd_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import sys 4 | import urllib.request 5 | import re 6 | 7 | 8 | def get_taxid(url): 9 | try: 10 | sys.stderr.write(url + "\n") 11 | assembly_stats = url + "/" + url.split("/")[-1] + "_assembly_stats.txt" 12 | filedata = urllib.request.urlopen(assembly_stats).read().decode() 13 | x = re.search("# Taxid:[\s0-9]*\\r\\n", filedata) 14 | if x: 15 | return re.findall("\d+", x.group())[0] 16 | else: 17 | return None 18 | except: 19 | return None 20 | 21 | # Can be Oral, Nasal or both ("Nasal,Oral") 22 | habitats = ["Oral", "Nasal"] 23 | data = "http://www.ehomd.org/ftp/genomes/PROKKA/current/SEQID_info.csv" 24 | 25 | df = pd.read_table(data, sep=",", usecols=["Habitat", "Sequence_Source"]) 26 | df = df[df["Habitat"].isin(habitats + ["Nasal,Oral"])].drop_duplicates() 27 | df["taxid"] = df["Sequence_Source"].map(get_taxid) 28 | 29 | print('"Human Oral Microbiome Database (eHOMD)":') 30 | for h in habitats: 31 | print(' "' + h + '":') 32 | parsed_ids = set(df.taxid[df.Habitat.str.contains(h)]) 33 | print(' url: "http://www.ehomd.org/?name=HOMD"') 34 | print(" ids: [" + ", ".join(parsed_ids) + "]") 35 | 36 | sys.stderr.write("Could not retrieve taxid for: " + "\n".join(df[df.taxid.isna()]["Sequence_Source"].to_list()) + "\n") 37 | -------------------------------------------------------------------------------- /scripts/env.yaml: -------------------------------------------------------------------------------- 1 | name: grimer_dev 2 | channels: 3 | - defaults 4 | - bioconda 5 | - conda-forge 6 | dependencies: 7 | - pandas 8 | - jsonapi-client>=0.9.7 #mgnify scripts 9 | -------------------------------------------------------------------------------- /scripts/mgnify_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import pandas as pd 5 | import sys 6 | import os 7 | import pickle 8 | import gzip 9 | from urllib.parse import urlencode 10 | from jsonapi_client import Session, Filter 11 | from glob import glob 12 | 13 | """ 14 | Script to download taxonomy abundance files and metadata from MGnify 15 | - It always downloads latest available results based on the pipeline version 16 | 17 | Example dump: seq -f "MGYS%08g" 1 5724 | xargs -P 8 -I {} ./mgnify_download.py -i {} -v -g -o mgnify_dump/ > mgnify_dump.log 2>&1 & 18 | """ 19 | 20 | 21 | def main(argv=sys.argv[1:]): 22 | 23 | API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/' 24 | 25 | parser = argparse.ArgumentParser(description='grimer-download-mgnify') 26 | parser.add_argument('-i', '--study-accession', required=True, type=str, help="MGnify study accession (e.g. MGYS00002462)") 27 | parser.add_argument('-g', '--gzip', default=False, action='store_true', help="Gzip downloaded files") 28 | parser.add_argument('-v', '--verbose', default=False, action='store_true', help="Verbose output") 29 | parser.add_argument('-o', '--output-prefix', type=str, help="Output prefix for downloaded files. Default: --study-accession") 30 | args = parser.parse_args(argv) 31 | 32 | study_accession = args.study_accession 33 | if args.output_prefix: 34 | prefix = args.output_prefix + study_accession 35 | else: 36 | prefix = study_accession 37 | gz = args.gzip 38 | 39 | md_file = prefix + "_metadata.tsv" 40 | out_file = prefix + ".pkl" 41 | if gz: 42 | out_file = out_file + ".gz" 43 | md_file = md_file + ".gz" 44 | 45 | # Check if files exists and skip 46 | tax_files = glob(prefix + "*_taxonomy_abundances_*") 47 | if tax_files and os.path.isfile(out_file) and os.path.isfile(md_file): 48 | print(study_accession, "Warning: files already exist ") 49 | return 50 | 51 | with Session(API_BASE) as s: 52 | # Get main study resource 53 | try: 54 | study = s.get('studies', study_accession).resource 55 | if args.verbose: 56 | print(study.accession, "SAMPLES:" + str(study.samples_count), sep="\t", end="\t") 57 | except: 58 | print(study_accession, "Error: Study accession not found") 59 | sys.exit(1) 60 | 61 | # Save study info as a dict in a pkl file 62 | f = gzip.open(out_file, 'wb') if gz else open(out_file, "wb") 63 | pickle.dump(study.json, file=f) 64 | f.close() 65 | 66 | # Get all taxonomic tables for the highest version of the pipeline 67 | highest_version = 0 68 | table_version = {} 69 | for download in study.downloads: 70 | label = download.description.label 71 | #["Taxonomic assignments", 72 | #"Taxonomic assignments SSU", 73 | #"Taxonomic assignments LSU" 74 | #"Taxonomic assignments UNITE", 75 | #"Taxonomic assignments ITSoneDB"] 76 | if "Taxonomic assignments" in label: 77 | version = float(download.pipeline.id) 78 | if version not in table_version: 79 | table_version[version] = [] 80 | table_version[version].append(download.url) 81 | if version > highest_version: 82 | highest_version = version 83 | 84 | if not table_version: 85 | print("Error: No taxonomic assignments for this study to download") 86 | sys.exit(1) 87 | else: 88 | table_urls = table_version[highest_version] 89 | 90 | # Get all available samples in one go and collect metadata 91 | params = { 92 | 'study_accession': study_accession, 93 | 'page_size': study.samples_count, 94 | } 95 | fltr = Filter(urlencode(params)) 96 | 97 | metadata = {} 98 | for sample in s.iterate('samples', fltr): 99 | # TODO: how to access runs faster, sample.runs is too slow 100 | #nruns += len(sample.runs) 101 | metadata[sample.accession] = {} 102 | for md in sample.sample_metadata: 103 | metadata[sample.accession][md["key"]] = md["value"] 104 | # Add sample description, name and name as metadata 105 | metadata[sample.accession]['sample-desc'] = sample.sample_desc 106 | metadata[sample.accession]['sample-name'] = sample.sample_name 107 | 108 | # Get link sample accession, run accession 109 | # TODO treat multiple runs per sample 110 | run_sample_accesion = {} 111 | try: 112 | for run in s.iterate('runs', fltr): 113 | run_sample_accesion[run.sample.id] = run.id 114 | except: 115 | print("Error: Could not retrieve run accession", sep="\t", end="\t") 116 | 117 | # Write metadata 118 | md_df = pd.DataFrame.from_dict(metadata).T 119 | if run_sample_accesion: 120 | mapped_accessions = md_df.index.isin(run_sample_accesion.keys()) 121 | if args.verbose: 122 | print("MAPPED:" + str(sum(mapped_accessions)), sep="\t", end="\t") 123 | md_df.index = md_df.index.map(lambda x: run_sample_accesion[x] if x in run_sample_accesion else x) 124 | else: 125 | if args.verbose: 126 | print("Warning: No mapping between accessions of samples and metadata", sep="\t", end="\t") 127 | 128 | if args.verbose: 129 | print("METADATA:" + str(md_df.shape[1]), sep="\t", end="\t") 130 | md_df.to_csv(md_file, compression="gzip" if gz else None, sep="\t") 131 | 132 | # Read and write tables 133 | for table_url in table_urls: 134 | try: 135 | t = pd.read_table(table_url) 136 | if args.verbose: 137 | print("OK:" + table_url, end=";") 138 | # Print original 139 | filename = prefix + "_" + os.path.basename(table_url) 140 | t.to_csv(filename if not gz else filename + ".gz", compression="gzip" if gz else None, sep="\t", index=False) 141 | except: 142 | if args.verbose: 143 | print("INVALID:" + table_url, end=";") 144 | 145 | if args.verbose: 146 | print() 147 | 148 | 149 | if __name__ == "__main__": 150 | main() 151 | -------------------------------------------------------------------------------- /scripts/mgnify_extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import gzip 4 | import os 5 | import pickle 6 | import argparse 7 | 8 | 9 | def main(): 10 | version = "1.0.0" 11 | 12 | parser = argparse.ArgumentParser(description='mgnify_extract') 13 | parser.add_argument('-f', '--input-folder', required=True, type=str, help="Folder with files generated by mgnify_download.py") 14 | parser.add_argument('-t', '--top-taxa', default=10, type=int, help="Top taxa to use for each study. 0 for everything. Default 10.") 15 | parser.add_argument('-o', '--output-taxa-counts', type=str, default="taxa_counts.tsv") 16 | parser.add_argument('-b', '--output-biome-counts', type=str, default="") 17 | 18 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version) 19 | args = parser.parse_args() 20 | 21 | acc_files = select_files(args.input_folder) 22 | print("Number of files found: ", len(acc_files)) 23 | 24 | rank_id_name = {0: "superkingdom", 25 | 1: "kingdom", 26 | 2: "phylum", 27 | 3:"class", 28 | 4:"order", 29 | 5:"family", 30 | 6:"genus", 31 | 7:"species"} 32 | taxa_biome = {} 33 | no_biome = [] 34 | biome_count = {} 35 | 36 | for study_accession, study_table in acc_files.items(): 37 | print(study_accession) 38 | 39 | study_file = args.input_folder + "/" + study_accession + ".pkl.gz" 40 | 41 | if os.path.isfile(study_file): 42 | with gzip.open(study_file) as f: 43 | study = pickle.load(f) 44 | else: 45 | no_biome.append(study_accession) 46 | continue 47 | 48 | study_biomes = [] 49 | if 'biomes' in study['relationships']: 50 | for b in study['relationships']['biomes']['data']: 51 | biome = b['id'] 52 | if biome not in taxa_biome: taxa_biome[biome] = {} 53 | if biome not in biome_count: biome_count[biome] = 0 54 | study_biomes.append(biome) 55 | biome_count[biome]+=1 56 | else: 57 | no_biome.append(study_accession) 58 | continue 59 | 60 | pipeline_version = float(study_table[-10:-7]) 61 | 62 | t = pd.read_table(study_table) 63 | 64 | sample_col = '#SampleID' 65 | if sample_col not in t.columns: 66 | # older files have a different header 67 | sample_col = 'taxonomy' 68 | 69 | # expand ranks in columns 70 | ranks = t[sample_col].str.split(';', expand=True) 71 | 72 | # replace empty for unclassified 73 | ranks.replace(regex={r'^.+__$': 'unclassified'}, inplace=True) 74 | 75 | # Replace "s__" at the beggining and _ for " " 76 | ranks.replace(regex={r'^.+__': '', '_': ' '}, inplace=True) 77 | 78 | # Pipeline <= 3.0 reports only species specific name, need to merge 79 | if pipeline_version <= 3.0: 80 | if 6 in ranks and 7 in ranks: 81 | #print(ranks) 82 | # Replace unclassified with None 83 | ranks[6] = ranks[6].replace("unclassified", None) 84 | ranks[7] = ranks[7].replace("unclassified", None) 85 | ranks["species"] = ranks[6] + " " + ranks[7] 86 | ranks.drop(columns=7, inplace=True) 87 | ranks.rename(columns={"species": 7}, inplace=True) 88 | #print(ranks) 89 | 90 | t = pd.concat([ranks, t], axis=1) 91 | t.drop(columns=sample_col, inplace=True) 92 | top_taxa_rank = {} 93 | 94 | for r in range(ranks.shape[1]): 95 | rank_table = t.groupby([r]).sum().T 96 | # Do not count for unclassified taxa 97 | if "unclassified" in rank_table.columns: 98 | rank_table.drop(columns="unclassified", inplace=True) 99 | if "Unclassified" in rank_table.columns: 100 | rank_table.drop(columns="Unclassified", inplace=True) 101 | 102 | max_count = rank_table.max().max() 103 | n_samples = rank_table.shape[0] 104 | avg_perc_taxa = ((rank_table/max_count).sum(axis=0) / n_samples).sort_values(ascending=False) 105 | if args.top_taxa: 106 | top_taxa_rank[r] = avg_perc_taxa.iloc[:args.top_taxa].index.to_list() 107 | else: 108 | top_taxa_rank[r] = avg_perc_taxa.index.to_list() 109 | 110 | # Study can have multiple biomes, for each count 111 | for biome in study_biomes: 112 | for rank, taxa in top_taxa_rank.items(): 113 | r = rank_id_name[rank] 114 | for t in taxa: 115 | if (r,t) not in taxa_biome[biome]: taxa_biome[biome][(r,t)] = 0 116 | taxa_biome[biome][(r,t)]+=1 117 | 118 | biomes_df = pd.DataFrame.from_dict(taxa_biome).T 119 | stacked = pd.DataFrame(biomes_df.T.stack(), columns=["count"]) 120 | 121 | stacked.to_csv(args.output_taxa_counts, sep="\t", header=None) 122 | 123 | if no_biome: 124 | print("Skipped " + str(len(no_biome)) + " files without defined file/biome") 125 | 126 | if args.output_biome_counts: 127 | with open(args.output_biome_counts, "w") as bf: 128 | for biome, cnt in biome_count.items(): 129 | print(biome, cnt, sep="\t", file=bf) 130 | 131 | 132 | def select_files(input_folder): 133 | """ 134 | Return a dict with {accession: taxonomy abundance file} 135 | One file per accession (biggest in size) 136 | """ 137 | tax_ab_files = {} 138 | for file in os.listdir(input_folder): 139 | ffile = input_folder + "/" + file 140 | bname = os.path.basename(file) 141 | 142 | if "taxonomy_abundances" in file: 143 | accession = bname.split("_")[0] 144 | # Add file and filesize 145 | if accession not in tax_ab_files: tax_ab_files[accession] = tuple(["", 0]) 146 | fsize = os.path.getsize(ffile) 147 | if fsize > tax_ab_files[accession][1]: 148 | tax_ab_files[accession] = (ffile, fsize) 149 | 150 | return {k: v[0] for k, v in tax_ab_files.items()} 151 | 152 | 153 | if __name__ == "__main__": 154 | main() 155 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import re 4 | 5 | from setuptools import setup 6 | 7 | 8 | def read(filename): 9 | filename = os.path.join(os.path.dirname(__file__), filename) 10 | text_type = type(u"") 11 | with io.open(filename, mode="r", encoding='utf-8') as fd: 12 | return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read()) 13 | 14 | setup( 15 | name="grimer", 16 | version="1.1.0", 17 | url="https://www.github.com/pirovc/grimer", 18 | license='MIT', 19 | author="Vitor C. Piro", 20 | author_email="pirovc@posteo.net", 21 | description="GRIMER: contamination detection and microbiome exploration", 22 | long_description=read("README.md"), 23 | packages=['grimer'], 24 | package_data={ 25 | 'grimer': ['css/*', 'img/*', 'js/*', 'scripts/*'] 26 | }, 27 | entry_points={'console_scripts': ['grimer=grimer.grimer:main']}, 28 | classifiers=[ 29 | 'License :: OSI Approved :: MIT License', 30 | 'Programming Language :: Python :: 3.5', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | 'Programming Language :: Python :: 3.8', 34 | 'Programming Language :: Python :: 3.9', 35 | ], 36 | ) 37 | --------------------------------------------------------------------------------