├── .gitignore
├── LICENSE
├── README.md
├── config
└── default.yaml
├── docs
├── config.md
├── examples.md
├── img
│ ├── correlation.png
│ ├── correlation_genus.png
│ ├── heatmap.png
│ ├── heatmap_cluster.png
│ ├── heatmap_group.png
│ ├── help.png
│ ├── logo_panels.png
│ ├── overview.png
│ ├── overview_bars.png
│ ├── overview_decontam.png
│ ├── overview_info.png
│ ├── overview_mgnify.png
│ ├── overview_mgnify_2.png
│ ├── overview_references.png
│ ├── overview_table.png
│ ├── overview_table_strep.png
│ ├── samples.png
│ ├── samples_bars.png
│ ├── samples_table.png
│ └── tools.png
├── importing.md
├── index.md
└── manual.md
├── env.yaml
├── files
├── README.md
├── contaminants.yml
├── human-related.yml
└── mgnify5989.tsv
├── grimer-mgnify.py
├── grimer.py
├── grimer
├── __init__.py
├── callbacks.py
├── cds.py
├── config.py
├── css
│ ├── __init__.py
│ └── popup.css
├── decontam.py
├── func.py
├── grimer.py
├── img
│ ├── __init__.py
│ └── logo.png
├── js
│ ├── __init__.py
│ ├── func.js
│ └── popup.js
├── layout.py
├── metadata.py
├── plots.py
├── reference.py
├── scripts
│ ├── __init__.py
│ └── run_decontam.R
└── table.py
├── mkdocs.yml
├── scripts
├── bacdive_download.py
├── ehomd_download.py
├── env.yaml
├── mgnify_download.py
└── mgnify_extract.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 |
3 | # Packages
4 | *.egg
5 | *.egg-info
6 | dist
7 | build
8 | eggs
9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | lib
16 | lib64
17 | __pycache__
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 pirovc.github.io
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | GRIMER performs analysis of microbiome studies and generates a portable and interactive dashboard integrating annotation, taxonomy and metadata with focus on contamination detection.
4 |
5 | - [Installation, user manual](https://pirovc.github.io/grimer/)
6 | - [Live examples](https://pirovc.github.io/grimer/examples/)
7 | - [Pre-print](https://doi.org/10.1101/2021.06.22.449360)
8 |
9 |
10 | 
11 |
12 | ## Powered by
13 |
14 |
15 | [
](https://bokeh.org)
16 | [
](https://pandas.org)
17 | [
](https://scipy.org)
18 | [
](https://scikit-bio.org)
19 |
--------------------------------------------------------------------------------
/config/default.yaml:
--------------------------------------------------------------------------------
1 | references:
2 | "Contaminants": "files/contaminants.yml"
3 | "Human-related": "files/human-related.yml"
4 |
5 | # controls:
6 | # "Negative Controls": "path/file1.tsv"
7 | # "Positve Controls":
8 | # "Metadata_Field":
9 | # - "Metadata_Value1"
10 | # - "Metadata_Value2"
11 |
12 | external:
13 | mgnify: "files/mgnify5989.tsv"
14 | decontam:
15 | threshold: 0.1 # [0-1] P* hyperparameter
16 | method: "frequency" # frequency, prevalence, combined
17 | # # frequency (default: use sum of counts)
18 | # frequency_file: "path/file1.txt"
19 | # frequency_metadata: "Field1"
20 | # # prevalence (default: use all controls)
21 | # prevalence_file:
22 | # - "path/file1.txt"
23 | # - "path/file2.txt"
24 | # prevalence_metadata:
25 | # "Field1":
26 | # - "ValueA"
27 | # - "ValueB"
28 | # "Field2":
29 | # - "ValueC"
30 |
31 |
--------------------------------------------------------------------------------
/docs/config.md:
--------------------------------------------------------------------------------
1 | # Configuration file
2 |
3 | GRIMER uses a configuration file to set reference sources of annotation (e.g. contaminants), controls and external tools (decontam, mgnify). The configuration can be provided with the argument `-c/--config` and it should be in the [YAML](https://yaml.org/){ target="_blank" } format.
4 |
5 | A basic example of a configuration file:
6 |
7 | ```yaml
8 | references:
9 | "Contaminants": "files/contaminants.yml"
10 | "Human-related": "files/human-related.yml"
11 |
12 | controls:
13 | "Negative Controls": "path/file1.tsv"
14 | "Positve Controls":
15 | "Metadata_Field":
16 | - "Metadata_Value1"
17 | - "Metadata_Value2"
18 |
19 | external:
20 | mgnify: "files/mgnify5989.tsv"
21 | decontam:
22 | threshold: 0.1
23 | method: "frequency"
24 | ```
25 |
26 | ## references
27 |
28 | References can be provided as an external `.yml/.yaml` file in a specific format (see below) or as a text file with one taxonomic identifier or taxonomic name per line.
29 |
30 | ```yaml
31 | "General Description":
32 | "Specific description":
33 | url: "www.website.com?id={}"
34 | ids: [1,2,3]
35 | ```
36 |
37 | A real example of saliva organisms extracted from BacDive (NCBI taxonomic ids):
38 |
39 | ```yaml
40 | "Human-related bacterial isolates from BacDive":
41 | "Saliva":
42 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
43 | ids: [152331, 113107, 157688, 979627, 45634, 60133, 157687, 1624, 1583331, 1632, 249188]
44 | ```
45 |
46 | Common contaminants compiled from the literature and human-related possible sources of contamination are available in the [GRIMER repository](https://github.com/pirovc/grimer/tree/main/files){ target="_blank" }. For more information, please refer to the [pre-print](https://doi.org/10.1101/2021.06.22.449360){ target="_blank" }. If the target study overlaps with some of those annotation (e.g. study of human skin), related entries can be easily removed from the provided files to not generate redundant annotations.
47 |
48 | ## controls
49 |
50 | Several control groups can be provided to annotate samples. They can be provided as a file with one sample identifier per line:
51 |
52 | ```yaml
53 | controls:
54 | "Controls": "controls.txt"
55 | ```
56 |
57 | or directly from the metadata (`-m/--metadata-file`) as a field and value(s) information:
58 |
59 | ```yaml
60 | controls:
61 | "Other Controls":
62 | "sample_type": # field
63 | - "blank" # value
64 | - "control" # value
65 | ```
66 |
67 | Both methods can be combined into one configuration file.
68 |
69 | ## external
70 |
71 | Set the configuration and functionality of external tools executed by GRIMER.
72 |
73 | ### mgnify
74 |
75 | GRIMER uses a parsed MGnify database to annotate observations and link them to the respective MGnify repository, reporting most common biome occurrences. Instructions on how to re-generate the parsed database from MGnify can be found [here](https://github.com/pirovc/grimer/tree/main/files#mgnify){ target="_blank" }.
76 |
77 | A [pre-parsed database](https://raw.githubusercontent.com/pirovc/grimer/main/files/mgnify5989.tsv){ target="_blank" } is available in the GRIMER repository (generated on 2022-03-09). To use it, please set the file in the configuration as follows and activate it with the `-g/--mgnify` when running GRIMER.
78 |
79 | ```yaml
80 | external:
81 | mgnify: "files/mgnify5989.tsv"
82 | ```
83 |
84 | ### decontam
85 |
86 | GRIMER can run [DECONTAM](https://benjjneb.github.io/decontam/){ target="_blank" } with `-d/--decontam`, but some configuration is necessary. It is possible to set the threshold (P* hyperparameter) and the method (frequency, prevalence, combined).
87 |
88 | For the frequency/combined method, DNA frequencies for each sample have to be provided either in a `.tsv` file (sample identifier frequency) or as a metadata field. If none is provided, the sum of all counts in the input table is used for the frequency calculation.
89 |
90 | For the prevalence/combined method, file(s) with a list of sample identifiers or a metadata field/value can be provided. If none is provided, all samples defined in the "controls" are considered for the prevalence calculation.
91 |
92 | Below an example of how to set-up the configuration file for DECONTAM:
93 |
94 | ```yaml
95 | external:
96 | decontam:
97 | threshold: 0.1 # P* hyperparameter threshold, values between 0 and 1
98 | method: "frequency" # Options: frequency, prevalence, combined
99 | frequency_file: "path/file1.txt"
100 | # frequency_metadata: "Field1"
101 | # prevalence_file:
102 | # - "path/file1.txt"
103 | # - "path/file2.txt"
104 | prevalence_metadata:
105 | "Field1":
106 | - "ValueA"
107 | - "ValueB"
108 | "Field2":
109 | - "ValueC"
110 | ```
111 |
112 | ## Using the configuration file
113 |
114 | Example [UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom){ target="_blank" } file from [microbiomedb.org](https://microbiomedb.org){ target="_blank" }
115 |
116 | config.yml (external .yml files are available in the [GRIMER repository](https://github.com/pirovc/grimer/tree/main/files){ target="_blank" })
117 |
118 | ```yml
119 | references:
120 | "Contaminants": "files/contaminants.yml"
121 | "Human-related": "files/human-related.yml"
122 |
123 | external:
124 | mgnify: "files/mgnify5989.tsv"
125 | decontam:
126 | threshold: 0.1 # [0-1] P* hyperparameter
127 | method: "frequency" # frequency, prevalence, combined
128 | ```
129 |
130 | Running GRIMER with DECONTAM and MGnify integration
131 |
132 | ```bash
133 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom \
134 | --config config.yml \
135 | --decontam --mgnify \
136 | --taxonomy ncbi \
137 | --ranks superkingdom phylum class order family genus species
138 | ```
--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | Examples of reports generated with [GRIMER](https://github.com/pirovc/grimer)
4 |
5 | ---
6 |
7 | ### Data analysis from Leiby et al. "Lack of detection of a human placenta microbiome in samples from preterm and term deliveries"
8 |
9 | ***original publication: [10.1186/s40168-018-0575-4](https://doi.org/10.1186/s40168-018-0575-4){ target="_blank" }***
10 |
11 | **[GRIMER report MGS](https://pirovc.github.io/grimer-reports/placenta/placenta_mgs.html){ target="_blank" }**
12 |
13 | **[GRIMER report AMPLICON](https://pirovc.github.io/grimer-reports/placenta/placenta_amplicon.html){ target="_blank" }**
14 |
15 |
16 | commands used to create report
17 |
18 | ```bash
19 | # Download files (table, metadata and config)
20 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/placenta/placenta_files.tar.gz
21 | tar xf placenta_files.tar.gz
22 |
23 | # Run GRIMER
24 | # AMPLICON
25 | grimer --config placenta_amplicon_config.yaml \
26 | --input-file placenta_amplicon_table.tsv \
27 | --metadata-file placenta_metadata.tsv \
28 | --taxonomy ncbi \
29 | --ranks superkingdom phylum class order family genus species \
30 | --level-separator ";" \
31 | --obs-replace "^.+__" "" "_" " " \
32 | --unassigned-header "Unassigned" \
33 | --decontam --mgnify --transpose \
34 | --title "Placenta study AMPLICON - Leiby, J.S. et al 2018" \
35 | --output-html placenta_amplicon.html
36 |
37 | # MGS
38 | grimer --config placenta_mgs_config.yaml \
39 | --input-file placenta_mgs_table.tsv \
40 | --metadata-file placenta_metadata.tsv \
41 | --taxonomy ncbi \
42 | --ranks superkingdom phylum class order family genus species \
43 | --level-separator "|" \
44 | --unassigned-header "unassigned" \
45 | --decontam --mgnify \
46 | --title "Placenta study MGS - Leiby, J.S. et al 2018" \
47 | --output-html placenta_mgs.html
48 | ```
49 |
50 |
51 | ---
52 |
53 | ### KatharoSeq analysis from Minich et al. "KatharoSeq Enables High-Throughput Microbiome Analysis from Low-Biomass Samples"
54 |
55 | ***original publication: [10.1128/mSystems.00218-17](https://doi.org/10.1128/mSystems.00218-17){ target="_blank" }***
56 |
57 | **[GRIMER report](https://pirovc.github.io/grimer-reports/katharoseq/katharoseq.html){ target="_blank" }**
58 |
59 |
60 | commands used to create report
61 |
62 | ```bash
63 | # Download files (table, metadata and config)
64 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/katharoseq/katharoseq_files.tar.gz
65 | tar xf katharoseq_files.tar.gz
66 |
67 | # Run GRIMER
68 | grimer --config katharoseq_config.yaml \
69 | --input-file katharoseq_table.tsv \
70 | --metadata-file katharoseq_metadata.tsv \
71 | --transformation clr \
72 | --obs-replace "^.+__" "" "_" " " \
73 | --taxonomy ncbi \
74 | --ranks superkingdom phylum class order family genus species \
75 | --level-separator ";" \
76 | --decontam --mgnify \
77 | --title "KatharoSeq - Minich et al. 2018" \
78 | --output-html katharoseq.html
79 | ```
80 |
81 |
82 |
83 | ---
84 |
85 | ### Preterm Infant Resistome downloaded from [MicrobiomeDB](https://microbiomedb.org/mbio/app/record/dataset/DS_82fe0308e2){ target="_blank" }
86 |
87 | ***original publication: [10.1038/nmicrobiol.2016.24](https://doi.org/10.1038/nmicrobiol.2016.24){ target="_blank" }***
88 |
89 | **[GRIMER report](https://pirovc.github.io/grimer-reports/microbiomedb/ResistomeAmplicon.html){ target="_blank" }**
90 |
91 |
92 | commands used to create report
93 |
94 | ```bash
95 | # Download files (table, metadata and config) - Original source: https://microbiomedb.org/common/downloads/release-22/82fe0308e2032de2041694df6592ba542ea84b86/ResistomeAmplicon.16s_DADA2.taxon_abundance.biom
96 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/microbiomedb/microbiomedb_files.tar.gz
97 | tar xf microbiomedb_files.tar.gz
98 |
99 | # Run GRIMER
100 | grimer --config ResistomeAmplicon.16s_DADA2_config.yaml \
101 | --input-file ResistomeAmplicon.16s_DADA2.taxon_abundance.biom \
102 | --taxonomy ncbi \
103 | --ranks superkingdom phylum class order family genus species \
104 | --decontam --mgnify \
105 | --title "MicrobiomeDB Preterm Infant Resistome (V4)" \
106 | --output-html ResistomeAmplicon.html
107 | ```
108 |
109 |
110 |
111 | ---
112 |
113 | ### Antibiotic induced changes in the microbiota disrupt redox dynamics in the gut downloaded from [MGnify](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005180){ target="_blank" }
114 |
115 | ***original publication [10.7554/elife.35987](https://doi.org/10.7554/elife.35987){ target="_blank" }***
116 |
117 | **[GRIMER report](https://pirovc.github.io/grimer-reports/mgnify/MGYS00005180.html){ target="_blank" }**
118 |
119 |
120 | commands used to create report
121 |
122 | ```bash
123 | # Script to download files and generate GRIMER report from any MGnify study accession
124 | # Requires "jsonapi-client>=0.9.7" (conda install "jsonapi-client>=0.9.7")
125 | ./grimer-mgnify.py -i MGYS00005180 -o MGYS00005180 -g "--decontam --mgnify"
126 |
127 | # Or directly from files
128 | wget https://raw.githubusercontent.com/pirovc/grimer-reports/main/mgnify/mgnify_files.tar.gz
129 | tar xf mgnify_files.tar.gz
130 | # Run GRIMER
131 | grimer --config MGYS00005180_config.yaml \
132 | --input-file MGYS00005180_ERP108433_taxonomy_abundances_SSU_v4.1.tsv \
133 | --metadata-file MGYS00005180_metadata.tsv \
134 | --obs-replace "^.+__" "" "_" " " \
135 | --taxonomy ncbi \
136 | --ranks superkingdom kingdom phylum class order family genus species \
137 | --level-separator ";" \
138 | --decontam --mgnify \
139 | --title "MGnify study accession MGYS00005180" \
140 | --output-html MGYS00005180.html
141 | ```
142 |
143 |
144 |
145 | ---
--------------------------------------------------------------------------------
/docs/img/correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/correlation.png
--------------------------------------------------------------------------------
/docs/img/correlation_genus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/correlation_genus.png
--------------------------------------------------------------------------------
/docs/img/heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/heatmap.png
--------------------------------------------------------------------------------
/docs/img/heatmap_cluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/heatmap_cluster.png
--------------------------------------------------------------------------------
/docs/img/heatmap_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/heatmap_group.png
--------------------------------------------------------------------------------
/docs/img/help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/help.png
--------------------------------------------------------------------------------
/docs/img/logo_panels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/logo_panels.png
--------------------------------------------------------------------------------
/docs/img/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview.png
--------------------------------------------------------------------------------
/docs/img/overview_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_bars.png
--------------------------------------------------------------------------------
/docs/img/overview_decontam.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_decontam.png
--------------------------------------------------------------------------------
/docs/img/overview_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_info.png
--------------------------------------------------------------------------------
/docs/img/overview_mgnify.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_mgnify.png
--------------------------------------------------------------------------------
/docs/img/overview_mgnify_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_mgnify_2.png
--------------------------------------------------------------------------------
/docs/img/overview_references.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_references.png
--------------------------------------------------------------------------------
/docs/img/overview_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_table.png
--------------------------------------------------------------------------------
/docs/img/overview_table_strep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/overview_table_strep.png
--------------------------------------------------------------------------------
/docs/img/samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/samples.png
--------------------------------------------------------------------------------
/docs/img/samples_bars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/samples_bars.png
--------------------------------------------------------------------------------
/docs/img/samples_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/samples_table.png
--------------------------------------------------------------------------------
/docs/img/tools.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/docs/img/tools.png
--------------------------------------------------------------------------------
/docs/importing.md:
--------------------------------------------------------------------------------
1 | # Importing files
2 |
3 | GRIMER is independent of any quantification method and requires a contingency table with raw counts of observations/components for each samples/compositions in the study. Observations are usually, but not limited to, taxonomic entries (e.g. genus, species, strains), operational taxonomic units (OTUs), amplicon sequence variants (ASVs), metagenome-assembled genomes (MAGs) or sequence features.
4 |
5 | GRIMER `--input-file` accepts a file with tab-separated values (.tsv) containing a table of counts (Observation table, Count table, Contingency Tables, ...) or a [.biom](https://biom-format.org/){ target="_blank" } file.
6 |
7 | ## The Biological Observation Matrix file (.biom)
8 |
9 | GRIMER parses [BIOM](https://biom-format.org/){ target="_blank" } files and affiliated metadata, if available. Alternatively, an external metadata file can be provided with `-m/--metadata` and will take precedence over the .biom metadata.
10 |
11 | Example [UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom){ target="_blank" } file from [microbiomedb.org](https://microbiomedb.org){ target="_blank" }
12 |
13 | - Default report (no taxonomy)
14 |
15 | ```bash
16 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom
17 | ```
18 |
19 | - Integrated NCBI taxonomy (will translate names to taxonomy ids)
20 |
21 | ```bash
22 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom \
23 | --taxonomy ncbi \
24 | --ranks superkingdom phylum class order family genus species
25 | ```
26 |
27 | - Using an external metadata file ([UgandaMaternalV3V4.16s_DADA2.sample_details.tsv](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.sample_details.tsv){ target="_blank" })
28 |
29 | ```bash
30 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.biom \
31 | --metadata-file UgandaMaternalV3V4.16s_DADA2.sample_details.tsv \
32 | --taxonomy ncbi \
33 | --ranks superkingdom phylum class order family genus species
34 | ```
35 |
36 | ## tab-separated file (.tsv)
37 |
38 | GRIMER parses .tsv files with single taxonomic identifier/names annotations or with multi-level (e.g.: lineage) taxonomic annotated observations.
39 |
40 | - Rows contain observations and columns contain samples (use `--transpose` if your file is reversed)
41 | - First column and first row are used as headers
42 | - Taxonomy integration: files can have either taxonomic identifiers (NCBI, e.g.: 562) or taxonomic names (NCBI, e.g.: Escherichia coli or GTDB, e.g.: s__Escherichia coli)
43 |
44 | ### Multi-level annotations (e.g. Bacteria;Proteobacteria;Gammaproteobacteria...)
45 |
46 | - Example [UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv){ target="_blank" } file from [microbiomedb.org](https://microbiomedb.org){ target="_blank" }
47 |
48 |
49 | ```bash
50 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv \
51 | --level-separator ";"
52 | ```
53 |
54 | - With metadata ([UgandaMaternalV3V4.16s_DADA2.sample_details.tsv](https://microbiomedb.org/common/downloads/release-31/c66d2dc8473138e3a737ef2ad0b25f1e6e9c0f22/UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv))
55 |
56 | ```bash
57 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv \
58 | --level-separator ";" \
59 | --metadata-file UgandaMaternalV3V4.16s_DADA2.sample_details.tsv
60 | ```
61 |
62 | - With integrated NCBI taxonomy (will translate names to taxids)
63 |
64 | ```bash
65 | grimer --input-file UgandaMaternalV3V4.16s_DADA2.taxon_abundance.tsv \
66 | --level-separator ";" \
67 | --metadata-file UgandaMaternalV3V4.16s_DADA2.sample_details.tsv \
68 | --taxonomy ncbi \
69 | --ranks superkingdom phylum class order family genus species
70 | ```
71 |
72 | ### Single level annotations (e.g. Neisseria animalis)
73 |
74 | - Example [ERP108433_phylum_taxonomy_abundances_SSU_v4.1.tsv](https://www.ebi.ac.uk/metagenomics/api/v1/studies/MGYS00005180/pipelines/4.1/file/ERP108433_phylum_taxonomy_abundances_SSU_v4.1.tsv) from [MGnify](https://www.ebi.ac.uk/metagenomics), phylum level only
75 |
76 | ```bash
77 | # Removing first column with kingdom
78 | cut -f 2- ERP108433_phylum_taxonomy_abundances_SSU_v4.1.tsv > ERP108433_phylum_taxonomy_abundances_SSU_v4.1_parsed.tsv
79 | # Set identifier for unassigned observations as "Unassigned" (many occurences, will be summed)
80 | grimer --input-file ERP108433_phylum_taxonomy_abundances_SSU_v4.1_parsed.tsv \
81 | --unassigned-header "Unassigned"
82 | ```
83 |
84 | - Re-generating taxonomic lineage from single annotations (in this case only superkingdom)
85 |
86 | ```bash
87 | grimer --input-file ERP108433_phylum_taxonomy_abundances_SSU_v4.1_parsed.tsv \
88 | --unassigned-header "Unassigned" \
89 | --taxonomy ncbi \
90 | --ranks superkingdom phylum
91 | ```
92 |
93 | ## From commonly used tools/sources
94 |
95 | ### ganon
96 |
97 | ```bash
98 | ganon table --input *.tre \
99 | --output-file ganon_table.tsv \
100 | --header taxid \
101 | --rank species
102 |
103 | grimer --input-file ganon_table.tsv \
104 | --taxonomy ncbi \
105 | --ranks superkingdom phylum class order family genus species
106 | ```
107 |
108 | ### MetaPhlAn
109 |
110 | ```bash
111 | # merge_metaphlan_tables.py is available with the metaphlan package
112 | merge_metaphlan_tables.py *.tsv | head -n+2 > metaphlan_table.tsv
113 |
114 | grimer --input-file metaphlan_table.tsv \
115 | --level-separator "|" \
116 | --obs-replace '^.+__' '' '_' ' ' \
117 | --taxonomy ncbi \
118 | --ranks superkingdom phylum class order family genus species
119 | ```
120 |
121 | ### QIIME2 feature table (.qza)
122 |
123 | - Example [feature-table.qza](https://docs.qiime2.org/2022.8/data/tutorials/exporting/feature-table.qza) from [QIIME2 docs](https://docs.qiime2.org/2022.8/tutorials/exporting/#exporting-a-feature-table)
124 |
125 | ```bash
126 | qiime tools export --input-path feature-table.qza --output-path exported-feature-table
127 | grimer --input-file exported-feature-table/feature-table.biom
128 | ```
129 |
130 | ### phyloseq
131 |
132 |
133 | ```R
134 | #source("http://bioconductor.org/biocLite.R")
135 | #biocLite("biomformat")
136 | #biocLite('phyloseq')
137 | library("biomformat")
138 | library('phyloseq')
139 | data(soilrep)
140 | b <- make_biom(data = otu_table(soilrep))
141 | write_biom(b, 'out.biom')
142 | ```
143 |
144 | ```bash
145 | grimer --input-file out.biom
146 | ```
147 |
148 | ### MGnify
149 |
150 | - `grimer-mgnify.py` will download and generate a GRIMER report for any MGnify study accession (e.g. MGYS00006024)
151 |
152 | ```bash
153 | # Install API dependency
154 | conda install "jsonapi-client>=0.9.7"
155 | ./grimer-mgnify.py -i MGYS00006024 -o out_folder_mgnify/
156 | ```
157 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # GRIMER
2 |
3 |
4 |
5 | ## About
6 |
7 | GRIMER is a tool that performs automated analyses and generates a portable and interactive dashboard integrating annotation, taxonomy and metadata. It unifies several sources of evidence to help detect contamination. GRIMER is independent of quantification methods and directly analyses contingency tables to create an interactive and offline report. Reports can be created in seconds and are accessible for non-specialists, providing an intuitive set of charts to explore data distribution among observations and samples and its connections with external sources.
8 |
9 | - More information about the method: [pre-print](https://doi.org/10.1101/2021.06.22.449360){ target="_blank" }
10 | - Source-code: [GitHub repository](https://github.com/pirovc/grimer){ target="_blank" }
11 |
12 | ## Installation
13 |
14 | Via conda
15 |
16 | ```bash
17 | conda install -c bioconda -c conda-forge grimer
18 | ```
19 |
20 | or locally installing only dependencies via conda:
21 |
22 | ```bash
23 | git clone https://github.com/pirovc/grimer.git
24 | cd grimer
25 | conda env create -f env.yaml # or mamba env create -f env.yaml
26 | conda activate grimer # or source activate grimer
27 | python setup.py install --record files.txt # Uninstall: xargs rm -rf < files.txt
28 | grimer -h
29 | ```
30 |
31 | ## Basic Usage
32 |
33 | - In-depth examples of input files: [Importing files](importing)
34 | - Complete examples of usage with real files: [Examples](examples)
35 |
36 |
37 | Tab-separated input table
38 |
39 | ```bash
40 | grimer -i input_table.tsv
41 | ```
42 |
43 | BIOM file
44 | ```bash
45 | grimer -i myfile.biom
46 | ```
47 |
48 | Tab-separated input table with taxonomic annotated observations (e.g. sk__Bacteria;k__;p__Actinobacteria;c__Actinobacteria...)
49 | ```bash
50 | grimer -i input_table.tsv -f ";"
51 | ```
52 |
53 | Tab-separated input table with metadata
54 | ```bash
55 | grimer -i input_table.tsv -m metadata.tsv
56 | ```
57 |
58 | With taxonomy integration (ncbi)
59 | ```bash
60 | grimer -i input_table.tsv -m metadata.tsv -t ncbi #optional -b taxdump.tar.gz
61 | ```
62 |
63 | With configuration file to setup external tools, references and annotations
64 | ```bash
65 | grimer -i input_table.tsv -m metadata.tsv -t ncbi -c config/default.yaml -d -g
66 | ```
67 |
68 | ## Parameters
69 |
70 |
71 | ▄████ ██▀███ ██▓ ███▄ ▄███▓▓█████ ██▀███
72 | ██▒ ▀█▒▓██ ▒ ██▒▓██▒▓██▒▀█▀ ██▒▓█ ▀ ▓██ ▒ ██▒
73 | ▒██░▄▄▄░▓██ ░▄█ ▒▒██▒▓██ ▓██░▒███ ▓██ ░▄█ ▒
74 | ░▓█ ██▓▒██▀▀█▄ ░██░▒██ ▒██ ▒▓█ ▄ ▒██▀▀█▄
75 | ░▒▓███▀▒░██▓ ▒██▒░██░▒██▒ ░██▒░▒████▒░██▓ ▒██▒
76 | ░▒ ▒ ░ ▒▓ ░▒▓░░▓ ░ ▒░ ░ ░░░ ▒░ ░░ ▒▓ ░▒▓░
77 | ░ ░ ░▒ ░ ▒░ ▒ ░░ ░ ░ ░ ░ ░ ░▒ ░ ▒░
78 | ░ ░ ░ ░░ ░ ▒ ░░ ░ ░ ░░ ░
79 | ░ ░ ░ ░ ░ ░ ░
80 | version 1.1.0
81 |
82 |
83 | usage: grimer [-h] -i INPUT_FILE [-m METADATA_FILE] [-c CONFIG]
84 | [-t {ncbi,gtdb,silva,greengenes,ott}] [-b [TAXONOMY_FILES ...]] [-r [RANKS ...]]
85 | [-l TITLE] [-p [{overview,samples,heatmap,correlation} ...]] [-o OUTPUT_HTML]
86 | [--full-offline] [-g] [-d] [-f LEVEL_SEPARATOR] [-y VALUES] [-w] [-s]
87 | [-u [UNASSIGNED_HEADER ...]] [-z REPLACE_ZEROS] [--obs-replace [OBS_REPLACE ...]]
88 | [--sample-replace [SAMPLE_REPLACE ...]] [--min-frequency MIN_FREQUENCY]
89 | [--max-frequency MAX_FREQUENCY] [--min-count MIN_COUNT] [--max-count MAX_COUNT]
90 | [-j TOP_OBS_BARS] [-a {none,norm,log,clr}] [-e METADATA_COLS] [--optimal-ordering]
91 | [--show-zeros]
92 | [--linkage-methods [{single,complete,average,centroid,median,ward,weighted} ...]]
93 | [--linkage-metrics [{braycurtis,canberra,chebyshev,cityblock,correlation,cosine,dice,euclidean,hamming,jaccard,jensenshannon,kulsinski,kulczynski1,mahalanobis,minkowski,rogerstanimoto,russellrao,seuclidean,sokalmichener,sokalsneath,sqeuclidean,yule} ...]]
94 | [--skip-dendrogram] [-x TOP_OBS_CORR] [-v]
95 |
96 | optional arguments:
97 | -h, --help show this help message and exit
98 | -v, --version show program's version number and exit
99 |
100 | required arguments:
101 | -i INPUT_FILE, --input-file INPUT_FILE
102 | Tab-separatad file with table with counts (Observation table, Count table,
103 | Contingency Tables, ...) or .biom file. By default rows contain observations
104 | and columns contain samples (use --transpose if your file is reversed). The
105 | first column and first row are used as headers. (default: None)
106 |
107 | main arguments:
108 | -m METADATA_FILE, --metadata-file METADATA_FILE
109 | Tab-separated file with metadata. Rows should contain samples and columns
110 | the metadata fields. QIIME2 metadata format is accepted, with an extra row
111 | to define categorical and numerical fields. If --input-file is a .biom file,
112 | metadata will be extracted from it if available. (default: None)
113 | -c CONFIG, --config CONFIG
114 | Configuration file with definitions of references, controls and external
115 | tools. (default: None)
116 | -t {ncbi,gtdb,silva,greengenes,ott}, --taxonomy {ncbi,gtdb,silva,greengenes,ott}
117 | Enable taxonomic analysis, convert entries and annotate samples. Files will
118 | be automatically downloaded and parsed. Optionally, stored files can be
119 | provided with --taxonomy-files. (default: None)
120 | -b [TAXONOMY_FILES ...], --taxonomy-files [TAXONOMY_FILES ...]
121 | Specific taxonomy files to use with --taxonomy. (default: [])
122 | -r [RANKS ...], --ranks [RANKS ...]
123 | Taxonomic ranks to generate visualizations. Use 'default' to use entries
124 | from the table directly. (default: ['default'])
125 |
126 | output arguments:
127 | -l TITLE, --title TITLE
128 | Title to display on the top of the report. (default: )
129 | -p [{overview,samples,heatmap,correlation} ...], --output-plots [{overview,samples,heatmap,correlation} ...]
130 | Plots to generate. (default: ['overview', 'samples', 'heatmap',
131 | 'correlation'])
132 | -o OUTPUT_HTML, --output-html OUTPUT_HTML
133 | Filename of the HTML report output. (default: output.html)
134 | --full-offline Embed Bokeh javascript library in the output file. Output will be around
135 | 1.5MB bigger but it will work without internet connection. ~your report will
136 | live forever~ (default: False)
137 |
138 | general data options:
139 | -g, --mgnify Plot MGnify, requires --config file with parsed MGnify database. (default:
140 | False)
141 | -d, --decontam Run DECONTAM and generate plots. requires --config file with DECONTAM
142 | configuration. (default: False)
143 | -f LEVEL_SEPARATOR, --level-separator LEVEL_SEPARATOR
144 | If provided, consider --input-table to be a hierarchical multi-level table
145 | where the observations headers are separated by the indicated separator char
146 | (usually ';' or '|') (default: None)
147 | -y VALUES, --values VALUES
148 | Force 'count' or 'normalized' data parsing. Empty to auto-detect. (default:
149 | None)
150 | -w, --cumm-levels Activate if input table has already cummulative values on parent taxonomic
151 | levels. (default: False)
152 | -s, --transpose Transpose --input-table before parsing (if samples are listed on columns and
153 | observations on rows) (default: False)
154 | -u [UNASSIGNED_HEADER ...], --unassigned-header [UNASSIGNED_HEADER ...]
155 | Define one or more header names containing unsassinged/unclassified counts.
156 | (default: None)
157 | -z REPLACE_ZEROS, --replace-zeros REPLACE_ZEROS
158 | Treat zeros in the input table. INT (add 'smallest count' divided by INT to
159 | every value), FLOAT (add FLOAT to every value). Default: 1000 (default:
160 | 1000)
161 | --obs-replace [OBS_REPLACE ...]
162 | Replace values on observations labels/headers (supports regex). Example: '_'
163 | ' ' will replace underscore with spaces, '^.+__' '' will remove the matching
164 | regex. Several pairs of instructions are supported. (default: [])
165 | --sample-replace [SAMPLE_REPLACE ...]
166 | Replace values on sample labels/headers (supports regex). Example: '_' ' '
167 | will replace underscore with spaces, '^.+__' '' will remove the matching
168 | regex. Several pairs of instructions are supported. (default: [])
169 | --min-frequency MIN_FREQUENCY
170 | Define minimum number/percentage of samples containing an observation to
171 | keep the observation [values between 0-1 for percentage, >1 specific
172 | number]. (default: None)
173 | --max-frequency MAX_FREQUENCY
174 | Define maximum number/percentage of samples containing an observation to
175 | keep the observation [values between 0-1 for percentage, >1 specific
176 | number]. (default: None)
177 | --min-count MIN_COUNT
178 | Define minimum number/percentage of counts to keep an observation [values
179 | between 0-1 for percentage, >1 specific number]. (default: None)
180 | --max-count MAX_COUNT
181 | Define maximum number/percentage of counts to keep an observation [values
182 | between 0-1 for percentage, >1 specific number]. (default: None)
183 |
184 | Samples options:
185 | -j TOP_OBS_BARS, --top-obs-bars TOP_OBS_BARS
186 | Number of top abundant observations to show in the Samples panel, based on
187 | the avg. percentage counts/sample. (default: 20)
188 |
189 | Heatmap and clustering options:
190 | -a {none,norm,log,clr}, --transformation {none,norm,log,clr}
191 | Transformation of counts for Heatmap. none (counts), norm (percentage), log
192 | (log10), clr (centre log ratio). (default: log)
193 | -e METADATA_COLS, --metadata-cols METADATA_COLS
194 | Available metadata cols to be selected on the Heatmap panel. Higher values
195 | will slow down the report navigation. (default: 3)
196 | --optimal-ordering Activate optimal_ordering on scipy linkage method, takes longer for large
197 | number of samples. (default: False)
198 | --show-zeros Do not skip zeros on heatmap plot. File will be bigger and iteraction with
199 | heatmap slower. By default, zeros will be omitted. (default: False)
200 | --linkage-methods [{single,complete,average,centroid,median,ward,weighted} ...]
201 | --linkage-metrics [{braycurtis,canberra,chebyshev,cityblock,correlation,cosine,dice,euclidean,hamming,jaccard,jensenshannon,kulsinski,kulczynski1,mahalanobis,minkowski,rogerstanimoto,russellrao,seuclidean,sokalmichener,sokalsneath,sqeuclidean,yule} ...]
202 | --skip-dendrogram Disable dendogram plots for clustering. (default: False)
203 |
204 | Correlation options:
205 | -x TOP_OBS_CORR, --top-obs-corr TOP_OBS_CORR
206 | Number of top abundant observations to build the correlationn matrix, based
207 | on the avg. percentage counts/sample. 0 for all (default: 50)
208 |
209 |
210 | ## Powered by
211 |
212 | [
](https://bokeh.org)
213 | [
](https://pandas.org)
214 | [
](https://scipy.org)
215 | [
](https://scikit-bio.org)
216 |
--------------------------------------------------------------------------------
/docs/manual.md:
--------------------------------------------------------------------------------
1 | # GRIMER Reports - User Manual
2 |
3 | ---
4 |
5 | *For this manual, the metagenomics analysis is based on data from Leiby et al. "Lack of detection of a human placenta microbiome in samples from preterm and term deliveries"*
6 |
7 | - **[GRIMER report MGS Leiby et al.](https://pirovc.github.io/grimer-reports/placenta/placenta_mgs.html){ target="_blank" }**
8 |
9 | ---
10 |
11 | GRIMER report contains 4 main panels: [Overview](#overview), [Samples](#samples), [Heatmap](#heatmap), and [Correlation](#correlation). Every panel has one or more visualization and widgets to select, filter, group, and modify its contents.
12 |
13 |
14 |
15 | - Panels can be reported independently with `-p/--output-plots`
16 | - Help buttons
provide details and information about the plot/analysis
17 | - All plots have built-in tools to export a png, show/hide tooltips, zoom in/out, select entries, among other features
18 |
19 | ## Overview
20 |
21 | The Overview panel shows an individual summary for each observation, related annotations and their distribution among samples.
22 |
23 |
24 |
25 | ### Table
26 |
27 | On the top, a table will list the observations of the study (e.g. OTUS, species). If taxons are the observations, entries can be divided into taxonomic ranks.
28 |
29 |
30 |
31 | - It is possible to filter the items listed on the table using the widgets on the rigth
32 | - Each entry will contain some details about the observations (e.g. Frequency among samples, total counts, ...)
33 | - Selecting an item on the table will activate further details of the observation in the other plots of the panel
34 |
35 | For example, the genus *Streptococcus*:
36 |
37 |
38 |
39 | - Appears on 61% of the samples of this study.
40 | - Has an average of 2.5% relative abundance among all samples.
41 | - Was reported in 5 studies as a "common contaminant".
42 | - It is highly present in water and negative control samples.
43 | - It was detected as a possible contaminant by the DECONTAM method.
44 |
45 | ### Side plots
46 |
47 | On the top right, additional plots and information are display once an observation is selected on the Table. In this example, the *Streptococcus* genus is selected.
48 |
49 | #### Info
50 |
51 |
52 |
53 | - Further information about the observation and related references is displayed. In this case, common contaminants sources.
54 |
55 | #### References
56 |
57 |
58 |
59 | - This plot shows the number of counts of the observation in the provided references (and the counts on the taxonomic lineage).
60 | - In the example above, the genus *Streptococcus* was reported 5 times directly in one of the reference sets (common contaminants), and 3 times as parent (some species of *Streptococcus* were reported as contaminants).
61 |
62 | #### MGnify
63 |
64 |
65 |
66 | - This plot shows the number of studies in the MGnify database for the selected observation.
67 | - *Streptococcus* was reported in 316 studies for the biome "Host Associated:Human".
68 | - In-detail biome levels can be selected to define more specific groups. In the biome level 5 (see below), *Streptococcus* was reported to be mostly found in Fecal samples among all MGnify studies.
69 |
70 |
71 |
72 | #### DECONTAM
73 |
74 |
75 |
76 | - This plot can be used to verify the DECONTAM output.
77 | - It shows the proportion of counts of the selected observation (y-axis) against DNA Concentration (if provided) or Total number of counts (x-axis) of each sample, both in log10 scale.
78 | - Controls samples are displayed in a different color.
79 | - An indication of contamination can be defined when counts are inversely proportional to DNA concentration. The red and black dotted lines are the expected models for contamination and non-contamination, respectively, based on the data of the study. A good indication for contamination is when the counts (excluding the control samples) "fit" the red line model.
80 | - The P-score statistic is not a P-value and it is not associated with any guarantees on the type 1 error rate. Small scores indicate the contaminant model is a better fit, and high scores indicate that the non-contaminant model is a better fit.
81 | - More details about the DECONTAM method and output can be found [here](https://benjjneb.github.io/decontam/vignettes/decontam_intro.html){ target="_blank" }.
82 |
83 | ### Sample bars
84 |
85 | This plot summarizes samples content, annotated with general classification metrics (left y-axis). Annotations can be selected on the bottom dropdown lists. Once an observation is selected on the top table, this plot will also show the count of the observation for each sample (right y-axis).
86 |
87 |
88 |
89 | - Bars are showing total number of counts for each sample and are annotated with the percentage of "human-related" taxa, provided as a reference.
90 | - The x-axis is grouped by two metadata variables: Type and Case/control. Each sub-group is sorted based on the number of counts (in this case reads).
91 | - Yellow circles (right y-axis) are showing the amount of the selected observation (*Streptococcus*) for each one of the samples in a log scale.
92 | - Parent taxonomic ranks can be activated on the top-rigth legend.
93 |
94 | ## Samples
95 |
96 | In-depth evaluation of individual samples can be performed in this panel.
97 |
98 |
99 |
100 | ### Table
101 |
102 |
103 |
104 | - The top table lists all samples in the study, with information about assigned and unassigned counts.
105 | - Further information for the abundance of each taxonomic rank is display if enabled for the report.
106 | - Rows of the table can be selected using the widgets on the right (or manually with the checkboxes). Selected items will be displayed in the bar plot. In the example above, only samples belonging to the "Maternal Saliva" category for the metadata field "Type" are selected.
107 |
108 | ### Bars
109 |
110 |
111 |
112 | - Observation bars showing proportions of top most abundant taxa. The number of items to be displayed can defined with the parameter `-j/--top-obs-bars`
113 | - In the example above, genus level proportions are displayed only for the items selected in the table.
114 | - The bars are grouped by Case/Control and antibiotic usage. Samples are sorted by the *Streptococcus* (1) abundances within each group.
115 |
116 | ## Heatmap
117 |
118 | Several transformations (`-a/--transformation`) can be applied to the data (normalization, log, center log ratio) to be further visualized in the Heatmap panel. Hierarchical clustering, grouping and sorting options can be independently selected for samples and observations to enable pattern detection (e.g. batch effects, treatment effects etc).
119 |
120 |
121 |
122 | - The heatmap shows values for samples (y-axis) and observations (x-axis).
123 | - Side panels link metadata and annotation information to the heatmap axis.
124 | - By default, all external references are displayed. Metadata field(s) can manually selected in bottom-right list.
125 |
126 | ### Clustering
127 |
128 |
129 |
130 | - Heatmap data can be sorted by hierarchical/agglomerative clustering.
131 | - More clustering methods and metrics can be generated using the parameter `--linkage-methods` and `--linkage-metrics`.
132 | - Additional dendrograms are displayed on the clustered axis.
133 | - Here is possible to explore effects of clusters of data in the external panels (annotations and metadata).
134 |
135 | ### Grouping
136 |
137 |
138 |
139 | - Grouping of the heatmap data can be done by taxonomic ranks for observation and metadata for samples.
140 | - Data can be further sorted among groups.
141 |
142 | ## Correlation
143 |
144 |
145 |
146 | Correlation between all observations in the study are plotted as a heatmap matrix. Positive or negative correlations among observations point to concurrent signals in the microbiome analysis (e.g. certain species with similar abudances in the study).
147 |
148 |
149 |
150 | - In the example above, only top observations are displayed. This can be changed with the parameter `-x/--top-obs-corr`.
151 | - Only highly positive or negative correlated organisms are displayed: -1 to -0.8 and 0.8 to 1, respectively
152 | - Organisms highly correlated can be further investigated in the Overview and Heatmap panels.
--------------------------------------------------------------------------------
/env.yaml:
--------------------------------------------------------------------------------
1 | name: grimer
2 | channels:
3 | - defaults
4 | - bioconda
5 | - conda-forge
6 | dependencies:
7 | - bokeh==2.2.3
8 | - pandas
9 | - numpy
10 | - scipy>=1.6.0
11 | - scikit-bio>=0.5.6
12 | - multitax>=1.2.1
13 | - markdown
14 | - biom-format>=2.1.10
15 | - r-base>=4.0.0 #DECONTAM
16 | - bioconductor-decontam==1.10.0 #DECONTAM
17 | - r-optparse==1.6.6 #DECONTAM
18 | - jinja2==3.0.3 # newer version do not work with bokeh==2.2.3
--------------------------------------------------------------------------------
/files/README.md:
--------------------------------------------------------------------------------
1 | # GRIMER References and other files
2 |
3 | ## Reference file format
4 |
5 | 1) File with a list (one per line) of taxonomic identifiers or taxonomic names
6 |
7 | 2) or formatted `.yml` file:
8 |
9 | ```yaml
10 | "General Description":
11 | "Specific description":
12 | url: "www.website.com?id={}"
13 | ids: [1,2,3]
14 | ```
15 |
16 | The url can be a link to the entries listed on the id. Use the `{}` as a placeholder for the id. Example: `https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}`
17 |
18 | The files should be provided in the main configuration file for grimer as follows:
19 |
20 | ```yaml
21 | references:
22 | "Contaminants": "files/contaminants.yml"
23 | "Human-related": "files/human-related.yml"
24 | "CUSTOM CONTAMINANTS": "file.txt"
25 | "LAB RELATED BACTERIA": "another_file.yml"
26 | ```
27 |
28 | ### contaminants.yml
29 |
30 | Last update: 2022-03-09
31 |
32 | Manually curated from diverse publications:
33 |
34 | | Organism group | Genus | Species | Reference |
35 | |----------------|-------|---------|-----------|
36 | | Bacteria | 6 | 0 | 1998 Tanner, M.A. et al. |
37 | | Bacteria | 0 | 10 | 2002 Kulakov, L.A. et al. |
38 | | Bacteria | 4 | 0 | 2003 Grahn, N. et al. |
39 | | Bacteria | 16 | 0 | 2006 Barton, H.A. et al. |
40 | | Bacteria | 11 | 1 | 2014 Laurence, M. et al.|
41 | | Bacteria | 92 | 0 | 2014 Salter, S.J. et al. |
42 | | Bacteria | 7 | 0 | 2015 Jervis-Bardy, J. et al. |
43 | | Bacteria | 28 | 0 | 2015 Jousselin, E. et al. |
44 | | Bacteria | 77 | 127 | 2016 Glassing, A. et al.|
45 | | Bacteria | 23 | 0 | 2016 Lauder, A.P. et al. |
46 | | Bacteria | 6 | 0 | 2016 Lazarevic, V. et al. |
47 | | Bacteria | 62 | 0 | 2017 Salter, S.J. et al. |
48 | | Bacteria | 0 | 122 | 2018 Kirstahler, P. et al. |
49 | | Bacteria | 34 | 0 | 2018 Stinson, L.F. et al. |
50 | | Bacteria | 18 | 0 | 2019 Stinson, L.F. et al. |
51 | | Bacteria | 52 | 2 | 2019 Weyrich, L.S. et al. |
52 | | Bacteria | 8 | 26 | 2019 de Goffau, M.C. et al. |
53 | | Bacteria | 15 | 93 | 2020 Nejman D. et al. |
54 | | Viruses | 0 | 1 | 2015 Kjartansdóttir, K.R. et al. |
55 | | Viruses | 0 | 1 | 2015 Mukherjee, S. et al. |
56 | | Viruses | 0 | 291 | 2019 Asplund, M. et al. |
57 | | Eukaryota | 0 | 3 | 2016 Czurda, S. et al. |
58 | | Eukaryota | 0 | 1 | PRJNA168|
59 | | Total (unique) | 210 | 627 | |
60 |
61 | ### human-related.yml
62 |
63 | Last update: 2022-03-09
64 |
65 | Manually curated from from: Byrd, A., Belkaid, Y. & Segre, J. The human skin microbiome. Nat Rev Microbiol 16, 143–155 (2018). https://doi.org/10.1038/nrmicro.2017.157
66 |
67 | ```yaml
68 | "Top organisms form the human skin microbiome":
69 | "Bacteria":
70 | url: "https://doi.org/10.1038/nrmicro.2017.157"
71 | ids: [257758, 225324, 169292, 161879, 146827, 43765, 38304, 38287, 38286, 29466, 29388, 28037, 1747, 1305, 1303, 1290, 1282, 1270]
72 | "Eukarya":
73 | url: "https://doi.org/10.1038/nrmicro.2017.157"
74 | ids: [2510778, 1047171, 379413, 119676, 117179, 76777, 76775, 76773, 44058, 41880, 36894, 34391, 31312, 5480, 5068, 3074, 2762]
75 | "Viruses":
76 | url: "https://doi.org/10.1038/nrmicro.2017.157"
77 | ids: [185639, 746832, 10566, 493803, 10279, 746830, 746831, 46771]
78 | ```
79 |
80 | BacDive and eHOMD specific subsets. Dump date: 2022-03-09
81 |
82 | ```bash
83 | scripts/bacdive_download.py
84 | scripts/ehomd_download.py
85 | ```
86 |
87 | ## MGnify
88 |
89 | The downloaded MGnify database file should be provided in the main configuration file for grimer as follows:
90 |
91 | ```yaml
92 | external:
93 | mgnify: "files/mgnify5989.tsv"
94 | ```
95 | ### mgnify.tsv
96 |
97 | MGnify dump date: 2022-03-09 (latest study accession MGYS00005989)
98 |
99 | ```bash
100 | seq -f "MGYS%08g" 256 5989 | xargs -P 24 -I {} scripts/mgnify_download.py -i {} -v -g -o mgnify_dump_5989/ > mgnify_dump_5989.log 2>|1 |
101 | scripts/mgnify_extract.py -f mgnify_dump_5989 -t 10 -o files/mgnify.tsv
102 | ```
103 |
--------------------------------------------------------------------------------
/files/contaminants.yml:
--------------------------------------------------------------------------------
1 | "Common Bacterial contaminants":
2 | "2019 de Goffau, M.C. et al.":
3 | url: "http://doi.org/10.1038/s41586-019-1451-5"
4 | ids: [407, 335058, 504481, 1747, 40324, 1033, 38304, 28037, 470, 29448, 1828, 92793, 75, 375, 180282, 851, 301302, 853, 816, 33870, 85698, 87883, 147207, 68909, 1043493, 293256, 1134405, 410, 321895, 432308, 1416628, 1314, 1343, 69359]
5 | "2018 Kirstahler, P. et al.":
6 | url: "http://doi.org/10.1038/s41598-018-22416-4"
7 | ids: [1747, 40324, 470, 29448, 294, 1828, 39491, 40214, 28090, 134533, 108981, 202956, 239935, 28117, 28116, 818, 820, 161879, 33011, 80866, 853, 823, 821, 296, 303, 34073, 2559073, 1055192, 106648, 1075768, 1076, 1112209, 1131812, 1150298, 1159870, 1160721, 1198452, 1217692, 1276755, 1320556, 134534, 1353941, 136273, 1435036, 147645, 1492737, 1492738, 1497615, 1504823, 1509403, 1519439, 1538644, 1619232, 162426, 1646498, 165179, 1654716, 1665556, 1678129, 169292, 1706231, 1714344, 172088, 1736272, 1736280, 1736296, 1736316, 1736528, 1736532, 1740090, 1833, 1835254, 192843, 202952, 202954, 211589, 216465, 245, 246602, 246787, 247, 266749, 2702, 2736, 285, 28901, 29536, 310297, 310298, 33010, 34062, 346179, 362413, 362418, 370974, 38303, 387661, 40520, 418240, 46506, 47920, 503361, 50340, 52133, 529884, 53412, 55197, 55508, 5665, 64974, 70863, 75659, 756892, 76773, 80878, 80882, 86182, 96345, 986, 989370, 991, 99158]
8 | "2015 Jervis-Bardy, J. et al.":
9 | url: "http://doi.org/10.1186/s40168-015-0083-8"
10 | ids: [286, 48736, 59732, 335058, 41275, 28100, 34072]
11 | "2003 Grahn, N. et al.":
12 | url: "http://doi.org/10.1016/S0378-1097(02)01190-4"
13 | ids: [286, 48736, 40323, 338]
14 | "2020 Nejman D. et al.":
15 | url: "http://doi.org/10.1126/science.aay9189"
16 | ids: [561, 59732, 504481, 1747, 40324, 501783, 68287, 34072, 38304, 28037, 470, 29448, 294, 1828, 56946, 375, 180282, 225324, 729, 31998, 1282, 1290, 40214, 28090, 134533, 108981, 202956, 161879, 80866, 296, 303, 34073, 222991, 958, 232523, 81, 987053, 35812, 74030, 469322, 213484, 86669, 471, 106649, 40215, 358, 475299, 1036779, 151416, 993502, 328552, 43992, 1698, 74316, 293, 41276, 155892, 225991, 106592, 428988, 135517, 77097, 115555, 29570, 115553, 71999, 72000, 1358, 333297, 560405, 381630, 334852, 944322, 269069, 84292, 82380, 1270, 205844, 94625, 94626, 34004, 47496, 431058, 431059, 98513, 129817, 316, 76761, 256325, 228654, 190721, 329, 38313, 623, 323621, 352475, 382, 258, 59803, 185950, 93064, 68569, 370959, 172044, 117207, 33050, 1304, 1305]
17 | "2019 Weyrich, L.S. et al.":
18 | url: "http://doi.org/10.1111/1755-0998.13011"
19 | ids: [561, 286, 407, 48736, 40323, 469, 59732, 237, 335058, 1743, 13687, 374, 32008, 283, 1716, 44249, 504481, 1279, 1301, 1654, 1386, 501783, 106589, 212791, 1350, 129337, 724, 1578, 1357, 68287, 838, 222, 1485, 46913, 2745, 1269, 165696, 84567, 165695, 343873, 358705, 239935, 59753, 29330, 174708, 57495, 332102, 1243, 665874, 68, 2742, 1847, 114248, 94008]
20 | "2014 Laurence, M. et al.":
21 | url: "http://doi.org/10.1371/journal.pone.0097876"
22 | ids: [561, 286, 48736, 40323, 237, 13687, 374, 32008, 379, 222, 357, 1230476]
23 | "1998 Tanner, M.A. et al.":
24 | url: "http://doi.org/10.1128/AEM.64.8.3110-3113.1998"
25 | ids: [561, 40323, 469, 963, 75654, 88]
26 | "2014 Salter, S.J. et al.":
27 | url: "http://doi.org/10.1186/s12915-014-0087-z"
28 | ids: [561, 286, 407, 48736, 40323, 469, 59732, 237, 335058, 1743, 13687, 374, 32008, 283, 1716, 963, 44249, 1301, 1033, 1663, 1386, 55080, 41275, 106589, 212791, 149698, 68287, 28100, 379, 1827, 34072, 46123, 12916, 92793, 182269, 75, 281915, 46913, 75654, 29580, 32257, 57493, 88, 1269, 165696, 846, 84567, 125216, 165695, 165697, 343873, 338, 2040, 76890, 12960, 146937, 532, 84756, 85413, 1696, 2755, 77583, 2034, 1298, 80865, 37914, 120831, 547, 66831, 1860, 274591, 1004300, 53457, 131079, 16, 378210, 33882, 29404, 64001, 354354, 528, 376469, 265, 361607, 47494, 52972, 83618, 497, 215579, 1054211, 2060, 401469]
29 | "2016 Lauder, A.P. et al.":
30 | url: "http://doi.org/10.1186/s40168-016-0172-3"
31 | ids: [407, 469, 237, 1743, 504481, 1301, 1654, 165779, 1350, 724, 1578, 838, 1380, 1016, 33042, 39948, 117563, 32067, 906, 482, 836, 32207, 29465]
32 | "2015 Jousselin, E. et al.":
33 | url: "http://doi.org/10.1111/1755-0998.12478"
34 | ids: [286, 407, 48736, 469, 1743, 13687, 1716, 44249, 1279, 165779, 1663, 55080, 129337, 1357, 1827, 2745, 165697, 150247, 32199, 568987, 435913, 70774, 89966, 171, 20, 529883, 613, 39643]
35 | "2016 Lazarevic, V. et al.":
36 | url: "http://doi.org/10.1186/s12866-016-0689-4"
37 | ids: [561, 286, 1279, 1357, 1827, 57493]
38 | "2006 Barton, H.A. et al.":
39 | url: "http://doi.org/10.1016/j.mimet.2005.10.005"
40 | ids: [561, 407, 48736, 40323, 469, 13687, 283, 963, 41275, 106589, 149698, 379, 12916, 357, 2282523, 1257]
41 | "2016 Glassing, A. et al.":
42 | url: "http://doi.org/10.1186/s13099-016-0103-7"
43 | ids: [561, 286, 407, 40323, 59732, 237, 335058, 1743, 374, 32008, 283, 1716, 963, 44249, 1279, 1301, 1747, 40324, 1654, 1033, 165779, 1663, 1386, 55080, 501783, 212791, 1350, 129337, 724, 1578, 149698, 28100, 838, 38304, 28037, 294, 46123, 1380, 182269, 1016, 1485, 33042, 281915, 39948, 117563, 29580, 32257, 32067, 906, 482, 846, 836, 125216, 32207, 29465, 56946, 358705, 225324, 851, 729, 31998, 301302, 1282, 1290, 39491, 28117, 28116, 818, 820, 33011, 823, 821, 201096, 244127, 572511, 356778, 135858, 990721, 216851, 236752, 946234, 46466, 43994, 1506553, 28050, 437755, 119852, 100175, 577310, 53370, 46205, 255204, 747294, 1567, 295418, 97050, 84108, 28453, 432330, 1522, 39492, 33038, 46125, 114702, 1655, 1656, 46353, 172371, 33029, 54007, 160404, 92442, 1402, 115979, 35841, 216816, 138336, 136996, 51101, 54914, 60550, 198252, 337315, 1017, 2718, 1492, 29363, 74426, 117506, 39791, 43768, 401472, 38305, 218538, 309120, 39486, 84112, 208479, 1351, 1352, 1547, 562, 564, 1379, 84135, 362076, 249058, 46124, 137732, 272239, 726, 863372, 154046, 261299, 29581, 505, 573, 467210, 1624, 1382, 40542, 1613, 172827, 374425, 61654, 144191, 483, 484, 669464, 33033, 2741, 204516, 28101, 28124, 28127, 840, 28132, 28135, 106588, 47883, 76731, 204525, 172042, 1660, 69823, 615, 102148, 1283, 28035, 29389, 45634, 230120, 230123, 1302, 68892, 1338, 1309, 1303, 257758, 1308, 157076, 154288, 39778, 29466]
44 | "2017 Salter, S.J. et al.":
45 | url: "http://doi.org/10.1371/journal.pntd.0005975"
46 | ids: [50709, 299566, 1375, 2040, 507, 31988, 165779, 161492, 150247, 92793, 374, 55080, 1696, 41275, 369926, 32008, 194, 2717, 75, 10, 59732, 1716, 37914, 231454, 423604, 212791, 117563, 963, 1004300, 682522, 1357, 149698, 906, 68287, 407, 33882, 1839, 528, 376469, 84567, 335058, 28100, 838, 286, 83618, 48736, 379, 1835, 45669, 22, 28453, 13687, 40323, 1054211, 13275, 33057, 157, 213484, 29465, 1827, 265, 1386]
47 | "2018 Stinson, L.F. et al.":
48 | url: "http://doi.org/10.3389/fmicb.2018.00270"
49 | ids: [1696, 1716, 43668, 37914, 1269, 32207, 1743, 836, 838, 1016, 308865, 1386, 2755, 1279, 66831, 1350, 1578, 1301, 29465, 374, 407, 434, 165696, 13687, 283, 80865, 93681, 48736, 570, 713, 469, 212791, 286, 40323]
50 | "2019 Stinson, L.F. et al.":
51 | url: "http://doi.org/10.1111/lam.13091"
52 | ids: [561, 335058, 407, 13687, 407, 374, 165696, 222, 1716, 547, 48736, 1004302, 1827, 1743, 1269, 204456, 106589, 1678]
53 | "2002 Kulakov, L.A. et al.":
54 | url: "http://doi.org/10.1128/AEM.68.4.1548-1555.2002"
55 | ids: [329, 376, 239, 36773, 69392, 1785, 1409, 304, 28214, 294]
56 | "Common Viral contaminants":
57 | "2019 Asplund, M. et al.":
58 | url: "http://doi.org/10.1016/j.cmi.2019.04.028"
59 | ids: [12071, 742919, 11103, 31647, 1678143, 10298, 10376, 10359, 11676, 129951, 10583, 31552, 10798, 11908, 585044, 518981, 1225745, 11620, 1891767, 493803, 11033, 159150, 35306, 68887, 11870, 11958, 11861, 11946, 11864, 363745, 363020, 242521, 11866, 11960, 31668, 31669, 31670, 11867, 11955, 11874, 11876, 11878, 11885, 36381, 11886, 11888, 269447, 269448, 11950, 11948, 1332312, 354090, 11884, 1352534, 1395610, 1395611, 1395612, 1395613, 1395614, 1395615, 1395616, 1395617, 1395618, 1395619, 1395620, 1341019, 11801, 11809, 1511763, 1394983, 697906, 1072204, 1148801, 1574422, 12104, 763552, 10264, 85708, 759804, 28344, 85506, 33747, 10345, 285986, 220638, 1154691, 185638, 1169627, 1045778, 185636, 72201, 345198, 176652, 1301280, 68347, 1618248, 1618254, 10288, 198112, 1454023, 1454024, 1454025, 1278278, 1278246, 1278252, 1278247, 1278248, 1278249, 1278250, 1278251, 399781, 1278255, 346932, 1278261, 1278263, 1278265, 1474867, 1379694, 1521385, 1521387, 1521389, 938081, 938082, 880162, 251749, 455370, 169864, 1379788, 1608440, 642253, 642255, 1224510, 1592207, 1592212, 1592083, 1592085, 1592086, 1592088, 1592093, 1592095, 1592096, 1592081, 1843761, 1519405, 1557033, 1608451, 664785, 1435438, 1170653, 40979, 12235, 12138, 11987, 51680, 12056, 146500, 554168, 212035, 1269028, 693272, 1420594, 1094892, 1128140, 1235314, 1128143, 1128151, 1128131, 1450746, 1461100, 181522, 1424633, 1010698, 1299317, 1450749, 1416631, 1128422, 1034806, 1592112, 1592113, 1592127, 938080, 1074214, 1519385, 1519387, 1519389, 1519390, 1519395, 1519396, 1519397, 186617, 1262072, 1407671, 743583, 340016, 745107, 745102, 745100, 1416009, 1187128, 889876, 760732, 1243183, 1229760, 1481186, 1505225, 1560342, 233894, 115987, 260149, 227470, 926067, 1127514, 1296654, 294382, 1486657, 1084719, 10756, 1486662, 1285382, 1497851, 1127515, 145579, 263375, 764562, 1133292, 1133022, 242527, 260373, 279280, 644524, 242861, 1132026, 1357714, 1197951, 1327981, 1327976, 1327979, 1327992, 1328030, 1327990, 1327980, 1327972, 1327982, 1327995, 1327983, 1327970, 1327971, 756279, 1327977, 1327993, 1328029, 1327975, 1327974, 1327985, 756280, 756282, 1527524, 1540094, 1042123, 541865, 1567016, 765765, 1176422, 1327037, 1162295, 1141135, 1141136, 335924, 536444, 929832, 682650, 1137745, 536473, 749413, 1477406, 1048515, 1048516, 1048517, 1048520, 1048521, 1537091, 1264700, 1609634, 1455074, 414970, 10863, 10864, 1222338, 1147148, 1237364, 1414766, 1977402, 948870, 1524881, 10665, 10760, 1147094, 1429767, 925983, 925984, 1527519, 1527506, 1229753, 1540097, 1540098, 1054461, 1391223, 294631, 1325731, 908819, 1458858, 1458842, 90963, 1536592, 1527515, 551895, 1129191, 139872, 201847, 287412, 1262517, 754044, 1385658, 1176423, 889949, 446529, 1034128, 1056830, 1089119, 1486472, 1034111, 205879, 1340709, 1567475, 1472912, 1204539, 1399915, 1283076, 1283077, 1168479, 1168478, 440250, 400567, 994601, 1465639, 889956, 445700, 444862, 536454, 445688, 444861, 1229794, 1229793, 1229792, 1229791, 1229790, 1229789, 1229786, 1229787, 1229788, 1229784, 1229782, 376758, 1498188, 504501, 504553, 1235647, 1235648, 1235649, 1235650, 1235653, 1235654, 1235655, 1235656, 1235657, 877240, 754052, 1316739, 347326, 1235689, 31535, 757342, 582345, 1462581, 386793, 1204517, 347327, 1335230, 743813, 1348912, 1327964, 270673, 188350, 1541891, 169683, 998086, 1500757, 1458843, 1129146, 1279082, 1114179, 1548900, 1231048, 1548901, 1449437, 1548918, 1476390, 462590, 754048, 948071, 1481785, 1417599, 1131316, 691965, 136084, 754067, 1161935, 1173749, 1173761, 1173759, 1173762, 590739, 1406795, 1141134, 1204529, 1540099, 1168549, 866889, 1458859, 1458860, 1458861, 10761, 754060, 1524882, 1357423, 373126, 1150991, 1195080, 320843, 55510, 1434319, 320850, 369581, 537874, 1208587, 1566990, 10732, 490913, 1526550, 1340810, 756277, 753084, 753085, 756275, 1026955, 1340812, 238854, 555387, 754042, 444860, 981335, 469660, 215796, 1478972, 1385659, 926697, 336724, 278008, 1211417, 271647, 754075, 573173, 573174, 979525, 979534, 1529058, 1283071, 573176, 1589298, 1076759, 1461743, 1150989, 754058, 754051, 929835, 1414739, 754072, 1524880, 194802, 1168281, 1204514, 1188795, 331278]
60 | "2015 Mukherjee, S. et al.":
61 | url: "http://doi.org/10.1186/1944-3277-10-18"
62 | ids: [10847]
63 | "2015 Kjartansdóttir, K.R. et al.":
64 | url: "https://doi.org/10.1073/pnas.1423756112"
65 | ids: [322019]
66 | "Common Eukaryotic contaminants":
67 | "PRJNA168":
68 | url: "https://www.ncbi.nlm.nih.gov/genome/guide/human/"
69 | ids: [9606]
70 | "2016 Czurda, S. et al.":
71 | url: "https://doi.org/10.1128/JCM.02112-15"
72 | ids: [1895944, 76775, 5308]
73 |
--------------------------------------------------------------------------------
/files/human-related.yml:
--------------------------------------------------------------------------------
1 | "Top organisms form the human skin microbiome":
2 | "Bacteria":
3 | url: "https://doi.org/10.1038/nrmicro.2017.157"
4 | ids: [257758, 225324, 169292, 161879, 146827, 43765, 38304, 38287, 38286, 29466, 29388, 28037, 1747, 1305, 1303, 1290, 1282, 1270]
5 | "Eukarya":
6 | url: "https://doi.org/10.1038/nrmicro.2017.157"
7 | ids: [2510778, 1047171, 379413, 119676, 117179, 76777, 76775, 76773, 44058, 41880, 36894, 34391, 31312, 5480, 5068, 3074, 2762]
8 | "Viruses":
9 | url: "https://doi.org/10.1038/nrmicro.2017.157"
10 | ids: [185639, 746832, 10566, 493803, 10279, 746830, 746831, 46771]
11 | "Human Oral Microbiome Database (eHOMD)":
12 | "Oral":
13 | url: "http://www.ehomd.org/?name=HOMD"
14 | ids: [712116, 469621, 888056, 767100, 1194526, 2081962, 1547448, 1225197, 936596, 1074118, 1321781, 947828, 1403335, 1046629, 39950, 1242967, 1287474, 1074106, 999424, 319701, 999429, 546268, 927666, 1401072, 857100, 1035189, 638301, 857154, 997347, 1125718, 525375, 1403338, 942513, 1227262, 1411915, 1074166, 575614, 888062, 1125712, 1236516, 936561, 486408, 546269, 1225205, 1095741, 1041521, 712710, 596330, 210007, 655813, 553178, 562981, 1321822, 907491, 553199, 1125701, 857291, 546274, 1227269, 1074104, 749551, 1236508, 1074116, 712362, 553198, 1028803, 1889813, 1125724, 857111, 1225192, 608534, 575612, 1423814, 370554, 1028802, 1244083, 904338, 1167010, 1051006, 999432, 1035196, 592010, 546266, 553220, 864567, 1302863, 862966, 1002365, 1407647, 1051972, 525376, 1440770, 471876, 321967, 487215, 1074175, 546270, 866778, 857131, 1242969, 1225193, 888809, 242619, 712368, 596323, 282402, 122586, 1048332, 680646, 712528, 592028, 1236497, 862969, 626523, 1395125, 888727, 1125725, 1074183, 1423782, 888815, 1366052, 1123249, 1032505, 1316596, 767031, 888743, 158, 1074122, 1266997, 1266996, 1321779, 679199, 754507, 1316593, 1122984, 370551, 904317, 563032, 1190621, 1074119, 1046624, 706439, 203275, 713051, 553201, 999430, 1095731, 857133, 767029, 907492, 686659, 1211023, 1321782, 742820, 1347790, 997353, 879310, 629741, 888728, 28129, 1225202, 1122987, 1321815, 857099, 888808, 1051985, 1321818, 1411148, 1257041, 944564, 1035197, 1127692, 1434264, 1074095, 1074173, 1225204, 1785995, 857129, 944565, 186103, 1125719, 1401068, 630588, 1448849, 634176, 556263, 888814, 712938, 712623, 1074176, 1225187, 1423799, 1114967, 28137, 1440768, 857135, 28112, 1074121, 999438, 393480, 641149, 888061, 1309, 1073367, 1030843, 888721, 553184, 909952, 888825, 1236517, 2081702, 1227276, 1297564, 1074178, 1074124, 1292047, 160491, 1035195, 1114965, 702439, 1307428, 209882, 411466, 1122993, 679198, 696216, 187101, 888811, 712361, 857105, 1243032, 652722, 1257040, 679196, 857149, 1311575, 562982, 521097, 760570, 732, 1122989, 470565, 935589, 491076, 888054, 469607, 1304, 575611, 457405, 1661745, 596315, 909420, 1035185, 1074138, 626522, 1203258, 712122, 883167, 1236518, 889204, 1203602, 374833, 684066, 1074105, 575615, 679201, 649743, 1167007, 1448850, 1155071, 1225188, 944560, 1074165, 999436, 1074156, 575590, 523794, 1739435, 562973, 521095, 857113, 883109, 907488, 888833, 489653, 712466, 122587, 596319, 1127690, 885272, 1125722, 888057, 706436, 1440771, 469602, 1234601, 857125, 1095748, 1283280, 1074179, 1225191, 1000588, 525378, 1035190, 857147, 748671, 888812, 546265, 997830, 871541, 684738, 907490, 936589, 1074066, 1120979, 272556, 1225186, 712357, 568704, 649764, 634994, 1009852, 764544, 1108963, 857140, 1401077, 871237, 591365, 1307427, 999414, 880592, 334390, 2093824, 768728, 1074144, 324831, 857138, 553171, 999422, 888052, 649742, 1161424, 45634, 1167628, 864568, 999428, 1074101, 712538, 1125723, 553175, 1225196, 1334627, 857148, 999423, 861450, 28132, 546271, 1203259, 544580, 712411, 626369, 1074167, 1122982, 679192, 857137, 862513, 746361, 2748316, 585503, 873513, 1161421, 997352, 1321775, 1739279, 76859, 999435, 1321823, 1177574, 546273, 888813, 1128111, 1122986, 1401073, 1307443, 997356, 546262, 888049, 1074109, 471872, 857102, 1074190, 935599, 889201, 754505, 1739543, 1225200, 592026, 857123, 837, 272622, 1257037, 1120943, 712624, 1125717, 857151, 796943, 857290, 1074160, 868129, 907487, 2572089, 1074128, 857108, 712435, 888810, 679193, 767453, 1074148, 857134, 1095752, 1125702, 712363, 1074137, 1321821, 1095750, 861454, 553174, 1710, 1028806, 762965, 1292048, 1074143, 1401079, 52773, 1074151, 864570, 1073372, 28131, 1074159, 1095729, 370552, 272831, 435830, 1115809, 1225194, 360104, 596324, 1074108, 706433, 1318634, 447456, 1005704, 857152, 1074184, 653386, 1074092, 1074112, 857153, 596322, 1114969, 469599, 857110, 2748317, 619693, 1028804, 585501, 1125700, 1383, 999431, 1256219, 431947, 668336, 768726, 76123, 566549, 1227268, 1321772, 1434258, 904306, 1256230, 1095733, 857146, 641147, 1248420, 641143, 521393, 1122174, 768727, 999437, 1074115, 525326, 1074134, 1403949, 1859694, 1074120, 1074157, 1122980, 1074162, 1120957, 1316254, 114527, 1257038, 1074149, 1321816, 1203603, 1074186, 1125699, 638300, 754506, 76857, 1104322, 221027, 712961, 1321774, 714, 1123317, 1074161, 40543, 1074123, 1000590, 537973, 1035194, 1321784, 861455, 1316933, 1225201, 1000570, 889206, 713059, 435838, 873517, 1074155, 1307442, 399795, 706437, 999426, 712310, 1095747, 714315, 1122949, 1074136, 1346615, 1095730, 888060, 1434263, 857120, 1297567, 1434260, 1074126, 985008, 679195, 1115803, 999427, 511691, 164, 671214, 857112, 706434, 1161902, 857104, 857132, 693991, 888050, 1074146, 796942, 999434, 553207, 469604, 630527, 1111678, 176279, 1029822, 457403, 1073353, 671211, 1437447, 35519, 679188, 1705617, 1167009, 1227264, 1120944, 857106, 1403829, 873533, 887325, 702437, 888746, 944557, 857142, 857103, 797473, 1074130, 1161422, 1074171, 1074153, 857130, 929793, 546263, 1884263, 1273133, 699187, 1127691, 862968, 908937, 190304, 857136, 713030, 1074129, 198466, 1198676, 857144, 1321786, 1125720, 176090, 469601, 1074182, 888019, 907486, 604162, 864563, 888832, 1216362, 869214, 857126, 160490, 1095738, 1122172, 76856, 1095742, 679194, 936563, 694569, 866776, 1750, 887901, 272623, 1073366, 1411021, 243275, 1236504, 857109, 888051, 1122985, 1234877, 1074170, 1225203, 596329, 479436, 1321820, 1297566, 1074154, 547045, 1120942, 857143, 742814, 1307444, 1321817, 1227272, 568703, 575593, 638302, 887929, 435832, 1200793, 1123310, 1074168, 363952, 712982, 907493, 1095739, 712471, 1095740, 1256223, 1031709, 1035193, 1122171, 862515, 1089447, 176280, 411465, 1074113, 862967, 1074107, 649760, 857155, 857119, 857115, 883094, 1257042, 553219, 1434265, 554406, 347253, 1226633, 857116, 1157946, 1660, 1074111, 1159208, 1235815, 1074127, 686660, 1410950, 1225190, 1434262, 887898, 929102, 1074494, 1257039, 633147, 1353243, 702438, 1123263, 267747, 1035184, 546264, 905067, 1081904, 1227266, 1122991, 1074140, 563033, 2572088, 2572087, 712150, 1074135, 883092, 645512, 360105, 1088720, 651822, 879309, 857121, 861452, 596320, 416870, 1434261, 45243, 662598, 66851, 888059, 525374, 857101, 1042402, 592031, 28133, 712711, 1074102, 1127699, 999415, 1195243, 888741, 1321814, 1434259, 370553, 857139, 388919, 1091045, 447455, 679200, 286636, 999439, 1440769, 546275, 596317, 857122, 525361, 649761, 888742, 888816, 1227270, 712633, 57171, 883158, 762948, 1203550, 1074093, 857150, 857128, 525337, 904294, 712365, 1074100, 857124, 1074125, 857117, 904296, 1225199, 1287476, 1005705, 1074185, 857292, 768724, 798300, 1227271, 562983, 1095744, 553177, 857141, 857114, 907489, 1138874, 1127694, 1411022, 857118, 857107, 864565, 1120941, 1074164, 77917, 246198, 1225189, 193567, 1297565, 1415626, 1074114, 999440, 1227261, 1127693, 857127, 1127696, 1028805, 469378, 888055, 935598, 1035188, 1095743, 1078483, 1225195, 1074180, 762963, 1074177, 467705, 857145, 999425, 862970, 1125721, 1127695, 1121268, 1404260, 525283, 525325, 352165, 712991, 620833, 553218, 1074169, 1074181, 1114966, 862971, 1293577, 888048, 1167008, 999433, 1403336, 1122994, 1185324, 1073362, 293653, 1225198, 2748177, 1074163, 362948]
15 | "Nasal":
16 | url: "http://www.ehomd.org/?name=HOMD"
17 | ids: [406556, 516950, 1203561, 418127, 282458, 883103, 512767, 497962, 1715217, 1236608, 282459, 1069628, 857571, 553567, 1739317, 857577, 451515, 656912, 760791, 487214, 374927, 1069626, 452948, 553573, 1203622, 656913, 450394, 869216, 553594, 374933, 512566, 374932, 760746, 488222, 548474, 456482, 521005, 869309, 525381, 857575, 1203559, 1069625, 359786, 1203627, 760809, 453362, 406557, 71421, 553574, 574093, 869215, 196620, 553583, 760861, 488221, 512769, 1203566, 1203632, 189423, 406563, 1834153, 1203619, 553580, 406558, 553590, 561276, 869269, 455227, 453361, 760810, 478, 553592, 281310, 548473, 374928, 548470, 1069623, 548475, 553581, 374931, 158879, 553577, 488223, 553571, 553588, 857578, 480, 553601, 857574, 262728, 886289, 585161, 453366, 171101, 760834, 1203625, 497980, 857579, 453365, 521004, 406561, 262727, 375177, 359787, 375063, 374930, 1203557, 158878, 935897, 760787, 453363, 406560, 487213, 857576, 595501, 553596, 497963, 273036, 93061, 512768, 681288, 1121367, 553565, 90241, 1203562, 406562, 727, 170187, 1130804, 93062, 426430, 1715123, 866630, 553568, 857581, 406559, 857573, 451516, 857572, 375432, 1203624, 862964, 373153, 546342, 703339, 453364]
18 | "Human-related bacterial isolates from BacDive":
19 | "Limbs":
20 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
21 | ids: [178214, 52132, 386414, 306, 146827, 38303, 137732, 326522, 760, 755171, 82380, 38304, 1504, 478, 47920, 33010, 37326, 354351, 488, 1717, 33935, 1747, 33007, 1660, 1667, 614, 31973, 358, 29466, 69968, 1141657, 754, 479117, 43770, 1286, 652, 411577, 90245, 487, 1713, 43767, 47312, 59561, 630, 479, 156979, 1648, 1513, 732, 1292, 287, 539, 38313, 150055, 1890675, 291112, 13076, 2014, 1529, 420404, 1785, 196, 1245, 220685, 620903, 53437, 1977869, 217204, 180332, 38875, 400946, 495, 84698, 316, 1766, 28189, 161902, 192066, 714, 2054, 1282, 749, 74706, 38301, 753, 1352, 587, 490, 670, 283734, 29354, 303, 326523, 36740, 470, 28125, 485, 47917, 521520, 120957, 131111, 739, 511, 38289, 550, 200476, 1379, 158822, 220687, 53462, 123899, 650, 84112, 1280, 192, 1536, 1509, 131110, 1351, 46124, 239, 71254, 29380, 78355, 37329, 1506, 1697053, 1303, 158877, 1348, 502790, 28264, 66228, 24, 29317, 1402, 676, 1314, 29391, 1409, 488730, 82347, 193461, 501496, 53972, 43765, 411570, 1365628, 147645, 29388, 28035, 33968, 51671, 33028, 37637, 361500, 65058, 646, 730, 105219, 70348, 752, 1328, 1015, 292, 28450, 28091, 747, 28132, 1273, 755172, 28038, 28188, 33889, 672, 40091, 1296, 53363, 1710, 1547, 180588, 729, 370622, 1430326, 135487, 1305, 644, 90239, 206506, 472, 169292, 39791, 669, 38284, 108980, 1239307, 68892, 28090, 44737, 504, 1891233, 58172, 48296, 29432, 28449, 1311, 41276, 1781, 36809, 1720, 322095, 1034, 565, 1701, 391, 82633, 40542, 310300, 1290, 34105]
22 | "Ear":
23 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
24 | ids: [38313, 1280, 306, 35703, 1776741, 760, 28037, 72557, 480, 319939, 68766, 2702, 1661, 1872515, 44750, 1639, 1014, 32002, 545, 28264, 199591, 1353, 267212, 43263, 316, 1869190, 1747, 1314, 52769, 33007, 134375, 285091, 89093, 29379, 29321, 678932, 184870, 674, 47770, 29388, 1313, 663, 1725, 51671, 753, 217203, 727, 85698, 585, 53364, 670, 666, 105219, 678, 90245, 1311, 1898, 292, 93220, 36809, 59561, 87883, 156979, 131111, 739, 511, 419475, 1895474, 293, 287, 1343, 1421, 38287, 123899, 1652]
25 | "Eye":
26 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
27 | ids: [760, 253, 38304, 154288, 478, 29394, 37330, 37326, 247, 488, 2047, 1671023, 759851, 197575, 945844, 47312, 1401, 59561, 479, 2035, 46125, 34062, 732, 1578165, 207340, 161879, 539, 1931, 187491, 28037, 480, 420404, 1544413, 616, 41202, 38290, 545, 40216, 1544416, 529, 192066, 1270, 753, 490, 29354, 485, 134533, 739, 1671022, 1379, 650, 90241, 1280, 1824, 1351, 1655, 280147, 46124, 69392, 239, 1309, 813, 37329, 571, 47478, 29391, 134375, 1409, 43765, 498, 147645, 1685, 72556, 51671, 723, 752, 1302, 28172, 483, 83558, 1750, 40091, 180588, 47846, 370622, 740, 726, 472, 457921, 38284, 68892, 1313, 477, 756689, 727, 1304, 1177728, 504, 29432, 666, 1396, 1871047, 1720, 161890, 735, 2055, 38287]
28 | "Nose":
29 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
30 | ids: [1591, 90241, 1280, 306, 760, 195105, 1673725, 74319, 478, 29394, 520, 40324, 28264, 39950, 38284, 1282, 31973, 1313, 72556, 727, 181487, 1304, 59823, 504, 65058, 105219, 615, 1328, 131111, 43990, 732, 286802, 33889]
31 | "Skin/Nail/Hair":
32 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
33 | ids: [282305, 1280, 94138, 131110, 1655, 1891644, 729, 1780, 29382, 33010, 202789, 38290, 33034, 37326, 28264, 1347369, 66228, 521392, 1766, 472, 169292, 1261, 1747, 45254, 1869190, 2047, 1817405, 1986155, 1282, 1270, 33918, 1314, 861, 43765, 281920, 29388, 663, 1352, 106654, 1260, 1931, 181487, 1276, 59823, 1965292, 132933, 1286, 1347368, 37923, 29432, 730, 36740, 470, 1622, 1781, 36809, 1288, 1698, 59561, 2035, 1720, 29506, 131111, 1283, 38289, 1648, 1273, 34062, 1292, 287, 1753, 1656, 1290, 71999, 672]
34 | "Oral":
35 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
36 | ids: [1613, 463, 306, 912594, 453, 28085, 29394, 478, 37330, 28131, 247, 1717, 1747, 2047, 358, 327575, 665914, 158, 43770, 76832, 419208, 249188, 544580, 203, 87883, 732, 651822, 207340, 43768, 1241978, 1785, 1590, 28137, 52227, 1501332, 80878, 82541, 192066, 341694, 38301, 272548, 596085, 303, 470, 131111, 739, 78259, 480035, 123899, 84112, 221027, 111015, 1280, 55565, 69392, 571, 28083, 1852361, 28126, 39950, 1310, 29391, 1409, 1397, 72556, 723, 105219, 342002, 1659, 230143, 93220, 84109, 28091, 257758, 449, 206, 459, 1389713, 157691, 573, 200, 28454, 472, 28119, 135080, 28133, 1313, 461393, 85698, 1078480, 504, 1396, 39778, 13689, 82633, 354243, 40542, 132249, 1290, 1017, 82380, 253, 47920, 34059, 205, 488, 28129, 113287, 135083, 1597, 29466, 81950, 194702, 615, 47312, 1110546, 46125, 34062, 1380685, 539, 1931, 1874826, 419015, 273136, 446, 189722, 52768, 1529, 480, 520, 267212, 228603, 2126346, 1318, 42817, 1352, 569, 33033, 544581, 76124, 450, 371601, 81858, 1383, 1246, 114528, 69823, 1019, 486, 131110, 76122, 1309, 239, 78355, 2702, 52773, 40214, 40324, 28134, 1314, 519, 985002, 41986, 65058, 244292, 28087, 1302, 747, 1501329, 41976, 28112, 68766, 1389922, 1234680, 42895, 1305, 726, 35783, 39791, 110845, 135082, 44737, 648, 48296, 32013, 28449, 1559, 1465, 29313, 1871047, 817, 735, 719, 134537, 263, 28141, 851, 60552, 885, 796937, 95486, 582, 56811, 1522312, 487, 47715, 1596, 110505, 142586, 1960874, 38313, 143393, 702745, 55211, 860, 51160, 529, 2054, 489, 223392, 47671, 490, 472693, 59505, 28125, 1490, 485, 228599, 1656, 2104, 93218, 90241, 577, 29446, 93219, 518, 37329, 1303, 76759, 82203, 47884, 1306, 45634, 1402, 158823, 43765, 43769, 51671, 33028, 28110, 1624, 238, 1924944, 204, 43675, 1795, 671224, 28132, 243701, 43997, 28095, 305719, 28214, 237576, 79263, 78258, 206043, 1308, 68892, 189723, 633701, 727, 626084, 439703, 502, 796942, 1720, 43990, 407975, 556499, 109790, 329, 1756149, 29341, 33010, 618, 39777, 1660, 114702, 33053, 754, 133926, 181487, 132933, 2094119, 1871052, 947033, 319706, 447, 80866, 128780, 76123, 1292, 287, 1343, 71451, 2079439, 84163, 28037, 114527, 616, 32002, 28136, 495, 714, 29363, 520603, 56774, 824, 177972, 1689, 319709, 850, 216816, 1778, 540, 327574, 1379, 199, 1351, 1655, 46124, 1287736, 68891, 1506, 1697053, 84521, 182337, 28264, 135079, 29317, 1498, 431269, 29523, 37637, 1309795, 1502, 156978, 1328, 1015, 292, 240125, 1547448, 562, 293, 45242, 1944660, 1831, 1296, 157687, 729, 671218, 61645, 135487, 638849, 1018, 106648, 607712, 1911679, 38284, 467210, 589436, 134034, 2382124, 837, 41200, 66851, 143361, 228604, 218538, 58172, 134534, 36809, 1472, 86185, 2055, 391, 341722]
37 | "Saliva":
38 | url: "https://bacdive.dsmz.de/search?search=taxid:{}"
39 | ids: [152331, 113107, 157688, 979627, 45634, 60133, 157687, 1624, 1583331, 1632, 249188]
40 |
--------------------------------------------------------------------------------
/grimer-mgnify.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import scripts.mgnify_download
4 | import grimer.grimer
5 | import argparse
6 | import os
7 | import glob
8 |
9 | parser = argparse.ArgumentParser(description='grimer-mgnify')
10 | parser.add_argument('-i', '--mgnify-study-accession', required=True, type=str, help="MGnify study accession (e.g. MGYS00002462)")
11 | parser.add_argument('-g', '--grimer-params', type=str, help="Extra params for grimer")
12 | parser.add_argument('-o', '--output-prefix', type=str, help="Output prefix for files and report")
13 | args = parser.parse_args()
14 |
15 | if args.output_prefix:
16 | prefix = args.output_prefix
17 | else:
18 | prefix = args.mgnify_study_accession
19 |
20 | # download files
21 | print("Downloading files for study accession " + args.mgnify_study_accession)
22 | scripts.mgnify_download.main(['-i', args.mgnify_study_accession, '-o', prefix, '-v'])
23 |
24 | files = filter(os.path.isfile, glob.glob(prefix + '*taxonomy_abundances*'))
25 | # Sort files by size ASC
26 | files = sorted(files, key=lambda x: os.stat(x).st_size)
27 | md = glob.glob(prefix + '*_metadata.tsv*')
28 |
29 | if args.grimer_params:
30 | grimer_params = args.grimer_params.split(" ")
31 | else:
32 | grimer_params = []
33 | grimer.grimer.main(["-i", files[-1],
34 | "-m", md[-1],
35 | "-c", 'config/default.yaml',
36 | "-f", ";",
37 | "--obs-replace", "^.+__", "", "_", " ",
38 | "-r", "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species",
39 | "-t", "ncbi",
40 | "-o", prefix + ".html",
41 | "--title", "MGnify study accession " + args.mgnify_study_accession,
42 | ] + grimer_params)
43 |
--------------------------------------------------------------------------------
/grimer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import grimer.grimer
3 | grimer.grimer.main()
4 |
--------------------------------------------------------------------------------
/grimer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/__init__.py
--------------------------------------------------------------------------------
/grimer/cds.py:
--------------------------------------------------------------------------------
1 | #General
2 | import pandas as pd
3 | import numpy as np
4 | from math import pi
5 |
6 | #Internal
7 | from grimer.func import print_df, transform_table, print_log, format_js_toString
8 |
9 | #Bokeh
10 | from bokeh.models import ColumnDataSource
11 |
12 |
13 | def dict_taxname(tax, taxids):
14 | """
15 | mapping taxids to names
16 | (or names to names if taxid is not used)
17 | """
18 | id_name = {}
19 | for i in taxids:
20 | n = tax.name(i) if tax else i
21 | id_name[i] = n if n else i
22 | return id_name
23 |
24 |
25 | def cds_plot_references(table, tax, references):
26 | # Stacked list of references, accounting for lineage matches
27 | # index -> observations (repeated)
28 | # columns -> "rank", "ref", "direct", "parent"
29 | clist = []
30 | if references is not None:
31 | for rank in table.ranks():
32 | for obs in table.observations(rank):
33 | for desc, ref in references.items():
34 | direct = ref.get_refs_count(obs, direct=True)
35 | parent = ref.get_refs_count(obs, parents=True)
36 | if direct + parent > 0:
37 | clist.append([obs, rank, desc, direct, parent])
38 |
39 | df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "parent"])
40 | df_references.set_index('obs', inplace=True)
41 |
42 | print_df(df_references, "cds_p_references")
43 | return ColumnDataSource(df_references)
44 |
45 |
46 | def cds_annotations(table, references, controls, decontam, control_samples):
47 | # Stacked matrix of true annotations (omit false)
48 | # index -> taxids
49 | # columns -> rank, annot
50 |
51 | df_annotations = pd.DataFrame(columns=["rank", "annot", "factors", "ov", "tv"])
52 | for i, rank in enumerate(table.ranks()):
53 | # Generate a DataFrame to use as source in tables
54 | df_rank = pd.DataFrame(index=table.observations(rank))
55 |
56 | if decontam is not None:
57 | contaminants = decontam.get_contaminants(rank, df_rank.index).values
58 | if contaminants.any():
59 | df_rank["decontam"] = decontam.get_pscore(rank, df_rank.index)[contaminants]
60 |
61 | if references is not None:
62 | for desc, ref in references.items():
63 | df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True))
64 | df_rank.loc[df_rank[desc] == 0, desc] = np.nan
65 |
66 | if controls is not None:
67 | for desc, ctrl in controls.items():
68 | control_table = table.get_subtable(samples=control_samples[desc], rank=rank)
69 | freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0]
70 | df_rank[desc] = table.observations(rank).map(freq_perc_control).to_list()
71 |
72 | df_rank = pd.DataFrame(df_rank.stack(), columns=["ov"]).reset_index(1)
73 | df_rank.rename(columns={"level_1": "annot"}, inplace=True)
74 |
75 | # add transformed values to fit same scale on heatmap
76 | # Decontam reverse p-score normalized
77 | if not df_rank[df_rank["annot"] == "decontam"].empty:
78 | min_val = df_rank[df_rank["annot"] == "decontam"]["ov"].min()
79 | max_val = df_rank[df_rank["annot"] == "decontam"]["ov"].max()
80 | df_rank.loc[df_rank["annot"] == "decontam", "tv"] = 1 - ((df_rank[df_rank["annot"] == "decontam"]["ov"] - min_val) / (max_val - min_val))
81 |
82 | # max references divided by max
83 | if references is not None:
84 | for desc, ref in references.items():
85 | if not df_rank[df_rank["annot"] == desc].empty:
86 | max_val = df_rank[df_rank["annot"] == desc]["ov"].max()
87 | df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] / max_val
88 |
89 | # keep same percentage
90 | if controls is not None:
91 | for desc, ctrl in controls.items():
92 | if not df_rank.loc[df_rank["annot"] == desc].empty:
93 | df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"]
94 |
95 | df_rank["rank"] = rank # set rank
96 | df_rank["factors"] = df_rank.index if i == 0 else "" # initialize just for first rank (save space)
97 |
98 | # Concat in the main df
99 | df_annotations = pd.concat([df_annotations, df_rank], axis=0)
100 |
101 | print_df(df_annotations, "cds_p_annotations")
102 | return ColumnDataSource(df_annotations)
103 |
104 |
105 | def cds_obstable(table, tax, references, controls, control_samples, decontam):
106 | # index unique taxids
107 | # col|... values to plot to columns in the datatable
108 | # tax|... auxiliary lineage of taxa entries
109 | # aux|ref auxiliary references identifiers
110 |
111 | df_obstable = pd.DataFrame()
112 | # Create unified DataFrame with all ranks used
113 | for rank in table.ranks():
114 | # Generate a DataFrame to use as source in tables
115 | df_rank = pd.DataFrame(index=table.observations(rank))
116 | df_rank["col|rank"] = rank
117 | if tax:
118 | df_rank["col|name"] = table.observations(rank).map(lambda txid: tax.name(txid) if tax.name(txid) else txid).to_list()
119 | else:
120 | df_rank["col|name"] = table.observations(rank)
121 |
122 | # Frequency of taxa among all samples
123 | df_rank["col|frequency_perc"] = table.get_frequency_perc(rank)
124 | df_rank["col|counts_perc_avg"] = table.get_counts_perc_avg_samples(rank)
125 | # Average percentage of counts among all samples
126 | df_rank["col|total_counts"] = table.get_counts(rank)
127 |
128 | # If active - add decontam True/False results
129 | if decontam:
130 | df_rank["col|decontam"] = decontam.get_contaminants(rank, df_rank.index)
131 |
132 | # Add a column for each Annotation source
133 | if references is not None:
134 | for desc, ref in references.items():
135 | df_rank["col|" + desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)).to_list()
136 |
137 | # Add a column for each Control source
138 | if controls is not None:
139 | # calculate frequency for each group of control provided
140 | for desc, ctrl in controls.items():
141 | control_table = table.get_subtable(samples=control_samples[desc], rank=rank)
142 | freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0]
143 | df_rank["col|" + desc] = table.observations(rank).map(freq_perc_control).fillna(0).to_list()
144 |
145 | # Add col for each rank with parent taxid if exists, linking entries in their lineage for filtering and plotting
146 | for other_rank in table.ranks():
147 | if table.ranks().index(other_rank) > table.ranks().index(rank):
148 | df_rank["tax|" + other_rank] = ""
149 | elif other_rank != rank:
150 | df_rank["tax|" + other_rank] = table.observations(rank).map(lambda txid: table.get_lineage(txid, rank, other_rank)).fillna("")
151 | else:
152 | df_rank["tax|" + other_rank] = df_rank.index
153 | # Sort values by frequency to show on table
154 | df_rank.sort_values(by="col|frequency_perc", ascending=False, inplace=True)
155 |
156 | # Concat in the main df
157 | df_obstable = pd.concat([df_obstable, df_rank], axis=0)
158 |
159 | print_df(df_obstable, "cds_m_obstable")
160 | return ColumnDataSource(df_obstable)
161 |
162 |
163 | def cds_sampletable(table):
164 | # index unique sample-ids
165 | # col|... values to plot to columns in the datatable
166 |
167 | df_sampletable = pd.DataFrame(index=table.samples)
168 | df_sampletable["col|total"] = table.get_total() if not table.normalized else 0
169 | df_sampletable["col|assigned"] = table.get_assigned() if not table.normalized else 0
170 | df_sampletable["col|assigned_perc"] = table.get_assigned_perc()
171 | df_sampletable["col|unassigned"] = table.get_unassigned() if not table.normalized else 0
172 | df_sampletable["col|unassigned_perc"] = table.get_unassigned_perc()
173 |
174 | # assigned by rank
175 | for rank in table.ranks():
176 | df_sampletable["col|" + rank] = table.data[rank].sum(axis=1).divide(table.get_total(), axis=0)
177 |
178 | df_sampletable.fillna(0, inplace=True)
179 |
180 | print_df(df_sampletable, "cds_p_sampletable")
181 | return ColumnDataSource(df_sampletable)
182 |
183 |
184 | def cds_samplebars(table):
185 | # index unique sample-ids
186 | # aux| auxiliary values (not plotted)
187 | # bar| values plotted as bars (sample counts)
188 | # tax| values plotted as circles (taxa value)
189 |
190 | df_bars = pd.DataFrame(index=table.samples)
191 | # factors: set the x-axis reference for plotting, it can be dinamically changed (with groups)
192 | df_bars["aux|factors"] = df_bars.index
193 | df_bars["bar|unassigned"] = table.get_unassigned()
194 | # Initialized with "Assigned" of first rank
195 | df_bars["bar|selected"] = table.get_subtable(table.ranks()[0]).sum(axis=1)
196 | # Total assigned - assigned to rank
197 | df_bars["bar|others"] = (table.get_total() - table.get_unassigned()) - df_bars["bar|selected"]
198 | # Add empty cols for taxa values, to be dynamically inserted (None to avoid printing 0)
199 | for rank in table.ranks():
200 | df_bars["tax|" + rank] = None
201 |
202 | print_df(df_bars, "cds_p_samplebars")
203 | return ColumnDataSource(df_bars)
204 |
205 |
206 | def cds_samples(table, references, controls, decontam):
207 | # index unique sample-ids
208 | # aux| auxiliary values (not plotted)
209 | # cnt| count values to be copied/traansformed to bars
210 |
211 | df_samples = pd.DataFrame(index=table.samples)
212 | # index to retrieve default input order
213 | df_samples["aux|input_order"] = range(df_samples.shape[0], 0, -1)
214 | df_samples["cnt|total"] = table.get_total()
215 | df_samples["cnt|unassigned"] = table.get_unassigned()
216 |
217 | # Keep total number of assignemnts for calculations
218 | df_samples["cnt|assigned"] = table.get_total() - table.get_unassigned()
219 |
220 | # Add specific rank assignements
221 | for rank in table.ranks():
222 | df_samples["cnt|" + rank + "|assigned"] = table.data[rank].sum(axis=1)
223 |
224 | # Add counts specific to sources
225 | source_list = []
226 | if references is not None:
227 | source_list.append(references.items())
228 | if controls is not None:
229 | source_list.append(controls.items())
230 |
231 | for sources in source_list:
232 | for desc, src in sources:
233 | for rank in table.ranks():
234 | idx = table.observations(rank).map(lambda x: src.get_refs_count(x, direct=True)) >= 1
235 | df_samples["cnt|" + rank + "|" + desc] = table.data[rank][table.observations(rank)[idx]].sum(axis=1)
236 |
237 | if decontam:
238 | contaminants = decontam.get_contaminant_list()
239 | for rank in table.ranks():
240 | idx = table.observations(rank).isin(contaminants)
241 | df_samples["cnt|" + rank + "|decontam"] = table.data[rank][table.observations(rank)[idx]].sum(axis=1)
242 |
243 | # fill NaN with zero so bars do not "dissapear" when plotting
244 | df_samples.fillna(0, inplace=True)
245 |
246 | print_df(df_samples, "cds_d_samples")
247 | return ColumnDataSource(df_samples)
248 |
249 |
250 | def cds_metadata(metadata):
251 | # index -> sample-ids
252 | # columns -> metadata fields
253 | # values -> metadata values
254 | df_md = metadata.get_data()
255 | print_df(df_md, "cds_d_metadata")
256 | return ColumnDataSource(df_md)
257 |
258 |
259 | def cds_plot_metadata(metadata, max_metadata_cols):
260 | # index (unique sample-ids)
261 | # md0, md1, ..., md(max_metadata_cols)
262 | # values (metadata field, metadata values)
263 |
264 | df_plot_md = pd.DataFrame(index=metadata.data.index, columns=["factors"] + [str(i) for i in range(1, max_metadata_cols + 1)])
265 | df_plot_md["factors"] = df_plot_md.index
266 | # Fill in only first metadata field
267 | first_field = metadata.get_col_headers()[0]
268 |
269 | df_plot_md["1"] = [(first_field, format_js_toString(md_value)) for md_value in metadata.get_col(first_field)]
270 |
271 | # Fill with empty strings to match js output when not selected
272 | df_plot_md.fillna("", inplace=True)
273 |
274 | print_df(df_plot_md, "cds_p_metadata")
275 | return ColumnDataSource(df_plot_md)
276 |
277 |
278 | def cds_plot_decontam(decontam):
279 | # index unique sample-ids
280 | # concentrations from decontam inputs
281 | # controls from decontam inputs
282 | # counts: field to be dynamically filled with click on obstable
283 | df_decontam = decontam.get_data()
284 | df_decontam["controls"] = df_decontam["controls"].map({True: 'Control', False: 'Sample'})
285 | df_decontam["counts"] = None
286 | print_df(df_decontam, "cds_p_decontam")
287 | return ColumnDataSource(df_decontam)
288 |
289 |
290 | def cds_decontam(decontam, ranks):
291 | """
292 | cds based on a dict with valid values to plot model lines
293 | {taxid: (contam_y1, contam_y2, non_contam_y, pval)}
294 | """
295 | dict_coord_mod = {}
296 | for rank in ranks:
297 | df_valid_vals = decontam.rank[rank].dropna(subset=['contam'])
298 | pval = decontam.get_pscore(rank, df_valid_vals.index)
299 | vals = list(zip(df_valid_vals["contam"], df_valid_vals["contam_2"], df_valid_vals["non.contam"], pval))
300 | dict_coord_mod.update(dict(zip(df_valid_vals.index, vals)))
301 |
302 | print_df(dict_coord_mod, "cds_d_decontam_models")
303 | return ColumnDataSource(dict_coord_mod)
304 |
305 |
306 | def cds_plot_decontam_models(decontam):
307 | """
308 | cds based on a dict with 3 pairs of values to plot. x is shared among y_cont and y_noncont
309 | # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]}
310 | """
311 | dict_decontam_models = {}
312 | dict_decontam_models["x"] = [decontam.get_data()["concentration"].min(),
313 | decontam.get_data()["concentration"].max()]
314 | dict_decontam_models["y_cont"] = [None, None]
315 | dict_decontam_models["y_noncont"] = [None, None]
316 | print_df(dict_decontam_models, "cds_p_decontam_models")
317 | return ColumnDataSource(dict_decontam_models)
318 |
319 |
320 | def dict_sampleobs(table):
321 | # dict with raw counts (not storing zeros)
322 | # dict_sampleobs[rank][obs][sample] = count
323 | dict_sampleobs = {}
324 | for rank in table.ranks():
325 | dict_sampleobs[rank] = {}
326 | for obs, sample_val in table.data[rank].to_dict().items():
327 | dict_sampleobs[rank][obs] = {}
328 | for sample, val in sample_val.items():
329 | if val > 0:
330 | dict_sampleobs[rank][obs][sample] = val
331 |
332 | print_df(dict_sampleobs, "dict_d_sampleobs")
333 | return dict_sampleobs
334 |
335 |
336 | def cds_heatmap(table, transformation, show_zeros):
337 | # Stacked matrix of raw counts + transformed value
338 | # index -> sample-ids (repeated)
339 | # obs
340 | # rank
341 | # ov -> original value (raw counts)
342 | # tv -> transformed values (user choice: log10, clr, ...)
343 |
344 | df_heatmap = pd.DataFrame(columns=["obs", "rank", "ov", "tv", "factors_sample", "factors_obs"])
345 | for i, rank in enumerate(table.ranks()):
346 | stacked_rank_df = pd.DataFrame(table.data[rank].stack(), columns=["ov"]).reset_index(1)
347 | # Rename first col to obs
348 | stacked_rank_df.rename(columns={stacked_rank_df.columns[0]: "obs"}, inplace=True)
349 | stacked_rank_df["rank"] = rank
350 | tv = transform_table(table.data[rank], table.get_total(), transformation, table.zerorep)
351 | stacked_rank_df["tv"] = tv.stack().values
352 | #Drop zeros based on original counts
353 | if not show_zeros:
354 | stacked_rank_df = stacked_rank_df[stacked_rank_df["ov"] > 0]
355 | # initialize factors only for first rank
356 | #stacked_rank_df["factors_sample"] = stacked_rank_df.index
357 | #stacked_rank_df["factors_obs"] = stacked_rank_df["obs"]
358 | stacked_rank_df["factors_sample"] = stacked_rank_df.index if i == 0 else ""
359 | stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] if i == 0 else ""
360 |
361 | df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0)
362 |
363 | df_heatmap.drop('ov', axis=1, inplace=True)
364 | print_df(df_heatmap, "cds_p_heatmap")
365 | return ColumnDataSource(df_heatmap)
366 |
367 |
368 | def dict_hcluster(table, hcluster):
369 | # keys -> combination of hclusters
370 | # values -> sorted sample-ids
371 |
372 | leaves_x = {}
373 | # default order
374 | leaves_y = {"default": table.samples.to_list()}
375 |
376 | for rank in hcluster:
377 | # default order for each rank
378 | leaves_x["default|" + rank] = table.observations(rank).to_list()
379 | for method in hcluster[rank]:
380 | for metric in hcluster[rank][method]:
381 | # key
382 | key = rank + "|" + method + "|" + metric
383 | # samples
384 | leaves_y[key] = hcluster[rank][method][metric]["y"]["index"]
385 | # taxa
386 | leaves_x[key] = hcluster[rank][method][metric]["x"]["index"]
387 |
388 | print_df(leaves_x, "dict_d_hcluster_x")
389 | print_df(leaves_y, "dict_d_hcluster_y")
390 | return leaves_x, leaves_y
391 |
392 |
393 | def cds_plot_dendro():
394 | # Empty CDS {"x": [], "y": [], "c": []}
395 | dendro_x = {"x": [], "y": [], "c": []}
396 | dendro_y = {"x": [], "y": [], "c": []}
397 | print_df(dendro_x, "cds_p_dendro_x")
398 | print_df(dendro_y, "cds_p_dendro_y")
399 | return ColumnDataSource(dendro_x), ColumnDataSource(dendro_y)
400 |
401 |
402 | def dict_dendro(table, dendro):
403 | # dict_d_dedro_x and dict_d_dedro_y:
404 | # key -> key + "|x" , key + "|y" , key + "|c"
405 | # value -> list of lists (x and y) or list (c)
406 | dict_d_dedro_y = {}
407 | dict_d_dedro_x = {}
408 |
409 | for rank in dendro:
410 | for method in dendro[rank]:
411 | for metric in dendro[rank][method]:
412 | # key
413 | key = rank + "|" + method + "|" + metric
414 | # dendrogram values
415 | dict_d_dedro_y[key + "|x"] = dendro[rank][method][metric]["y"]["xs"]
416 | dict_d_dedro_y[key + "|y"] = dendro[rank][method][metric]["y"]["ys"]
417 | dict_d_dedro_y[key + "|c"] = dendro[rank][method][metric]["y"]["colors"]
418 | dict_d_dedro_x[key + "|x"] = dendro[rank][method][metric]["x"]["xs"]
419 | dict_d_dedro_x[key + "|y"] = dendro[rank][method][metric]["x"]["ys"]
420 | dict_d_dedro_x[key + "|c"] = dendro[rank][method][metric]["x"]["colors"]
421 |
422 | return dict_d_dedro_x, dict_d_dedro_y
423 |
424 |
425 | def dict_topobs(table, top_obs_bars):
426 | dict_top_taxa = {}
427 | for rank in table.ranks():
428 | dict_top_taxa[rank] = table.get_top(rank, top_obs_bars)
429 | print_df(dict_top_taxa, "dict_d_topobs")
430 | return dict_top_taxa
431 |
432 |
433 | def dict_refs(table, references):
434 | # dict with information about sources and references
435 | # references can be repeated among descriptions, sources and taxids
436 | # {taxid: {source: {desc: [refs]}}
437 | d_refs = {}
438 | # Get only valid taxids
439 | used_ids = set()
440 | for rank in table.ranks():
441 | used_ids.update(table.observations(rank))
442 |
443 | if references is not None:
444 | for i in used_ids:
445 | for sname, s in references.items():
446 | for ref, descs in s.get_refs_desc(i, direct=True).items():
447 | for desc in descs:
448 | # Only add items if they have a reference to it
449 | if i not in d_refs:
450 | d_refs[i] = {}
451 | if sname not in d_refs[i]:
452 | d_refs[i][sname] = {}
453 | if desc not in d_refs[i][sname]:
454 | d_refs[i][sname][desc] = []
455 | d_refs[i][sname][desc].append(ref)
456 |
457 | print_df(d_refs, "dict_d_refs")
458 | return d_refs
459 |
460 |
461 | def cds_correlation(table, corr):
462 | df_corr = pd.DataFrame(columns=["taxid", "rank", "rho"])
463 | for rank in table.ranks():
464 | stacked_rank_df = pd.DataFrame(corr[rank]["rho"], index=corr[rank]["observations"], columns=corr[rank]["observations"]).stack(dropna=False).reset_index(1)
465 | stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True)
466 | stacked_rank_df.rename(columns={0: "rho"}, inplace=True)
467 | stacked_rank_df["rank"] = rank
468 |
469 | # Drop NA for rho (missing values and upper triangular matrix)
470 | stacked_rank_df.dropna(subset=['rho'], inplace=True)
471 |
472 | df_corr = pd.concat([df_corr, stacked_rank_df], axis=0)
473 |
474 | print_df(df_corr, "cds_p_correlation")
475 | return ColumnDataSource(df_corr)
476 |
477 |
478 | def cds_obsbars(table, top_obs_bars):
479 | # index (unique sample-ids)
480 | # cols: 1, 2, ..., top_obs_bars, unassigned, others, factors
481 |
482 | #Load with data from first rank
483 | top_taxids = table.get_top(table.ranks()[0], top_obs_bars)
484 | df_obsbars = table.get_subtable(taxids=top_taxids, rank=table.ranks()[0], keep_shape=True)
485 | df_obsbars.rename(columns={c: str(i) for i, c in enumerate(df_obsbars.columns)}, inplace=True)
486 | # Complete table with None values
487 | ncol = len(df_obsbars.columns)
488 | while ncol < top_obs_bars:
489 | df_obsbars[str(ncol)] = 0
490 | ncol += 1
491 | # Other account for filtered taxa (not on top) and left over percentage for the rank without assignment
492 | df_obsbars["others"] = table.get_total() - table.get_unassigned() - df_obsbars.sum(axis=1)
493 | df_obsbars["unassigned"] = table.get_unassigned()
494 | df_obsbars = transform_table(df_obsbars, table.get_total(), "norm", 0) * 100
495 | df_obsbars["factors"] = df_obsbars.index.to_list()
496 |
497 | print_df(df_obsbars, "cds_p_obsbars")
498 | return ColumnDataSource(df_obsbars)
499 |
500 |
501 | def cds_mgnify(mgnify, table, tax):
502 | # index (taxa, level, lineage)
503 | # count for each combination of index
504 |
505 | df_mgnify = pd.DataFrame(columns=["taxa", "level", "lineage", "count", "angle"])
506 |
507 | # Match uids (taxid or names) from input and keep only found elements
508 | uids = [txid for rank in table.ranks() for txid in table.observations(rank)]
509 | df_tmp = mgnify[mgnify['taxa'].isin(uids)]
510 |
511 | # reset index to properly concate later with biome lineages
512 | df_tmp.reset_index(drop=True, inplace=True)
513 |
514 | if df_tmp.empty:
515 | print_log("could not find matching entries on MGnify")
516 | return None
517 |
518 | # Split biome lineage
519 | biome_levels = df_tmp['biome'].str.split(':', expand=True)
520 | n_levels = biome_levels.shape[1]
521 |
522 | # Rename levels with full lineage, starting from second level
523 | biome_lineage = pd.DataFrame(biome_levels[1])
524 | for l in range(2, n_levels):
525 | biome_lineage[l] = pd.Series(biome_levels[[i for i in range(1, l + 1)]].values.tolist()).str.join(':')
526 |
527 | # Concat back
528 | df_tmp = pd.concat([biome_lineage, df_tmp], axis=1)
529 |
530 | # for each biome level (ignoring root 0)
531 | for l in range(1, n_levels):
532 | # group counts by biome, and fix fields
533 | df_biome = df_tmp.groupby(["taxa", l]).sum(numeric_only=True)
534 | df_biome["level"] = str(l)
535 | df_biome.reset_index(inplace=True)
536 | df_biome.rename(columns={l: "lineage"}, inplace=True)
537 |
538 | # Calculate angle for each taxa/level for wedges
539 | total_taxa_level = df_biome.groupby("taxa").sum(numeric_only=True).to_dict()["count"]
540 | df_biome["angle"] = (df_biome['count'] / df_biome['taxa'].map(total_taxa_level)) * (2 * pi)
541 |
542 | # Group to the final df
543 | df_mgnify = pd.concat([df_mgnify, df_biome], axis=0, ignore_index=True)
544 |
545 | # set index
546 | df_mgnify.set_index('taxa', inplace=True)
547 |
548 | print_df(df_mgnify, "cds_p_mgnify")
549 | return ColumnDataSource(df_mgnify)
550 |
--------------------------------------------------------------------------------
/grimer/config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | from scipy.spatial.distance import _METRICS_NAMES
4 | from scipy.cluster.hierarchy import _LINKAGE_METHODS
5 |
6 |
7 | class Config:
8 |
9 | version = "1.1.0"
10 | default_rank_name = "default"
11 | output_plots = ["overview", "samples", "heatmap", "correlation"]
12 | transformations = ["none", "norm", "log", "clr"]
13 | taxonomy = ["ncbi", "gtdb", "silva", "greengenes", "ott"]
14 |
15 | def __new__(self, argv=None):
16 |
17 | formatter_class = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=100)
18 | parser = argparse.ArgumentParser(prog="grimer",
19 | description=logo(self.version),
20 | formatter_class=formatter_class)
21 |
22 | required_group = parser.add_argument_group('required arguments')
23 | required_group.add_argument('-i', '--input-file', required=True, type=str, help="Tab-separatad file with table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --transpose if your file is reversed). The first column and first row are used as headers.")
24 |
25 | main_group = parser.add_argument_group('main arguments')
26 | main_group.add_argument('-m', '--metadata-file', type=str, help="Tab-separated file with metadata. Rows should contain samples and columns the metadata fields. QIIME2 metadata format is accepted, with an extra row to define categorical and numerical fields. If --input-file is a .biom file, metadata will be extracted from it if available.")
27 | main_group.add_argument('-c', '--config', type=str, help="Configuration file with definitions of references, controls and external tools.")
28 | main_group.add_argument('-t', '--taxonomy', type=str, default=None, help="Enable taxonomic analysis, convert entries and annotate samples. Files will be automatically downloaded and parsed. Optionally, stored files can be provided with --taxonomy-files.", choices=Config.taxonomy)
29 | main_group.add_argument('-b', '--taxonomy-files', nargs="*", type=str, default=[], help="Specific taxonomy files to use with --taxonomy.")
30 | main_group.add_argument('-r', '--ranks', nargs="*", default=[Config.default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + Config.default_rank_name + "' to use entries from the table directly.")
31 |
32 | output_group = parser.add_argument_group('output arguments')
33 | output_group.add_argument('-l', '--title', type=str, default="", help="Title to display on the top of the report.")
34 | output_group.add_argument('-p', '--output-plots', nargs="*", type=str, default=Config.output_plots, help="Plots to generate.", choices=Config.output_plots)
35 | output_group.add_argument('-o', '--output-html', type=str, default="output.html", help="Filename of the HTML report output.")
36 | output_group.add_argument('--full-offline', default=False, action='store_true', help="Embed Bokeh javascript library in the output file. Output will be around 1.5MB bigger but it will work without internet connection. ~your report will live forever~")
37 |
38 | data_group = parser.add_argument_group('general data options')
39 | data_group.add_argument('-g', '--mgnify', default=False, action='store_true', help="Plot MGnify, requires --config file with parsed MGnify database.")
40 | data_group.add_argument('-d', '--decontam', default=False, action='store_true', help="Run DECONTAM and generate plots. requires --config file with DECONTAM configuration.")
41 | data_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hierarchical multi-level table where the observations headers are separated by the indicated separator char (usually ';' or '|')")
42 | data_group.add_argument('-y', '--values', default=None, type=str, help="Force 'count' or 'normalized' data parsing. Empty to auto-detect.")
43 | data_group.add_argument('-w', '--cumm-levels', default=False, action='store_true', help="Activate if input table has already cummulative values on parent taxonomic levels.")
44 | data_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table before parsing (if samples are listed on columns and observations on rows)")
45 | data_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.")
46 | data_group.add_argument('-z', '--replace-zeros', type=str, default="1000", help="Treat zeros in the input table. INT (add 'smallest count' divided by INT to every value), FLOAT (add FLOAT to every value). Default: 1000")
47 | data_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on observations labels/headers (supports regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex. Several pairs of instructions are supported.")
48 | data_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on sample labels/headers (supports regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex. Several pairs of instructions are supported.")
49 | data_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].")
50 | data_group.add_argument('--max-frequency', type=float, help="Define maximum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].")
51 | data_group.add_argument('--min-count', type=float, help="Define minimum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].")
52 | data_group.add_argument('--max-count', type=float, help="Define maximum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].")
53 |
54 | sample_group = parser.add_argument_group('Samples options')
55 | sample_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Number of top abundant observations to show in the Samples panel, based on the avg. percentage counts/sample.")
56 |
57 | heatmap_group = parser.add_argument_group('Heatmap and clustering options')
58 | heatmap_group.add_argument('-a', '--transformation', type=str, default="log", help="Transformation of counts for Heatmap. none (counts), norm (percentage), log (log10), clr (centre log ratio).", choices=Config.transformations)
59 | heatmap_group.add_argument('-e', '--metadata-cols', type=int, default=3, help="Available metadata cols to be selected on the Heatmap panel. Higher values will slow down the report navigation.")
60 | heatmap_group.add_argument('--optimal-ordering', default=False, action='store_true', help="Activate optimal_ordering on scipy linkage method, takes longer for large number of samples.")
61 | heatmap_group.add_argument('--show-zeros', default=False, action='store_true', help="Do not skip zeros on heatmap plot. File will be bigger and iteraction with heatmap slower. By default, zeros will be omitted.")
62 | heatmap_group.add_argument('--linkage-methods', type=str, nargs="*", default=["complete"], choices=list(_LINKAGE_METHODS))
63 | heatmap_group.add_argument('--linkage-metrics', type=str, nargs="*", default=["euclidean"], choices=_METRICS_NAMES)
64 | heatmap_group.add_argument('--skip-dendrogram', default=False, action='store_true', help="Disable dendogram plots for clustering.")
65 |
66 | correlation_group = parser.add_argument_group('Correlation options')
67 | correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=50, help="Number of top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all")
68 |
69 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + Config.version)
70 | parser.add_argument('-D', '--debug', default=False, action='store_true', help=argparse.SUPPRESS)
71 |
72 | return parser.parse_args(argv)
73 |
74 |
75 | def logo(version):
76 | print("")
77 | print(" ▄████ ██▀███ ██▓ ███▄ ▄███▓▓█████ ██▀███ ")
78 | print(" ██▒ ▀█▒▓██ ▒ ██▒▓██▒▓██▒▀█▀ ██▒▓█ ▀ ▓██ ▒ ██▒")
79 | print(" ▒██░▄▄▄░▓██ ░▄█ ▒▒██▒▓██ ▓██░▒███ ▓██ ░▄█ ▒")
80 | print(" ░▓█ ██▓▒██▀▀█▄ ░██░▒██ ▒██ ▒▓█ ▄ ▒██▀▀█▄ ")
81 | print(" ░▒▓███▀▒░██▓ ▒██▒░██░▒██▒ ░██▒░▒████▒░██▓ ▒██▒")
82 | print(" ░▒ ▒ ░ ▒▓ ░▒▓░░▓ ░ ▒░ ░ ░░░ ▒░ ░░ ▒▓ ░▒▓░")
83 | print(" ░ ░ ░▒ ░ ▒░ ▒ ░░ ░ ░ ░ ░ ░ ░▒ ░ ▒░")
84 | print(" ░ ░ ░ ░░ ░ ▒ ░░ ░ ░ ░░ ░ ")
85 | print(" ░ ░ ░ ░ ░ ░ ░ ")
86 | print(" version " + str(version))
87 | print("")
88 | print("")
89 |
--------------------------------------------------------------------------------
/grimer/css/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/css/__init__.py
--------------------------------------------------------------------------------
/grimer/css/popup.css:
--------------------------------------------------------------------------------
1 | /* (A) WRAPPER */
2 | #pop-up {
3 | position: fixed;
4 | top: 0; left: 0;
5 | z-index: 999;
6 | width: 100vw;
7 | height: 100vh;
8 | background: rgba(0, 0, 0, 0.2);
9 | visibility: hidden;
10 | opacity: 0;
11 | transition: opacity 0.1s;
12 | }
13 | #pop-up.open {
14 | visibility: visible;
15 | opacity: 1;
16 | }
17 |
18 | /* (B) BOX */
19 | #pop-box {
20 | position: relative;
21 | max-width: 70%;
22 | background: #fff;
23 | margin: 50vh auto 0 auto;
24 | transform: translateY(-50%);
25 | }
26 |
27 | /* (C) TITLE */
28 | #pop-title {
29 | padding: 5px;
30 | margin: 0;
31 | background: #868b8e;
32 | color: #fff;
33 | }
34 |
35 | /* (D) TEXT */
36 | #pop-text {
37 | border: 2px solid #868b8e;
38 | padding: 10px;
39 | margin: 0;
40 | background: #fff;
41 | color: #555;
42 | }
43 |
44 | /* (E) CLOSE BUTTON */
45 | #pop-close {
46 | position: absolute;
47 | top: 0; right: 5px;
48 | padding: 2px;
49 | color: #fff;
50 | font-size: 32px;
51 | cursor: pointer;
52 | }
53 |
--------------------------------------------------------------------------------
/grimer/decontam.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | class Decontam:
5 | cols_rank = ["freq", "prev", "p.freq", "p.prev", "p", "contaminant"]
6 |
7 | def __init__(self, df_concentration_controls):
8 | self.data = df_concentration_controls
9 | self.rank = {}
10 |
11 | def __repr__(self):
12 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()]
13 | return 'Decontam({})'.format(', '.join(args))
14 |
15 | def add_rank_results(self, rank, decontam_out_file, decontam_mod_file):
16 | self.rank[rank] = pd.read_table(decontam_out_file, sep='\t', header=0, skiprows=0, index_col=0, names=self.cols_rank, dtype={0: str})
17 |
18 | # Parse models enforcing index as string
19 | mod = pd.read_table(decontam_mod_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0: str})
20 |
21 | # Remove point counter at end (.1 or .1000)
22 | mod.index = mod.index.map(lambda txid: txid[:-5] if txid.endswith(".1000") else txid[:-2]).to_list()
23 |
24 | # Merge first point of model
25 | self.rank[rank] = self.rank[rank].merge(mod.iloc[0::2, 0], left_index=True, right_index=True)
26 |
27 | # Merge second point of model and non-contant line
28 | self.rank[rank] = self.rank[rank].merge(mod.iloc[1::2, :], suffixes=["", "_2"], left_index=True, right_index=True)
29 |
30 | def add_rank_empty(self, rank, idx):
31 | self.rank[rank] = pd.DataFrame(index=idx, columns=self.cols_rank + ["contam", "contam_2", "non.contam"])
32 | self.rank[rank]["contaminant"] = False
33 |
34 | def get_data(self):
35 | return self.data.fillna(False)
36 |
37 | def get_contaminants(self, rank, idx):
38 | return self.rank[rank].reindex(idx)["contaminant"]
39 |
40 | def get_pscore(self, rank, idx):
41 | return self.rank[rank].reindex(idx)["p"]
42 |
43 | def get_contaminant_list(self):
44 | clist = []
45 | for r in self.rank:
46 | clist.extend(self.rank[r].index[self.rank[r]["contaminant"] == True].to_list())
47 | return clist
48 |
--------------------------------------------------------------------------------
/grimer/func.py:
--------------------------------------------------------------------------------
1 | #General
2 | import numpy as np
3 | import os
4 | import sys
5 | import subprocess
6 | import shlex
7 | import pandas as pd
8 | from pandas.api.types import is_numeric_dtype
9 | import yaml
10 |
11 | #Internal
12 | from grimer.config import Config
13 | from grimer.decontam import Decontam
14 | from grimer.metadata import Metadata
15 | from grimer.reference import Reference
16 | from grimer.table import Table
17 |
18 | # Bokeh
19 | from bokeh.palettes import Category10, Category20, Colorblind, linear_palette, Turbo256
20 |
21 | # MultiTax
22 | from multitax import *
23 |
24 | #biom
25 | import biom
26 |
27 | # scikit-bio
28 | from skbio.stats.composition import clr
29 |
30 | # Scipy
31 | import scipy.cluster.hierarchy as sch
32 |
33 |
34 | def parse_config_file(config):
35 | cfg = None
36 | if config:
37 | try:
38 | with open(config, 'r') as file:
39 | cfg = yaml.safe_load(file)
40 | except Exception as e:
41 | print_log("Failed loading configuration file [" + config + "], skipping")
42 | print_log(str(e))
43 | else:
44 | print_log("Not provided, skipping")
45 | return cfg
46 |
47 |
48 | def parse_taxonomy(taxonomy, taxonomy_files):
49 | tax = None
50 | if taxonomy is not None:
51 | try:
52 | if not taxonomy_files:
53 | print_log("Downloading taxonomy")
54 | if taxonomy == "ncbi":
55 | tax = NcbiTx(files=taxonomy_files, extended_names=True)
56 | elif taxonomy == "gtdb":
57 | tax = GtdbTx(files=taxonomy_files)
58 | elif taxonomy == "silva":
59 | tax = SilvaTx(files=taxonomy_files)
60 | elif taxonomy == "greengenes":
61 | tax = GreengenesTx(files=taxonomy_files)
62 | elif taxonomy == "ott":
63 | tax = OttTx(files=taxonomy_files, extended_names=True)
64 | else:
65 | raise
66 | except Exception as e:
67 | print_log("Failed loading " + taxonomy + " taxonomy, skipping")
68 | print_log(str(e))
69 | else:
70 | print_log("Not provided, skipping")
71 | return tax
72 |
73 |
74 | def parse_table(args, tax):
75 | # Specific default params if biom file is provided
76 | if args.input_file.endswith(".biom"):
77 | if not args.level_separator:
78 | args.level_separator = ";"
79 | args.transpose = True
80 |
81 | # Read and return full table with separated total and unassigned counts (sharing same index)
82 | table_df, total, unassigned = parse_input_file(args.input_file, args.unassigned_header, args.transpose, args.sample_replace, args.cumm_levels)
83 |
84 | if table_df.empty:
85 | raise Exception("Error parsing input file")
86 |
87 | # Define if table is already normalized (0-100) or has count data
88 | if args.values == "count":
89 | normalized = False
90 | elif args.values == "normalized":
91 | normalized = True
92 | elif (table_df.sum(axis=1).round() == 100).all() or (table_df % 1 != 0).any().any():
93 | normalized = True
94 | else:
95 | normalized = False
96 |
97 | # Zero replacement
98 | try:
99 | replace_zero_value = table_df[table_df.gt(0)].min().min() / int(args.replace_zeros)
100 | except:
101 | replace_zero_value = float(args.replace_zeros)
102 | if replace_zero_value == 1 and args.transformation == "log":
103 | replace_zero_value = 0.999999 # Do not allow value 1 using log
104 |
105 | # Split table into ranks. Ranks are either in the headers in multi level tables or will be created for a one level table
106 | if args.level_separator:
107 | ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace, args.cumm_levels)
108 | else:
109 | ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, Config.default_rank_name)
110 |
111 | if not ranked_tables:
112 | raise Exception("Error parsing input file")
113 |
114 | table = Table(table_df.index, total, unassigned, lineage, normalized, replace_zero_value)
115 |
116 | print_log("")
117 | print_log("Total valid samples: " + str(len(table.samples)))
118 | # Check for long sample headers, break some plots
119 | long_sample_headers = [h for h in table_df.index if len(h) > 70]
120 | if long_sample_headers:
121 | print_log("Long sample labels/headers detected, plots may break: ")
122 | print_log("\n".join(long_sample_headers))
123 | print_log("")
124 |
125 | for r, t in ranked_tables.items():
126 | print_log("--- " + r + " ---")
127 | filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count, normalized))
128 | if t.empty:
129 | print_log("No valid entries, skipping")
130 | else:
131 | # Trim table for empty zeros rows/cols
132 | table.add_rank(r, filtered_trimmed_t)
133 | print_log("Total valid observations: " + str(len(table.observations(r))))
134 |
135 | print_log("")
136 |
137 | if not normalized:
138 | print_log("Total assigned (counts): " + str(table.get_total().sum() - table.get_unassigned().sum()))
139 | print_log("Total unassigned (counts): " + str(table.get_unassigned().sum()))
140 | print_log("")
141 |
142 | return table
143 |
144 |
145 | def parse_metadata(args, samples):
146 | metadata = None
147 |
148 | # Parse metadata as DataFrame (md)
149 | md = pd.DataFrame()
150 | if args.metadata_file:
151 | # Parse table as dataframe
152 | md = pd.read_table(args.metadata_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0: str})
153 | elif args.input_file.endswith(".biom"):
154 | try:
155 | biom_in = biom.load_table(args.input_file)
156 | if biom_in.metadata() is not None:
157 | md = biom_in.metadata_to_dataframe(axis="sample")
158 | except:
159 | print_log("Error parsing metadata from BIOM file, skipping")
160 | return None
161 |
162 | if md.empty:
163 | print_log("No valid metadata, skipping")
164 | return None
165 |
166 | # Enforce string index
167 | md.index = md.index.astype('str')
168 |
169 | # Return type of columns, remove metadata row if present from metadata
170 | md_types = define_metadata_types(md)
171 |
172 | # types defined on file
173 | if str(md.index[0]).startswith("#"):
174 | # Drop row with types from main data
175 | md.drop(md_types.name, inplace=True)
176 | # Enforce column type on dataframe
177 | md[md_types[md_types == "categorical"].index] = md[md_types[md_types == "categorical"].index].astype(str)
178 | md[md_types[md_types == "numeric"].index] = md[md_types[md_types == "numeric"].index].apply(pd.to_numeric)
179 |
180 | # Convert datatypes to adequate numeric values (int, float)
181 | md = md.convert_dtypes(infer_objects=False, convert_string=False, convert_boolean=False)
182 | # Re-convert everything to object to standardize (int64 NA is not seriazable on bokeh)
183 | md = md.astype("object")
184 |
185 | # Remove empty fields
186 | null_cols = md.isna().all(axis=0)
187 | if any(null_cols):
188 | md = md.loc[:, ~null_cols]
189 | md_types = md_types[~null_cols]
190 | print_log(str(sum(null_cols)) + " metadata fields removed without valid values")
191 |
192 | # Convert NaN on categorical to ""
193 | md[md_types[md_types == "categorical"].index] = md[md_types[md_types == "categorical"].index].fillna('')
194 | # Convert boolean from categorical to String
195 | mask = md[md_types[md_types == "categorical"].index].applymap(type) != bool
196 | md[md_types[md_types == "categorical"].index] = md[md_types[md_types == "categorical"].index].where(mask, md[md_types[md_types == "categorical"].index].replace({True: 'True', False: 'False'}))
197 |
198 | # Remove names
199 | md.index.names = [None]
200 | md_types.name = None
201 |
202 | # sort and filter by given samples
203 | md = md.reindex(samples)
204 |
205 | # Check if matched metadata and samples
206 | null_rows = md.isna().all(axis=1)
207 | if any(null_rows):
208 | # Do not remove, just inform user
209 | #md = md.loc[~null_rows, :]
210 | print_log(str(sum(null_rows)) + " samples without valid metadata")
211 |
212 | if md.empty or sum(null_rows) == md.shape[0]:
213 | print_log("No valid metadata, skipping")
214 | return None
215 |
216 | metadata = Metadata(md, md_types)
217 | print_log("Samples: " + str(metadata.data.shape[0]))
218 | print_log("Numeric Fields: " + str(metadata.get_data("numeric").shape[1]))
219 | print_log("Categorical Fields: " + str(metadata.get_data("categorical").shape[1]))
220 | return metadata
221 |
222 |
223 | def define_metadata_types(metadata):
224 | # Define all COLUMN TYPES as default
225 | types = pd.Series(Metadata.default_type, index=metadata.columns)
226 | # Set types
227 | if str(metadata.index[0]).startswith("#"):
228 | # types defined on file: get values defined on the first row
229 | types = metadata.iloc[0]
230 | # Validate declared types
231 | idx_valid = types.isin(Metadata.valid_types)
232 | if not idx_valid.all():
233 | print_log("Invalid metadata types replaced by: " + Metadata.default_type)
234 | types[~idx_valid] = default_type
235 | else:
236 | # guessed types from read_table
237 | types[metadata.dtypes.map(is_numeric_dtype)] = "numeric"
238 |
239 | return types
240 |
241 |
242 | def parse_references(cfg, tax, taxonomy, ranks):
243 | references = None
244 | if cfg is not None and "references" in cfg:
245 | if taxonomy == "ncbi":
246 | references = {}
247 | for desc, sf in cfg["references"].items():
248 | references[desc] = Reference(file=sf)
249 | if tax:
250 | # Update taxids / get taxid from name
251 | references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax))
252 | for i in list(references[desc].ids.keys()):
253 | # lineage of all parent nodes (without itself)
254 | for l in tax.lineage(i)[:-1]:
255 | references[desc].add_parent(l, i)
256 | else:
257 | print_log("References only possible with ncbi taxonomy, skipping")
258 | else:
259 | print_log("No references defined in the configuration file, skipping")
260 | return references
261 |
262 |
263 | def parse_controls(cfg, table, metadata):
264 | controls = None
265 | control_samples = None
266 | if cfg is not None and "controls" in cfg:
267 | controls = {}
268 | control_samples = {}
269 | for desc, c in cfg["controls"].items():
270 | samples = set()
271 | if isinstance(c, str):
272 | # If str, it's a file with one sample per line
273 | with open(c, "r") as file:
274 | samples = file.read().splitlines()
275 | elif isinstance(c, dict):
276 | # if a dict, several metadata fields:values can be provided to set control samples
277 | for field, val in c.items():
278 | if field not in metadata.get_col_headers():
279 | print_log("Could not find " + field + " in the metadata, skipping for control " + desc)
280 | else:
281 | for v in val:
282 | samples.update(metadata.get_subset(field, v).index)
283 |
284 | if samples:
285 | obs = set()
286 | valid_samples = set()
287 | for rank in table.ranks():
288 | # Retrieve sub-table for every rank
289 | control_table = table.get_subtable(rank, samples=samples)
290 | obs.update(control_table.columns.to_list())
291 | valid_samples.update(control_table.index.to_list())
292 | # Add control observations as a reference
293 | controls[desc] = Reference(ids=obs)
294 | control_samples[desc] = list(valid_samples)
295 | print_log(desc + ": " + str(len(valid_samples)) + " samples / " + str(len(obs)) + " observations")
296 | else:
297 | print_log("Could not identify control input " + desc)
298 |
299 | else:
300 | print_log("No controls defined in the configuration file, skipping")
301 |
302 | return controls, control_samples
303 |
304 |
305 | def parse_mgnify(run_mgnify, cfg, tax, ranks):
306 | mgnify = None
307 | if run_mgnify:
308 | if cfg is not None and "mgnify" in cfg["external"]:
309 | try:
310 | mgnify = pd.read_table(cfg["external"]["mgnify"], header=None, names=["rank", "taxa", "biome", "count"])
311 | except Exception as e:
312 | print_log("Failed parsing MGnify database file [" + cfg["external"]["mgnify"] + "], skipping")
313 | print_log(str(e))
314 | return None
315 | # Filter to keep only used ranks, if provided
316 | if ranks:
317 | mgnify = mgnify.loc[mgnify['rank'].isin(ranks)]
318 | mgnify.reset_index(drop=True, inplace=True)
319 | # Convert taxids if tax is provided
320 | if tax:
321 | updated_nodes = update_tax_nodes([tuple(x) for x in mgnify[["rank", "taxa"]].to_numpy()], tax)
322 | mgnify["taxa"] = mgnify[["rank", "taxa"]].apply(lambda rt: updated_nodes[(rt[0], rt[1])] if updated_nodes[(rt[0], rt[1])] is not None else rt[1], axis=1)
323 | if mgnify.empty:
324 | mgnify = None
325 | print_log("No matches with MGnify database, skipping")
326 | else:
327 | print_log("Not defined in the configuration file, skipping")
328 | else:
329 | print_log("Not activated, skipping")
330 | return mgnify
331 |
332 |
333 | def run_correlation(table, top_obs_corr):
334 | corr = {}
335 | for rank in table.ranks():
336 | corr[rank] = {}
337 | if top_obs_corr:
338 | top_taxids = sorted(table.get_top(rank, top_obs_corr))
339 | matrix = table.get_subtable(taxids=top_taxids, rank=rank)
340 | else:
341 | top_taxids = sorted(table.observations(rank))
342 | matrix = table.data[rank]
343 |
344 | corr[rank]["observations"] = top_taxids
345 | corr[rank]["rho"] = []
346 | # No correlation with just one observation
347 | if len(matrix.columns) >= 2:
348 | rho = pairwise_rho(transform_table(matrix, 0, "clr", table.zerorep).values)
349 | if len(matrix.columns) == 2:
350 | # If there are only 2 observations, return in a float
351 | # re-format in a matrix shape
352 | rho = np.array([[np.nan, np.nan], [rho[1, 0], np.nan]])
353 | else:
354 | # fill upper triangular matrix (mirrored values) with nan to be ignored by pandas
355 | # to save half of the space
356 | rho[np.triu_indices(rho.shape[0])] = np.nan
357 |
358 | corr[rank]["rho"] = rho
359 |
360 | return corr
361 |
362 |
363 | def parse_input_file(input_file, unassigned_header, transpose, sample_replace, cumm_levels):
364 |
365 | if input_file.endswith(".biom"):
366 | table_df = biom.load_table(input_file).to_dataframe(dense=True)
367 | else:
368 | # Default input_file: index=observations, columns=samples
369 | # table_df should have samples on indices and observations on columns
370 | table_df = pd.read_table(input_file, sep='\t', index_col=0, dtype={0: str}).transpose().fillna(0)
371 | # Enforce string observations
372 | table_df.columns = table_df.columns.astype(str)
373 |
374 | # If user is providing a reverse table, turn back
375 | if transpose:
376 | table_df = table_df.transpose()
377 |
378 | # Remove header on rows
379 | table_df.index.name = None
380 |
381 | # Replace text on sample labels
382 | if sample_replace:
383 | print_log("Replacing sample values:")
384 | before_replace = table_df.head(1).index
385 | #get index as series to use replace method
386 | new_index = table_df.reset_index()["index"].replace(regex=dict(zip(sample_replace[::2], sample_replace[1::2])))
387 | table_df.set_index(new_index, inplace=True)
388 | for b, a in zip(before_replace, table_df.head(1).index):
389 | print_log(" " + b + " -> " + a)
390 | print_log(" ...")
391 |
392 | # Sum total before split unassigned or filter
393 | if cumm_levels:
394 | total = table_df.max(axis=1)
395 | else:
396 | total = table_df.sum(axis=1)
397 |
398 | # unique unassigned/unclassified for table
399 | # Separate unassigned counts column from main data frame
400 | unassigned = pd.Series(0, index=table_df.index)
401 | if unassigned_header:
402 | for header in unassigned_header:
403 | if header in table_df.columns:
404 | if isinstance(table_df[header], pd.DataFrame):
405 | # Sum in case there are several equally named headers
406 | unassigned += table_df[header].sum(axis=1)
407 | else:
408 | # return a pd.Series
409 | unassigned += table_df[header]
410 | table_df.drop(columns=header, inplace=True)
411 | else:
412 | print_log("'" + header + "' header not found")
413 |
414 | if unassigned.sum() == 0:
415 | print_log("No unassigned entries defined")
416 |
417 | print_log("Trimming table")
418 | table_df = trim_table(table_df)
419 |
420 | # Filter based on the final table
421 | unassigned = unassigned.reindex(table_df.index)
422 | total = total.reindex(table_df.index)
423 |
424 | return table_df, total, unassigned
425 |
426 |
427 | def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count, normalized):
428 |
429 | if min_count:
430 | cnt = table_df.sum().sum()
431 | if min_count < 1:
432 | table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df
433 | table_df = table_df[table_df_norm >= min_count].fillna(0)
434 | elif min_count > 1:
435 | table_df = table_df[table_df >= min_count].fillna(0)
436 | print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --min-count " + str(min_count))
437 |
438 | if max_count:
439 | cnt = table_df.sum().sum()
440 | if max_count < 1:
441 | table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df
442 | table_df = table_df[table_df_norm <= max_count].fillna(0)
443 | elif max_count > 1:
444 | table_df = table_df[table_df <= max_count].fillna(0)
445 | print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --max-count " + str(max_count))
446 |
447 | if min_frequency:
448 | cnt = table_df.shape[1]
449 | table_df_freq = table_df.gt(0).sum(axis=0)
450 | if min_frequency < 1:
451 | table_df_freq = table_df_freq / table_df.shape[0]
452 | table_df = table_df.loc[:, table_df_freq >= min_frequency]
453 | elif min_frequency > 1:
454 | table_df = table_df.loc[:, table_df_freq >= min_frequency]
455 | print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --min-frequency " + str(min_frequency))
456 |
457 | if max_frequency:
458 | cnt = table_df.shape[1]
459 | table_df_freq = table_df.gt(0).sum(axis=0)
460 | if max_frequency < 1:
461 | table_df_freq = table_df_freq / table_df.shape[0]
462 | table_df = table_df.loc[:, table_df_freq <= max_frequency]
463 | elif max_frequency > 1:
464 | table_df = table_df.loc[:, table_df_freq <= max_frequency]
465 | print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --max-frequency " + str(max_frequency))
466 |
467 | return table_df
468 |
469 |
470 | def trim_table(table_df):
471 | # Check for cols/rows with sum zero
472 | zero_rows = table_df.sum(axis=1).eq(0)
473 | if any(zero_rows):
474 | table_df = table_df.loc[~zero_rows, :]
475 | print_log(str(sum(zero_rows)) + " samples with only zero removed")
476 |
477 | zero_cols = table_df.sum(axis=0).eq(0)
478 | if any(zero_cols):
479 | table_df = table_df.loc[:, ~zero_cols]
480 | print_log(str(sum(zero_cols)) + " observations with only zero removed")
481 |
482 | return table_df
483 |
484 |
485 | def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace, cumm_levels):
486 | from grimer.grimer import _debug
487 |
488 | # Transpose table (obseravations as index) and expand ranks in columns
489 | ranks_df = table_df.T.index.str.split(level_separator, expand=True).to_frame(index=False)
490 |
491 | # For every pair of replace arguments
492 | if obs_replace:
493 | print_log("Replacing observation values:")
494 | before_replace = ranks_df.dropna().head(1).values[0]
495 | ranks_df.replace(regex=dict(zip(obs_replace[::2], obs_replace[1::2])), inplace=True)
496 | for b, a in zip(before_replace, ranks_df.dropna().head(1).values[0]):
497 | print_log(" " + b + " -> " + a)
498 | print_log(" ...")
499 |
500 | # replace entirely space or empty with NaN
501 | ranks_df = ranks_df.replace(r'^\s*$', np.nan, regex=True)
502 |
503 | # Set rank names, matching user defined or default
504 | user_ranks = False
505 | if len(ranks) == ranks_df.shape[1]:
506 | parsed_ranks = {r: ranks[r] for r in range(ranks_df.shape[1])}
507 | user_ranks = True
508 | else:
509 | print_log("Ranks provided (" + str(len(ranks)) + ") do not match file (" + str(ranks_df.shape[1]) + " levels). Using default named ranks.")
510 | parsed_ranks = {r: "rank-" + str(r) for r in range(ranks_df.shape[1])}
511 | ranks_df.rename(columns=parsed_ranks, inplace=True)
512 |
513 | # Update taxids
514 | if tax is not None:
515 | unmatched_nodes = 0
516 | for i, r in parsed_ranks.items():
517 | rank_nodes = ranks_df[r].dropna().unique()
518 |
519 | # If there is at least one valid entry
520 | if rank_nodes.any():
521 | # If user-provided ranks are matching, update nodes with rank
522 | if user_ranks:
523 | updated_nodes = {node: unode for (rank, node), unode in update_tax_nodes([(r, n) for n in rank_nodes], tax).items()}
524 | else:
525 | updated_nodes = update_tax_nodes(rank_nodes, tax)
526 |
527 | # Add nan to keep missing ranks (different than tax.undefined_node [None] which will keep the name)
528 | updated_nodes[np.nan] = np.nan
529 | ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] is not None else t)
530 | del updated_nodes[np.nan]
531 |
532 | unmatched_nodes += list(updated_nodes.values()).count(tax.undefined_node)
533 |
534 | if unmatched_nodes:
535 | print_log(str(unmatched_nodes) + " observations not found in taxonomy (but kept)")
536 |
537 | # Check unique lineage
538 | for i, r in parsed_ranks.items():
539 | if i > 0:
540 | lin_count = ranks_df.iloc[:, :i+1].drop_duplicates().groupby(r).count()
541 | invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list()
542 | if invalid:
543 | print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r)
544 | if _debug:
545 | print_log(",".join(invalid) + " observations removed with invalid lineage at " + r)
546 | # Set to NaN to keep shape of ranks_df
547 | ranks_df.loc[ranks_df[r].isin(invalid), r] = np.nan
548 |
549 | ranked_tables = {}
550 | for i, r in parsed_ranks.items():
551 | # ranks_df and table_df.T have the same shape
552 | ranked_table_df = pd.concat([ranks_df[r], table_df.T.reset_index(drop=True)], axis=1)
553 | if cumm_levels:
554 | ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).max().T
555 | else:
556 | ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).sum().T
557 | ranked_tables[r].columns.name = None
558 |
559 | lineage = ranks_df
560 | return ranked_tables, lineage
561 |
562 |
563 | def parse_single_table(table_df, ranks, tax, default_rank_name):
564 |
565 | # Update taxids
566 | if tax is not None:
567 | print(table_df)
568 | updated_nodes = update_tax_nodes(table_df.columns, tax)
569 | unmatched_nodes = list(updated_nodes.values()).count(tax.undefined_node)
570 | if unmatched_nodes:
571 | print_log(str(unmatched_nodes) + " observations not found in taxonomy")
572 |
573 | for node, upd_node in updated_nodes.items():
574 | if upd_node is not None and upd_node != node:
575 | # If updated node is a merge on an existing taxid, sum values
576 | if upd_node in table_df:
577 | table_df[upd_node] += table_df[node]
578 | table_df.drop(columns=node, inplace=True)
579 | print_log("Updated and merged taxonomic nodes: " + node + " -> " + upd_node)
580 | else:
581 | table_df.rename(columns={node: upd_node}, inplace=True)
582 | print_log("Updated taxonomic node: " + node + " -> " + upd_node)
583 |
584 |
585 | # Generate ranks
586 | ranked_tables = {}
587 | for rank in ranks:
588 | # Special case for "default" rank
589 | if rank == default_rank_name:
590 | ranked_tables[rank] = table_df
591 | else:
592 | taxid_parent_rank = {i: tax.parent_rank(tax.latest(i), rank) for i in table_df.columns}
593 | rank_df = pd.DataFrame(index=table_df.index)
594 | for taxid, parent_rank_taxid in taxid_parent_rank.items():
595 | if parent_rank_taxid is None:
596 | #no_rank += 1
597 | continue
598 | if parent_rank_taxid not in rank_df:
599 | rank_df[parent_rank_taxid] = 0
600 | rank_df[parent_rank_taxid] += table_df[taxid]
601 |
602 | if not rank_df.empty:
603 | ranked_tables[rank] = rank_df
604 |
605 |
606 | # Generate lineage
607 | if tax:
608 | lineage = pd.DataFrame(list(map(lambda t: tax.lineage(t, ranks=list(ranked_tables.keys())), table_df.columns)), columns=list(ranked_tables.keys()))
609 | else:
610 | lineage = pd.DataFrame()
611 |
612 | return ranked_tables, lineage
613 |
614 |
615 | def transform_table(df, total_counts, transformation, replace_zero_value):
616 | # Special case clr with one observation (result in zeros)
617 | if transformation == "clr" and df.shape[1] == 1:
618 | print_log("WARNING: using log instead of clr with one observation")
619 | transformation = "log"
620 |
621 | if transformation == "log":
622 | transformed_df = (df + replace_zero_value).apply(np.log10)
623 | elif transformation == "clr":
624 | transformed_df = pd.DataFrame(clr(df + replace_zero_value), index=df.index, columns=df.columns)
625 | elif transformation == "norm":
626 | transformed_df = df.divide(total_counts, axis=0) + replace_zero_value
627 | else:
628 | transformed_df = df + replace_zero_value
629 |
630 | return transformed_df
631 |
632 |
633 | def update_tax_nodes(nodes, tax):
634 | """
635 | nodes can be a list of strings: taxids or names or a list of tuples with (rank, taxid/name)
636 | Return a dictionary mapping nodes and updated nodes (or None)
637 | First look for id, if nothing found, lookup by unique name
638 | """
639 |
640 | updated_nodes = {}
641 | for node in nodes:
642 | if isinstance(node, tuple):
643 | r = node[0]
644 | n = node[1]
645 | else:
646 | r = None
647 | n = node
648 |
649 | # Either returns same node, updated or tax.undefined_node (None)
650 | updated_taxid = tax.latest(n)
651 | if updated_taxid:
652 | # Assign updated or same taxid
653 | updated_nodes[node] = updated_taxid
654 | else:
655 | names = tax.search_name(n, rank=r, exact=True)
656 | # Assign taxid if found unique name only
657 | if names and len(names) == 1:
658 | updated_nodes[node] = names[0]
659 | else:
660 | updated_nodes[node] = tax.undefined_node
661 |
662 | return updated_nodes
663 |
664 |
665 | def run_decontam(run_decontam, cfg, table, metadata, control_samples, script_dir):
666 |
667 | if not run_decontam:
668 | print_log("Not activated, skipping")
669 | return None
670 |
671 | if cfg is None:
672 | print_log("Not defined in the configuration file, skipping")
673 | return None
674 |
675 | df_decontam = pd.DataFrame(index=table.samples, columns=["concentration", "controls"])
676 | cfg_decontam = cfg["external"]["decontam"]
677 | tmp_output_prefix = "tmp_"
678 |
679 | # Collect metadata for DECONTAM (concentrations to use frequency and control for prevalence)
680 | out_table = tmp_output_prefix + "table_counts.tsv"
681 | out_concentration = tmp_output_prefix + "concentration_counts.tsv"
682 | out_controls = tmp_output_prefix + "control_samples_list.txt"
683 | if cfg_decontam["method"] in ["frequency", "combined"]:
684 | out_concentration = tmp_output_prefix + "concentration_counts.tsv"
685 | # Load frequency file, if provided
686 | if "frequency_file" in cfg_decontam:
687 | if os.path.isfile(cfg_decontam["frequency_file"]):
688 | # Load concentrations from file and sort (reindex) based on table inputs
689 | df_decontam["concentration"] = pd.read_table(cfg_decontam["frequency_file"], sep='\t', header=None, skiprows=0, index_col=0).reindex(table.samples)
690 | # If any entry is unknown, input is incomplete
691 | if df_decontam["concentration"].isnull().values.any():
692 | print_log("File " + cfg_decontam["frequency_file"] + " is incomplete (Missing: " + ",".join(df_decontam[df_decontam.isnull().any(axis=1)].index.to_list()) + "), skipping")
693 | return None
694 | else:
695 | print_log("File " + cfg_decontam["frequency_file"] + " not found, skipping")
696 | return None
697 | elif "frequency_metadata" in cfg_decontam:
698 | if cfg_decontam["frequency_metadata"] in metadata.get_col_headers():
699 | # Get concentrations from metadata
700 | df_decontam["concentration"] = metadata.get_col(cfg_decontam["frequency_metadata"])
701 | else:
702 | print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata, skipping.")
703 | return None
704 | elif not table.normalized:
705 | # Use total from table
706 | print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)")
707 | df_decontam["concentration"] = table.get_total()
708 | else:
709 | print_log("Cannot run DECONTAM without defined concentration and normalized input values, skipping")
710 | return None
711 | # Print concentrations to file
712 | df_decontam["concentration"].to_csv(out_concentration, sep="\t", header=False, index=True)
713 |
714 | if cfg_decontam["method"] in ["prevalence", "combined"]:
715 | control_list = set()
716 | if "prevalence_file" in cfg_decontam:
717 | for file in cfg_decontam["prevalence_file"]:
718 | if os.path.isfile(file):
719 | # Load controls from file
720 | control_list.update([line.rstrip() for line in open(file)])
721 | else:
722 | print_log("File not found " + file)
723 | elif "prevalence_metadata" in cfg_decontam:
724 | # if a dict, several metadata fields:values can be provided to set control samples
725 | for field, val in cfg_decontam["prevalence_metadata"].items():
726 | if field not in metadata.get_col_headers():
727 | print_log("Could not find " + field + " in the metadata, skipping for decontam (prevalence)")
728 | else:
729 | for v in val:
730 | control_list.update(metadata.get_subset(field, v).index)
731 | else:
732 | # Use all samples passed as controls
733 | for cs in control_samples.values():
734 | control_list.update(cs)
735 |
736 | # Select valid controls
737 | df_decontam["controls"] = table.samples.isin(control_list)
738 |
739 | if df_decontam["controls"].any():
740 | print_log(str(df_decontam["controls"].sum()) + " valid control samples to be used by DECONTAM")
741 | outf = open(out_controls, "w")
742 | print("\n".join(df_decontam.index[df_decontam["controls"]]), file=outf)
743 | outf.close()
744 | else:
745 | print_log("Could not find valid control entries, skipping")
746 | return None
747 |
748 | decontam = Decontam(df_decontam)
749 | # Run DECONTAM for each for each
750 | for rank in table.ranks():
751 | if len(table.observations(rank)) == 1:
752 | decontam.add_rank_empty(rank, table.observations(rank))
753 | else:
754 | # normalize and write temporary table for each rank
755 | if not table.normalized:
756 | transform_table(table.data[rank], table.get_total()[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True)
757 | else:
758 | table.data[rank].to_csv(out_table, sep="\t", header=True, index=True)
759 |
760 | cmd = " ".join([os.path.join(script_dir, "scripts", "run_decontam.R"),
761 | "--resout " + tmp_output_prefix + "decontam_out.tsv",
762 | "--modout " + tmp_output_prefix + "decontam_mod.tsv",
763 | "--counts " + out_table,
764 | "--concentrations " + out_concentration if cfg_decontam["method"] in ["frequency", "combined"] else "",
765 | "--controls " + out_controls if cfg_decontam["method"] in ["prevalence", "combined"] else "",
766 | "--method " + cfg_decontam["method"],
767 | "--threshold " + str(cfg_decontam["threshold"])])
768 | stdout, stderr = run_cmd(cmd)
769 |
770 | decontam.add_rank_results(rank, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv")
771 |
772 | for file in [out_table, out_concentration, out_controls, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv"]:
773 | if os.path.isfile(file):
774 | os.remove(file)
775 |
776 | return decontam
777 |
778 |
779 | def run_hclustering(table, linkage_methods, linkage_metrics, transformation, skip_dendrogram, optimal_ordering):
780 | hcluster = {}
781 | dendro = {}
782 |
783 | for rank in table.ranks():
784 |
785 | # Get .values of transform, numpy array
786 | matrix = transform_table(table.data[rank], table.get_total(), transformation, table.zerorep).values
787 |
788 | hcluster[rank] = {}
789 | dendro[rank] = {}
790 | for method in linkage_methods:
791 | hcluster[rank][method] = {}
792 | dendro[rank][method] = {}
793 | for metric in linkage_metrics:
794 | hcluster[rank][method][metric] = {}
795 | hcluster[rank][method][metric]["x"] = {}
796 | hcluster[rank][method][metric]["y"] = {}
797 |
798 | #H.clustering, returning dendrogram
799 | # Only one observation does not cluster
800 | if matrix.shape[1] > 1:
801 | x = sch.dendrogram(sch.linkage(matrix.transpose(), method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True)
802 | hcluster[rank][method][metric]["x"]["index"] = table.observations(rank)[x["leaves"]].to_list()
803 | else:
804 | hcluster[rank][method][metric]["x"]["index"] = table.observations(rank).to_list()
805 |
806 | # Only one samples does not cluster
807 | if matrix.shape[0] > 1:
808 | y = sch.dendrogram(sch.linkage(matrix, method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True)
809 | hcluster[rank][method][metric]["y"]["index"] = table.samples[y["leaves"]].to_list()
810 | else:
811 | hcluster[rank][method][metric]["y"]["index"] = table.samples.to_list()
812 |
813 | if not skip_dendrogram:
814 | dendro[rank][method][metric] = {}
815 | dendro[rank][method][metric]["y"] = {}
816 | dendro[rank][method][metric]["x"] = {}
817 |
818 | # Save dendrogram values and colors
819 | xs, ys, colors = [[]] * 3
820 | if matrix.shape[1] > 1:
821 | xs, ys, colors = dendro_lines_color(x, "x")
822 | dendro[rank][method][metric]["x"]["xs"] = xs
823 | dendro[rank][method][metric]["x"]["ys"] = ys
824 | dendro[rank][method][metric]["x"]["colors"] = colors
825 | if matrix.shape[0] > 1:
826 | xs, ys, colors = dendro_lines_color(y, "y")
827 | dendro[rank][method][metric]["y"]["xs"] = xs
828 | dendro[rank][method][metric]["y"]["ys"] = ys
829 | dendro[rank][method][metric]["y"]["colors"] = colors
830 |
831 | return hcluster, dendro
832 |
833 |
834 | def dendro_lines_color(dendro, axis):
835 | icoord = pd.DataFrame(dendro["icoord"])
836 | icoord = icoord * ((len(dendro["icoord"]) + 0.5) / icoord.max().max())
837 | icoord = icoord.values.tolist()
838 | if axis == "y":
839 | dcoord = dendro["dcoord"]
840 | else:
841 | dcoord = [[-j for j in i] for i in dendro['dcoord']]
842 |
843 | color_list = dendro["color_list"]
844 | unique_colors = sorted(set(color_list))
845 | cp = make_color_palette(len(unique_colors))
846 | colors = [cp[unique_colors.index(colorid)] for colorid in color_list]
847 |
848 | if axis == "y":
849 | return dcoord, icoord, colors
850 | else:
851 | return icoord, dcoord, colors
852 |
853 |
854 | def pairwise_vlr(mat):
855 | cov = np.cov(mat.T, ddof=1)
856 | diagonal = np.diagonal(cov)
857 | return -2 * cov + diagonal[:, np.newaxis] + diagonal
858 |
859 |
860 | def pairwise_rho(mat):
861 | variances = np.var(mat, axis=0, ddof=1)
862 | return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances))
863 |
864 |
865 | def include_scripts(scripts):
866 | # Insert global js functions and css and return template
867 | template = "{% block postamble %}"
868 | for file, t in scripts.items():
869 | with open(file, 'r') as file:
870 | template += "<" + t + ">"
871 | template += "".join(file.readlines())
872 | template += "" + t + ">"
873 | template += "{% endblock %}"
874 | return template
875 |
876 |
877 | def format_js_toString(val):
878 | # Transform numeric value to float and string to match toString
879 | return str(float(val)) if isinstance(val, (int, float)) else str(val)
880 |
881 |
882 | def make_color_palette(n_colors, linear: bool=False, palette: dict=None):
883 | if isinstance(palette, dict) and n_colors <= max(palette.keys()):
884 | # Special case for 1 and 2 (not in palettes)
885 | palette = palette[3 if n_colors < 3 else n_colors]
886 |
887 | if linear or n_colors > 20:
888 | if not palette:
889 | palette = Turbo256
890 | if n_colors <= 256:
891 | return linear_palette(palette, n_colors)
892 | else:
893 | # Repeat colors
894 | return [palette[int(i * 256.0 / n_colors)] for i in range(n_colors)]
895 | else:
896 | # Select color palette based on number of requested colors
897 | # Return the closest palette with most distinc set of colors
898 | if not palette:
899 | if n_colors <= 8:
900 | palette = Colorblind[8]
901 | elif n_colors <= 10:
902 | palette = Category10[10]
903 | elif n_colors <= 20:
904 | palette = Category20[20]
905 | else:
906 | palette = Turbo256
907 |
908 | return palette[:n_colors]
909 |
910 | def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True):
911 | errcode = 0
912 | stdout = ""
913 | stderr = ""
914 | try:
915 | process = subprocess.Popen(shlex.split(cmd),
916 | universal_newlines=True,
917 | stdout=subprocess.PIPE,
918 | stderr=subprocess.PIPE)
919 | # wait for the process to terminate
920 | stdout, stderr = process.communicate()
921 | errcode = process.returncode
922 | if exit_on_error and errcode != 0:
923 | raise Exception()
924 | if print_stderr and stderr:
925 | print_log(stderr)
926 |
927 | except Exception as e:
928 | print_log('The following command failed to run:\n' + cmd)
929 | print_log(str(e))
930 | print_log("Error code: " + str(errcode))
931 | print_log("Out: ")
932 | if stdout:
933 | print_log(stdout)
934 | print_log("Error: ")
935 | if stderr:
936 | print_log(stderr)
937 | sys.exit(errcode)
938 |
939 | return stdout, stderr
940 |
941 |
942 | def print_log(text):
943 | sys.stderr.write(text + "\n")
944 | sys.stderr.flush()
945 |
946 |
947 | def print_df(df, name: str=None):
948 | from grimer.grimer import _debug
949 | if _debug:
950 | print(name)
951 | if isinstance(df, dict):
952 | if df:
953 | print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1])
954 | #print(list(df.values())[0], "...", list(df.values())[-1])
955 | else:
956 | #print(df.columns)
957 | print(df.head())
958 | print(df.shape)
959 | print("size:", sys.getsizeof(df))
960 | print("-----------------------------------------------")
961 |
962 |
963 | def print_logo_cli(version):
964 | print_log("==================")
965 | print_log(" ╔═╗╦═╗╦╔╦╗╔═╗╦═╗ ")
966 | print_log(" ║ ╦╠╦╝║║║║║╣ ╠╦╝ ")
967 | print_log(" ╚═╝╩╚═╩╩ ╩╚═╝╩╚═ ")
968 | print_log(" v" + version)
969 | print_log("==================")
970 |
--------------------------------------------------------------------------------
/grimer/grimer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | _debug = False
3 |
4 | #General
5 | import sys
6 |
7 | #Internal
8 | from grimer.callbacks import *
9 | from grimer.cds import *
10 | from grimer.config import Config, logo
11 | from grimer.layout import *
12 | from grimer.plots import *
13 | from grimer.func import *
14 |
15 | #Bokeh
16 | from bokeh.io import save
17 | from bokeh.plotting import output_file
18 |
19 |
20 | def main(argv=sys.argv[1:]):
21 | """
22 | GRIMER code overview
23 | 1) Load data/analysis: parse configuration, load files and run analysis into data objects
24 | e.g. args.input_file to Table() and decontam
25 | 2) Generata data sources: Convert objects and analysis int cds/dict
26 | e.g. table to cds_m_obstable
27 | 3) Plot elements: plot figures and widgets based on cds/dict (and some objects)
28 | e.g cds_m_obstable to ele["obstable"]["fig"]
29 | 4) Link javascript: link data sources and javascript custom callbacks
30 | 5) Draw layout: Put elements into layout scheme and generate report
31 | """
32 |
33 | # Parse CLI arguments
34 | args = Config(argv)
35 |
36 | # Setup global _debug variable to be used by other files with #from grimer.grimer import _debug
37 | global _debug
38 | _debug = args.debug
39 | # Define path of running script to get static files
40 | script_dir, _ = os.path.split(__file__)
41 |
42 | # 1) Load data/analysis
43 | # If not parsed, skipped or error, var is None
44 | cfg = None
45 | tax = None
46 | table = None
47 | metadata = None
48 | references = None
49 | controls = None
50 | control_samples = None
51 | hcluster = None
52 | dendro = None
53 | corr = None
54 |
55 | print_log("- Parsing configuration file")
56 | cfg = parse_config_file(args.config)
57 |
58 | print_log("- Parsing taxonomy")
59 | tax = parse_taxonomy(args.taxonomy, args.taxonomy_files)
60 |
61 | print_log("- Parsing input table")
62 | try:
63 | table = parse_table(args, tax)
64 | except Exception as e:
65 | print(e)
66 | return 1
67 |
68 | print_log("- Parsing metadata")
69 | metadata = parse_metadata(args, table.samples.to_list())
70 |
71 | print_log("- Parsing references")
72 | references = parse_references(cfg, tax, args.taxonomy, table.ranks())
73 |
74 | print_log("- Parsing controls")
75 | controls, control_samples = parse_controls(cfg, table, metadata)
76 |
77 | print_log("- Parsing MGnify database")
78 | mgnify = parse_mgnify(args.mgnify, cfg, tax, table.ranks())
79 |
80 | print_log("- Running DECONTAM")
81 | decontam = run_decontam(args.decontam, cfg, table, metadata, control_samples, script_dir)
82 |
83 | print_log("- Running hierarchical clustering")
84 | hcluster, dendro = run_hclustering(table, args.linkage_methods, args.linkage_metrics, args.transformation, args.skip_dendrogram, args.optimal_ordering)
85 |
86 | print_log("- Running correlation")
87 | corr = run_correlation(table, args.top_obs_corr)
88 |
89 | # 2) Generata data sources:
90 | # cds (ColumnDataSource) and dict containers: data structures loaded and parsed by bokehjs
91 | # "cds" for matrix like dataframes with fixed column sizes
92 | # "dict" for variable column sizes
93 | # _p_ : plot -> direct source of figures either pre-loaded or empty
94 | # _d_ : data -> auxiliar containers to be used/shared among plots
95 | # usually by copying and/or transforming values into a _p_ container
96 | # _m_ : mixed -> contain both plot and data properties
97 |
98 | print_log("- Generating data sources")
99 | # _m_
100 | # df: index (unique observations), col|..., tax|..., aux|ref
101 | cds_m_obstable = cds_obstable(table, tax, references, controls, control_samples, decontam)
102 | # _p_
103 | # df: index (unique sample-ids), aux|..., bar|..., tax|...
104 | cds_p_samplebars = cds_samplebars(table)
105 | # stacked: index (repeated observations), rank, ref, direct, parent
106 | cds_p_references = cds_plot_references(table, tax, references)
107 | # matrix: index (unique sample-ids), concentrations, controls, counts
108 | cds_p_decontam = cds_plot_decontam(decontam) if decontam else None
109 | # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]}
110 | cds_p_decontam_models = cds_plot_decontam_models(decontam) if decontam else None
111 | # stacked: index (taxa, level, lineage), count, perc
112 | cds_p_mgnify = cds_mgnify(mgnify, table, tax) if mgnify is not None else None
113 | # stacked: index (repeated sample-ids), obs, rank, ov, tv
114 | cds_p_heatmap = cds_heatmap(table, args.transformation, args.show_zeros)
115 | # matrix: index (unique sample-ids), md0, md1, ..., md(args.metadata_cols) -> (metadata field, metadata values)
116 | cds_p_metadata = cds_plot_metadata(metadata, args.metadata_cols) if metadata else None
117 | # stacked: index (repeated observations), rank, annot
118 | cds_p_annotations = cds_annotations(table, references, controls, decontam, control_samples)
119 | # empty matrix {"x": [], "y": [], "c": []}
120 | cds_p_dendro_x, cds_p_dendro_y = cds_plot_dendro() if not args.skip_dendrogram else [None, None]
121 | # stacked: index (repeated observations), other observation, rank, rho
122 | cds_p_correlation = cds_correlation(table, corr)
123 | # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors
124 | cds_p_obsbars = cds_obsbars(table, args.top_obs_bars)
125 | # df: index (unique sample-ids), col|...
126 | cds_p_sampletable = cds_sampletable(table)
127 | # _d_
128 | # df: index (unique sample-ids), aux|..., cnt|...,
129 | cds_d_samples = cds_samples(table, references, controls, decontam)
130 | # matrix: index (unique sample-ids) x columns (metadata fields) -> metadata values
131 | cds_d_metadata = cds_metadata(metadata) if metadata else None
132 | # {taxid: (contam_y1, contam_y2, non_contam_y, pval)}
133 | cds_d_decontam = cds_decontam(decontam, table.ranks()) if decontam else None
134 | # key = rank + "|" + method + "|" + metric
135 | # y: {"default": sorted sample-ids, key: sorted sample-ids, ...}
136 | # x: {"default|rank": sorted sample-ids, key: sorted sample-ids, ...}
137 | dict_d_hcluster_x, dict_d_hcluster_y = dict_hcluster(table, hcluster)
138 | # {key+"|x": x-values, key+"|y": y-values , key+"|c": colors}
139 | dict_d_dedro_x, dict_d_dedro_y = dict_dendro(table, dendro) if not args.skip_dendrogram else [None, None]
140 | # {taxid: name}
141 | dict_d_taxname = dict_taxname(tax, [txid for rank in table.ranks() for txid in table.observations(rank)])
142 | # {rank: [taxid1,taxid2, ..., taxid(top_obs_bars)]}
143 | dict_d_topobs = dict_topobs(table, args.top_obs_bars)
144 | # {taxid: {source: {desc: [refs]}}
145 | dict_d_refs = dict_refs(table, references)
146 | # dict: {rank: {obs: {sample: count}}}
147 | dict_d_sampleobs = dict_sampleobs(table)
148 |
149 | # 3) Plot elements
150 | print_log("- Plotting elements")
151 |
152 | # Defined fixed layout and plot sizes
153 | sizes = {}
154 | sizes["overview_top_panel_height"] = 300
155 | sizes["overview_top_panel_width_left"] = 250
156 | sizes["overview_top_panel_width_right"] = 450
157 |
158 | # Elements to plot
159 | # ele[name]["fig"] -> main figure/element
160 | # ele[name]["filter"] -> filter to the figure
161 | # ele[name]["wid"][widget1] -> widgets to the figure
162 | ele = {}
163 |
164 | # obstable
165 | ele["obstable"] = {}
166 | ele["obstable"]["fig"], ele["obstable"]["filter"] = plot_obstable(sizes, cds_m_obstable, table.ranks(), references, controls)
167 | ele["obstable"]["wid"] = plot_obstable_widgets(sizes, dict_d_taxname, max(cds_m_obstable.data["col|total_counts"]))
168 |
169 | # infopanel
170 | ele["infopanel"] = {}
171 | ele["infopanel"]["textarea"] = plot_infopanel()
172 |
173 | # references
174 | ele["references"] = {}
175 | if references:
176 | ele["references"]["fig"], ele["references"]["filter"] = plot_references(sizes, table, cds_p_references, dict_d_taxname)
177 | else:
178 | ele["references"]["fig"], ele["references"]["filter"] = None, None
179 | ele["references"]["wid"] = plot_references_widgets(sizes, references)
180 |
181 | # mgnify
182 | ele["mgnify"] = {}
183 | if cds_p_mgnify:
184 | ele["mgnify"]["fig"], ele["mgnify"]["filter"] = plot_mgnify(sizes, cds_p_mgnify)
185 | else:
186 | ele["mgnify"]["fig"], ele["mgnify"]["filter"] = None, None
187 | ele["mgnify"]["wid"] = plot_mgnify_widgets()
188 |
189 | # decontam
190 | ele["decontam"] = {}
191 | ele["decontam"]["wid"] = {}
192 | if decontam:
193 | ele["decontam"]["fig"] = plot_decontam(sizes, cds_p_decontam, cds_p_decontam_models, table.get_min_valid_count_perc())
194 | else:
195 | ele["decontam"]["fig"] = None
196 | ele["decontam"]["wid"] = plot_decontam_widgets(sizes)
197 |
198 | # samplebars
199 | ele["samplebars"] = {}
200 | ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, table)
201 | ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, references, controls, decontam, table.normalized)
202 |
203 | # sampletable
204 | ele["sampletable"] = {}
205 | ele["sampletable"]["fig"] = plot_sampletable(cds_p_sampletable, sizes, table.ranks())
206 | ele["sampletable"]["wid"] = plot_sampletable_widgets(sizes, max(cds_p_sampletable.data["col|total"]), metadata)
207 |
208 | # heatmap
209 | tools_heatmap = "hover,save,box_zoom,reset,crosshair,box_select"
210 | ele["heatmap"] = {}
211 | ele["heatmap"]["fig"] = plot_heatmap(table, cds_p_heatmap, tools_heatmap, args.transformation, dict_d_taxname)
212 | ele["heatmap"]["wid"] = plot_heatmap_widgets(table.ranks(), args.linkage_methods, args.linkage_metrics, references, controls, metadata, decontam)
213 |
214 | # metadata (heatmap)
215 | ele["metadata"] = {}
216 | ele["metadata"]["wid"] = {}
217 | if metadata:
218 | ele["metadata"]["fig"], ele["metadata"]["wid"] = plot_metadata(ele["heatmap"]["fig"], tools_heatmap, metadata, cds_d_metadata, cds_p_metadata)
219 | else:
220 | ele["metadata"]["fig"] = Spacer()
221 | ele["metadata"]["wid"]["metadata_multiselect"] = Spacer()
222 | ele["metadata"]["wid"]["legend_colorbars"] = Spacer()
223 | ele["metadata"]["wid"]["toggle_legend"] = Spacer()
224 |
225 | # annotations
226 | ele["annotations"] = {}
227 | if cds_p_annotations.data["index"].size:
228 | ele["annotations"]["fig"] = plot_annotations(ele["heatmap"]["fig"], tools_heatmap, cds_p_annotations, dict_d_taxname)
229 | else:
230 | ele["annotations"]["fig"] = Spacer()
231 |
232 | # dendrograms
233 | ele["dendrox"] = {}
234 | ele["dendroy"] = {}
235 | if not args.skip_dendrogram:
236 | ele["dendrox"]["fig"], ele["dendroy"]["fig"] = plot_dendrogram(ele["heatmap"]["fig"], tools_heatmap, cds_p_dendro_x, cds_p_dendro_y)
237 | else:
238 | ele["dendrox"]["fig"] = Spacer()
239 | ele["dendroy"]["fig"] = Spacer()
240 |
241 | # correlation
242 | ele["correlation"] = {}
243 | ele["correlation"]["fig"], ele["correlation"]["filter"] = plot_correlation(cds_p_correlation, table.ranks(), dict_d_taxname)
244 | ele["correlation"]["wid"] = plot_correlation_widgets(table.ranks(), args.top_obs_corr)
245 |
246 | # obsbars
247 | ele["obsbars"] = {}
248 | ele["obsbars"]["wid"] = plot_obsbars_widgets(table.ranks(), metadata, dict_d_topobs, dict_d_taxname, args.top_obs_bars)
249 | ele["obsbars"]["fig"], ele["obsbars"]["legend"] = plot_obsbars(cds_p_obsbars, dict_d_topobs, table.ranks(), args.top_obs_bars, dict_d_taxname, ele["obsbars"]["wid"]["rank_select"])
250 |
251 | #4) Link javascript:
252 | print_log("- Linking javascript")
253 |
254 | link_obstable_filter(ele, cds_m_obstable, table.ranks())
255 | link_obstable_samplebars(ele,
256 | cds_m_obstable,
257 | cds_p_samplebars,
258 | cds_d_samples,
259 | dict_d_sampleobs,
260 | cds_d_metadata,
261 | cds_p_decontam,
262 | cds_p_decontam_models,
263 | cds_d_decontam,
264 | cds_p_references,
265 | table.ranks(),
266 | table.get_min_valid_count_perc(),
267 | table.get_total().max(),
268 | cds_p_mgnify,
269 | dict_d_refs,
270 | dict_d_taxname)
271 | link_heatmap_widgets(ele,
272 | cds_d_samples,
273 | cds_d_metadata,
274 | cds_p_metadata,
275 | dict_d_hcluster_x,
276 | dict_d_hcluster_y,
277 | cds_p_dendro_x,
278 | cds_p_dendro_y,
279 | dict_d_dedro_x,
280 | dict_d_dedro_y,
281 | cds_p_annotations,
282 | cds_m_obstable,
283 | cds_p_heatmap,
284 | table.ranks(),
285 | dict_d_taxname)
286 | link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, args.metadata_cols)
287 | link_correlation_widgets(ele, cds_p_correlation)
288 | link_obsbars_widgets(ele,
289 | cds_p_obsbars,
290 | dict_d_topobs,
291 | dict_d_sampleobs,
292 | cds_d_samples,
293 | args.top_obs_bars,
294 | dict_d_taxname,
295 | cds_d_metadata,
296 | cds_p_sampletable)
297 | link_sampletable_select(ele, cds_p_sampletable, cds_d_metadata)
298 |
299 | # 5) Draw layout
300 | print_log("- Drawing layout")
301 | logo_path = os.path.join(script_dir, "img", "logo.png")
302 |
303 | final_layout = make_layout(ele, sizes, Config.version, logo_path, args.title, args.output_plots)
304 | if final_layout is None:
305 | return 1
306 |
307 | template = include_scripts({os.path.join(script_dir, "js", "func.js"): "script",
308 | os.path.join(script_dir, "js", "popup.js"): "script",
309 | os.path.join(script_dir, "css", "popup.css"): "style"})
310 |
311 | if args.full_offline:
312 | mode = "inline" # configure to provide entire Bokeh JS and CSS inline
313 | elif _debug:
314 | mode = "absolute-dev" # non-minimized - configure to load from the installed Bokeh library static directory
315 | else:
316 | mode = "cdn" # configure to load Bokeh JS and CSS from https://cdn.bokeh.org
317 |
318 | # setup output file and JS mode
319 | print_log("- Saving report")
320 | output_file(args.output_html, title="GRIMER" if not args.title else "GRIMER - " + args.title, mode=mode)
321 | save(final_layout, template=template)
322 | print_log("File: " + args.output_html)
323 | file_size_bytes = os.path.getsize(args.output_html)
324 | print_log("Size: " + str(file_size_bytes) + " bytes (" + '{0:.2f} MB'.format(file_size_bytes / float(1024 ** 2)) + ")")
325 |
326 | if __name__ == "__main__":
327 | main()
328 |
--------------------------------------------------------------------------------
/grimer/img/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/img/__init__.py
--------------------------------------------------------------------------------
/grimer/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/img/logo.png
--------------------------------------------------------------------------------
/grimer/js/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/js/__init__.py
--------------------------------------------------------------------------------
/grimer/js/func.js:
--------------------------------------------------------------------------------
1 | function sort_numeric(a, b){ return a - b; }
2 | function sort_string(a, b){ return a.localeCompare(b); }
3 |
4 | function grimer_sort(factors, sort_col, sort_mode="numeric", desc=false, group_col1=[], group_col2=[], index=[]) {
5 | //sort_mode : numeric, string
6 |
7 | // subset data if index provided
8 | if(index.length){
9 | factors = index.map( s => factors[s] );
10 | sort_col = index.map( s => sort_col[s] );
11 | if(group_col1.length){
12 | group_col1 = index.map( s => group_col1[s] );
13 | }
14 | if(group_col2.length){
15 | group_col2 = index.map( s => group_col2[s] );
16 | }
17 | }
18 |
19 | // Generate numerical index to sort arrays
20 | var idx = new Array(factors.length);
21 | for (var i = 0; i < idx.length; ++i) idx[i] = i;
22 | //If numeric, replace NaN with sortable value (-Infinity) to be at the end of the sorted array
23 | if (sort_mode=="numeric"){
24 | sort_col = sort_col.map(function(v){ return isNaN(v) ? -Infinity : v })
25 | }
26 |
27 | if(group_col1.length && group_col2.length){
28 | if (sort_mode=="numeric" && desc==false)
29 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[b],sort_col[a]));
30 | else if (sort_mode=="numeric" && desc==true)
31 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[a],sort_col[b]));
32 | else if (sort_mode=="string" && desc==false)
33 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[a],sort_col[b]));
34 | else if (sort_mode=="string" && desc==true)
35 | idx.sort((a, b) => sort_string(group_col2[a],group_col2[b]) || sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[b],sort_col[a]));
36 | }else if(group_col1.length){
37 | if (sort_mode=="numeric" && desc==false)
38 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[b],sort_col[a]));
39 | else if (sort_mode=="numeric" && desc==true)
40 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_numeric(sort_col[a],sort_col[b]));
41 | else if (sort_mode=="string" && desc==false)
42 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[a],sort_col[b]));
43 | else if (sort_mode=="string" && desc==true)
44 | idx.sort((a, b) => sort_string(group_col1[a],group_col1[b]) || sort_string(sort_col[b],sort_col[a]));
45 | }else{
46 | if (sort_mode=="numeric" && desc==false)
47 | idx.sort((a, b) => sort_numeric(sort_col[b],sort_col[a]));
48 | else if (sort_mode=="numeric" && desc==true)
49 | idx.sort((a, b) => sort_numeric(sort_col[a],sort_col[b]));
50 | else if (sort_mode=="string" && desc==false)
51 | idx.sort((a, b) => sort_string(sort_col[a],sort_col[b]));
52 | else if (sort_mode=="string" && desc==true)
53 | idx.sort((a, b) => sort_string(sort_col[b],sort_col[a]));
54 | }
55 |
56 | var sorted_factors = new Array(idx.length);
57 | for (var i = 0; i < idx.length; ++i) sorted_factors[i] = factors[idx[i]];
58 | return sorted_factors;
59 | }
60 |
61 | function table_to_tsv(source, cols, headers, selected) {
62 |
63 | var rows_idx = []
64 | if(selected==true){
65 | //remove undefined from selected if present
66 | rows_idx = source.selected.indices.filter(function( element ) {
67 | return element !== undefined;
68 | });
69 | }
70 | else{
71 | // include all rows
72 | for (let i = 0; i < source.get_length(); i++) {
73 | rows_idx.push(i);
74 | }
75 | }
76 |
77 | const lines = [headers.join('\t')]
78 | for (let i = 0; i < rows_idx.length; i++) {
79 | let row = [];
80 | for (let j = 0; j < cols.length; j++) {
81 | row.push(source.data[cols[j]][rows_idx[i]].toString())
82 | }
83 | lines.push(row.join('\t'))
84 | }
85 | return lines.join('\n').concat('\n')
86 | }
87 |
88 | function save_file(filename, filetext){
89 | const blob = new Blob([filetext], { type: 'text/csv;charset=utf-8;' })
90 | //addresses IE
91 | if (navigator.msSaveBlob) {
92 | navigator.msSaveBlob(blob, filename)
93 | } else {
94 | const link = document.createElement('a')
95 | link.href = URL.createObjectURL(blob)
96 | link.download = filename
97 | link.target = '_blank'
98 | link.style.visibility = 'hidden'
99 | link.dispatchEvent(new MouseEvent('click'))
100 | }
101 | }
--------------------------------------------------------------------------------
/grimer/js/popup.js:
--------------------------------------------------------------------------------
1 | var pop = {
2 | // (A) ATTACH POPUP HTML
3 | pWrap : null, // HTML popup wrapper
4 | pBox : null, // HTML popup box
5 | pTitle : null, // HTML popup title
6 | pText : null, // HTML popup text
7 | pClose : null, // HTML close button
8 | init : function () {
9 | // (A1) POPUP WRAPPER
10 | pop.pWrap = document.createElement("div");
11 | pop.pWrap.id = "pop-up";
12 | document.body.appendChild(pop.pWrap);
13 |
14 | // (A2) POPUP BOX
15 | pop.pBox = document.createElement("div");
16 | pop.pBox.id = "pop-box";
17 | pop.pWrap.appendChild(pop.pBox);
18 |
19 | // (A3) TITLE
20 | pop.pTitle = document.createElement("h2");
21 | pop.pTitle.id = "pop-title";
22 | pop.pBox.appendChild(pop.pTitle);
23 |
24 | // (A4) TEXT
25 | pop.pText = document.createElement("p");
26 | pop.pText.id = "pop-text";
27 | pop.pBox.appendChild(pop.pText);
28 |
29 | // (A5) CLOSE
30 | pop.pClose = document.createElement("div");
31 | pop.pClose.id = "pop-close";
32 | pop.pClose.innerHTML = "☒";
33 | pop.pClose.onclick = pop.close;
34 | pop.pBox.appendChild(pop.pClose);
35 | },
36 |
37 | // (B) OPEN POPUP
38 | open : function (title, text) {
39 | pop.pTitle.innerHTML = title;
40 | pop.pText.innerHTML = text;
41 | pop.pWrap.classList.add("open");
42 | },
43 |
44 | // (C) CLOSE POPUP
45 | close : function () {
46 | pop.pWrap.classList.remove("open");
47 | }
48 | };
49 | window.addEventListener("DOMContentLoaded", pop.init);
50 |
--------------------------------------------------------------------------------
/grimer/layout.py:
--------------------------------------------------------------------------------
1 | from bokeh.layouts import column, row, gridplot
2 | from bokeh.models import Spacer, Tabs, Panel, Div
3 | from grimer.func import print_log
4 | import base64
5 |
6 |
7 | def make_layout(ele, sizes, version, logo_path, title, output_plots):
8 |
9 | main_panels = {}
10 | if "overview" in output_plots:
11 | filterwidgets = column(ele["obstable"]["wid"]["frequency_spinner"],
12 | ele["obstable"]["wid"]["counts_perc_avg_spinner"],
13 | ele["obstable"]["wid"]["total_counts_spinner"],
14 | ele["obstable"]["wid"]["name_multichoice"],
15 | row(ele["obstable"]["wid"]["help_button"], ele["obstable"]["wid"]["export_dropdown"]))
16 | filterwidgetstabs = Tabs(tabs=[Panel(child=filterwidgets, title="Filter")],
17 | sizing_mode="fixed",
18 | height=sizes["overview_top_panel_height"] + 20,
19 | width=sizes["overview_top_panel_width_left"])
20 | info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")]
21 | if ele["references"]["fig"]:
22 | info_tabs.append(Panel(child=column(ele["references"]["fig"],
23 | row(ele["references"]["wid"]["references_select"],
24 | ele["references"]["wid"]["help_button"])
25 | ), title="References"))
26 | if ele["mgnify"]["fig"]:
27 | info_tabs.append(Panel(child=column(ele["mgnify"]["fig"],
28 | row(ele["mgnify"]["wid"]["biome_spinner"],
29 | ele["mgnify"]["wid"]["help_button"])
30 | ), title="MGnify"))
31 | if ele["decontam"]["fig"]:
32 | info_tabs.append(Panel(child=column(ele["decontam"]["fig"],
33 | row(ele["decontam"]["wid"]["pscore_text"],
34 | ele["decontam"]["wid"]["pscore_input"],
35 | ele["decontam"]["wid"]["help_button"])
36 | ), title="DECONTAM"))
37 | infotabs = Tabs(tabs=info_tabs,
38 | sizing_mode="fixed",
39 | height=sizes["overview_top_panel_height"] + 20,
40 | width=sizes["overview_top_panel_width_right"])
41 | row_obstable = row(filterwidgetstabs,
42 | ele["obstable"]["fig"],
43 | infotabs,
44 | sizing_mode="stretch_width")
45 | row_barpot = column(row(ele["samplebars"]["fig"]),
46 | row(ele["samplebars"]["wid"]["y1_select"],
47 | ele["samplebars"]["wid"]["annotbar_rank_select"],
48 | ele["samplebars"]["wid"]["annotbar_select"],
49 | ele["samplebars"]["wid"]["groupby1_select"],
50 | ele["samplebars"]["wid"]["groupby2_select"],
51 | ele["samplebars"]["wid"]["sort_select"],
52 | ele["samplebars"]["wid"]["y2_select"],
53 | ele["samplebars"]["wid"]["help_button"]),
54 | ele["samplebars"]["wid"]["toggle_label"])
55 | main_panels["overview"] = Panel(child=column(row_obstable, row_barpot, sizing_mode="stretch_width"), title="Overview")
56 |
57 | if "samples" in output_plots:
58 | selectwidgets = column(ele["sampletable"]["wid"]["total_counts_spinner"],
59 | ele["sampletable"]["wid"]["assigned_spinner"],
60 | ele["sampletable"]["wid"]["metadata_multichoice"],
61 | row(ele["sampletable"]["wid"]["help_button"], ele["sampletable"]["wid"]["export_dropdown"]))
62 | selectwidgetstabs = Tabs(tabs=[Panel(child=selectwidgets, title="Select")],
63 | sizing_mode="fixed",
64 | height=sizes["overview_top_panel_height"] + 20,
65 | width=sizes["overview_top_panel_width_left"])
66 | row_sampletable = row(selectwidgetstabs,
67 | ele["sampletable"]["fig"],
68 | sizing_mode="stretch_width")
69 | row_obsbars = column(row(ele["obsbars"]["fig"]),
70 | row(ele["obsbars"]["wid"]["rank_select"],
71 | ele["obsbars"]["wid"]["groupby1_select"],
72 | ele["obsbars"]["wid"]["groupby2_select"],
73 | ele["obsbars"]["wid"]["sort_select"],
74 | ele["obsbars"]["wid"]["help_button"]),
75 | ele["obsbars"]["wid"]["toggle_label"])
76 | main_panels["samples"] = Panel(child=column(row_sampletable, row_obsbars, sizing_mode="stretch_width"), title="Samples")
77 |
78 | if "heatmap" in output_plots:
79 | row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]],
80 | [ele["dendrox"]["fig"]],
81 | [ele["annotations"]["fig"], None, ele["heatmap"]["wid"]["help_button"]]],
82 | toolbar_location='right',
83 | merge_tools=True)
84 | row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"],
85 | ele["heatmap"]["wid"]["toggle_label"],
86 | width=300),
87 | row(column(ele["heatmap"]["wid"]["x_groupby_select"],
88 | ele["heatmap"]["wid"]["x_sort_select"]),
89 | column(ele["heatmap"]["wid"]["y_groupby_select"],
90 | ele["heatmap"]["wid"]["y_sort_select"]),
91 | sizing_mode="stretch_width"),
92 | column(ele["metadata"]["wid"]["metadata_multiselect"],
93 | ele["metadata"]["wid"]["toggle_legend"],
94 | sizing_mode="stretch_height",
95 | width=300))
96 | main_panels["heatmap"] = Panel(child=column(row_heatmap, row_heatmap_widgets, sizing_mode="stretch_width"), title="Heatmap")
97 |
98 | if "correlation" in output_plots:
99 | row_correlation = row(column(ele["correlation"]["wid"]["rank_select"],
100 | ele["correlation"]["wid"]["neg_slider"],
101 | ele["correlation"]["wid"]["pos_slider"],
102 | ele["correlation"]["wid"]["toggle_label"],
103 | ele["correlation"]["wid"]["help_button"]),
104 | ele["correlation"]["fig"])
105 | main_panels["correlation"] = Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation")
106 |
107 | if not main_panels:
108 | print_log("No valid plots to output")
109 | return None
110 | else:
111 | # Add plots in user chosen order
112 | tabs = [main_panels[p] for p in output_plots]
113 |
114 | main_tab = Tabs(tabs=tabs)
115 | logo_base64 = base64.b64encode(open(logo_path, 'rb').read()) # encode to base64
116 | logo_base64 = logo_base64.decode() # convert to string
117 | logo_div = Div(text='
' + 'v' + version + '', width=300, height=40, sizing_mode="fixed")
118 | if title:
119 | title_div = Div(text='' + title + '
', height=40, sizing_mode="stretch_width")
120 | else:
121 | title_div = Spacer()
122 | final = column([row(logo_div, title_div), main_tab], sizing_mode="stretch_width")
123 |
124 | return final
125 |
--------------------------------------------------------------------------------
/grimer/metadata.py:
--------------------------------------------------------------------------------
1 | class Metadata:
2 | valid_types = ["categorical", "numeric"]
3 | default_type = "categorical"
4 |
5 | def __init__(self, md, types):
6 | self.data = md
7 | self.types = types
8 |
9 | def __repr__(self):
10 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()]
11 | return 'Metadata({})'.format(', '.join(args))
12 |
13 | def get_col_headers(self):
14 | return self.data.columns
15 |
16 | def get_data(self, metadata_type: str=None):
17 | if metadata_type is not None:
18 | return self.data[self.types[self.types == metadata_type].index]
19 | else:
20 | return self.data
21 |
22 | def get_col(self, col):
23 | return self.data[col]
24 |
25 | def get_unique_values(self, col):
26 | return self.get_col(col).dropna().unique()
27 |
28 | def get_type(self, col):
29 | return self.types[col]
30 |
31 | def get_subset(self, column, value):
32 | return self.data[self.data[column] == value]
33 |
--------------------------------------------------------------------------------
/grimer/reference.py:
--------------------------------------------------------------------------------
1 | import yaml
2 |
3 |
4 | class Reference:
5 | def __init__(self, file: str=None, ids: list=[]):
6 | self.ids = {} # {refid: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)}}
7 | self.parents = {} # {parent_id: set(refids)}
8 |
9 | if file is not None:
10 | self.parse(file)
11 | elif ids:
12 | for i in ids:
13 | self.add(i, "", "")
14 |
15 | def __repr__(self):
16 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()]
17 | return 'Reference({})'.format(', '.join(args))
18 |
19 | def add(self, i, ref: str=None, desc: str=None):
20 | if i not in self.ids:
21 | self.ids[i] = {}
22 | if ref is not None:
23 | if ref not in self.ids[i]:
24 | self.ids[i][ref] = set()
25 | if desc is not None:
26 | self.ids[i][ref].add(desc)
27 |
28 | def add_parent(self, parent, refid):
29 | if parent not in self.parents:
30 | self.parents[parent] = set()
31 | self.parents[parent].add(refid)
32 |
33 | def parse(self, file):
34 | with open(file, 'r') as fh:
35 | if file.endswith(".yml") or file.endswith(".yaml"):
36 | src = yaml.safe_load(fh)
37 | for desc, val in src.items():
38 | for ref, v in val.items():
39 | for i in map(str, v["ids"]):
40 | self.add(i, (ref, v["url"]), desc)
41 | else:
42 | for line in fh:
43 | main_id = line.rstrip()
44 | self.add(main_id)
45 |
46 | def update_taxids(self, taxid_updated):
47 | for node, upd_node in taxid_updated.items():
48 | if upd_node is not None and upd_node != node:
49 | print("Updated taxonomic node: " + node + " -> " + upd_node)
50 | self.add(upd_node)
51 | self.ids[upd_node].update(self.ids[node])
52 | del self.ids[node]
53 |
54 | def get_refs_desc(self, i, direct: bool=False, parents: bool=False):
55 | refs_desc = {}
56 | if direct and i in self.ids:
57 | refs_desc.update(self.ids[i])
58 | if parents and i in self.parents:
59 | for refid in self.parents[i]:
60 | refs_desc.update(self.ids[refid])
61 | return refs_desc
62 |
63 | def get_refs_count(self, i, direct: bool=False, parents: bool=False):
64 | return len(self.get_refs_desc(i, direct, parents))
65 |
--------------------------------------------------------------------------------
/grimer/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/grimer/317a7f337c0fea3502ff6c69e9f4002fcae903a6/grimer/scripts/__init__.py
--------------------------------------------------------------------------------
/grimer/scripts/run_decontam.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 |
3 | library("optparse")
4 | library("decontam")
5 | library("reshape2")
6 | library("stats")
7 | #library(ggplot2); packageVersion("ggplot2")
8 |
9 | parser <- OptionParser()
10 | parser <- add_option(parser, c("-o", "--resout"), default="decontam_out.tsv", type="character", help="File to output results")
11 | parser <- add_option(parser, c("-d", "--modout"), default="decontam_mod.tsv", type="character", help="File to output models")
12 | parser <- add_option(parser, c("-i", "--counts"), default="", type="character", help="Input count table")
13 | parser <- add_option(parser, c("-c", "--concentrations"), default="", type="character", help="Input table with DNA concentration")
14 | parser <- add_option(parser, c("-n", "--controls"), default="", type="character", help="Input list with control sample ids")
15 | parser <- add_option(parser, c("-m", "--method"), default="frequency", type="character", help="Method to use: frequecy, prevalence, combined")
16 | parser <- add_option(parser, c("-t", "--threshold"), default=0.1, type="double", help="Threshold")
17 | parser <- add_option(parser, c("-v", "--verbose"), action="store_true", default=TRUE, help="Print extra output [default]")
18 | args <- parse_args(parser)
19 |
20 | generate_plot_frequency_values <- function(seqtab, taxa, conc, neg=NULL, normalize=FALSE, showModels=TRUE, log=TRUE, facet=TRUE){
21 | # Code copied and adapted from https://github.com/benjjneb/decontam/blob/master/R/plotting.R
22 | # v1.1.2 6a242fc7fc452a971b7b60b6757ea81a86ade7b5
23 |
24 | #print(seqtab)
25 |
26 | if(any(rowSums(seqtab) == 0)) { # Catch and remove zero-count samples
27 | zero.count <- rowSums(seqtab) == 0
28 | seqtab <- seqtab[!zero.count,]
29 | conc <- conc[!zero.count]
30 | if(!is.null(neg)) neg <- neg[!zero.count]
31 | warning("Removed ", sum(zero.count), " samples with zero total counts (or frequency).")
32 | }
33 |
34 | if(normalize) seqtab <- sweep(seqtab, 1, rowSums(seqtab), "/")
35 | if(!(is.numeric(conc) && all(conc>0))) stop("conc must be positive numeric.")
36 | if(is.null(neg)) neg <- rep(FALSE, length(conc)) # Don't ignore any samples
37 | if(is.character(taxa)) {
38 | seqtab <- seqtab[,colnames(seqtab) %in% taxa,drop=FALSE]
39 | } else {
40 | stop("taxa must be a vector of taxa names.")
41 | }
42 | ntax.plot <- ncol(seqtab)
43 | if(ntax.plot == 0) stop("None of the provided taxa were present in seqtab.")
44 | # Prepare plotting data.frame
45 | plotdf <- cbind(data.frame(seqtab, check.names=FALSE), DNA_conc=conc, Type=ifelse(neg, "Negative", "Sample"))
46 | plot_melt <- melt(plotdf, measure.vars=1:ntax.plot, variable.name="taxa", value.name="taxon_abundance")
47 | taxon_levels <- taxa
48 |
49 | plot_melt$taxa <- factor(plot_melt$taxa, levels = taxon_levels)
50 | if(showModels) {
51 | mod_melts <- split(plot_melt, plot_melt$taxa)
52 | logc <- log(seq(min(plotdf$DNA_conc), max(plotdf$DNA_conc), length.out=1000))
53 | for(tax in names(mod_melts)) {
54 | newdata <- data.frame(logc=logc, taxa=tax, DNA_conc=exp(logc))
55 | freq <- mod_melts[[tax]]$taxon_abundance
56 | conc <- mod_melts[[tax]]$DNA_conc
57 | df <- data.frame(logc=log(conc), logf=log(freq))
58 | df <- df[!neg | is.na(neg),]
59 | df <- df[freq>0,]
60 | if(sum(freq>0)>1) {
61 | lm1 <- lm(logf~offset(-1*logc), data=df)
62 | lm0 <- lm(logf~1, data=df)
63 | newdata$contam <- exp(predict(lm1, newdata=newdata))
64 | newdata$non.contam <- exp(predict(lm0, newdata=newdata))
65 | } else {
66 | newdata$contam <- NA
67 | newdata$non.contam <- NA
68 | }
69 | mod_melts[[tax]] <- newdata
70 | }
71 | mod_melt <- do.call(rbind, mod_melts)
72 | }
73 |
74 | # p1 <- ggplot(data=plot_melt, aes_string("DNA_conc", "taxon_abundance")) + xlab("DNA Concentration")
75 | # p1 <- p1 + ylab(ifelse(normalize, "Frequency", "Relative Abundance"))
76 | # if(log) p1 <- p1 + scale_x_log10()
77 | # if(log) p1 <- p1 + scale_y_log10()
78 | # if(nlevels(factor(neg))>1) p1 <- p1 + aes_string(color="Type")
79 | # if(facet && ntax.plot > 1) p1 <- p1 + facet_wrap(~taxa)
80 | # if(showModels) p1 <- p1 + geom_line(data=mod_melt, aes_string(y="contam"), color="red", linetype="solid")
81 | # if(showModels) p1 <- p1 + geom_line(data=mod_melt, aes_string(y="non.contam"), color="black", linetype="dashed")
82 | # p1 + geom_point()
83 | # ggsave("test.png")
84 |
85 | # Get first and last points of the models
86 | idx <- sort(c(seq(1, length(mod_melt$taxa), 1000), seq(1000, length(mod_melt$taxa), 1000)))
87 | return(mod_melt[idx,c("contam","non.contam")])
88 | }
89 |
90 | # Load count table
91 | count_table <- read.table(file=args$counts, sep='\t', header=TRUE, check.names=FALSE)
92 | rows_table <- count_table[,1]
93 | count_matrix <- data.matrix(data.frame(count_table[,-1], row.names = rows_table, check.names=FALSE))
94 |
95 | # Load concentration table
96 | if(!args$concentrations==""){
97 | concentrations <- read.table(file=args$concentrations, sep='\t', header=FALSE, check.names=FALSE)
98 | concentrations_list <- concentrations[ (concentrations[, "V1"] %in% rows_table) , "V2"]
99 | }
100 |
101 | # Load list of controls
102 | if(!args$controls==""){
103 | controls <- read.table(file=args$controls, sep='\t', header=FALSE, check.names=FALSE)
104 | controls_index <- rows_table %in% controls[ , "V1"]
105 | }
106 |
107 | # Run DECONTAM
108 | if (args$method=="frequency"){
109 | decontam_out <- isContaminant(count_matrix, normalize=FALSE, conc=concentrations_list, method="frequency", threshold=args$threshold)
110 | }else if (args$method=="prevalence") {
111 | decontam_out <- isContaminant(count_matrix, normalize=FALSE, neg=controls_index, method="prevalence", threshold=args$threshold)
112 | }else if (args$method=="combined") {
113 | decontam_out <- isContaminant(count_matrix, normalize=FALSE, neg=controls_index, conc=concentrations_list, method="combined", threshold=args$threshold)
114 | }
115 |
116 | write.table(decontam_out, file=args$resout, sep="\t", quote=FALSE)
117 |
118 | models <- generate_plot_frequency_values(count_matrix, colnames(count_table[-1]), normalize=FALSE, conc=concentrations_list)
119 |
120 | write.table(models, file=args$modout, sep="\t", quote=FALSE)
121 |
--------------------------------------------------------------------------------
/grimer/table.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 |
4 | class Table:
5 | def __init__(self, samples, total, unassigned, lineage, normalized, zerorep):
6 | # Ordered dict to keep rank insert order
7 | self.data = OrderedDict()
8 | self.lineage = lineage
9 | self.samples = samples
10 | self.total = total
11 | self.unassigned = unassigned
12 | self.normalized = normalized
13 | self.zerorep = zerorep
14 |
15 | def __repr__(self):
16 | args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()]
17 | return 'Table({})'.format(', '.join(args))
18 |
19 | def add_rank(self, rank, table):
20 | self.data[rank] = table
21 |
22 | def observations(self, rank):
23 | return self.data[rank].columns
24 |
25 | def ranks(self):
26 | return list(self.data.keys())
27 |
28 | def get_min_valid_count_perc(self):
29 | return min([self.get_counts_perc(rank)[self.get_counts_perc(rank) > 0].min().min() for rank in self.ranks()])
30 |
31 | def get_total(self):
32 | return self.total
33 |
34 | def get_unassigned(self):
35 | return self.unassigned
36 |
37 | def get_assigned(self):
38 | return self.get_total() - self.get_unassigned()
39 |
40 | def get_unassigned_perc(self):
41 | return self.get_unassigned().divide(self.get_total(), axis=0) if not self.normalized else self.get_unassigned().divide(100, axis=0)
42 |
43 | def get_assigned_perc(self):
44 | return self.get_assigned().divide(self.get_total(), axis=0) if not self.normalized else self.get_assigned().divide(100, axis=0)
45 |
46 | def get_lineage(self, taxid, rank, other_rank):
47 | # get lineage up-to requested rank
48 | return self.lineage[self.lineage[rank] == taxid][other_rank].values[0]
49 |
50 | def get_frequency(self, rank):
51 | return self.data[rank].gt(0).sum(axis=0)
52 |
53 | def get_frequency_perc(self, rank):
54 | return self.get_frequency(rank) / len(self.samples)
55 |
56 | def get_counts(self, rank):
57 | return self.data[rank].sum(axis=0) if not self.normalized else 0
58 |
59 | def get_counts_perc(self, rank):
60 | return self.data[rank].divide(self.get_total(), axis=0) if not self.normalized else self.data[rank].divide(100, axis=0)
61 |
62 | def get_counts_perc_avg_samples(self, rank):
63 | return self.get_counts_perc(rank).sum(axis=0) / len(self.samples)
64 |
65 | def get_top(self, rank, n):
66 | return sorted(self.get_counts_perc_avg_samples(rank).sort_values(ascending=False).index[:n].to_list())
67 |
68 | def get_subtable(self, rank, samples: list=[], taxids: list=[], keep_shape: bool=False):
69 | subtable = self.data[rank]
70 |
71 | if samples:
72 | valid_samples = []
73 | for s in samples:
74 | if s in self.samples:
75 | valid_samples.append(s)
76 |
77 | if valid_samples:
78 | subtable = subtable.loc[subtable.index.intersection(valid_samples)]
79 | if not keep_shape:
80 | subtable = subtable.loc[:, subtable.sum(axis=0) > 0]
81 | else:
82 | return None
83 |
84 | if taxids:
85 | subtable = subtable[taxids].copy()
86 | if not keep_shape:
87 | subtable = subtable.loc[subtable[taxids].sum(axis=1) > 0, :]
88 |
89 | return subtable
90 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: GRIMER
2 | theme: readthedocs
3 | nav:
4 | - GRIMER: index.md
5 | - Importing files: importing.md
6 | - Configuration file: config.md
7 | - GRIMER Reports - User Manual: manual.md
8 | - GRIMER Reports - Examples: examples.md
9 | plugins:
10 | - glightbox # pip install mkdocs-glightbox
11 | markdown_extensions:
12 | - attr_list
--------------------------------------------------------------------------------
/scripts/bacdive_download.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import pandas as pd
3 | from multitax import NcbiTx
4 | import sys
5 |
6 | ## TODO
7 | ## filter infection?
8 | ## find names?
9 |
10 | data = {("Host_Human-HostBodySite_Limb", "Limbs"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=39&filters%5B1%5D%5Bcat3%5D=&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1",
11 | ("Host_Human-HostBodySite_Organ_Ear", "Ear"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=209&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1",
12 | ("Host_Human-HostBodySite_Organ_Eye", "Eye"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=210&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1",
13 | ("Host_Human-HostBodySite_Organ_Nose", "Nose"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=217&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1",
14 | ("Host_Human-HostBodySite_Organ_SkinNailHair", "Skin/Nail/Hair"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=40&filters%5B1%5D%5Bcat3%5D=219&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1",
15 | ("Host_Human-HostBodySite_Organ_OralCavityAndAirways", "Oral"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=5&filters%5B1%5D%5Bcat2%5D=41&filters%5B1%5D%5Bcat3%5D=&filters%5B1%5D%5Bcolor%5D=5&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1",
16 | ("Host_Human-HostBodyProduct_OralCavityAndAirways_Saliva", "Saliva"): "https://bacdive.dsmz.de/isolation-sources/csv?filter-domain=&filter-phylum=&filter-class=&filter-ordo=&filter-family=&filter-genus=&filters%5B0%5D%5Bcat1%5D=4&filters%5B0%5D%5Bcat2%5D=29&filters%5B0%5D%5Bcat3%5D=&filters%5B0%5D%5Bcolor%5D=4&filters%5B1%5D%5Bcat1%5D=6&filters%5B1%5D%5Bcat2%5D=47&filters%5B1%5D%5Bcat3%5D=276&filters%5B1%5D%5Bcolor%5D=6&iso_3_country=&polygon-strain-ids=&sort_by_order=ASC&sort_by=st.species&pfc=&csv=1"}
17 |
18 | tax = NcbiTx(extended_names=True)
19 |
20 | print('"Human-related bacterial isolates from BacDive":')
21 |
22 | for (search, name), url in data.items():
23 | print(' "' + name + '":')
24 | print(' url: "https://bacdive.dsmz.de/search?search=taxid:{}"')
25 | parsed_ids = set()
26 | df = pd.read_table(url, sep=",", index_col=0).dropna(subset=["Species"])
27 | for species in df.Species.unique():
28 | taxids = tax.search_name(species, rank="species", exact=True)
29 | if not taxids:
30 | sys.stderr.write("Species name not found: " + species + "\n")
31 | elif len(taxids) > 1:
32 | sys.stderr.write("Species with ambiguous name: " + species + "\n")
33 | else:
34 | parsed_ids.add(taxids[0])
35 | print(" ids: [" + ", ".join(parsed_ids) + "]")
36 |
--------------------------------------------------------------------------------
/scripts/ehomd_download.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import pandas as pd
3 | import sys
4 | import urllib.request
5 | import re
6 |
7 |
8 | def get_taxid(url):
9 | try:
10 | sys.stderr.write(url + "\n")
11 | assembly_stats = url + "/" + url.split("/")[-1] + "_assembly_stats.txt"
12 | filedata = urllib.request.urlopen(assembly_stats).read().decode()
13 | x = re.search("# Taxid:[\s0-9]*\\r\\n", filedata)
14 | if x:
15 | return re.findall("\d+", x.group())[0]
16 | else:
17 | return None
18 | except:
19 | return None
20 |
21 | # Can be Oral, Nasal or both ("Nasal,Oral")
22 | habitats = ["Oral", "Nasal"]
23 | data = "http://www.ehomd.org/ftp/genomes/PROKKA/current/SEQID_info.csv"
24 |
25 | df = pd.read_table(data, sep=",", usecols=["Habitat", "Sequence_Source"])
26 | df = df[df["Habitat"].isin(habitats + ["Nasal,Oral"])].drop_duplicates()
27 | df["taxid"] = df["Sequence_Source"].map(get_taxid)
28 |
29 | print('"Human Oral Microbiome Database (eHOMD)":')
30 | for h in habitats:
31 | print(' "' + h + '":')
32 | parsed_ids = set(df.taxid[df.Habitat.str.contains(h)])
33 | print(' url: "http://www.ehomd.org/?name=HOMD"')
34 | print(" ids: [" + ", ".join(parsed_ids) + "]")
35 |
36 | sys.stderr.write("Could not retrieve taxid for: " + "\n".join(df[df.taxid.isna()]["Sequence_Source"].to_list()) + "\n")
37 |
--------------------------------------------------------------------------------
/scripts/env.yaml:
--------------------------------------------------------------------------------
1 | name: grimer_dev
2 | channels:
3 | - defaults
4 | - bioconda
5 | - conda-forge
6 | dependencies:
7 | - pandas
8 | - jsonapi-client>=0.9.7 #mgnify scripts
9 |
--------------------------------------------------------------------------------
/scripts/mgnify_download.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import pandas as pd
5 | import sys
6 | import os
7 | import pickle
8 | import gzip
9 | from urllib.parse import urlencode
10 | from jsonapi_client import Session, Filter
11 | from glob import glob
12 |
13 | """
14 | Script to download taxonomy abundance files and metadata from MGnify
15 | - It always downloads latest available results based on the pipeline version
16 |
17 | Example dump: seq -f "MGYS%08g" 1 5724 | xargs -P 8 -I {} ./mgnify_download.py -i {} -v -g -o mgnify_dump/ > mgnify_dump.log 2>&1 &
18 | """
19 |
20 |
21 | def main(argv=sys.argv[1:]):
22 |
23 | API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/'
24 |
25 | parser = argparse.ArgumentParser(description='grimer-download-mgnify')
26 | parser.add_argument('-i', '--study-accession', required=True, type=str, help="MGnify study accession (e.g. MGYS00002462)")
27 | parser.add_argument('-g', '--gzip', default=False, action='store_true', help="Gzip downloaded files")
28 | parser.add_argument('-v', '--verbose', default=False, action='store_true', help="Verbose output")
29 | parser.add_argument('-o', '--output-prefix', type=str, help="Output prefix for downloaded files. Default: --study-accession")
30 | args = parser.parse_args(argv)
31 |
32 | study_accession = args.study_accession
33 | if args.output_prefix:
34 | prefix = args.output_prefix + study_accession
35 | else:
36 | prefix = study_accession
37 | gz = args.gzip
38 |
39 | md_file = prefix + "_metadata.tsv"
40 | out_file = prefix + ".pkl"
41 | if gz:
42 | out_file = out_file + ".gz"
43 | md_file = md_file + ".gz"
44 |
45 | # Check if files exists and skip
46 | tax_files = glob(prefix + "*_taxonomy_abundances_*")
47 | if tax_files and os.path.isfile(out_file) and os.path.isfile(md_file):
48 | print(study_accession, "Warning: files already exist ")
49 | return
50 |
51 | with Session(API_BASE) as s:
52 | # Get main study resource
53 | try:
54 | study = s.get('studies', study_accession).resource
55 | if args.verbose:
56 | print(study.accession, "SAMPLES:" + str(study.samples_count), sep="\t", end="\t")
57 | except:
58 | print(study_accession, "Error: Study accession not found")
59 | sys.exit(1)
60 |
61 | # Save study info as a dict in a pkl file
62 | f = gzip.open(out_file, 'wb') if gz else open(out_file, "wb")
63 | pickle.dump(study.json, file=f)
64 | f.close()
65 |
66 | # Get all taxonomic tables for the highest version of the pipeline
67 | highest_version = 0
68 | table_version = {}
69 | for download in study.downloads:
70 | label = download.description.label
71 | #["Taxonomic assignments",
72 | #"Taxonomic assignments SSU",
73 | #"Taxonomic assignments LSU"
74 | #"Taxonomic assignments UNITE",
75 | #"Taxonomic assignments ITSoneDB"]
76 | if "Taxonomic assignments" in label:
77 | version = float(download.pipeline.id)
78 | if version not in table_version:
79 | table_version[version] = []
80 | table_version[version].append(download.url)
81 | if version > highest_version:
82 | highest_version = version
83 |
84 | if not table_version:
85 | print("Error: No taxonomic assignments for this study to download")
86 | sys.exit(1)
87 | else:
88 | table_urls = table_version[highest_version]
89 |
90 | # Get all available samples in one go and collect metadata
91 | params = {
92 | 'study_accession': study_accession,
93 | 'page_size': study.samples_count,
94 | }
95 | fltr = Filter(urlencode(params))
96 |
97 | metadata = {}
98 | for sample in s.iterate('samples', fltr):
99 | # TODO: how to access runs faster, sample.runs is too slow
100 | #nruns += len(sample.runs)
101 | metadata[sample.accession] = {}
102 | for md in sample.sample_metadata:
103 | metadata[sample.accession][md["key"]] = md["value"]
104 | # Add sample description, name and name as metadata
105 | metadata[sample.accession]['sample-desc'] = sample.sample_desc
106 | metadata[sample.accession]['sample-name'] = sample.sample_name
107 |
108 | # Get link sample accession, run accession
109 | # TODO treat multiple runs per sample
110 | run_sample_accesion = {}
111 | try:
112 | for run in s.iterate('runs', fltr):
113 | run_sample_accesion[run.sample.id] = run.id
114 | except:
115 | print("Error: Could not retrieve run accession", sep="\t", end="\t")
116 |
117 | # Write metadata
118 | md_df = pd.DataFrame.from_dict(metadata).T
119 | if run_sample_accesion:
120 | mapped_accessions = md_df.index.isin(run_sample_accesion.keys())
121 | if args.verbose:
122 | print("MAPPED:" + str(sum(mapped_accessions)), sep="\t", end="\t")
123 | md_df.index = md_df.index.map(lambda x: run_sample_accesion[x] if x in run_sample_accesion else x)
124 | else:
125 | if args.verbose:
126 | print("Warning: No mapping between accessions of samples and metadata", sep="\t", end="\t")
127 |
128 | if args.verbose:
129 | print("METADATA:" + str(md_df.shape[1]), sep="\t", end="\t")
130 | md_df.to_csv(md_file, compression="gzip" if gz else None, sep="\t")
131 |
132 | # Read and write tables
133 | for table_url in table_urls:
134 | try:
135 | t = pd.read_table(table_url)
136 | if args.verbose:
137 | print("OK:" + table_url, end=";")
138 | # Print original
139 | filename = prefix + "_" + os.path.basename(table_url)
140 | t.to_csv(filename if not gz else filename + ".gz", compression="gzip" if gz else None, sep="\t", index=False)
141 | except:
142 | if args.verbose:
143 | print("INVALID:" + table_url, end=";")
144 |
145 | if args.verbose:
146 | print()
147 |
148 |
149 | if __name__ == "__main__":
150 | main()
151 |
--------------------------------------------------------------------------------
/scripts/mgnify_extract.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import pandas as pd
3 | import gzip
4 | import os
5 | import pickle
6 | import argparse
7 |
8 |
9 | def main():
10 | version = "1.0.0"
11 |
12 | parser = argparse.ArgumentParser(description='mgnify_extract')
13 | parser.add_argument('-f', '--input-folder', required=True, type=str, help="Folder with files generated by mgnify_download.py")
14 | parser.add_argument('-t', '--top-taxa', default=10, type=int, help="Top taxa to use for each study. 0 for everything. Default 10.")
15 | parser.add_argument('-o', '--output-taxa-counts', type=str, default="taxa_counts.tsv")
16 | parser.add_argument('-b', '--output-biome-counts', type=str, default="")
17 |
18 | parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version)
19 | args = parser.parse_args()
20 |
21 | acc_files = select_files(args.input_folder)
22 | print("Number of files found: ", len(acc_files))
23 |
24 | rank_id_name = {0: "superkingdom",
25 | 1: "kingdom",
26 | 2: "phylum",
27 | 3:"class",
28 | 4:"order",
29 | 5:"family",
30 | 6:"genus",
31 | 7:"species"}
32 | taxa_biome = {}
33 | no_biome = []
34 | biome_count = {}
35 |
36 | for study_accession, study_table in acc_files.items():
37 | print(study_accession)
38 |
39 | study_file = args.input_folder + "/" + study_accession + ".pkl.gz"
40 |
41 | if os.path.isfile(study_file):
42 | with gzip.open(study_file) as f:
43 | study = pickle.load(f)
44 | else:
45 | no_biome.append(study_accession)
46 | continue
47 |
48 | study_biomes = []
49 | if 'biomes' in study['relationships']:
50 | for b in study['relationships']['biomes']['data']:
51 | biome = b['id']
52 | if biome not in taxa_biome: taxa_biome[biome] = {}
53 | if biome not in biome_count: biome_count[biome] = 0
54 | study_biomes.append(biome)
55 | biome_count[biome]+=1
56 | else:
57 | no_biome.append(study_accession)
58 | continue
59 |
60 | pipeline_version = float(study_table[-10:-7])
61 |
62 | t = pd.read_table(study_table)
63 |
64 | sample_col = '#SampleID'
65 | if sample_col not in t.columns:
66 | # older files have a different header
67 | sample_col = 'taxonomy'
68 |
69 | # expand ranks in columns
70 | ranks = t[sample_col].str.split(';', expand=True)
71 |
72 | # replace empty for unclassified
73 | ranks.replace(regex={r'^.+__$': 'unclassified'}, inplace=True)
74 |
75 | # Replace "s__" at the beggining and _ for " "
76 | ranks.replace(regex={r'^.+__': '', '_': ' '}, inplace=True)
77 |
78 | # Pipeline <= 3.0 reports only species specific name, need to merge
79 | if pipeline_version <= 3.0:
80 | if 6 in ranks and 7 in ranks:
81 | #print(ranks)
82 | # Replace unclassified with None
83 | ranks[6] = ranks[6].replace("unclassified", None)
84 | ranks[7] = ranks[7].replace("unclassified", None)
85 | ranks["species"] = ranks[6] + " " + ranks[7]
86 | ranks.drop(columns=7, inplace=True)
87 | ranks.rename(columns={"species": 7}, inplace=True)
88 | #print(ranks)
89 |
90 | t = pd.concat([ranks, t], axis=1)
91 | t.drop(columns=sample_col, inplace=True)
92 | top_taxa_rank = {}
93 |
94 | for r in range(ranks.shape[1]):
95 | rank_table = t.groupby([r]).sum().T
96 | # Do not count for unclassified taxa
97 | if "unclassified" in rank_table.columns:
98 | rank_table.drop(columns="unclassified", inplace=True)
99 | if "Unclassified" in rank_table.columns:
100 | rank_table.drop(columns="Unclassified", inplace=True)
101 |
102 | max_count = rank_table.max().max()
103 | n_samples = rank_table.shape[0]
104 | avg_perc_taxa = ((rank_table/max_count).sum(axis=0) / n_samples).sort_values(ascending=False)
105 | if args.top_taxa:
106 | top_taxa_rank[r] = avg_perc_taxa.iloc[:args.top_taxa].index.to_list()
107 | else:
108 | top_taxa_rank[r] = avg_perc_taxa.index.to_list()
109 |
110 | # Study can have multiple biomes, for each count
111 | for biome in study_biomes:
112 | for rank, taxa in top_taxa_rank.items():
113 | r = rank_id_name[rank]
114 | for t in taxa:
115 | if (r,t) not in taxa_biome[biome]: taxa_biome[biome][(r,t)] = 0
116 | taxa_biome[biome][(r,t)]+=1
117 |
118 | biomes_df = pd.DataFrame.from_dict(taxa_biome).T
119 | stacked = pd.DataFrame(biomes_df.T.stack(), columns=["count"])
120 |
121 | stacked.to_csv(args.output_taxa_counts, sep="\t", header=None)
122 |
123 | if no_biome:
124 | print("Skipped " + str(len(no_biome)) + " files without defined file/biome")
125 |
126 | if args.output_biome_counts:
127 | with open(args.output_biome_counts, "w") as bf:
128 | for biome, cnt in biome_count.items():
129 | print(biome, cnt, sep="\t", file=bf)
130 |
131 |
132 | def select_files(input_folder):
133 | """
134 | Return a dict with {accession: taxonomy abundance file}
135 | One file per accession (biggest in size)
136 | """
137 | tax_ab_files = {}
138 | for file in os.listdir(input_folder):
139 | ffile = input_folder + "/" + file
140 | bname = os.path.basename(file)
141 |
142 | if "taxonomy_abundances" in file:
143 | accession = bname.split("_")[0]
144 | # Add file and filesize
145 | if accession not in tax_ab_files: tax_ab_files[accession] = tuple(["", 0])
146 | fsize = os.path.getsize(ffile)
147 | if fsize > tax_ab_files[accession][1]:
148 | tax_ab_files[accession] = (ffile, fsize)
149 |
150 | return {k: v[0] for k, v in tax_ab_files.items()}
151 |
152 |
153 | if __name__ == "__main__":
154 | main()
155 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import re
4 |
5 | from setuptools import setup
6 |
7 |
8 | def read(filename):
9 | filename = os.path.join(os.path.dirname(__file__), filename)
10 | text_type = type(u"")
11 | with io.open(filename, mode="r", encoding='utf-8') as fd:
12 | return re.sub(text_type(r':[a-z]+:`~?(.*?)`'), text_type(r'``\1``'), fd.read())
13 |
14 | setup(
15 | name="grimer",
16 | version="1.1.0",
17 | url="https://www.github.com/pirovc/grimer",
18 | license='MIT',
19 | author="Vitor C. Piro",
20 | author_email="pirovc@posteo.net",
21 | description="GRIMER: contamination detection and microbiome exploration",
22 | long_description=read("README.md"),
23 | packages=['grimer'],
24 | package_data={
25 | 'grimer': ['css/*', 'img/*', 'js/*', 'scripts/*']
26 | },
27 | entry_points={'console_scripts': ['grimer=grimer.grimer:main']},
28 | classifiers=[
29 | 'License :: OSI Approved :: MIT License',
30 | 'Programming Language :: Python :: 3.5',
31 | 'Programming Language :: Python :: 3.6',
32 | 'Programming Language :: Python :: 3.7',
33 | 'Programming Language :: Python :: 3.8',
34 | 'Programming Language :: Python :: 3.9',
35 | ],
36 | )
37 |
--------------------------------------------------------------------------------