├── tests ├── __init__.py ├── test_progressbar.py ├── test_load_genes.py ├── test_utils.py ├── init_tests.py ├── test_permtations.py ├── test_newick.py ├── test_load_traits.py ├── test_scoary_tree.py ├── test_upgma.py ├── test_analyze_trait.py ├── test_final_overview.py ├── test_scoary.py └── test_picking.py ├── media ├── scoary.ai └── scoary-2-logo.svg ├── data ├── vcf │ ├── ExampleVCFTrait.csv │ └── Example.vcf ├── tetracycline │ ├── Restrict_to.csv │ ├── ExampleTree.nwk │ ├── Tetracycline_resistance.csv │ └── expected_result.json └── generated │ └── Trait.csv ├── benchmarking ├── runtime │ ├── data │ │ ├── runtime.txt │ │ └── 100_traits.csv │ ├── README.md │ └── Optimization strategies.md ├── picking_performance │ ├── data │ │ ├── benchmark.png │ │ ├── benchmark_with_GLM.png │ │ └── benchmark_with_PySR.png │ └── README.md └── binarization │ ├── README.md │ └── benchmark_binarization.py ├── .gitignore ├── scoary ├── init_multiprocessing.py ├── __init__.py ├── KeyValueStore.py ├── progressbar.py ├── upgma.py ├── newick.py ├── permutations.py ├── final_overview.py ├── load_genes.py ├── vcf2scoary.py ├── ScoaryTree.py ├── utils.py ├── picking.py ├── analyze_trait.py └── scoary.py ├── Dockerfile ├── pyproject.toml ├── LICENCE.md ├── Notes.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /media/scoary.ai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/media/scoary.ai -------------------------------------------------------------------------------- /data/vcf/ExampleVCFTrait.csv: -------------------------------------------------------------------------------- 1 | ,ExampleVCFtrait 2 | Reference,0 3 | Strain_A,1 4 | Strain_B,1 5 | Strain_C,0 6 | -------------------------------------------------------------------------------- /benchmarking/runtime/data/runtime.txt: -------------------------------------------------------------------------------- 1 | s2 start: 14:58:12 2 | s2 end: 14:58:35 3 | s1 start: 14:58:58 4 | s1 end: 15:21:31 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | dist/ 3 | data/ 4 | TEST_OUTPUT/ 5 | TMP/ 6 | *__pycache__* 7 | /benchmarking/binarization/simulations/ 8 | /benchmarking/*/.old 9 | -------------------------------------------------------------------------------- /benchmarking/picking_performance/data/benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/benchmarking/picking_performance/data/benchmark.png -------------------------------------------------------------------------------- /benchmarking/picking_performance/data/benchmark_with_GLM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/benchmarking/picking_performance/data/benchmark_with_GLM.png -------------------------------------------------------------------------------- /benchmarking/picking_performance/data/benchmark_with_PySR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/benchmarking/picking_performance/data/benchmark_with_PySR.png -------------------------------------------------------------------------------- /scoary/init_multiprocessing.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | 3 | mp.set_start_method('spawn') 4 | 5 | 6 | def init(): 7 | mgr = mp.Manager() 8 | ns = mgr.Namespace() 9 | counter = mgr.Value('i', 0) 10 | lock = mgr.Lock() 11 | return mgr, ns, counter, lock 12 | -------------------------------------------------------------------------------- /scoary/__init__.py: -------------------------------------------------------------------------------- 1 | from .scoary import scoary 2 | from .ScoaryTree import ScoaryTree 3 | from .picking import pick, pick_single 4 | from .permutations import permute_picking 5 | from .utils import print_tree, get_version 6 | 7 | __version__ = get_version() 8 | __author__ = 'Thomas Roder' 9 | __credits__ = ['Thomas Roder', 'Ola Brynildsrud'] 10 | __license__ = 'MIT' 11 | __maintainer__ = 'Thomas Roder' 12 | __email__ = 'roder.thomas@gmail.com' 13 | -------------------------------------------------------------------------------- /data/tetracycline/Restrict_to.csv: -------------------------------------------------------------------------------- 1 | Isolate_1,Isolate_10,Isolate_11,Isolate_12,Isolate_13,Isolate_14,Isolate_15,Isolate_16,Isolate_17,Isolate_18,Isolate_19,Isolate_2,Isolate_20,Isolate_21,Isolate_22,Isolate_23,Isolate_24,Isolate_25,Isolate_26,Isolate_27,Isolate_28,Isolate_29,Isolate_3,Isolate_30,Isolate_31,Isolate_32,Isolate_33,Isolate_34,Isolate_35,Isolate_36,Isolate_37,Isolate_38,Isolate_39,Isolate_4,Isolate_40,Isolate_41,Isolate_42,Isolate_43,Isolate_44,Isolate_45,Isolate_46,Isolate_47,Isolate_48,Isolate_49,Isolate_5,Isolate_50 2 | 3 | -------------------------------------------------------------------------------- /data/vcf/Example.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##FORMAT= 3 | ##INFO= 4 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Reference Strain_A Strain_B Strain_C 5 | NC_000962 4013 0 T C 9999 0 TYPE=snp GT 0 1 1 1 6 | NC_000962 4705 0 T C,A 9999 0 TYPE=snp GT 0 0 1 2 7 | NC_000962 6575 0 C A,T,G 9999 0 TYPE=snp GT 0 3 1 2 8 | NC_000962 6750 0 C T 9999 0 TYPE=snp GT 0 0 1 0 9 | NC_000962 7362 0 G C 9999 0 TYPE=snp GT 0 1 1 1 10 | -------------------------------------------------------------------------------- /tests/test_progressbar.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import time 3 | from scoary.progressbar import * 4 | 5 | 6 | class Test(TestCase): 7 | def test_print_progress(self): 8 | start_time = datetime.now() 9 | n_tot = 20 10 | for i in range(n_tot + 1): 11 | time.sleep(0.05) 12 | msg = f'{i}: {" a" * i}' 13 | print_progress(i, n_tot, message=msg, start_time=start_time, message_width=30, default_width=120) 14 | 15 | def test_print_edge(self): 16 | start_time = datetime.now() 17 | msg = f'{0}: {" a" * 0}' 18 | print_progress(0, 0, message=msg, start_time=start_time, message_width=30, default_width=120) 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-bullseye 2 | 3 | 4 | RUN apt-get update && \ 5 | apt-get install -y build-essential && \ 6 | apt-get clean 7 | 8 | ARG SCOARY_VERSION 9 | 10 | # to build from local sources, use the lines below: 11 | COPY dist/*$SCOARY_VERSION* /tmp/scoary/ 12 | RUN pip install -U /tmp/scoary/scoary_2-$SCOARY_VERSION-py3-none-any.whl && \ 13 | pip cache purge && \ 14 | rm -rf /tmp/scoary 15 | 16 | # to build from pip, use this: 17 | # RUN pip install scoary-2==$SCOARY_VERSION && \ 18 | # pip cache purge 19 | 20 | # set these environment variables to directories where non-root is allowed to write 21 | ENV NUMBA_CACHE_DIR=/tmp/NUMBA_CACHE_DIR 22 | ENV CONFINT_DB=/tmp/CONFINT_DB 23 | ENV MPLCONFIGDIR=/tmp/MPLCONFIGDIR 24 | 25 | WORKDIR /data 26 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "scoary-2" 3 | version = "0.0.15" 4 | description = "Scoary2: Associate genes with traits!" 5 | authors = ["MrTomRod "] 6 | readme = "README.md" 7 | license = "GPL3" 8 | packages = [ 9 | { include = "scoary" } 10 | ] 11 | 12 | 13 | [tool.poetry.scripts] 14 | scoary2 = "scoary.scoary:main" 15 | vcf2scoary = "scoary.vcf2scoary:main" 16 | 17 | 18 | [tool.poetry.dependencies] 19 | python = ">=3.10,<3.11" 20 | numba = "^0.58.0" 21 | pandas = "^2" 22 | scipy = "^1.7.3" 23 | scikit-learn = "^1.0.2" 24 | fast-fisher = "^0.0.4" 25 | matplotlib = "^3.5.2" 26 | statsmodels = "^0.14.0" 27 | fire = "^0.5.0" 28 | mgwas-data-exploration-app = "^0.1.0" 29 | 30 | 31 | [tool.poetry.dev-dependencies] 32 | ete3 = "^3.1.2" 33 | biotite = "^0.38.0" 34 | 35 | 36 | [build-system] 37 | requires = ["poetry-core>=1.0.0"] 38 | build-backend = "poetry.core.masonry.api" 39 | -------------------------------------------------------------------------------- /tests/test_load_genes.py: -------------------------------------------------------------------------------- 1 | from init_tests import * 2 | 3 | from scoary.load_genes import load_genes 4 | 5 | 6 | class Test(TestCase): 7 | def test_count(self): 8 | orig_data, binary_data = load_genes('../data/generated/Gene_presence_absence.csv', gene_data_type='gene-count') 9 | print(orig_data, binary_data) 10 | orig_data, binary_data = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', 11 | ignore=roary_ignore) 12 | print(orig_data, binary_data) 13 | 14 | def test_list(self): 15 | orig_data, binary_data = load_genes( 16 | '../data/new_ds/Orthogroups.tsv', 17 | gene_data_type='gene-list:\t' 18 | ) 19 | print(orig_data, binary_data) 20 | orig_data, binary_data = load_genes( 21 | '../data/new_ds/N0.tsv', 22 | gene_data_type='gene-list:\t', 23 | ignore=orthofinder_ignore 24 | ) 25 | print(orig_data, binary_data) 26 | -------------------------------------------------------------------------------- /benchmarking/binarization/README.md: -------------------------------------------------------------------------------- 1 | # Pangenome Simulator 2 | 3 | 1) `generate_simulations()` 4 | 5 | Script for simulating a pan-genome. Outputs a Roary-like gene_presence_absence.csv and a Traits file. 6 | 7 | This script is based on Ola Brynildsrud's [Simulate_pan_genome](https://github.com/AdmiralenOla/Simulate_pan_genome/). 8 | 9 | > [!CAUTION] 10 | > Disclaimer: This script is intended for demonstrating the utility of Scoary2 and may or may not be a realistic 11 | implementation of how bacterial evolution works. 12 | 13 | 2) `run_scoary()` 14 | 15 | Run Scoary2 on the simulated data. 16 | 17 | 3) `analyze_scoary_results()` 18 | 19 | Parse the output of Scoary2 to find the rank o f the true trait. 20 | 21 | Creates. [results.tsv](out%2Fresults.tsv). 22 | 23 | If Scoary2 produces no output (no gene left after multiple testing correction) or if the true trait is not in 24 | the final list of traits, the rank is set to `nan`. 25 | 26 | 3) `plot_all()` 27 | 28 | Plot the results of the analysis. 29 | 30 | Creates `out/effect_sizes.png`. 31 | 32 | ![effect_sizes.svg](out%2Feffect_sizes.svg) -------------------------------------------------------------------------------- /LICENCE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Thomas Roder 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from init_tests import * 2 | from scoary.utils import * 3 | 4 | logger = logging.getLogger('TEST_LOGGER') 5 | 6 | 7 | class Test(TestCase): 8 | def test_load_info_file_trait(self): 9 | trait_info_df = load_info_file( 10 | logger=logger, info_file='../data/new_ds/LC-meta.tsv', merge_col='Trait', 11 | expected_overlap_set={'Compound_287', 'Compound_287'}, reference_file='placeholder' 12 | ) 13 | print(trait_info_df) 14 | 15 | def test_load_info_file_genes(self): 16 | gene_info_df = load_info_file( 17 | logger=logger, info_file='../data/new_ds/N0_best_names.tsv', merge_col='Gene', 18 | expected_overlap_set={'N0.HOG0000000', 'N0.HOG0000001'}, reference_file='placeholder' 19 | ) 20 | print(gene_info_df) 21 | 22 | def test_load_info_file_isolate(self): 23 | isolate_info_df = load_info_file( 24 | logger=logger, info_file='../data/new_ds/isolate-meta.tsv', merge_col='Isolate', 25 | expected_overlap_set={'FAM23868-i1-1.1'}, reference_file='placeholder' 26 | ) 27 | print(isolate_info_df) 28 | -------------------------------------------------------------------------------- /Notes.md: -------------------------------------------------------------------------------- 1 | # How to publish 2 | 3 | ## Pypi 4 | 5 | Create update: 6 | 7 | 1) Change version number in [pyproject.toml](pyproject.toml) 8 | 2) Create new package and upload: 9 | 10 | ```bash 11 | SCOARY_VERSION="?.?.?" 12 | # build: will create files in dist/ 13 | poetry build 14 | # test: install .whl file 15 | pip install -U dist/scoary_2-$SCOARY_VERSION-py3-none-any.whl 16 | # upload 17 | poetry publish 18 | ``` 19 | 20 | ## Docker / Podman 21 | 22 | If you use docker, simply replace each `podman` with `docker`. 23 | 24 | ```shell 25 | podman build --build-arg SCOARY_VERSION=$SCOARY_VERSION --tag troder/scoary-2 . 26 | ``` 27 | 28 | Publish docker image: 29 | 30 | ```shell 31 | # podman login docker.io --get-login 32 | # podman login docker.io 33 | podman tag troder/scoary-2 troder/scoary-2:$SCOARY_VERSION 34 | podman push troder/scoary-2:$SCOARY_VERSION 35 | 36 | # update tag 'latest' 37 | podman tag troder/scoary-2 troder/scoary-2:latest 38 | podman push troder/scoary-2:latest 39 | ``` 40 | 41 | ## Docker / Zenodo links in Wiki 42 | 43 | Update Zenodo: 44 | 1) Create a new release on GitHub (Title: `scoary-2:$SCOARY_VERSION`) 45 | 2) Will automatically create a new DOI on Zenodo 46 | 3) Make sure links are updated 47 | -------------------------------------------------------------------------------- /benchmarking/picking_performance/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark of pair picking 2 | 3 | Goal: compare the performance of Scoary vs Scoary2 pair picking algorithms. 4 | 5 | ## Output 6 | 7 | Raw data: [benchmark.tsv](data%2Fbenchmark.tsv) 8 | 9 | ![benchmark.png](data%2Fbenchmark.png) 10 | 11 | **GLM:** ` time ~ n_isolates + n_genes + n_isolates * n_genes` 12 | 13 | - `scoary = 0.0006532479935340877 + -8.266041425328844e-07 * n_isolates + -0.00010316416563699979 * n_genes + 2.8076161350353536e-05 * n_isolates * n_genes` 14 | - `scoary2 = 4.729632447019741e-05 + 1.387879503778183e-05 * n_isolates + -2.187177527361452e-06 * n_genes + 6.866970437111624e-07 * n_isolates * n_genes` 15 | 16 | Full output: see [benchmark_picking.py](benchmark_picking.py#L308-L352) 17 | 18 | ![benchmark_with_GLM.png](data%2Fbenchmark_with_GLM.png) 19 | 20 | **PySR:** symbolic regression 21 | 22 | Operators: `["+", "*", "exp", inv(x)"]` 23 | 24 | "Best" scoring models are `constant * n_genes * n_isolates` for Scoary and Scoary2 with the following coefficients: 25 | 26 | - Scoary: `2.6693995e-5` 27 | - Scoary2: `8.678912e-7` 28 | 29 | Full output: see [benchmark_picking.py](benchmark_picking.py#L396-L426) 30 | 31 | ![benchmark_with_PySR.png](data%2Fbenchmark_with_PySR.png) 32 | 33 | -------------------------------------------------------------------------------- /benchmarking/runtime/README.md: -------------------------------------------------------------------------------- 1 | # Running Scoary and Scoary2 on the same data 2 | 3 | Dataset: 100 randomly picked and binarized traits from the Scoary2 dataset. 4 | 5 | **1) Run Scoary** 6 | 7 | ```bash 8 | echo "s1 start: $(date +"%T")" >> runtime.txt 9 | 10 | podman run --user 0:0 --rm -it -v ./:/data:Z biocontainers/scoary:v1.6.16-1-deb_cv1 \ 11 | scoary -t 100_traits.csv -g N0_count.csv -s 2 -o s1_out --permute 1000 --correction I -p 0.1 12 | 13 | echo "s1 end: $(date +"%T")" >> runtime.txt 14 | ``` 15 | 16 | **2) Run Scoary2** 17 | 18 | ```bash 19 | echo "s2 start: $(date +"%T")" >> runtime.txt 20 | 21 | podman run --rm -v ./:/data:Z troder/scoary-2 \ 22 | scoary2 \ 23 | --genes N0_count.csv \ 24 | --gene-data-type 'gene-count:,' \ 25 | --traits 100_traits.csv \ 26 | --trait-data-type 'binary:,' \ 27 | --multiple_testing native:0.1 --n-permut 1000 \ 28 | --n-cpus 8 \ 29 | --random-state 42 \ 30 | --outdir s2_out \ 31 | --trait_wise_correction 32 | 33 | echo "s2 end: $(date +"%T")" >> runtime.txt 34 | ``` 35 | 36 | ## Results 37 | 38 | ```bash 39 | $ cat runtime.txt 40 | s2 start: 14:58:12 41 | s2 end: 14:58:35 42 | s1 start: 14:58:58 43 | s1 end: 15:21:31 44 | ``` 45 | 46 | - Scoary2 took 23 seconds 47 | - Scoary took 22 minutes and 33 seconds or 1353 seconds 48 | - Scoary2 is 1353 / 23 = **59 times** faster than Scoary on this dataset 49 | -------------------------------------------------------------------------------- /data/tetracycline/ExampleTree.nwk: -------------------------------------------------------------------------------- 1 | (((((('Isolate_1', 'Isolate_55'), ((('Isolate_36', 'Isolate_46'), 'Isolate_97'), 'Isolate_51')), (((((((('Isolate_10', ('Isolate_9', 'Isolate_91')), 'Isolate_31'), ('Isolate_38', ('Isolate_45', 'Isolate_5'))), 'Isolate_59'), (((('Isolate_15', 'Isolate_21'), 'Isolate_70'), 'Isolate_22'), 'Isolate_32')), (((((('Isolate_13', 'Isolate_80'), ('Isolate_50', 'Isolate_63')), 'Isolate_66'), ((('Isolate_23', 'Isolate_69'), 'Isolate_25'), 'Isolate_57')), 'Isolate_72'), ((('Isolate_19', 'Isolate_41'), 'Isolate_82'), 'Isolate_48'))), (('Isolate_52', 'Isolate_81'), ('Isolate_61', 'Isolate_79'))), (((('Isolate_12', 'Isolate_86'), 'Isolate_62'), (('Isolate_29', 'Isolate_84'), ('Isolate_64', 'Isolate_78'))), (((('Isolate_26', ('Isolate_4', 'Isolate_75')), 'Isolate_95'), (('Isolate_7', 'Isolate_74'), 'Isolate_85')), ('Isolate_68', 'Isolate_83'))))), (((('Isolate_11', 'Isolate_18'), 'Isolate_60'), ((('Isolate_14', 'Isolate_73'), (('Isolate_24', 'Isolate_6'), 'Isolate_33')), 'Isolate_54')), ('Isolate_35', 'Isolate_96'))), ((((('Isolate_16', 'Isolate_65'), 'Isolate_90'), 'Isolate_89'), (((('Isolate_17', 'Isolate_8'), 'Isolate_58'), 'Isolate_77'), (('Isolate_44', 'Isolate_100'), 'Isolate_56'))), (((((('Isolate_2', 'Isolate_88'), (('Isolate_28', 'Isolate_49'), 'Isolate_39')), ((((('Isolate_20', 'Isolate_47'), (('Isolate_3', 'Isolate_42'), 'Isolate_53')), 'Isolate_71'), ('Isolate_40', 'Isolate_67')), 'Isolate_92')), 'Isolate_87'), 'Isolate_94'), 'Isolate_99'))), (((('Isolate_27', 'Isolate_43'), ('Isolate_37', 'Isolate_76')), ('Isolate_34', 'Isolate_98')), ('Isolate_30', 'Isolate_93'))); -------------------------------------------------------------------------------- /data/generated/Trait.csv: -------------------------------------------------------------------------------- 1 | ,Trait 2 | root,0 3 | strain0001,0 4 | strain0002,0 5 | strain0003,0 6 | strain0004,0 7 | strain0005,0 8 | strain0006,0 9 | strain0007,1 10 | strain0008,1 11 | strain0009,0 12 | strain0010,0 13 | strain0011,0 14 | strain0012,0 15 | strain0013,0 16 | strain0014,1 17 | strain0015,1 18 | strain0016,0 19 | strain0017,1 20 | strain0018,1 21 | strain0019,0 22 | strain0020,0 23 | strain0021,0 24 | strain0022,0 25 | strain0023,0 26 | strain0024,1 27 | strain0025,0 28 | strain0026,1 29 | strain0027,0 30 | strain0028,0 31 | strain0029,0 32 | strain0030,0 33 | strain0031,1 34 | strain0032,0 35 | strain0033,1 36 | strain0034,1 37 | strain0035,0 38 | strain0036,1 39 | strain0037,1 40 | strain0038,1 41 | strain0039,0 42 | strain0040,1 43 | strain0041,1 44 | strain0042,0 45 | strain0043,0 46 | strain0044,0 47 | strain0045,0 48 | strain0046,1 49 | strain0047,1 50 | strain0048,1 51 | strain0049,1 52 | strain0050,1 53 | strain0051,1 54 | strain0052,0 55 | strain0053,1 56 | strain0054,0 57 | strain0055,1 58 | strain0056,1 59 | strain0057,1 60 | strain0058,0 61 | strain0059,1 62 | strain0060,0 63 | strain0061,0 64 | strain0062,1 65 | strain0063,1 66 | strain0064,0 67 | strain0065,0 68 | strain0066,0 69 | strain0067,0 70 | strain0068,0 71 | strain0069,0 72 | strain0070,0 73 | strain0071,0 74 | strain0072,0 75 | strain0073,0 76 | strain0074,1 77 | strain0075,0 78 | strain0076,1 79 | strain0077,0 80 | strain0078,0 81 | strain0079,0 82 | strain0080,0 83 | strain0081,0 84 | strain0082,0 85 | strain0083,1 86 | strain0084,0 87 | strain0085,0 88 | strain0086,1 89 | strain0087,1 90 | strain0088,0 91 | strain0089,0 92 | strain0090,0 93 | strain0091,0 94 | strain0092,1 95 | strain0093,0 96 | strain0094,1 97 | strain0095,0 98 | strain0096,1 99 | strain0097,0 100 | strain0098,1 101 | strain0099,1 102 | -------------------------------------------------------------------------------- /tests/init_tests.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | from os.path import dirname, exists 5 | from scipy.spatial import distance 6 | from scipy.stats import fisher_exact, boschloo_exact 7 | 8 | from unittest import TestCase 9 | 10 | # set up logging 11 | import logging 12 | 13 | logging.basicConfig() 14 | # logging.getLogger().setLevel(logging.INFO) 15 | 16 | ROOT = dirname(dirname(__file__)) 17 | 18 | roary_ignore = ['Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate', 19 | 'Genome fragment', 'Order within fragment', 'Accessory Fragment', 'Accessory Order with Fragment', 'QC', 20 | 'Min group size nuc', 'Max group size nuc', 'Avg group size nuc'] 21 | orthofinder_ignore = ['OG', 'Gene Tree Parent Clade'] 22 | 23 | 24 | def get_json(path: str): 25 | with open(path) as f: 26 | return json.load(f) 27 | 28 | 29 | def is_equivalent(a, b): 30 | if np.isinf(a) and np.isinf(b): 31 | return True 32 | if np.isnan(a) and np.isnan(b): 33 | return True 34 | return np.isclose(a, b) 35 | 36 | 37 | def is_equivalent_tree(a, b) -> bool: 38 | if type(a) is str or type(b) is str: 39 | return a == b 40 | else: 41 | return ( 42 | is_equivalent_tree(a[0], b[0]) and is_equivalent_tree(a[1], b[1]) 43 | ) or ( 44 | is_equivalent_tree(a[0], b[1]) and is_equivalent_tree(a[1], b[0]) 45 | ) 46 | 47 | 48 | def get_tempdir_path() -> str: 49 | # template = '/tmp/scoary-test-outdir-{i}' 50 | # i = 0 51 | # while exists(template.format(i=i)): 52 | # i += 1 53 | # 54 | # tempdir_path = template.format(i=i) 55 | 56 | tempdir_path = '/home/thomas/PycharmProjects/scoary-2/TEST_OUTPUT' 57 | 58 | logging.warning(f'Using this tempdir: file://{tempdir_path}') 59 | 60 | return tempdir_path 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Scoary2 logo (light mode)](media/scoary-2-logo-full.svg#gh-light-mode-only) 2 | ![Scoary2 logo (dark mode)](media/scoary-2-logo-full-dark.svg#gh-dark-mode-only) 3 | 4 | Scoary2 associates orthogenes (e.g. generated using [OrthoFinder][orthofinder] 5 | or [Roary][roary] to traits. It reports a list of genes sorted by strength of 6 | association per trait. The results can be explored interactively with a simple, static HTML/JS app. 7 | 8 | 9 | [![Publication](https://img.shields.io/badge/BMC%20Genome%20Biology-10.1186%2Fs13059--024--03233--7-blue)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03233-7) 10 | [![bioRxiv Preprint](https://img.shields.io/badge/bioRxiv-2023.04.19.537353-b31b1b.svg)](https://www.biorxiv.org/content/10.1101/2023.04.19.537353v1.full) 11 | [![Docker Image Version (latest semver)](https://img.shields.io/docker/v/troder/scoary-2?logo=docker&label=Docker&color=%231D63ED)](https://hub.docker.com/troder/scoary-2) 12 | [![DOI](https://zenodo.org/badge/445173674.svg)](https://zenodo.org/doi/10.5281/zenodo.10352170) 13 | 14 | 15 | # Wiki 16 | 17 | - [Home](https://github.com/MrTomRod/scoary-2/wiki/Home) 18 | - [Installation](https://github.com/MrTomRod/scoary-2/wiki/Installation) 19 | - [Usage](https://github.com/MrTomRod/scoary-2/wiki/Usage) 20 | - [Input](https://github.com/MrTomRod/scoary-2/wiki/Input) 21 | - [Output](https://github.com/MrTomRod/scoary-2/wiki/Output) 22 | - [Tutorial](https://github.com/MrTomRod/scoary-2/wiki/Tutorial) 23 | - [App](https://github.com/MrTomRod/scoary-2/wiki/App) 24 | - [Understanding the p values](https://github.com/MrTomRod/scoary-2/wiki/Understanding-the-p-values) 25 | - [Usage as Python library](https://github.com/MrTomRod/scoary-2/wiki/Usage-as-Python-library) 26 | 27 | 28 | [orthofinder]: https://github.com/davidemms/OrthoFinder/ 29 | [roary]: https://sanger-pathogens.github.io/Roary/ 30 | 31 | # Paper 32 | 33 | Please cite: 34 | 35 | > Roder, T. _et al._ _Scoary2_: rapid association of phenotypic multi-omics data with microbial pan-genomes. 36 | > _Genome Biol_ **25**, 93 (2024). https://doi.org/10.1186/s13059-024-03233-7 37 | -------------------------------------------------------------------------------- /data/tetracycline/Tetracycline_resistance.csv: -------------------------------------------------------------------------------- 1 | ,Tetracycline_resistance,Bogus_trait 2 | Isolate_1,0,0 3 | Isolate_10,0,1 4 | Isolate_11,0,1 5 | Isolate_12,1,0 6 | Isolate_13,0,NA 7 | Isolate_14,0,1 8 | Isolate_15,0,0 9 | Isolate_16,0,0 10 | Isolate_17,0,1 11 | Isolate_18,0,0 12 | Isolate_19,0,0 13 | Isolate_2,0,1 14 | Isolate_20,0,1 15 | Isolate_21,0,1 16 | Isolate_22,1,1 17 | Isolate_23,1,0 18 | Isolate_24,0,0 19 | Isolate_25,0,1 20 | Isolate_26,1,- 21 | Isolate_27,0,0 22 | Isolate_28,1,1 23 | Isolate_29,1,1 24 | Isolate_3,0,1 25 | Isolate_30,1,1 26 | Isolate_31,1,0 27 | Isolate_32,0,0 28 | Isolate_33,0,0 29 | Isolate_34,1,1 30 | Isolate_35,1,1 31 | Isolate_36,1,0 32 | Isolate_37,0,1 33 | Isolate_38,1,1 34 | Isolate_39,1,1 35 | Isolate_4,1,0 36 | Isolate_40,1,1 37 | Isolate_41,0,1 38 | Isolate_42,0,0 39 | Isolate_43,0,0 40 | Isolate_44,0,1 41 | Isolate_45,0,0 42 | Isolate_46,0,1 43 | Isolate_47,0,1 44 | Isolate_48,0,0 45 | Isolate_49,1,1 46 | Isolate_5,0,1 47 | Isolate_50,1,0 48 | Isolate_51,0,0 49 | Isolate_52,0,0 50 | Isolate_53,0,0 51 | Isolate_54,1,0 52 | Isolate_55,0,1 53 | Isolate_56,1,0 54 | Isolate_57,1,1 55 | Isolate_58,0,1 56 | Isolate_59,0,0 57 | Isolate_6,0,1 58 | Isolate_60,0,1 59 | Isolate_61,0,1 60 | Isolate_62,0,1 61 | Isolate_63,0,0 62 | Isolate_64,0,1 63 | Isolate_65,0,0 64 | Isolate_66,1,1 65 | Isolate_67,0,1 66 | Isolate_68,1,0 67 | Isolate_69,1,1 68 | Isolate_7,0,0 69 | Isolate_70,0,1 70 | Isolate_71,0,1 71 | Isolate_72,0,0 72 | Isolate_73,0,0 73 | Isolate_74,0,0 74 | Isolate_75,1,1 75 | Isolate_76,0,1 76 | Isolate_77,1,1 77 | Isolate_78,0,0 78 | Isolate_79,1,. 79 | Isolate_8,0,1 80 | Isolate_80,0,0 81 | Isolate_81,0,0 82 | Isolate_82,0,1 83 | Isolate_83,0,1 84 | Isolate_84,0,0 85 | Isolate_85,1,0 86 | Isolate_86,1,1 87 | Isolate_87,1,1 88 | Isolate_88,0,1 89 | Isolate_89,0,1 90 | Isolate_9,0,1 91 | Isolate_90,1,0 92 | Isolate_91,0,0 93 | Isolate_92,0,1 94 | Isolate_93,0,1 95 | Isolate_94,0,0 96 | Isolate_95,1,1 97 | Isolate_96,0,1 98 | Isolate_97,0,0 99 | Isolate_98,0,0 100 | Isolate_99,1,0 101 | Isolate_100,0,0 102 | -------------------------------------------------------------------------------- /scoary/KeyValueStore.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import sqlite3 4 | 5 | 6 | class KeyValueStore: 7 | table_name: str 8 | 9 | def __init__(self, table_name, db_path: str = None): 10 | self.table_name = table_name 11 | 12 | if db_path is None: 13 | if 'KEY_VALUE_STORE_DB' in os.environ: 14 | db_path = os.environ['KEY_VALUE_STORE_DB'] 15 | else: 16 | db_path = os.path.expanduser('~/.cache/keyvaluestore.db') 17 | 18 | self._db_path = db_path 19 | self.con, self.cur = self.get_cur() 20 | self.create_db() 21 | 22 | def __str__(self): 23 | return f'KeyValueStore {self.table_name} ({self._db_path})' 24 | 25 | def get_cur(self): 26 | try: 27 | con = sqlite3.connect(self._db_path) 28 | cur = con.cursor() 29 | except Exception as e: 30 | logging.warning(f'Failed to connect to db: {self._db_path}') 31 | raise e 32 | return con, cur 33 | 34 | def __del__(self): 35 | try: 36 | self.cur.close() 37 | self.con.close() 38 | except Exception: 39 | pass 40 | 41 | def create_db(self): 42 | raise NotImplementedError(f'Users of the abstract class {self.__class__} must implement this function!') 43 | 44 | @staticmethod 45 | def list_to_string(l) -> str: 46 | return ', '.join(f"'{e}'" for e in l) 47 | 48 | @staticmethod 49 | def list_to_string_bracket(l): 50 | return ', '.join(f"('{e}')" for e in l) 51 | 52 | def _create_db(self, columns: {str: str}, pk_col: str): 53 | columns = ', '.join(f'{col_name} {col_type}' for col_name, col_type in columns.items()) 54 | sql = f''' 55 | CREATE TABLE IF NOT EXISTS {self.table_name} ( 56 | {columns}, 57 | PRIMARY KEY ({pk_col}) 58 | ); 59 | ''' 60 | try: 61 | self.cur.execute(sql) 62 | except sqlite3.OperationalError as e: 63 | logging.warning(f'Failed to run this SQL command on db {self._db_path}:\n{sql}') 64 | raise e 65 | 66 | def drop_db(self): 67 | self.cur.execute(f'''DROP TABLE {self.table_name}''') 68 | -------------------------------------------------------------------------------- /tests/test_permtations.py: -------------------------------------------------------------------------------- 1 | from .init_tests import * 2 | 3 | from scoary import ScoaryTree, pick_single, print_tree, pick 4 | 5 | from scoary.permutations import create_permuted_df, permute_picking 6 | 7 | 8 | def test_permutations(tree: list, label_to_trait_a, label_to_trait_b, n_permut): 9 | max_contr, max_suppo, max_oppos, best, worst = pick( 10 | tree=tree, 11 | label_to_trait_a=label_to_trait_a, 12 | trait_b_df=pd.DataFrame(label_to_trait_b, index=['gene']), 13 | calc_pvals=True 14 | ) 15 | print_tree( 16 | ScoaryTree.from_list(tree), 17 | label_to_trait_a, label_to_trait_b 18 | ) 19 | is_positively_correlated = max_suppo >= max_oppos 20 | n_pos, n_neg = sum(label_to_trait_b.values()), len(label_to_trait_b) 21 | n_positive = n_pos if is_positively_correlated else n_neg 22 | 23 | estimator = (max_suppo if is_positively_correlated else max_oppos) / max_contr 24 | print(f'{max_contr=}\n{max_suppo=}\n{max_oppos=}\n{best=}\n{worst=}\n{estimator=}') 25 | 26 | print('Calculating permutatons... p-value=', end='') 27 | permuted_df = create_permuted_df( 28 | labels=[f'i{i}' for i in range(1, 17)], n_positive=n_positive, 29 | n_permut=n_permut, random_state=42 30 | ) 31 | max_contr, max_suppo, max_oppos = pick( 32 | tree=tree, label_to_trait_a=label_to_trait_a, 33 | trait_b_df=permuted_df, calc_pvals=False 34 | ) 35 | 36 | permuted_estimators = max_suppo / max_contr 37 | 38 | pval = ((permuted_estimators >= estimator).sum() + 1) / (n_permut + 1) 39 | 40 | print(pval) 41 | 42 | 43 | class Test(TestCase): 44 | def test_bad(self, n_permut=3000): 45 | tree = [[[['i1', 'i2'], ['i3', 'i4']], [['i5', 'i6'], ['i7', 'i8']]], 46 | [[['i9', 'i10'], ['i11', 'i12']], [['i13', 'i14'], ['i15', 'i16']]]] 47 | label_to_trait_a = {f'i{i}': bool(i % 2) for i in range(1, 17)} 48 | label_to_trait_b = label_to_trait_a.copy() 49 | test_permutations(tree, label_to_trait_a, label_to_trait_b, n_permut) 50 | 51 | def test_good(self, n_permut=3000): 52 | tree = [[[['i1', 'i2'], ['i3', 'i4']], [['i5', 'i6'], ['i7', 'i8']]], 53 | [[['i9', 'i10'], ['i11', 'i12']], [['i13', 'i14'], ['i15', 'i16']]]] 54 | label_to_trait_a = {f'i{i}': bool(i < 9) for i in range(1, 17)} 55 | label_to_trait_b = label_to_trait_a.copy() 56 | test_permutations(tree, label_to_trait_a, label_to_trait_b, n_permut) 57 | -------------------------------------------------------------------------------- /scoary/progressbar.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | from textwrap import shorten 4 | import logging 5 | 6 | SCOARY_PRINT_PROGRESS = os.environ.get('SCOARY_PRINT_PROGRESS', 'TRUE').upper() == 'TRUE' 7 | 8 | # can os determine the terminal size? 9 | try: 10 | n_cols = os.get_terminal_size().columns 11 | DYNAMIC_TERMINAL_WIDTH = True 12 | LINEBREAK_CHAR = '\r' 13 | except Exception: 14 | DYNAMIC_TERMINAL_WIDTH = False 15 | LINEBREAK_CHAR = '\n' 16 | 17 | # set function get_terminal_width depending on DYNAMIC_TERMINAL_WIDTH 18 | if DYNAMIC_TERMINAL_WIDTH: 19 | def get_terminal_width(min_: int, default: int) -> int: 20 | return max(min_, os.get_terminal_size().columns) 21 | else: 22 | def get_terminal_width(min_: int, default: int) -> int: 23 | return default 24 | 25 | 26 | def stringify_timedelta(delta: timedelta) -> str: 27 | """ 28 | Returns string 5 characters long. 29 | """ 30 | d = delta.days 31 | h, rem = divmod(delta.seconds, 3600) 32 | m, s = divmod(rem, 60) 33 | if d: 34 | res = f'{d}d {h}h' 35 | elif h: 36 | res = f'{h}h {m}m' 37 | elif m: 38 | res = f'{m}m {s}s' 39 | else: 40 | res = f'{s}s' 41 | res = shorten(res, width=5, placeholder='') 42 | return f'{res:>5s}' if res else f'>999d' 43 | 44 | 45 | def print_progress( 46 | i: int, 47 | n: int, 48 | message: str, 49 | start_time: datetime, 50 | message_width: int = 40, 51 | default_width: int = 100, 52 | sep: str = ' | ', 53 | end: str = LINEBREAK_CHAR 54 | ) -> None: 55 | if not SCOARY_PRINT_PROGRESS: 56 | return 57 | 58 | message = f"{shorten(message, width=message_width, placeholder='...'):{message_width}s}" 59 | assert len(message) == message_width 60 | 61 | n = max(1, n) 62 | i_safe = min(max(1, i), n) 63 | time_left = stringify_timedelta((datetime.now() - start_time) / i_safe * (n - i_safe)) # 5 chars 64 | percentage = f"{f'{i / n:.0%}':>4s}" # 4 chars 65 | 66 | width_total = get_terminal_width(min_=message_width + len(sep) * 2 + 20, default=default_width) 67 | 68 | text = f'{percentage}{sep}{time_left}{sep}{message}' 69 | len_progressbar = width_total - len(text) 70 | n_bars = len_progressbar - 3 # because of '[] ' 71 | 72 | res = f"[{'=' * round(i / n * n_bars):{n_bars}}] {text}" 73 | 74 | if not len(res) == width_total: 75 | logging.warning('Something went wrong with the progressbar!') 76 | 77 | print(res, end=end) 78 | -------------------------------------------------------------------------------- /tests/test_newick.py: -------------------------------------------------------------------------------- 1 | from init_tests import * 2 | 3 | from scoary.newick import parse_newick 4 | 5 | 6 | class Test(TestCase): 7 | def test_newick(self): 8 | test_data = [ 9 | ('(A,(C,D));', ['A', ['C', 'D']]), 10 | ('(A, (C,D));', ['A', ['C', 'D']]), 11 | ('(A(C,D));', ['A', ['C', 'D']]), 12 | ('(A(C, D));', ['A', ['C', 'D']]), 13 | ('A,(C,D);', ['A', ['C', 'D']]), 14 | ('((A,B),(C,D));', [['A', 'B'], ['C', 'D']]), 15 | ('(A,B),(C,D);', [['A', 'B'], ['C', 'D']]), 16 | ('(A,B),(C,D);', [['A', 'B'], ['C', 'D']]), 17 | ('(A,B)(C,D);', [['A', 'B'], ['C', 'D']]), 18 | ('(C,D)E;', [['C', 'D'], 'E']), 19 | ('(C,D),E;', [['C', 'D'], 'E']), 20 | ('(A,(C,D))F;', [['A', ['C', 'D']], 'F']), 21 | ('( A , ( C , D ) ) F ;', [['A', ['C', 'D']], 'F']), 22 | ('(A(C,D))F;', [['A', ['C', 'D']], 'F']), 23 | ('(A:0.1,(C:0.3,D:0.4):0.5);', ['A', ['C', 'D']]), 24 | ('(A:0.1,(C:0.3,D:0.4))F;', [['A', ['C', 'D']], 'F']), 25 | ('((B:0.2,(C:0.3,D:0.4))F:0.1)A;', [[['B', ['C', 'D']], 'F'], 'A']), 26 | ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);', ['A', [['B', [['D', 'G'], 'E']], 'C']]), 27 | ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);', ['A', [['B', [['D', 'G'], 'E']], 'C']]), 28 | ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));', ['A', ['B', ['D', 'G']]]), 29 | ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);', ['A', ['B', ['D', 'G']]]), 30 | ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);', ['A', [['B', [['D', 'G'], 'E']], 'C']]), 31 | ('(A,(B,(D,G)E)C);', ['A', [['B', [['D', 'G'], 'E']], 'C']]), 32 | ('(A,(B,(D,G)));', ['A', ['B', ['D', 'G']]]), 33 | ('(hodor one,(hodor-two,(hodor_3,0hodor 4)));', ['hodor one', ['hodor-two', ['hodor_3', '0hodor 4']]]), 34 | ('(FAM18356-i1-1.1:0.289313,(FAM19471-i1-1.1:0.210374,(FAM23169-i1-1.1:0.0764835,FAM1079-i1-1.1:0.0778624)0.977444:0.115601));', 35 | ['FAM18356-i1-1.1', ['FAM19471-i1-1.1', ['FAM23169-i1-1.1', 'FAM1079-i1-1.1']]]) 36 | ] 37 | 38 | for n, expected_result in test_data: 39 | res = parse_newick(n) 40 | self.assertEqual(res, expected_result) 41 | print() 42 | 43 | def test_tetracycline(self): 44 | with open('../data/tetracycline/ExampleTree.nwk') as f: 45 | newick = f.read() 46 | expected_result = get_json('../data/tetracycline/expected_result.json')['as_list'] 47 | res = parse_newick(newick) 48 | self.assertEqual(res, expected_result) 49 | -------------------------------------------------------------------------------- /data/tetracycline/expected_result.json: -------------------------------------------------------------------------------- 1 | { 2 | "as_list": [[[[[["Isolate_1", "Isolate_55"], [[["Isolate_36", "Isolate_46"], "Isolate_97"], "Isolate_51"]], [[[[[[[["Isolate_10", ["Isolate_9", "Isolate_91"]], "Isolate_31"], ["Isolate_38", ["Isolate_45", "Isolate_5"]]], "Isolate_59"], [[[["Isolate_15", "Isolate_21"], "Isolate_70"], "Isolate_22"], "Isolate_32"]], [[[[[["Isolate_13", "Isolate_80"], ["Isolate_50", "Isolate_63"]], "Isolate_66"], [[["Isolate_23", "Isolate_69"], "Isolate_25"], "Isolate_57"]], "Isolate_72"], [[["Isolate_19", "Isolate_41"], "Isolate_82"], "Isolate_48"]]], [["Isolate_52", "Isolate_81"], ["Isolate_61", "Isolate_79"]]], [[[["Isolate_12", "Isolate_86"], "Isolate_62"], [["Isolate_29", "Isolate_84"], ["Isolate_64", "Isolate_78"]]], [[[["Isolate_26", ["Isolate_4", "Isolate_75"]], "Isolate_95"], [["Isolate_7", "Isolate_74"], "Isolate_85"]], ["Isolate_68", "Isolate_83"]]]]], [[[["Isolate_11", "Isolate_18"], "Isolate_60"], [[["Isolate_14", "Isolate_73"], [["Isolate_24", "Isolate_6"], "Isolate_33"]], "Isolate_54"]], ["Isolate_35", "Isolate_96"]]], [[[[["Isolate_16", "Isolate_65"], "Isolate_90"], "Isolate_89"], [[[["Isolate_17", "Isolate_8"], "Isolate_58"], "Isolate_77"], [["Isolate_44", "Isolate_100"], "Isolate_56"]]], [[[[[["Isolate_2", "Isolate_88"], [["Isolate_28", "Isolate_49"], "Isolate_39"]], [[[[["Isolate_20", "Isolate_47"], [["Isolate_3", "Isolate_42"], "Isolate_53"]], "Isolate_71"], ["Isolate_40", "Isolate_67"]], "Isolate_92"]], "Isolate_87"], "Isolate_94"], "Isolate_99"]]], [[[["Isolate_27", "Isolate_43"], ["Isolate_37", "Isolate_76"]], ["Isolate_34", "Isolate_98"]], ["Isolate_30", "Isolate_93"]]], 3 | "as_newick": "((((((Isolate_1,Isolate_55),(((Isolate_36,Isolate_46),Isolate_97),Isolate_51)),((((((((Isolate_10,(Isolate_9,Isolate_91)),Isolate_31),(Isolate_38,(Isolate_45,Isolate_5))),Isolate_59),((((Isolate_15,Isolate_21),Isolate_70),Isolate_22),Isolate_32)),((((((Isolate_13,Isolate_80),(Isolate_50,Isolate_63)),Isolate_66),(((Isolate_23,Isolate_69),Isolate_25),Isolate_57)),Isolate_72),(((Isolate_19,Isolate_41),Isolate_82),Isolate_48))),((Isolate_52,Isolate_81),(Isolate_61,Isolate_79))),((((Isolate_12,Isolate_86),Isolate_62),((Isolate_29,Isolate_84),(Isolate_64,Isolate_78))),((((Isolate_26,(Isolate_4,Isolate_75)),Isolate_95),((Isolate_7,Isolate_74),Isolate_85)),(Isolate_68,Isolate_83))))),((((Isolate_11,Isolate_18),Isolate_60),(((Isolate_14,Isolate_73),((Isolate_24,Isolate_6),Isolate_33)),Isolate_54)),(Isolate_35,Isolate_96))),(((((Isolate_16,Isolate_65),Isolate_90),Isolate_89),((((Isolate_17,Isolate_8),Isolate_58),Isolate_77),((Isolate_44,Isolate_100),Isolate_56))),((((((Isolate_2,Isolate_88),((Isolate_28,Isolate_49),Isolate_39)),(((((Isolate_20,Isolate_47),((Isolate_3,Isolate_42),Isolate_53)),Isolate_71),(Isolate_40,Isolate_67)),Isolate_92)),Isolate_87),Isolate_94),Isolate_99))),((((Isolate_27,Isolate_43),(Isolate_37,Isolate_76)),(Isolate_34,Isolate_98)),(Isolate_30,Isolate_93)));" 4 | } -------------------------------------------------------------------------------- /tests/test_load_traits.py: -------------------------------------------------------------------------------- 1 | from init_tests import * 2 | from scoary.load_traits import load_numeric, load_binary, apply_kmeans, apply_gm, binarize, load_traits 3 | 4 | traits_bin = '../data/tetracycline/Tetracycline_resistance.csv' 5 | traits_num = '../data/tetracycline/Tetracycline_resistance_numeric.csv' 6 | 7 | 8 | class Test(TestCase): 9 | def test_load_binary(self): 10 | binary_df = load_binary(traits=traits_bin, delimiter=',') 11 | print(binary_df) 12 | 13 | def test_load_numeric(self): 14 | numeric_df = load_numeric(traits=traits_num, delimiter=',') 15 | print(numeric_df) 16 | 17 | def test_binarize_kmeans(self): 18 | numeric_df = load_numeric(traits=traits_num, delimiter=',') 19 | for alternative in ['skip', 'kmeans']: 20 | for cutoff in [.5, .7, .9]: 21 | for covar_type in ['full', 'tied', 'diag', 'spherical']: 22 | binary_df = binarize( 23 | numeric_df, method='kmeans', random_state=42, n_cpus=1, 24 | cutoff=cutoff, covariance_type=covar_type, 25 | alternative=alternative, outdir=None 26 | ) 27 | 28 | def test_binarize_gaussian_nonconverging(self): 29 | # Tetracycline trait cannot be binarized with cutoff=0.999 30 | numeric_df = load_numeric(traits=traits_num, delimiter=',') 31 | for method, n_expected_columns in [('gaussian', 1), ('kmeans', 2)]: 32 | binary_df = binarize( 33 | numeric_df, method=method, random_state=42, n_cpus=1, 34 | cutoff=0.9998, covariance_type='full', 35 | alternative='skip', outdir=None 36 | ) 37 | self.assertEqual(n_expected_columns, len(binary_df.columns), 38 | f'{method=}; {n_expected_columns=}; {binary_df.columns=}') 39 | 40 | def test_illegal(self): 41 | with self.assertRaises(AssertionError): 42 | numeric_df, traits_df = load_traits(traits_num, trait_data_type=f'gaussian:0.4999', random_state=42) 43 | with self.assertRaises(AssertionError): 44 | numeric_df, traits_df = load_traits(traits_num, trait_data_type=f'gaussian:1', random_state=42) 45 | with self.assertRaises(AssertionError): 46 | # fails because no traits can be binarized. Certainty is never high enough. 47 | numeric_df, traits_df = load_traits(traits_num, trait_data_type=f'gaussian:.999999999999', random_state=42) 48 | 49 | def test_multiprocessing(self): 50 | for n_cpus in [1, 5]: 51 | numeric_df, traits_df = load_traits( 52 | '../data/new_ds/LC.tsv', 53 | trait_data_type='gaussian:skip:\t', 54 | ignore='Starter-only-5A,FAMIX,Starter-only-10,Starter-only-7,mixture', 55 | n_cpus=n_cpus, limit_traits=(0, 10), 56 | outdir=f'{ROOT}/TEST_OUTPUT' 57 | ) 58 | -------------------------------------------------------------------------------- /benchmarking/runtime/Optimization strategies.md: -------------------------------------------------------------------------------- 1 | # Comparing different optimization strategies 2 | 3 | I also tried original [Scoary's optimization](https://github.com/AdmiralenOla/Scoary/blob/b713e10fc1968488132f62652c6dba35636ca3e6/scoary/methods.py#L1360-L1363) 4 | (breaking the permutations) instead of my approach, caching confidence intervals. 5 | 6 | ## Results 7 | 8 | **Scoary:** 9 | 10 | - break disabled: 41 minutes 11 | - normal: 22 minutes 12 | 13 | **Scoary2 (1 CPU):** 14 | 15 | - cache disabled: 2:01 16 | - normal: 1:12 17 | - break instead of cache: 1:46 18 | 19 | **Scoary2 (8 CPUs):** 20 | 21 | - cache: 26 sec 22 | - break: 39 sec 23 | 24 | ## Summary 25 | 26 | My caching optimization appears to be better. 27 | 28 | ## Code 29 | 30 | The code below replaces the permute_picking function in [permutations.py](/scoary/permutations.py) 31 | 32 | Note: I have not thoroughly tested this code, so it may contain bugs. 33 | 34 | ```python 35 | import scipy.stats as ss 36 | 37 | 38 | def permute_picking( 39 | trait: str, 40 | tree: ScoaryTree, 41 | label_to_trait: pd.Series | dict, 42 | result_df: pd.DataFrame, 43 | genes_bool_df: pd.DataFrame, 44 | n_permut: int, 45 | random_state: int = None, 46 | batch_size: int = 50 47 | ) -> np.array: 48 | if type(label_to_trait) is dict: 49 | label_to_trait = pd.Series(label_to_trait, dtype='boolean') 50 | n_tot = len(label_to_trait) 51 | n_pos = sum(label_to_trait) 52 | n_neg = n_tot - n_pos 53 | labels = label_to_trait.keys() 54 | 55 | n_reused = 0 56 | 57 | pvals = [] 58 | for _, row in result_df.iterrows(): 59 | label_to_gene = genes_bool_df.loc[row.Gene] 60 | 61 | is_positively_correlated = row.supporting >= row.opposing 62 | estimator = (row.supporting if is_positively_correlated else row.opposing) / row.contrasting 63 | n_pos_assoc = n_pos if is_positively_correlated else n_neg 64 | 65 | r = 0 66 | for batch_start in range(0, n_permut, batch_size): 67 | batch_end = min(batch_start + batch_size, n_permut) 68 | batch_size_current = batch_end - batch_start 69 | # print(f"Processing {batch_start + 1}-{batch_end} ({batch_size_current} of {n_permut} items)") 70 | 71 | permuted_df = create_permuted_df( 72 | labels=labels, n_positive=n_pos_assoc, 73 | n_permut=batch_size_current, random_state=random_state 74 | ) 75 | max_contr, max_suppo, max_oppos = pick( 76 | tree=tree.to_list, label_to_trait_a=label_to_gene, 77 | trait_b_df=permuted_df, calc_pvals=False 78 | ) 79 | 80 | # Check how many estimators are higher than the unpermuted 81 | r += sum((max_suppo / max_contr) >= estimator) 82 | 83 | # If r indicates a p > 0.1 with a probability of 95%, abort 84 | if batch_end >= 30 and (1 - ss.binom.cdf(r, batch_end, 0.1)) < 0.05: 85 | pval = (r + 1) / (batch_end + 1) 86 | break 87 | 88 | else: 89 | pval = (r + 1) / (n_permut + 1) 90 | 91 | pvals.append(pval) 92 | 93 | return pvals 94 | ``` -------------------------------------------------------------------------------- /media/scoary-2-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 8 | 9 | 11 | 13 | 15 | 18 | 20 | 22 | 25 | 32 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /scoary/upgma.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def _find_min(arr: np.ndarray, n_cols: int) -> (int, int): 6 | min_index = int(np.nanargmin(arr)) 7 | x, y = (min_index // n_cols, min_index % n_cols) 8 | assert x > y 9 | return x, y 10 | 11 | 12 | def _merge(arr: np.ndarray, node_list: [], cluster_sizes: [int], x: int, y: int): 13 | n_rows, n_cols = arr.shape 14 | assert n_rows == n_cols == len(node_list) == len(cluster_sizes) 15 | assert x > y 16 | 17 | # update arr 18 | for p in range(len(arr)): 19 | if p in (x, y): 20 | continue 21 | px1, px2 = (x, p) if p < x else (p, x) 22 | py1, py2 = (y, p) if p < y else (p, y) 23 | assert not (np.isnan(arr[px1, px2]) or np.isnan(arr[py1, py2])) 24 | 25 | # calculate mean difference 26 | arr[py1, py2] = (arr[px1, px2] * cluster_sizes[x] + arr[py1, py2] * cluster_sizes[y]) / (cluster_sizes[x] + cluster_sizes[y]) # row, col 27 | 28 | # remove row and col x 29 | arr = np.delete(arr, x, 0) 30 | arr = np.delete(arr, x, 1) 31 | 32 | # update labels 33 | new_label = [node_list[y], node_list[x]] 34 | del node_list[x] 35 | node_list[y] = new_label 36 | 37 | # update cluster_sizes 38 | cluster_sizes[y] = cluster_sizes[x] + cluster_sizes[y] 39 | del cluster_sizes[x] 40 | 41 | assert arr.shape == (n_rows - 1, n_cols - 1) 42 | assert len(node_list) == n_rows - 1 43 | return arr, node_list, cluster_sizes 44 | 45 | 46 | def _upgma(arr: np.ndarray, node_list: [str]) -> []: 47 | # fill triu with nan 48 | arr[np.triu_indices(arr.shape[0], k=0)] = np.nan 49 | 50 | cluster_sizes = [1 for _ in range(len(node_list))] 51 | 52 | while len(node_list) > 1: 53 | n_rows, n_cols = arr.shape 54 | assert len(node_list) == n_rows == n_cols 55 | 56 | # find next columns to merge 57 | x, y = _find_min(arr, len(node_list)) 58 | 59 | # merge columns and update labels 60 | arr, node_list, cluster_sizes = _merge(arr, node_list, cluster_sizes, x, y) 61 | 62 | assert len(node_list) == 1 63 | tree = node_list[0] 64 | 65 | return tree 66 | 67 | 68 | def upgma(distances: pd.DataFrame) -> []: 69 | """ 70 | Apply UPGMA (unweighted pair group method with arithmetic mean) algorithm. 71 | Returns unweighted tree in nested list form. 72 | 73 | Insipred by 'Creating a Phylogenetic Tree' by Oxford Academic (https://www.youtube.com/watch?v=09eD4A_HxVQ) 74 | 75 | :param distances: pandas.DataFrame: values: symmetric array; columns: tree labels 76 | :return: tree: nested list of strings 77 | """ 78 | # split pandas.DataFrame into numpy.ndarray and labels 79 | labels: [] = [str(c) for c in distances.columns] 80 | arr: np.ndarray = distances.values.astype(float) 81 | 82 | # sanity checks 83 | assert len(set(labels)) == len(labels), f'labels are not unique! {labels=}' 84 | assert np.allclose(arr, arr.T, rtol=1e-05, atol=1e-08), f'arr is not symmetric! arr:\n{distances.to_string()}' 85 | assert arr.shape[0] == arr.shape[1] 86 | assert not np.isnan(arr).any(), 'Distance matrix contains nan' 87 | assert not np.isinf(arr).any(), 'Distance matrix contains inf' 88 | assert not np.any(distances < 0), 'Distances must be positive' 89 | 90 | return _upgma(arr=arr, node_list=labels) 91 | -------------------------------------------------------------------------------- /scoary/newick.py: -------------------------------------------------------------------------------- 1 | from re import compile 2 | 3 | BRANCH_LENTGHS_COLON = compile(r':[0-9]+(\.[0-9]+)?(e-?[0-9]+)?') 4 | BRANCH_LENTGHS_BRACKET = compile(r'\)[0-9]+(.[0-9]+)?(e-?[0-9]+)?') 5 | 6 | class NewickParserException(Exception): 7 | pass 8 | 9 | 10 | def parse_newick(newick_string: str) -> []: 11 | """ 12 | A simple function to parse Newick strings to list tree format. 13 | 14 | Example: 15 | >>> parse_newick('(A,(B,C))D;') 16 | [['A', ['B', 'C']], 'D'] 17 | 18 | Limitations: 19 | - Only binary trees are supported 20 | - Distances are ignored 21 | - All labels must be named 22 | - NHX format not supported 23 | 24 | :param newick_string: Phylogenetic tree in newick format 25 | :return: Phylogenetic tree in list format 26 | """ 27 | 28 | # strip and remove branch lengths 29 | newick_string = newick_string.strip() 30 | newick_string = BRANCH_LENTGHS_COLON.sub(string=newick_string, repl='') 31 | newick_string = BRANCH_LENTGHS_BRACKET.sub(string=newick_string, repl=')') 32 | 33 | # sanity check 34 | if not newick_string.endswith(';'): 35 | raise NewickParserException(f'Newick string does not end in semicolon! {newick_string=}') 36 | 37 | def find_corresponding_closing(string: str) -> int: 38 | n_opening = 0 39 | n_closing = 0 40 | for i, char in enumerate(string): 41 | if char == '(': 42 | n_opening += 1 43 | continue 44 | if char == ')': 45 | n_closing += 1 46 | if n_closing == n_opening: 47 | return i 48 | 49 | raise NewickParserException(f'Could not find corresponding closing bracket in {string=}! {newick_string=}') 50 | 51 | def split_node(string: str) -> (str, str): 52 | if ',' in string: 53 | first_comma = string.index(',') 54 | if '(' in string: 55 | first_bracket = string.index('(') 56 | if first_bracket < first_comma: 57 | return string[0:first_bracket], string[first_bracket:] 58 | 59 | return string[0:first_comma], string[first_comma + 1:] 60 | else: 61 | if '(' not in string: 62 | raise NewickParserException(f'Could not find separators "," or "(" in {string=}! {newick_string=}') 63 | first_bracket = string.index('(') 64 | return string[0:first_bracket], string[first_bracket:] 65 | 66 | def parse_leaf(string: str) -> str: 67 | string = string.strip('"\' ') 68 | if len(string) == 0: 69 | raise NewickParserException(f'Leaf with no label: {string=}! {newick_string=}') 70 | return string 71 | 72 | def parse_recursive(string: str) -> list | str: 73 | string = string.strip() 74 | 75 | # leaf 76 | if not ',' in string and not '(' in string and not ')' in string: 77 | return parse_leaf(string) 78 | 79 | # remove enclosing brackets 80 | if string.startswith('(') and string.endswith(')') \ 81 | and find_corresponding_closing(string) == len(string) - 1: 82 | string = string[1:-1] 83 | 84 | # parse node 85 | if string.startswith('('): 86 | closing_idx = find_corresponding_closing(string) 87 | left = string[1:closing_idx] 88 | right = string[closing_idx + 1:].lstrip(',') 89 | else: 90 | left, right = split_node(string) 91 | 92 | left = parse_recursive(left) 93 | right = parse_recursive(right) 94 | 95 | return [left, right] 96 | 97 | return parse_recursive(newick_string[:-1]) # remove semicolon 98 | -------------------------------------------------------------------------------- /scoary/permutations.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import pandas as pd 4 | import numpy as np 5 | 6 | from .KeyValueStore import KeyValueStore 7 | from .picking import pick 8 | from .ScoaryTree import ScoaryTree 9 | 10 | logger = logging.getLogger('scoary.permutations') 11 | 12 | 13 | class ConfintStore(KeyValueStore): 14 | def create_db(self): 15 | self._create_db( 16 | columns={ 17 | 'tree': 'str', 18 | 'n_pos_assoc': 'int', 19 | 'n_permut': 'int', 20 | 'confidence_interval': 'str' 21 | }, 22 | pk_col='tree, n_pos_assoc, n_permut' 23 | ) 24 | 25 | def get(self, tree: str, n_pos_assoc: int, n_permut: int): 26 | sql = f'SELECT confidence_interval FROM {self.table_name} WHERE tree = ? AND n_pos_assoc = ? AND n_permut = ?' 27 | res = self.cur.execute( 28 | sql, 29 | (tree, n_pos_assoc, n_permut,) 30 | ).fetchone() 31 | return np.frombuffer(res[0], dtype=float) if res is not None else None 32 | 33 | def set(self, tree: str, n_pos_assoc: int, n_permut: int, confidence_interval: [float]): 34 | confidence_interval = confidence_interval.tobytes() 35 | sql = f'INSERT OR IGNORE INTO {self.table_name} VALUES (?, ?, ?, ?)' 36 | self.cur.execute( 37 | sql, (tree, n_pos_assoc, n_permut, confidence_interval) 38 | ) 39 | self.con.commit() 40 | 41 | 42 | CONFINT_CACHE = ConfintStore(table_name='confint_cache', db_path=os.environ.get('CONFINT_DB', None)) 43 | 44 | 45 | def create_permuted_df(labels: [str], n_positive: int, n_permut: int, random_state: int = None): 46 | if random_state: 47 | np.random.seed(random_state) 48 | 49 | n_negative = len(labels) - n_positive 50 | arr = np.repeat(np.array([[1] * n_positive + [0] * n_negative]), n_permut, axis=0) 51 | 52 | # creates a copy -> slow 53 | arr = np.apply_along_axis(np.random.permutation, axis=1, arr=arr) 54 | 55 | return pd.DataFrame(arr, columns=labels) 56 | 57 | 58 | def permute_picking( 59 | trait: str, 60 | tree: ScoaryTree, 61 | label_to_trait: pd.Series | dict, 62 | result_df: pd.DataFrame, 63 | genes_bool_df: pd.DataFrame, 64 | n_permut: int, 65 | random_state: int = None, 66 | ) -> np.array: 67 | if type(label_to_trait) is dict: 68 | label_to_trait = pd.Series(label_to_trait, dtype='boolean') 69 | n_tot = len(label_to_trait) 70 | n_pos = sum(label_to_trait) 71 | n_neg = n_tot - n_pos 72 | labels = label_to_trait.keys() 73 | 74 | n_reused = 0 75 | 76 | pvals = [] 77 | for _, row in result_df.iterrows(): 78 | label_to_gene = genes_bool_df.loc[row.Gene] 79 | unique_topology = tree.uniquify(label_to_gene) 80 | 81 | is_positively_correlated = row.supporting >= row.opposing 82 | estimator = (row.supporting if is_positively_correlated else row.opposing) / row.contrasting 83 | n_pos_assoc = n_pos if is_positively_correlated else n_neg 84 | 85 | permuted_estimators = CONFINT_CACHE.get(unique_topology, n_pos_assoc, n_permut) 86 | if permuted_estimators is None: 87 | permuted_df = create_permuted_df( 88 | labels=labels, n_positive=n_pos_assoc, 89 | n_permut=n_permut, random_state=random_state 90 | ) 91 | max_contr, max_suppo, max_oppos = pick( 92 | tree=tree.to_list, label_to_trait_a=label_to_gene, 93 | trait_b_df=permuted_df, calc_pvals=False 94 | ) 95 | 96 | permuted_estimators = max_suppo / max_contr 97 | CONFINT_CACHE.set(unique_topology, n_pos_assoc, n_permut, permuted_estimators) 98 | else: 99 | n_reused += 1 100 | 101 | pval = ((permuted_estimators >= estimator).sum() + 1) / (n_permut + 1) 102 | pvals.append(pval) 103 | 104 | logger.debug(f'{trait}: reused {n_reused} out of {len(result_df)}') 105 | 106 | return pvals 107 | -------------------------------------------------------------------------------- /tests/test_scoary_tree.py: -------------------------------------------------------------------------------- 1 | from init_tests import * 2 | 3 | from scoary.scoary import * 4 | 5 | from scoary.ScoaryTree import * 6 | 7 | 8 | class TestTreeFunctions(TestCase): 9 | def test_tree_from_list_to_list(self): 10 | expected_result = get_json('../data/tetracycline/expected_result.json')['as_list'] 11 | # convert to ScoaryTree 12 | scoary_tree = ScoaryTree.from_list(expected_result) 13 | # convert back to list 14 | list_tree = scoary_tree.to_list 15 | 16 | self.assertEqual(expected_result, list_tree) 17 | 18 | def test_tree_from_genes_df(self): 19 | """ 20 | Check if old scoary generates the equivalent tree based on genes presence/absence 21 | """ 22 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 23 | # convert to ScoaryTree 24 | scoary_tree = ScoaryTree.from_presence_absence(genes_df) 25 | # convert to list 26 | list_tree = scoary_tree.to_list 27 | # compare to Scoary 1 28 | expected_result = get_json('../data/tetracycline/expected_result.json')['as_list'] 29 | 30 | self.assertTrue(is_equivalent_tree(expected_result, list_tree)) 31 | 32 | def test_tree_from_newick_to_newick(self): 33 | """ 34 | Check if newick tree is imported correctly 35 | """ 36 | expected_result = get_json('../data/tetracycline/expected_result.json')['as_newick'] 37 | scoary_tree = ScoaryTree.from_newick(newick=expected_result) 38 | newick = scoary_tree.to_newick() 39 | self.assertEqual(expected_result, newick) 40 | 41 | def test_prune(self): 42 | scoary_tree = ScoaryTree.from_list( 43 | [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'], 44 | [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]] 45 | ) 46 | prune_labels = ['1', '2', '3', '18', '19', '21'] 47 | pruned_tree = scoary_tree.prune(labels=prune_labels) 48 | real_labels = pruned_tree.labels() 49 | self.assertEqual(real_labels, prune_labels) 50 | 51 | def test_copy(self): 52 | scoary_tree = ScoaryTree.from_list( 53 | [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'], 54 | [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]] 55 | ) 56 | copied_tree = scoary_tree.copy_nonrecursive() 57 | nonrec_copied_tree = scoary_tree.copy_nonrecursive() 58 | 59 | def confirm_copy(t1: ScoaryTree, t2: ScoaryTree): 60 | self.assertFalse(t1 is t2) 61 | if t1.is_leaf: 62 | self.assertTrue(t2.is_leaf) 63 | self.assertTrue(t1.label == t2.label) 64 | else: 65 | self.assertFalse(t2.is_leaf) 66 | confirm_copy(t1.left, t2.left) 67 | confirm_copy(t1.right, t2.right) 68 | 69 | confirm_copy(scoary_tree, copied_tree) 70 | confirm_copy(scoary_tree, nonrec_copied_tree) 71 | with self.assertRaises(AssertionError): 72 | confirm_copy(scoary_tree, scoary_tree) 73 | 74 | def test_prune_nonrecursive(self): 75 | scoary_tree = ScoaryTree.from_list( 76 | [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'], 77 | [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]] 78 | ) 79 | prune_labels = ['1', '2', '3', '18', '19', '21'] 80 | pruned_tree = scoary_tree.prune_nonrecursive(labels=prune_labels) 81 | real_labels = pruned_tree.labels() 82 | self.assertEqual(real_labels, prune_labels) 83 | 84 | def test_uniquivy(self): 85 | label_to_trait = {'X': True, ' ': False} 86 | 87 | def apply(tree): 88 | return ScoaryTree.from_list(tree).uniquify(label_to_trait) 89 | 90 | expected_result = '(((01)1)(01))' 91 | for tree in ( 92 | [['X', ' '], ['X', [' ', 'X']]], 93 | [[' ', 'X'], ['X', [' ', 'X']]], 94 | [[' ', 'X'], [[' ', 'X'], 'X']], 95 | [['X', [' ', 'X']], ['X', ' ']], 96 | [['X', ['X', ' ']], ['X', ' ']], 97 | [['X', ['X', ' ']], [' ', 'X']], 98 | ): 99 | unique_string = apply(tree) 100 | self.assertEqual(expected_result, unique_string) 101 | 102 | self.assertNotEqual( 103 | expected_result, 104 | apply([['X', ' '], ['X', [' ', ' ']]]) 105 | ) 106 | -------------------------------------------------------------------------------- /tests/test_upgma.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from init_tests import * 4 | 5 | from unittest import TestCase 6 | import numpy as np 7 | from biotite.sequence.phylo import upgma as _biotite_upgma 8 | from biotite.sequence.phylo.tree import TreeNode as BiotiteTreeNode 9 | 10 | from scipy.cluster.hierarchy import linkage, to_tree, ClusterNode 11 | 12 | from scoary.upgma import upgma as scoary_upgma 13 | from scoary.ScoaryTree import ScoaryTree 14 | 15 | 16 | def biotite_upgma(tree: BiotiteTreeNode, labels: [str]) -> ScoaryTree: 17 | def convert(node: BiotiteTreeNode) -> ScoaryTree: 18 | """recursive function""" 19 | if node.is_leaf(): 20 | return ScoaryTree(label=str(node_to_label[node])) 21 | else: 22 | return ScoaryTree(left=convert(node.children[0]), right=convert(node.children[1])) 23 | 24 | node_to_label: {BiotiteTreeNode: str} = {node: label for node, label in zip(tree.leaves, labels)} 25 | return convert(tree.root) 26 | 27 | 28 | def scipy_upgma(distances, labels: [str]): 29 | """ 30 | scipy.cluster.hierarchy.linkage: method=’average’ is called the UPGMA algorithm! 31 | """ 32 | 33 | def convert(node: ClusterNode) -> ScoaryTree: 34 | """recursive function""" 35 | if node.count == 1: # is_leaf 36 | return ScoaryTree(label=str(node_to_label[node.id])) 37 | else: 38 | return ScoaryTree(left=convert(node.left), right=convert(node.right)) 39 | 40 | node_to_label: {BiotiteTreeNode: str} = dict(enumerate(labels)) 41 | 42 | Z = linkage(distances, method='average') 43 | tree = to_tree(Z, False) 44 | return convert(tree) 45 | 46 | 47 | class Test(TestCase): 48 | def test_upgma(self): 49 | distances = np.array([ 50 | [0, 1, 7, 7, 9], 51 | [1, 0, 7, 6, 8], 52 | [7, 7, 0, 2, 4], 53 | [7, 6, 2, 0, 3], 54 | [9, 8, 4, 3, 0], 55 | ]) 56 | labels = [f'l{i}' for i in range(5)] 57 | 58 | _biotite_tree = _biotite_upgma(distances) 59 | biotite_tree = biotite_upgma(_biotite_tree, labels=labels).to_list 60 | 61 | scipy_tree = scipy_upgma(distances, labels).to_list 62 | 63 | distances_df = pd.DataFrame(distances, columns=labels) 64 | scoary_tree = scoary_upgma(distances_df) 65 | 66 | print(biotite_tree) 67 | print(scoary_tree) 68 | print(scipy_tree) 69 | 70 | assert is_equivalent_tree(biotite_tree, scipy_tree) 71 | assert is_equivalent_tree(biotite_tree, scoary_tree) 72 | 73 | def test_scoary(self, size=20, n_tests=1000): 74 | labels = [f'l{i}' for i in range(size)] 75 | 76 | n_failures = 0 77 | for i in range(n_tests): 78 | matrix = np.random.randint(0, 2000, size=(size, size)) 79 | symmetrical_matrix = (matrix + matrix.T) / 2 80 | 81 | distances_df = pd.DataFrame(symmetrical_matrix, columns=labels) 82 | scoary_tree = scoary_upgma(distances_df) 83 | 84 | _biotite_tree = _biotite_upgma(symmetrical_matrix) 85 | biotite_tree = biotite_upgma(_biotite_tree, labels=labels).to_list 86 | 87 | if not is_equivalent_tree(biotite_tree, scoary_tree): 88 | print('no match:') 89 | print(f' {biotite_tree=}') 90 | print(f' {scoary_tree=}') 91 | n_failures += 1 92 | 93 | print(f'{n_failures=} out of {n_tests} tests') 94 | print(n_failures / n_tests) 95 | self.assertLess(n_failures / n_tests, 0.05, f'Lots of failures, wtf?') 96 | 97 | def test_scipy(self, size=20, n_tests=1000): 98 | """ 99 | 0 % agreement! 100 | Evidently, scipy's upgma implementation works rather differently! 101 | """ 102 | labels = [f'l{i}' for i in range(size)] 103 | 104 | n_failures = 0 105 | for i in range(n_tests): 106 | matrix = np.random.randint(0, 2000, size=(size, size)) 107 | symmetrical_matrix = (matrix + matrix.T) / 2 108 | 109 | _biotite_tree = _biotite_upgma(symmetrical_matrix) 110 | biotite_tree = biotite_upgma(_biotite_tree, labels=labels).to_list 111 | 112 | scipy_tree = scipy_upgma(symmetrical_matrix, labels).to_list 113 | 114 | if not is_equivalent_tree(biotite_tree, scipy_tree): 115 | print('no match:') 116 | print(f' {biotite_tree=}') 117 | print(f' {scipy_tree=}') 118 | n_failures += 1 119 | 120 | print(f'{n_failures=} out of {n_tests} tests') 121 | print(n_failures / n_tests) 122 | self.assertLess(n_failures / n_tests, 0.05, f'Lots of failures, wtf?') 123 | -------------------------------------------------------------------------------- /scoary/final_overview.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os.path 3 | 4 | import pandas as pd 5 | import matplotlib as mpl 6 | 7 | mpl.use('SVG') 8 | # The SVG backend avoids this error message: 9 | # ValueError: Image size of 700x165660 pixels is too large. It must be less than 2^16 in each direction. 10 | # This allows for dendrograms with at least 20'000 traits 11 | 12 | import mgwas_data_exploration_app.main as exploration_app 13 | 14 | logger = logging.getLogger('scoary.final_overview') 15 | 16 | SCORES_CONFIG = { 17 | "best_fisher_q": { 18 | "legend": "Fisher's q-value", 19 | "marker-matplotlib": "$f$", 20 | "marker-html": "f", 21 | "color": "forestgreen" 22 | }, 23 | "best_empirical_p": { 24 | "legend": "Empirical p-value", 25 | "marker-matplotlib": "$e$", 26 | "marker-html": "e", 27 | "color": "mediumpurple" 28 | }, 29 | "best_fq*ep": { 30 | "legend": "fq*ep score", 31 | "marker-matplotlib": "*", 32 | "marker-html": "*", 33 | "color": "crimson" 34 | } 35 | } 36 | 37 | 38 | def create_final_overview( 39 | summary_df: pd.DataFrame, 40 | traits_df: pd.DataFrame, 41 | numeric_df: pd.DataFrame, 42 | outdir: str, 43 | trait_info_df: pd.DataFrame = None, 44 | isolate_info_df: pd.DataFrame = None, 45 | force_binary_clustering: bool = False, 46 | symmetric: bool = True, 47 | distance_metric: str = 'jaccard', 48 | linkage_method: str = 'ward', 49 | optimal_ordering: bool = True, 50 | corr_method: str = 'pearson' 51 | ): 52 | # copy files from exploration app 53 | logger.info('Copying exploration app...') 54 | exploration_app.copy_app(outdir, config={'scores': SCORES_CONFIG}) 55 | 56 | if isolate_info_df is not None: 57 | logger.info('Adding isolate_info.tsv...') 58 | isolate_info_df.to_csv(f'{outdir}/isolate_info.tsv', sep='\t') 59 | 60 | logger.debug('Adding preliminary summary.tsv...') 61 | summary_df.index.name = 'Trait' 62 | summary_df.to_csv(f'{outdir}/summary_orig.tsv', sep='\t') 63 | 64 | # append trait info 65 | if trait_info_df is not None: 66 | logger.debug('Adding trait_info_df to summary.tsv...') 67 | summary_df_index = list(summary_df.index) 68 | summary_df = summary_df \ 69 | .merge(trait_info_df, left_index=True, right_index=True, how='left', copy=False) \ 70 | .reindex(summary_df_index) # merging destroys index order 71 | summary_df.index.name = 'Trait' 72 | summary_df.to_csv(f'{outdir}/summary.tsv', sep='\t') 73 | 74 | if len(summary_df) > 1: 75 | logger.info('Calculating dendrogram linkage matrix...') 76 | if numeric_df is None or force_binary_clustering: 77 | logger.info(f'Calculating dendrogram based on binary data using jaccard distances...') 78 | linkage_matrix, labels = exploration_app.calculate_linkage_matrix_from_binary( 79 | summary_df=summary_df, 80 | traits_df=traits_df, 81 | symmetric=symmetric, 82 | distance_metric=distance_metric, 83 | linkage_method=linkage_method, 84 | optimal_ordering=optimal_ordering 85 | ) 86 | else: 87 | logger.info(f'Calculating dendrogram based on correlation of numeric features...') 88 | linkage_matrix, labels = exploration_app.calculate_linkage_matrix_from_numeric( 89 | summary_df=summary_df, 90 | traits_df=numeric_df, 91 | symmetric=symmetric, 92 | scale=True, 93 | corr_method=corr_method, 94 | linkage_method=linkage_method, 95 | optimal_ordering=optimal_ordering, 96 | ) 97 | 98 | logger.info('Calculating dendrogram plot...') 99 | summary_df = exploration_app.final_plot( 100 | linkage_matrix=linkage_matrix, 101 | labels=labels, 102 | summary_df=summary_df, 103 | scores_config=SCORES_CONFIG, 104 | workdir=outdir, 105 | dendrogram_x_scale='linear', 106 | scores_x_scale='manhattan' 107 | ) 108 | 109 | # save summary_df, ensure order matches plot 110 | logger.info('Saving sorted summary.tsv...') 111 | summary_df.index.name = 'Trait' 112 | summary_df.to_csv(f'{outdir}/summary.tsv', sep='\t') 113 | 114 | if not os.path.isfile(f'{outdir}/summary.tsv'): 115 | logger.debug('Moving summary_orig.tsv to summary.tsv...') 116 | os.rename(f'{outdir}/summary_orig.tsv', f'{outdir}/summary.tsv') 117 | -------------------------------------------------------------------------------- /scoary/load_genes.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | 4 | logger = logging.getLogger('scoary.load_genes') 5 | 6 | 7 | def filter_df(df: pd.DataFrame, restrict_to: [str] = None, ignore: [str] = None) -> pd.DataFrame: 8 | if ignore: 9 | ignore = set(ignore) 10 | missing = ignore.difference(set(df.columns)) 11 | assert len(missing) == 0, f'Some strains in ignore were not found: {missing=}' 12 | df = df[[c for c in df.columns if c not in ignore]] 13 | 14 | if restrict_to is not None: 15 | restrict_to = set(restrict_to) 16 | have_cols = set(df.columns) 17 | cols_missing = restrict_to.difference(have_cols) 18 | assert len(cols_missing) == 0, f'Some strains in restrict_to were not found:' \ 19 | f'\n{cols_missing=}' \ 20 | f'\n{restrict_to=}' \ 21 | f'\n{have_cols=}' 22 | cols_dropped = restrict_to.difference(set(df.columns)) 23 | logger.debug(f'Cols kept: {list(restrict_to)}') 24 | logger.debug(f'Cols dropped: {list(cols_dropped)}') 25 | df = df[[c for c in df.columns if c in restrict_to]] 26 | 27 | return df 28 | 29 | 30 | def load_gene_count_file( 31 | path: str, 32 | delimiter: str, 33 | restrict_to: [str] = None, 34 | ignore: [str] = None 35 | ) -> (pd.DataFrame, pd.DataFrame): 36 | """ 37 | Load Roary-style gene count file with columns=strains and rows=genes 38 | 39 | :param path: Path to file 40 | :param delimiter: delimiter of the file 41 | :param restrict_to: columns to keep, will drop all other columns 42 | :param ignore: columns to ignore 43 | :return: genes_df (DataFrame, dtype: bool); columns: strains; index: genes 44 | """ 45 | count_df = pd.read_csv(path, delimiter=delimiter, index_col=0) 46 | 47 | # remove columns that are not in traits_df 48 | if restrict_to is not None or ignore is not None: 49 | count_df = filter_df(count_df, restrict_to, ignore) 50 | 51 | # sanity checks 52 | assert count_df.columns.is_unique, f'{path=}: columns not unique' 53 | assert count_df.index.is_unique, f'{path=}: index not unique' 54 | assert not count_df.isna().values.any(), f'{path=}: contains NaN' 55 | 56 | # add metadata 57 | count_df.attrs['content_type'] = 'gene-count' 58 | 59 | # convert to bool 60 | binary_df = count_df >= 1 61 | 62 | # remove core- and unique genes 63 | row_sums = binary_df.sum(axis=1) 64 | binary_df = binary_df[(row_sums != 0) & (row_sums != len(binary_df.columns))] 65 | 66 | logger.debug(f'Loaded gene-count-df:\n{binary_df}') 67 | return count_df, binary_df 68 | 69 | 70 | def load_gene_list_file( 71 | path: str, 72 | delimiter: str, 73 | restrict_to: [str] = None, 74 | ignore: [str] = None 75 | ) -> (pd.DataFrame, pd.DataFrame): 76 | """ 77 | Load Orthofinder-style gene list file with columns=strains and rows=genes 78 | 79 | :param path: Path to file 80 | :param delimiter: delimiter of the file 81 | :param restrict_to: columns to keep, will drop all other columns 82 | :param ignore: columns to ignore 83 | :return: genes_df (DataFrame, dtype: bool); columns: strains; index: genes 84 | """ 85 | list_df = pd.read_csv(path, delimiter=delimiter, index_col=0, dtype=str) 86 | 87 | # remove columns that are not in traits_df 88 | if restrict_to is not None or ignore is not None: 89 | list_df = filter_df(list_df, restrict_to, ignore) 90 | 91 | # sanity checks 92 | assert list_df.columns.is_unique, f'{path=}: columns not unique' 93 | assert list_df.index.is_unique, f'{path=}: index not unique' 94 | 95 | # add metadata 96 | list_df.attrs['content_type'] = 'gene-list' 97 | 98 | # convert to bool 99 | binary_df = ~list_df.isna() 100 | 101 | # remove core- and unique genes 102 | row_sums = binary_df.sum(axis=1) 103 | binary_df = binary_df[(row_sums != 0) & (row_sums != len(binary_df.columns))] 104 | 105 | logger.debug(f'Loaded gene-list -df:\n{binary_df}') 106 | return list_df, binary_df 107 | 108 | 109 | def parse_params(orig_params: str) -> (str, str): 110 | error_message = f""" 111 | {orig_params=} is poorly formatted. 112 | Must be ':'. 113 | Possible values for data_type: {{'gene-count', 'gene-list'}} (default: gene-count) 114 | Possible values for delimiter: any single character, only relevant when data_type=gene-count (default: ',') 115 | """.strip() 116 | 117 | params = orig_params.lower().split(':') 118 | 119 | if len(params) == 1: 120 | data_type, delimiter = params[0], ',' 121 | elif len(params) == 2: 122 | data_type, delimiter = params 123 | else: 124 | raise AssertionError(error_message) 125 | 126 | assert data_type in {'gene-count', 'gene-list'}, error_message 127 | 128 | return data_type, delimiter 129 | 130 | 131 | def load_genes( 132 | genes: str, 133 | gene_data_type: str, 134 | restrict_to: [str] = None, 135 | ignore: [str] = None 136 | ) -> (pd.DataFrame, pd.DataFrame): 137 | """ 138 | Load genes_df with columns=strains and rows=genes 139 | 140 | :param genes: Path to genes file 141 | :return: genes_df (DataFrame, dtype: bool); columns: strains; index: genes 142 | """ 143 | data_type, delimiter = parse_params(gene_data_type) 144 | 145 | if data_type == 'gene-count': 146 | genes_orig_df, genes_bool_df = load_gene_count_file(genes, delimiter, restrict_to, ignore) 147 | elif data_type == 'gene-list': 148 | genes_orig_df, genes_bool_df = load_gene_list_file(genes, delimiter, restrict_to, ignore) 149 | else: 150 | raise AssertionError(f'Programming error: {data_type=} must be gene-count or gene-list!') 151 | 152 | # ensure the index name is always the same 153 | genes_orig_df.index.name = 'Gene' 154 | genes_bool_df.index.name = 'Gene' 155 | 156 | return genes_orig_df, genes_bool_df 157 | -------------------------------------------------------------------------------- /tests/test_analyze_trait.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from init_tests import * 4 | from datetime import datetime 5 | from scoary.ScoaryTree import ScoaryTree 6 | from scoary.load_genes import load_genes 7 | from scoary.load_traits import load_traits 8 | from scoary.analyze_trait import init_result_df, create_test_df, add_odds_ratio, pair_picking 9 | 10 | 11 | def generate_fake_traits(genes_df: pd.DataFrame) -> {str: bool}: 12 | label_to_trait = {} 13 | label_to_trait.update({l: True for l in genes_df.columns[:11]}) 14 | label_to_trait.update({l: False for l in genes_df.columns[89:]}) 15 | return pd.Series(label_to_trait, dtype='boolean') 16 | 17 | 18 | class TestScoary(TestCase): 19 | def test_create_result_df(self): 20 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 21 | result_df = init_result_df(genes_df, trait_series=generate_fake_traits(genes_df)) 22 | self.assertEqual( 23 | result_df.columns.tolist(), 24 | ['Gene', 'g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__', 'sensitivity', 'specificity'] 25 | ) 26 | 27 | def test_contingency_test(self): 28 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 29 | result_df = init_result_df(genes_df, trait_series=generate_fake_traits(genes_df)) 30 | test_df = create_test_df(result_df=result_df) 31 | self.assertEqual(['__contingency_table__', 'fisher_p'], test_df.columns.tolist()) 32 | print(f"Done: minpval={test_df.fisher_p.min()}") 33 | 34 | def test_odds_ratio(self): 35 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 36 | genes_df = genes_df[:100] # only first 100 rows 37 | test_df = init_result_df(genes_df, generate_fake_traits(genes_df)) 38 | 39 | # apply function 40 | test_df = add_odds_ratio(test_df) 41 | self.assertEqual( 42 | test_df.columns.tolist(), 43 | ['Gene', 'g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__', 'sensitivity', 'specificity', 44 | 'odds_ratio'] 45 | ) 46 | 47 | # calculate odds_ratio with fisher_exact 48 | fisher_ors = test_df.apply( 49 | lambda row: fisher_exact([[row['g+t+'], row['g+t-']], [row['g-t+'], row['g-t-']]])[0], axis=1) 50 | 51 | # check if result is identical 52 | for manual_or, fisher_or in zip(test_df['odds_ratio'], fisher_ors): 53 | self.assertTrue(is_equivalent(manual_or, fisher_or)) 54 | 55 | def test_init_result_df_performance(self): 56 | _, genes_df = load_genes('../data/new_ds/N0.tsv', gene_data_type='gene-list:\t') 57 | ltt = generate_fake_traits(genes_df) 58 | start = datetime.now() 59 | result_df = init_result_df(genes_df, trait_series=ltt) 60 | end = datetime.now() 61 | print(result_df) 62 | print('took:', end - start) 63 | 64 | def test_tetracycline(self): 65 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 66 | _, traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,') 67 | trait_series = traits_df['Tetracycline_resistance'] 68 | 69 | # calculate sensitivity and specificity 70 | test_df = init_result_df( 71 | genes_df, 72 | trait_series=pd.Series( 73 | {l: bool(v) for l, v in trait_series.items() if v in (0, 1)}, 74 | dtype='boolean' 75 | ) 76 | ) 77 | # calculate odds_ratio 78 | test_df = add_odds_ratio(test_df) 79 | # calculate pairwise comparisons 80 | tree = ScoaryTree.from_list(get_json('../data/tetracycline/expected_result.json')['as_list']) 81 | assert set(tree.labels()) == set(genes_df.columns) 82 | test_df = pair_picking(test_df, genes_df, tree=tree, label_to_trait=trait_series) 83 | 84 | # load expected result from scoary 1 85 | expected_result = pd.read_csv('../data/tetracycline/fisher_permute100.results.csv') 86 | 87 | test_df.set_index('Gene', inplace=True) 88 | 89 | # check if result is identical 90 | for i, row in expected_result.iterrows(): 91 | table = (row.Number_pos_present_in, 92 | row.Number_neg_present_in, 93 | row.Number_pos_not_present_in, 94 | row.Number_neg_not_present_in) 95 | new_row = test_df.loc[row.Gene] 96 | new_table = tuple(int(new_row[c]) for c in ('g+t+', 'g+t-', 'g-t+', 'g-t-')) 97 | 98 | self.assertEqual(table, new_table) 99 | self.assertAlmostEqual( 100 | row.Odds_ratio, new_row.odds_ratio, 101 | msg=f'Failed to calculate odds_ratio for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}' 102 | ) 103 | self.assertAlmostEqual( 104 | row.Sensitivity, new_row.sensitivity, 105 | msg=f'Failed to calculate sensitivity for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}' 106 | ) 107 | self.assertAlmostEqual( 108 | row.Specificity, new_row.specificity, 109 | msg=f'Failed to calculate specificity for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}' 110 | ) 111 | 112 | xx = [ 113 | (row.Max_Pairwise_comparisons, new_row.contrasting), 114 | (row.Max_supporting_pairs, new_row.supporting), 115 | (row.Max_opposing_pairs, new_row.opposing), 116 | (row.Best_pairwise_comp_p, new_row.best), 117 | (row.Worst_pairwise_comp_p, new_row.worst) 118 | ] 119 | try: 120 | self.assertEqual(row.Max_Pairwise_comparisons, new_row.contrasting) 121 | self.assertEqual(row.Max_supporting_pairs, new_row.supporting) 122 | self.assertEqual(row.Max_opposing_pairs, new_row.opposing) 123 | self.assertAlmostEqual(row.Best_pairwise_comp_p, new_row.best) 124 | self.assertAlmostEqual(row.Worst_pairwise_comp_p, new_row.worst) 125 | except Exception as e: 126 | print(i, row.Gene, xx) 127 | self.fail(msg=str(e)) 128 | -------------------------------------------------------------------------------- /tests/test_final_overview.py: -------------------------------------------------------------------------------- 1 | import os 2 | from subprocess import call 3 | from init_tests import * 4 | from scoary.final_overview import create_final_overview 5 | from scoary.load_traits import load_binary 6 | from scoary.utils import pd, AnalyzeTraitNamespace 7 | 8 | REPLACE_COPIES_WITH_SYMLINKS = True 9 | 10 | 11 | def replace_copies_with_symlinks(): 12 | def repl(fn, relpath='../..', subdir='app'): 13 | src = f'{relpath}/scoary/templates/{fn}' 14 | target = f'../TEST_OUTPUT/{subdir}/{fn}' 15 | if os.path.isfile(target): 16 | os.remove(target) 17 | os.symlink(src=src, dst=target) 18 | 19 | for file in ['trait.html', 'overview.html']: 20 | repl(file, relpath='..', subdir='') 21 | for file in ['config.json', 'favicon.svg', 'overview.css', 'overview.js', 'trait.css', 'trait.js']: 22 | repl(file) 23 | 24 | 25 | class Test(TestCase): 26 | def setUp(self) -> None: 27 | self.temp_dir = get_tempdir_path() 28 | self.fake_ns = AnalyzeTraitNamespace() 29 | self.fake_ns.outdir = self.temp_dir 30 | self.fake_ns.trait_info_df = None 31 | 32 | os.makedirs(self.temp_dir, exist_ok=True) 33 | call(f'rm -rf {self.temp_dir}/*', shell=True) 34 | for dir_ in ['app', 'traits', 'logs']: 35 | os.makedirs(f'{self.temp_dir}/{dir_}', exist_ok=True) 36 | 37 | def tearDown(self) -> None: 38 | if REPLACE_COPIES_WITH_SYMLINKS: 39 | replace_copies_with_symlinks() 40 | print(f'Open file://{self.temp_dir} to see the result!') 41 | print(f'To clean up, run "rm -r {self.temp_dir}"') 42 | 43 | def test_simple(self): 44 | summary_df = pd.DataFrame(**{'index': ['Compound_242', 'Compound_267', 'Compound_286'], 45 | 'columns': ['best_fisher_p', 'best_fisher_q', 'best_empirical_p', 'best_fq*ep'], 46 | 'data': [[0.574065934065931, 0.438405797101457, 0.03596403596403, 1.576684e-02], 47 | [0.432940190858691, 0.266793137470672, 0.13386613386613, 3.571457e-02], 48 | [0.194418465932588, 7.98120572982e-08, 0.02097902097902, 1.674379e-09]]}) 49 | traits_df = pd.DataFrame(**{ 50 | 'index': ['FAM10789-i1-1.1', 'FAM1079-i1-1.1', 'FAM10792-i1-1.1', 'FAM11142-i1-1.1', 'FAM11194-i1-1.1', 51 | 'FAM11199-i1-1.1', 'FAM11206-i1-1.1', 'FAM1233-i1-1.1', 'FAM1301-i1-1.1', 'FAM13493-i1-1.1'], 52 | 'columns': ['Compound_242', 'Compound_267', 'Compound_286'], 53 | 'data': [[pd.NA, True, True], [pd.NA, pd.NA, True], [pd.NA, pd.NA, True], [pd.NA, False, False], 54 | [pd.NA, True, True], [pd.NA, pd.NA, True], [True, pd.NA, True], [pd.NA, False, True], 55 | [False, pd.NA, True], [pd.NA, pd.NA, True]]}, 56 | dtype='boolean') 57 | self.fake_ns.traits_df = traits_df # load_binary('../data/new_ds/LC-binary.tsv', '\t') 58 | create_final_overview(summary_df=summary_df, ns=self.fake_ns) 59 | 60 | def test_larger(self): 61 | self.fake_ns.traits_df = load_binary('../data/new_ds/LC-binary.tsv', '\t') 62 | summary_df = pd.DataFrame(index=self.fake_ns.traits_df.columns) 63 | for col in ['best_fisher_p', 'best_fisher_q', 'best_empirical_p']: 64 | summary_df[col] = np.random.rand(1, len(self.fake_ns.traits_df.columns))[0] 65 | create_final_overview(summary_df=summary_df, ns=self.fake_ns) 66 | 67 | def test_largest(self): 68 | # This function was used to determine the desired recursion limit in plot_dendrogram 69 | n_traits, n_isolates = 100, 44 # 10000, 44 70 | self.fake_ns.traits_df = pd.DataFrame( 71 | np.random.rand(n_isolates, n_traits) > 0.5, 72 | index=[f'I{i}' for i in range(n_isolates)], 73 | columns=[f'T{i}' for i in range(n_traits)], 74 | ) 75 | summary_df = pd.DataFrame(index=self.fake_ns.traits_df.columns) 76 | for col in ['best_fisher_p', 'best_fisher_q', 'best_empirical_p']: 77 | summary_df[col] = np.random.rand(1, len(self.fake_ns.traits_df.columns))[0] 78 | 79 | create_final_overview(summary_df=summary_df, ns=self.fake_ns) 80 | 81 | def test_real(self): 82 | # Find the best way of plotting dendrogram 83 | # summary_df = pd.read_csv(f'../data/summary__.tsv', sep='\t', index_col=0) 84 | summary_df = pd.read_csv(f'../TMP/TEST_OUTPUT_real_restricted/summary.tsv', sep='\t', index_col=0) 85 | self.fake_ns.traits_df = pd.read_csv('../data/traits_df.tsv', sep='\t', index_col=0, dtype='str') == 'True' 86 | create_final_overview( 87 | summary_df=summary_df, 88 | traits_df=self.fake_ns.traits_df, 89 | outdir=self.fake_ns.outdir 90 | ) 91 | 92 | 93 | def test_understand_jaccard(self): 94 | from scipy.spatial.distance import cdist 95 | 96 | a = np.array([[0, 0, 0, 0, 0, 0, 1, 0, -1, 0], 97 | [1, 0, 0, -1, 1, 0, 0, -1, 0, 0], 98 | [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ], 99 | [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ], 100 | [1, 1, 1, -1, 1, 1, 1, 1, 1, 1]], dtype=int) 101 | print('input:\n', a) 102 | 103 | def conf(arr): 104 | aa, nn = arr.shape 105 | res = np.zeros(shape=(aa, aa)) 106 | 107 | for a in range(aa): 108 | for b in range(aa): 109 | if a == b: 110 | break 111 | 112 | n = nn - sum(np.logical_and(arr[a] == 0, arr[b] == 0)) 113 | 114 | # x = np.abs(arr[a] - arr[b]) / 2 115 | # y = np.abs(arr[a] - (0 - arr[b])) / 2 116 | # r = min(x.sum(), y.sum()) / n 117 | # print(a, b, x, y, n, r) 118 | 119 | x = arr[a] != arr[b] 120 | y = arr[a] != (0 - arr[b]) 121 | r = min(x.sum(), y.sum()) / n 122 | print(a, b, x, y, n, r) 123 | 124 | res[a, b] = r 125 | res[b, a] = r 126 | print('res') 127 | print(res) 128 | 129 | conf(a) 130 | 131 | def x(a): 132 | a = np.nan_to_num(a, nan=0.5) 133 | b = 0 - a 134 | 135 | d1 = cdist(a, a, metric='jaccard') 136 | d2 = cdist(a, b, metric='jaccard') 137 | d = np.minimum(d1, d2) 138 | 139 | # print('a') 140 | # print(a) 141 | # print('b') 142 | # print(b) 143 | # print('d1') 144 | # print(d1) 145 | # print('d2') 146 | # print(d2) 147 | print('d') 148 | print(d) 149 | 150 | x(a) 151 | 152 | 153 | class ReplaceCopiesWithSymlinks(TestCase): 154 | def test_replace_copies_with_symlinks(self): 155 | replace_copies_with_symlinks() 156 | -------------------------------------------------------------------------------- /scoary/vcf2scoary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Script to search vcf files for mutations within specific coordinates 5 | # Input: 6 | # -A vcf file 7 | # 8 | # Output: 9 | # -A Roary-like file with mutations sorted in rows, strains as columns and presence/absence in cells 10 | # -Columns: Chromosome, Position, variant (eg C->T), type (eg missense, synonymous, frameshift etc) 11 | 12 | 13 | # Reading VCF 14 | # File metainfo starts as ##key=value 15 | # These are always formed and should be caught 16 | # example ##fileformat=VCFv4.3 - give warning if format is off 17 | # Columns 8 MANDATORY 18 | # CHROM POS ID REF ALT QUAL FILTER INFO 19 | # OPTIONAL COLUMNS 20 | # FORMAT SAMPLE1 SAMPLE2 etc 21 | # All data lines are tab-delimited 22 | # CHROM : string, no whitespace 23 | # POS : integer. Can have many lines with same pos. Pos=0 or N+1 for telomere positions 24 | # ID : semicolon-delimited list of strings 25 | # REF : string, ACGTN (can be multiple) 26 | # ALT : comma-separated list, ACGTN* (* = allele is missing due to overlapping deletion) 27 | # (NOTE: Suggest splitting ALT variants into different lines to preserve binarity) 28 | # QUAL : float 29 | # FILTER : PASS or semicolon-delimited list 30 | # INFO : semicolon-delimited list of key=value pairs or flags 31 | # FORMAT (optional) : colon-delimited list. 32 | # Genotype fields - Genotype always first field 33 | # GT encoded as allele values separated by | or /. 0 = reference. 1 = first ALT. 2 = second alt etc 34 | # NOTE: Haploid calls (bacteria) have only 1 value 35 | # NOTE: / means genotype unphased. | means genotype phased 36 | # INFO field SVtypes : DELetion, INSertion, DUPlication, INVersion, CNV 37 | 38 | import sys 39 | import argparse 40 | import os 41 | import csv 42 | import re 43 | import traceback 44 | 45 | __version__ = '0.1b' 46 | __author__ = 'Ola Brynildsrud' 47 | __credits = ['Ola Brynildsrud'] 48 | __email__ = 'olbb@fhi.no' 49 | 50 | def main(): 51 | """ 52 | Converts VCF files (version 4.x) to Scoary format 53 | """ 54 | ########################################################################## 55 | # Parse command line arguments 56 | 57 | parser = argparse.ArgumentParser( 58 | description='This script takes in vcf files and creates a ' 59 | 'presence/absence matrix of mutations in the ' 60 | 'Roary/Scoary format', 61 | epilog='by Ola Brynildsrud (olbb@fhi.no)') 62 | parser.add_argument( 63 | '--out', 64 | action='store', 65 | default='./mutations_presence_absence.csv', 66 | help='The path to the output file') 67 | parser.add_argument( 68 | '--types', 69 | action='store', 70 | default='ALL', 71 | help='The types of variants to include in the output. NOTE: This ' 72 | 'works if TYPE=XX can be found in the INFO column of the vcf ' 73 | 'file. The special keyword ALL includes all types. This is ' 74 | 'the default setting. Common types are snp, mnp, ins, del ' 75 | 'and complex. Give as comma-separated list. ' 76 | 'Example: --types snp,ins,del') 77 | parser.add_argument( 78 | '--version', 79 | action='version', 80 | version=__version__) 81 | parser.add_argument( 82 | '--force', 83 | action='store_true', 84 | default=False, 85 | help='Force overwriting of output file. (If it already ' 86 | 'exists)') 87 | parser.add_argument( 88 | 'vcf', 89 | action='store', 90 | metavar='', 91 | help='The VCF file to convert to Roary/Scoary format') 92 | 93 | args = parser.parse_args() 94 | if args.types != "ALL": 95 | args.types = args.types.split(",") 96 | 97 | if os.path.isfile(args.out) and not args.force: 98 | sys.exit("Outfile already exists. Change name of outfile or " 99 | "run with --force") 100 | if not os.path.isfile(args.vcf): 101 | sys.exit("Unable to locate input file %s" % args.vcf) 102 | 103 | with open(args.vcf) as vcffile, open(args.out,'w') as outfile: 104 | lines = csv.reader(vcffile, delimiter='\t', quotechar='"') 105 | metainfo = {"##INFO" : {}, 106 | "##FILTER" : {}, 107 | "##FORMAT" : {}, 108 | "##ALT" : {}, 109 | "##contig" : {}, 110 | "##META" : {}, 111 | "##SAMPLE" : {}, 112 | "##PEDIGREE" : {} 113 | } 114 | #for line in lines: 115 | while True: 116 | try: 117 | line = next(lines) 118 | except StopIteration: 119 | print(traceback.print_exc()) 120 | sys.exit("ERROR: There appears to be only metainformation " 121 | "(lines starting with ##) in your VCF file.") 122 | # Get metainfo from file 123 | if line[0][:2] == '##': 124 | infoline = re.split('=',line[0], maxsplit=1) 125 | # Capture list output for complex tags 126 | if infoline[0] in metainfo: 127 | ID=re.search(r'ID=(\w+)',infoline[1]).group(1) 128 | infolist = re.split(',(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)',infoline[1].strip("<>")) 129 | metainfo[infoline[0]][ID] = {} 130 | # Enter all elements in infolist into appropriate dic 131 | for e in infolist: 132 | esplit = e.split("=") 133 | metainfo[infoline[0]][ID][esplit[0]] = esplit[1] 134 | 135 | else: 136 | metainfo[infoline[0]] = infoline[1] 137 | else: 138 | # Have reached the data section of the file 139 | data = {"header": line} 140 | break 141 | 142 | try: 143 | vcfversion = metainfo["##fileformat"].split("v")[1] 144 | if int(vcfversion[0]) != 4: 145 | print("WARNING: A VCF format other than 4.x detected." 146 | " File parsing may proceed with errors.") 147 | else: 148 | print("VCF version %s detected" % vcfversion) 149 | except: 150 | print("WARNING: Could not detect VCF format. Expected " 151 | "v4.x. File parsing may proceed with errors.") 152 | print(traceback.print_exc()) 153 | 154 | # Check that genotype fields have a single allele 155 | if metainfo["##FORMAT"]["GT"]["Number"] != "1": 156 | sys.exit("ERROR: Expected a single allele per genotype. Scoary " 157 | "only works for haploid organisms.") 158 | 159 | # Have now caught all metainformation. Now get column information 160 | #header = next(line) 161 | #print header 162 | data["header"] = data["header"][:9] + ["DUMMY"] + data["header"][9:] 163 | outfile.write(','.join('"' + c + '"' for c in data["header"]) + '\n') 164 | 165 | while True: 166 | try: 167 | line = next(lines) 168 | except StopIteration: 169 | print("Reached the end of the file") 170 | sys.exit(0) 171 | # Check if line is allowed: 172 | if args.types != "ALL": 173 | vartype = re.search(r'TYPE=(\w+)',line[7]).group(1) 174 | if vartype not in args.types: 175 | continue 176 | 177 | # Split line if ALT contains more than one variant 178 | if "," in line[4]: 179 | orgline = line[:] 180 | alts = line[4].split(",") 181 | c = 1 182 | for a in alts: 183 | newline = orgline[:] 184 | newline[4] = a 185 | # Only get GT 186 | newline[9:] = \ 187 | [cell.split(":")[0] for cell in orgline[9:]] 188 | # Fix dummy comparisons 189 | newline[9:] = fixdummy(newline[9:], c) 190 | newline = newline[:9] + ["True"] + newline[9:] 191 | c += 1 192 | writeLine(newline, outfile) 193 | 194 | # Genotype fields need to be 0 or 1 195 | # GT is always first in colon-separated list 196 | else: 197 | newline = line[:9] + ["False"] + line[9:] 198 | writeLine(newline, outfile) 199 | 200 | def writeLine(line, outfile): 201 | writeline = line[:9] + [cell.split(":")[0] for cell in line[9:]] 202 | outfile.write(','.join('"' + c + '"' for c in writeline) + '\n') 203 | 204 | def fixdummy(line,c): 205 | newline = line[:] 206 | try: 207 | for x in range(len(line)): 208 | if line[x] == ".": 209 | # Missing data get entered as reference / no presence 210 | newline[x] = "0" 211 | elif int(line[x]) == c: 212 | newline[x] = "1" 213 | else: 214 | newline[x] = "0" 215 | except ValueError: 216 | print(newline, c) 217 | sys.exit(-1) 218 | return newline 219 | 220 | ######## 221 | # MAIN # 222 | ######## 223 | if __name__ == '__main__': 224 | main() 225 | -------------------------------------------------------------------------------- /scoary/ScoaryTree.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from functools import cached_property 4 | from typing import Optional, Callable 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from scipy.spatial import distance 9 | 10 | from .newick import parse_newick 11 | from .upgma import upgma 12 | 13 | 14 | class ScoaryTree: 15 | left: Optional[ScoaryTree] = None 16 | right: Optional[ScoaryTree] = None 17 | label: Optional[str] = None 18 | is_leaf: bool = False 19 | _values: Optional[np.ndarray] = None 20 | _prune = False 21 | 22 | def __init__(self, left: ScoaryTree = None, right: ScoaryTree = None, label: str = None): 23 | if left is None and right is None: 24 | self.is_leaf = True 25 | assert type(label) is str, f'A valid node has a label! {label=}' 26 | self.label = label 27 | else: 28 | self.is_leaf = False 29 | assert type(left) is ScoaryTree and type( 30 | right) is ScoaryTree, f'A valid tree has 0 or 2 children! {left=} {right=}' 31 | self.left = left 32 | self.right = right 33 | 34 | def __str__(self) -> str: 35 | return self._newick() 36 | 37 | def __repr__(self): 38 | return str(self) 39 | 40 | def _newick(self): 41 | return self.label if self.is_leaf else f"({self.left._newick()},{self.right._newick()})" 42 | 43 | def to_newick(self) -> str: 44 | return f'{self._newick()};' 45 | 46 | def write_newick(self, path: str): 47 | with open(path, 'w') as f: 48 | f.write(self.to_newick()) 49 | 50 | def labels(self) -> [str]: 51 | if self.is_leaf: 52 | return [self.label] 53 | else: 54 | return self.left.labels() + self.right.labels() 55 | 56 | def uniquify(self, label_to_trait: {str: bool}): 57 | def uniquify(tree: ScoaryTree) -> str: 58 | if tree.is_leaf: 59 | return '1' if label_to_trait[tree.label] else '0' 60 | else: 61 | l, r = uniquify(tree.left), uniquify(tree.right) 62 | return f'({l}{r})' if l < r else f'({r}{l})' 63 | 64 | return uniquify(self) 65 | 66 | def copy(self): 67 | def copy(tree: ScoaryTree) -> ScoaryTree: 68 | if tree.is_leaf: 69 | return ScoaryTree(label=tree.label) 70 | else: 71 | return ScoaryTree(left=copy(tree.left), right=copy(tree.right)) 72 | 73 | return copy(self) 74 | 75 | def prune(self, labels: [str]) -> ScoaryTree: 76 | n_labels_found = 0 77 | 78 | def prune(tree: ScoaryTree) -> Optional[ScoaryTree]: 79 | if tree.is_leaf: 80 | if tree.label in labels: 81 | nonlocal n_labels_found 82 | n_labels_found += 1 83 | return ScoaryTree(label=tree.label) 84 | else: 85 | return None 86 | else: 87 | left, right = prune(tree.left), prune(tree.right) 88 | if left and right: 89 | return ScoaryTree(left=left, right=right) 90 | if left: 91 | return left 92 | if right: 93 | return right 94 | return None 95 | 96 | pruned_tree = prune(self) 97 | 98 | if n_labels_found != len(labels): 99 | missing = set(labels).difference(set(self.labels())) 100 | raise AssertionError(f'Pruning went wrong: did not find all labels in tree! ' 101 | f'{n_labels_found=}; {missing=}; tree={self}') 102 | 103 | return pruned_tree 104 | 105 | def prune_nonrecursive(self, labels: [str]) -> ScoaryTree: 106 | if self.is_leaf: 107 | assert [self.label] == labels, f'Pruning went wrong. {[self.label]} != {labels}' 108 | return ScoaryTree(label=self.label) 109 | 110 | n_labels_found = 0 111 | 112 | root = ScoaryTree(left=self.left, right=self.right) 113 | 114 | stack = [(root, 'right'), (root, 'left')] 115 | 116 | while stack: 117 | current_parent, current_direction = stack[-1] 118 | current_node: ScoaryTree = getattr(current_parent, current_direction) 119 | 120 | if current_node.is_leaf: 121 | # current node is leaf 122 | 123 | this = ScoaryTree(label=current_node.label) 124 | 125 | if current_node.label in labels: 126 | n_labels_found += 1 127 | else: 128 | this._prune = True # mark for pruning 129 | 130 | # append self to parent 131 | setattr(current_parent, current_direction, this) 132 | 133 | if current_direction == 'right': 134 | # prune 135 | current_parent.__prune() 136 | stack.pop() 137 | 138 | # found terminal node 139 | # # GO UP UNTIL CAN GO RIGHT 140 | while stack: 141 | ancestor_node, ancestor_direction = stack[-1] 142 | if ancestor_direction == 'right': 143 | ancestor_node.__prune() 144 | stack.pop() 145 | else: 146 | break 147 | 148 | if not stack: 149 | print(f'done\n{self}\n{root}') 150 | break 151 | 152 | # pop left node -> go right next 153 | stack.pop() 154 | 155 | else: 156 | this = ScoaryTree(left=current_node.left, right=current_node.right) 157 | stack.extend([(this, 'right'), (this, 'left')]) 158 | 159 | # append self to parent 160 | setattr(current_parent, current_direction, this) 161 | 162 | return root 163 | 164 | def copy_nonrecursive(self) -> ScoaryTree: 165 | return self.rename_nonrecursive(func=lambda label: label) 166 | 167 | def rename_nonrecursive(self, func: Callable): 168 | if self.is_leaf: 169 | return ScoaryTree(label=func(self.label)) 170 | 171 | root = ScoaryTree(left=self.left, right=self.right) 172 | 173 | stack = [(root, 'right'), (root, 'left')] 174 | 175 | while stack: 176 | current_parent, current_direction = stack[-1] 177 | current_node: ScoaryTree = getattr(current_parent, current_direction) 178 | 179 | if current_node.is_leaf: 180 | # current node is leaf 181 | this = ScoaryTree(label=func(current_node.label)) 182 | 183 | # append self to parent 184 | setattr(current_parent, current_direction, this) 185 | 186 | if current_direction == 'right': 187 | # found terminal node 188 | # # GO UP UNTIL CAN GO RIGHT 189 | while stack and stack[-1][1] == 'right': 190 | stack.pop() 191 | 192 | if not stack: 193 | print(f'done\n{self}\n{root}') 194 | break 195 | 196 | # pop left node -> go right next 197 | stack.pop() 198 | 199 | else: 200 | this = ScoaryTree(left=current_node.left, right=current_node.right) 201 | stack.extend([(this, 'right'), (this, 'left')]) 202 | 203 | # append self to parent 204 | setattr(current_parent, current_direction, this) 205 | 206 | return root 207 | 208 | def rename(self, func: Callable): 209 | """ 210 | Apply a function to each leaf label. 211 | 212 | Only used for debugging. This recursive function could cause RecursionError for big trees. 213 | """ 214 | 215 | def convert(tree: ScoaryTree) -> ScoaryTree: 216 | """recursive function""" 217 | if tree.is_leaf: 218 | return ScoaryTree(label=func(tree.label)) 219 | else: 220 | return ScoaryTree(left=convert(tree.left), right=convert(tree.right)) 221 | 222 | return convert(self) 223 | 224 | @classmethod 225 | def from_newick(cls, newick: str) -> ScoaryTree: 226 | list_tree = parse_newick(newick) 227 | return cls.from_list(list_tree) 228 | 229 | @classmethod 230 | def from_list(cls, tree: []) -> ScoaryTree: 231 | def convert(list_tree): 232 | """recursive function""" 233 | if type(list_tree) is str: 234 | return cls(label=list_tree) 235 | else: 236 | return cls(left=convert(list_tree[0]), right=convert(list_tree[1])) 237 | 238 | return convert(tree) 239 | 240 | @cached_property 241 | def to_list(self) -> []: 242 | def to_list(tree: ScoaryTree) -> str | []: 243 | if tree.is_leaf: 244 | return tree.label 245 | else: 246 | return [to_list(tree.left), to_list(tree.right)] 247 | 248 | return to_list(self) 249 | 250 | @classmethod 251 | def from_presence_absence(cls, genes_df: pd.DataFrame) -> ScoaryTree: 252 | distance_matrix = pd.DataFrame(distance.squareform(distance.pdist(genes_df.T, 'hamming')), 253 | columns=genes_df.columns) 254 | tree_as_list = upgma(distance_matrix) 255 | return cls.from_list(tree_as_list) 256 | 257 | def __prune(self): 258 | if self.left._prune and self.right._prune: 259 | self._prune = True 260 | elif self.left._prune: 261 | # become right 262 | self.label = self.right.label 263 | self.is_leaf = self.right.is_leaf 264 | self.left = self.right.left 265 | self.right = self.right.right 266 | elif self.right._prune: 267 | # become left 268 | self.label = self.left.label 269 | self.is_leaf = self.left.is_leaf 270 | self.right = self.left.right 271 | self.left = self.left.left 272 | -------------------------------------------------------------------------------- /scoary/utils.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | import os 3 | import sys 4 | import json 5 | import logging 6 | from copy import deepcopy 7 | import warnings 8 | from functools import cache 9 | from typing import Type, Any 10 | from datetime import datetime 11 | import numpy as np 12 | import pandas as pd 13 | from importlib.metadata import version 14 | from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning 15 | 16 | warnings.simplefilter('ignore', category=NumbaDeprecationWarning) 17 | warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) 18 | 19 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) 20 | ALLOWED_CORRECTIONS = {'native', 'bonferroni', 'sidak', 'holm-sidak', 'holm', 'simes-hochberg', 'hommel', 'fdr_bh', 21 | 'fdr_by', 'fdr_tsbh', 'fdr_tsbky'} 22 | 23 | logger = logging.getLogger('scoary.utils') 24 | 25 | try: 26 | from ete3 import Tree as EteTree 27 | 28 | 29 | def print_tree(scoary_tree, label_to_gene: {str: bool}, label_to_trait: {str: bool}, show_label=True): 30 | if show_label: 31 | label_fn = lambda label: f'{int(label_to_gene[label])}{int(label_to_trait[label])}_{label}' 32 | else: 33 | label_fn = lambda label: f'{int(label_to_gene[label])}{int(label_to_trait[label])}' 34 | 35 | renamed_tree = scoary_tree.rename(label_fn) 36 | ete_tree = EteTree(renamed_tree.to_newick()) 37 | print(ete_tree) 38 | 39 | except ImportError as e: 40 | def print_tree(scoary_tree, label_to_gene: {str: bool}, label_to_trait: {str: bool}): 41 | raise ImportError('This function requires the ete3 library. Please install via "pip install ete3"') 42 | 43 | 44 | @cache 45 | def get_version() -> str: 46 | try: 47 | return version('scoary-2') 48 | except importlib.metadata.PackageNotFoundError: 49 | return 'development' 50 | 51 | 52 | class NotSplittableError(Exception): 53 | pass 54 | 55 | 56 | class NoTraitsLeftException(Exception): 57 | pass 58 | 59 | 60 | def decode_unicode(string: str) -> str: 61 | return string.encode('utf-8').decode('unicode-escape') 62 | 63 | 64 | def setup_outdir(outdir: str, input: dict) -> str: 65 | outdir = outdir.rstrip('/') 66 | assert not os.path.exists(outdir), f'ERROR: {outdir=} already exists!' 67 | os.makedirs(f'{outdir}/traits') 68 | os.makedirs(f'{outdir}/logs') 69 | os.makedirs(f'{outdir}/app') 70 | with open(f'{outdir}/logs/input.json', 'w') as f: 71 | json.dump(input, f, indent=4) 72 | return outdir 73 | 74 | 75 | def setup_logging(logger: logging.Logger, path: str = None, print_info: bool = True, reset: bool = False): 76 | """ 77 | Setup logging for Scoary 78 | 79 | :param logger: logging.logging.Logger 80 | :param path: if set, DEBUG and higher goes to log files 81 | :param print_info: if True, INFO and higher goes to stdout 82 | :param reset: if True: close and remove all file handlers. (Important for multiprocessing: removes locks!) 83 | :return: 84 | """ 85 | if reset or os.environ.get('SCOARY_RESET_LOGGERS', 'FALSE').upper() == 'TRUE': 86 | while logger.handlers: 87 | handler = logger.handlers[0] 88 | handler.close() 89 | logger.removeHandler(handler) 90 | 91 | logger.setLevel(logging.DEBUG) 92 | 93 | if path is not None: 94 | # create logfile 95 | logfile = logging.FileHandler(path) 96 | logfile.setLevel(logging.DEBUG) 97 | logfile.setFormatter(logging.Formatter("%(asctime)s [%(name)s: %(levelname)s] %(message)s")) 98 | logger.addHandler(logfile) 99 | 100 | if print_info: 101 | # create streamhandler 102 | stdout = logging.StreamHandler() 103 | stdout.setLevel(getattr(logging, os.environ.get('SCOARY_LOGLEVEL_STDOUT', 'INFO').upper())) 104 | logger.addHandler(stdout) 105 | 106 | return logger 107 | 108 | 109 | def ignore_warnings(warning: Type[Warning]): 110 | """ 111 | Decorator to suppress warnings. 112 | 113 | Example: 114 | 115 | @ignore_warnings(warning=ConvergenceWarning) 116 | def some_function(): 117 | # any produced ConvergenceWarnings will be suppressed 118 | 119 | :param warning: class of warning to be suppressed 120 | """ 121 | 122 | def decorator(function): 123 | def wrapper(*args, **kwargs): 124 | with warnings.catch_warnings(): 125 | warnings.simplefilter('ignore', warning) 126 | return function(*args, **kwargs) 127 | 128 | return wrapper 129 | 130 | return decorator 131 | 132 | 133 | def parse_correction(correction_str: str, param_name: str) -> (str, float): 134 | if ':' in correction_str: 135 | method, cutoff = correction_str.split(':', 1) 136 | else: 137 | method, cutoff = correction_str, 'inf' 138 | 139 | assert method in ALLOWED_CORRECTIONS, f'{param_name}={correction_str} must be in {ALLOWED_CORRECTIONS}' 140 | 141 | try: 142 | cutoff = float(cutoff) 143 | except ValueError: 144 | raise AssertionError(f'Error in {correction_str=}: {cutoff=} could not be converted to float') 145 | 146 | return method, cutoff 147 | 148 | 149 | def is_int(string: str) -> bool: 150 | try: 151 | int(string) 152 | return True 153 | except ValueError: 154 | return False 155 | 156 | 157 | def is_float(string: str) -> bool: 158 | try: 159 | float(string) 160 | return True 161 | except ValueError: 162 | return False 163 | 164 | 165 | def split_into_parts(list_: list, n_parts: int) -> [list]: 166 | quotient, reminder = divmod(len(list_), n_parts) 167 | return [ 168 | list_[i * quotient + min(i, reminder):(i + 1) * quotient + min(i + 1, reminder)] 169 | for i in range(n_parts) 170 | ] 171 | 172 | 173 | def fisher_id(a, b, c, d): 174 | """ 175 | Eight contingency tables always give the same pvalue: ['abcd', 'acbd', 'badc', 'bdac', 'cadb', 'cdab', 'dbca', 'dcba'] 176 | 177 | Compute and save only one version. 178 | """ 179 | return min(( 180 | (a, b, c, d), 181 | (a, c, b, d), 182 | (b, a, d, c), 183 | (b, d, a, c), 184 | (c, a, d, b), 185 | (c, d, a, b), 186 | (d, b, c, a), 187 | (d, c, b, a) 188 | )) 189 | 190 | 191 | def load_info_file( 192 | logger: logging.Logger, 193 | info_file: str, 194 | merge_col: str, 195 | expected_overlap_set: set = None, 196 | reference_file: str = None 197 | ) -> pd.DataFrame: 198 | """ 199 | Load an info_file into pd.DataFrame: 200 | - Separator: tab ('\t') 201 | - Must have this header: {merge_col}\t{colname1}\t{colname2}... 202 | 203 | :param logger: instance of logging.Logger 204 | :param info_file: path to file 205 | :param merge_col: name of first column 206 | :param expected_overlap_set: a set of strings, some of which must occur in the index of info_file 207 | :param reference_file: path to reference file, just used for error messages 208 | :return: pd.DataFrame with merge_col as index 209 | """ 210 | info_df = pd.read_csv(info_file, index_col=0, delimiter='\t') 211 | 212 | assert info_df.index.name == merge_col, \ 213 | f'The file {info_file} is improperly formatted: The first column must be named "{merge_col}". ' \ 214 | f'Current name: {info_df.index.name}. Remaining columns: {info_df.columns.tolist()}' 215 | 216 | if expected_overlap_set is not None: 217 | overlap_size = len(set.intersection(set(info_df.index), expected_overlap_set)) 218 | if overlap_size == 0: 219 | logger.warning(f'The {merge_col}s in {info_file} do not match any {merge_col}s in {reference_file}') 220 | logger.debug(f'Loaded descriptions for {overlap_size} {merge_col}s') 221 | 222 | logger.debug(f'Loaded {merge_col} descriptions. columns={info_df.columns.tolist()}') 223 | assert not info_df.index.has_duplicates, \ 224 | f'{info_file} contains duplicates: {info_df.index[info_df.index.duplicated()]}' 225 | return info_df 226 | 227 | 228 | class MockCounter: 229 | """ 230 | Imitate multiprocessing.Manager.Value / multiprocessing.managers.ValueProxy 231 | """ 232 | 233 | def __init__(self): 234 | self._value: int = 0 235 | 236 | @property 237 | def value(self): 238 | return self._value 239 | 240 | @value.setter 241 | def value(self, value): 242 | self._value = value 243 | 244 | 245 | class MockLock: 246 | """ 247 | Imitate multiprocessing.Manager.Lock / multiprocessing.managers.AcquirerProxy 248 | """ 249 | 250 | def __init__(self): 251 | self._value = 0 252 | 253 | def __enter__(self): 254 | return self 255 | 256 | def __exit__(self, *exc): 257 | return False 258 | 259 | 260 | class AbstractNamespace: 261 | @classmethod 262 | def create_namespace(cls, ns, properties: {str: Any}): 263 | for name in cls.__dict__['__annotations__'].keys(): 264 | setattr(ns, name, properties[name]) 265 | return ns 266 | 267 | 268 | def grasp_namespace(cls, ns): 269 | """ 270 | This will copy the elements of the multiprocessing namespace into the "private" memory of the current process 271 | 272 | :param ns: multiprocessing.managers.Namespace 273 | :return: MockNameSpace 274 | """ 275 | new_ns = cls() 276 | for name in cls.__dict__['__annotations__'].keys(): 277 | value = getattr(ns, name) 278 | if name in ['lock', 'counter']: 279 | setattr(new_ns, name, value) 280 | else: 281 | setattr(new_ns, name, deepcopy(value)) 282 | return new_ns 283 | 284 | 285 | class AnalyzeTraitNamespace(AbstractNamespace): 286 | counter: MockCounter 287 | queue_size: int 288 | lock: MockLock 289 | outdir: str 290 | start_time: datetime 291 | genes_orig_df: pd.DataFrame 292 | genes_bool_df: pd.DataFrame 293 | gene_info_df: pd.DataFrame | None 294 | numeric_df: pd.DataFrame 295 | traits_df: pd.DataFrame 296 | trait_info_df: pd.DataFrame | None 297 | duplicates: pd.DataFrame 298 | tree: object #: ScoaryTree 299 | all_labels: set 300 | mt_f_method: str 301 | mt_f_cutoff: float 302 | trait_wise_correction: bool 303 | max_genes: int 304 | worst_cutoff: None | float 305 | n_permut: int 306 | random_state: int 307 | pairwise: bool 308 | multiple_testing_df: pd.DataFrame 309 | 310 | 311 | class BinarizeTraitNamespace(AbstractNamespace): 312 | counter: MockCounter 313 | lock: MockLock 314 | outdir: str 315 | start_time: datetime 316 | numeric_df: pd.DataFrame 317 | random_state: int 318 | method: str 319 | alternative: str 320 | covariance_type: str 321 | cutoff: float 322 | random_state: int 323 | -------------------------------------------------------------------------------- /benchmarking/runtime/data/100_traits.csv: -------------------------------------------------------------------------------- 1 | Name,lc:Compound_8069,lc:Compound_7747,lc:Compound_8286,lc:Compound_15820,vol:28.29B325,lc:Compound_15534,vol:28.34A521,vol:22.07B128,lc:Compound_7542,vol:29.32A619,lc:Compound_6322,vol:51.57B3119,lc:Compound_2501,vol:30.72B720,vol:23.12B149,vol:10.59A34,vol:43.43B1738,lc:Compound_7931,lc:Compound_10098,vol:50.04A1824,vol:39.67A1311,vol:31.09B751,lc:Compound_10065,lc:Compound_8550,lc:Compound_13565,vol:13.51A88,vol:33.59A924,lc:Compound_13311,vol:32.87B883,vol:28.63B419,lc:Compound_8016,lc:Compound_16708,lc:Compound_6217,lc:Compound_10163,lc:Compound_4586,lc:Compound_6238,lc:Compound_5186,vol:45.07B2035,lc:Compound_14592,lc:Compound_5252,lc:Compound_8405,vol:45.07B2053,vol:30.71A725,vol:45.07B2066,vol:40.16B1508,lc:Compound_12131,lc:Compound_11605,lc:Compound_12094,vol:41.79B1620,lc:Compound_7022,lc:Compound_2760,vol:23.55A329,lc:Compound_3387,lc:Compound_11587,lc:Compound_3197,vol:33.98A973,lc:Compound_2600,vol:26.27A428,lc:Compound_12011,vol:24.1A345,vol:31.72B789,vol:44.93B1925,lc:Compound_16370,vol:28.3B337,lc:Compound_12738,vol:49.37B2901,lc:Compound_6721,lc:Compound_6526,vol:47.94B2623,lc:Compound_4023,vol:50.74B2985,vol:47.94B2542,vol:47.94B2598,lc:Compound_12790,vol:34.12A980,lc:Compound_10040,lc:Compound_16135,lc:Compound_13119,lc:Compound_3793,vol:33.59A923,vol:28.29B330,lc:Compound_8627,lc:Compound_14527,lc:Compound_14036,lc:Compound_8062,lc:Compound_12717,lc:Compound_7448,lc:Compound_10182,lc:Compound_15808,lc:Compound_8001,lc:Compound_6351,lc:Compound_5738,vol:33.67B947,lc:Compound_15057,lc:Compound_15661,vol:28.49B400,lc:Compound_7986,lc:Compound_1737,lc:Compound_15484,lc:Compound_13098 2 | FAM14177-p1-1.1,-,0,-,0,0,1,0,-,-,-,0,0,1,1,0,0,0,1,-,0,-,0,0,0,0,-,-,0,-,0,-,0,0,-,-,0,-,-,0,0,-,0,0,-,0,-,0,-,0,0,1,0,1,1,0,0,1,1,1,1,1,-,-,0,0,1,0,0,-,0,0,-,1,0,0,0,-,0,0,0,0,0,0,-,-,0,0,0,0,0,-,0,-,0,0,0,-,-,-,- 3 | FAM14184-i1-1.1,1,0,-,0,1,1,-,-,1,1,-,-,1,1,0,0,-,1,1,0,-,0,-,0,0,-,0,0,1,0,1,0,0,1,1,0,-,-,0,0,1,-,0,-,1,1,0,-,0,0,1,0,1,1,0,0,-,1,1,0,1,-,-,0,0,0,0,0,-,-,0,-,0,0,-,0,1,0,-,0,0,0,0,-,-,0,0,0,0,0,-,-,0,0,0,0,-,-,-,- 4 | FAM14193-i1-1.1,1,0,0,0,0,1,-,-,0,1,-,0,1,1,0,0,0,1,0,0,-,0,1,0,0,-,-,0,-,0,1,-,0,1,0,0,-,-,1,0,-,1,0,-,0,0,0,-,0,0,1,0,1,-,0,0,-,1,-,1,-,-,-,0,0,-,0,0,0,0,0,0,0,-,0,0,1,0,-,0,0,0,0,1,0,0,0,-,0,0,-,0,1,0,0,0,-,-,-,- 5 | FAM14197-i1-1.1,-,0,-,0,0,0,0,0,-,0,1,1,0,1,1,0,1,0,-,0,-,0,0,0,0,-,-,0,0,0,1,0,1,1,0,1,1,-,0,1,0,-,0,-,0,0,0,0,-,-,1,0,1,1,0,1,0,1,1,1,0,-,1,0,0,0,1,0,1,-,0,1,1,-,0,0,-,0,0,0,0,0,0,0,-,0,0,0,0,-,1,0,0,0,0,0,-,1,-,- 6 | FAM14217-p1-1.1,-,0,0,0,0,0,-,-,0,1,0,-,-,1,0,1,1,1,0,-,-,1,0,1,0,-,-,0,1,0,-,0,0,0,0,0,-,1,0,0,0,1,0,1,0,-,0,0,1,0,1,1,1,-,1,0,1,1,1,0,-,1,-,0,0,0,0,0,1,0,1,1,1,0,-,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-,0,-,0,0,1,-,1,-,- 7 | FAM14221-p1-1.1,-,0,0,0,1,0,1,1,0,1,0,1,-,0,0,0,-,0,-,0,1,0,-,0,0,-,1,0,1,0,-,0,0,-,1,0,-,1,0,0,0,1,1,1,0,-,0,-,-,0,1,0,1,0,0,0,0,1,-,1,1,1,-,1,0,0,0,-,1,-,0,1,0,-,1,0,-,0,0,1,1,0,0,-,-,0,0,0,0,1,1,0,1,0,0,0,-,-,-,- 8 | FAM14222-p1-1.1,1,0,-,0,1,0,-,-,1,0,1,1,0,1,0,0,-,0,1,-,-,0,1,0,0,-,-,0,0,0,1,1,0,1,0,1,1,-,0,-,0,0,0,0,1,-,0,-,1,-,1,0,1,-,0,1,1,1,1,-,0,0,-,0,0,0,1,1,1,-,1,-,1,1,0,-,-,0,1,1,0,-,0,0,1,0,0,0,0,0,-,0,0,0,0,0,-,-,1,- 9 | FAM1414-i1-1.1,-,1,0,1,0,-,0,0,0,1,0,0,0,-,0,0,0,0,0,0,-,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,-,-,0,1,0,1,0,1,0,-,0,0,0,-,1,0,-,-,1,0,0,1,0,1,1,1,-,0,1,0,0,0,0,-,0,0,0,0,0,1,0,0,0,0,0,0,1,0,-,0,0,0,1,0,-,1,1,0,0,0,-,-,-,- 10 | FAM15061-i1-1.1,1,0,0,0,0,-,-,-,-,1,0,0,-,1,0,0,0,1,0,0,-,0,1,0,0,-,-,0,-,0,-,0,-,-,0,0,-,-,0,0,0,-,0,1,0,-,0,-,0,1,0,0,1,1,1,0,-,1,1,1,-,-,1,0,0,1,0,0,-,-,0,-,-,0,0,-,1,0,0,0,0,0,0,0,-,0,0,0,0,0,-,0,0,0,0,0,-,-,-,- 11 | FAM15078-i1-1.1,1,0,1,0,0,0,-,0,1,0,1,1,-,1,0,0,0,0,1,0,0,0,-,0,0,-,-,0,0,0,1,-,-,1,0,1,1,0,0,1,0,0,0,0,1,-,0,-,0,1,1,0,1,1,1,0,-,1,1,0,0,0,-,0,0,0,1,-,-,-,0,-,-,-,0,0,-,0,0,1,0,0,0,0,1,0,0,0,-,-,-,0,0,0,0,0,-,-,1,- 12 | FAM15113-i1-1.1,-,0,1,0,0,0,-,0,0,0,1,-,1,1,0,0,1,0,1,1,-,0,0,-,1,-,-,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,-,0,0,0,-,0,0,-,-,1,1,0,1,-,-,0,0,-,0,1,0,-,-,1,0,1,1,1,1,0,-,0,0,0,1,0,1,0,0,0,0,0,0,0,0,-,0,0,0,0,0,-,-,-,- 13 | FAM15170-i1-1.1,1,0,0,1,1,-,-,-,0,1,-,-,0,1,0,-,0,0,0,1,-,0,-,0,1,-,-,1,1,0,-,0,-,1,0,0,-,1,0,-,0,1,0,1,1,0,0,-,-,-,1,0,1,-,0,0,0,1,1,1,1,1,-,0,0,0,0,-,0,-,1,0,0,0,0,-,-,0,-,0,0,0,1,0,-,1,0,0,1,0,-,1,1,0,0,0,0,-,-,- 14 | FAM15190-i1-1.1,-,0,1,0,0,1,1,-,-,1,-,0,1,1,0,1,1,1,1,0,-,0,-,0,0,-,1,0,1,0,1,0,0,-,1,0,-,-,0,0,-,1,0,1,0,-,0,-,-,0,1,1,1,-,1,0,-,1,1,1,-,-,-,0,0,0,0,0,1,-,0,1,1,0,0,0,-,1,0,0,0,0,0,0,0,0,1,0,0,0,-,0,1,0,0,-,-,1,-,- 15 | FAM15192-i1-1.1,-,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,-,0,-,0,1,0,0,0,0,0,1,-,0,-,0,1,1,0,0,-,0,0,0,0,0,-,0,-,-,1,1,0,1,-,0,0,0,1,1,1,0,0,-,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-,0,0,0,0,0,-,-,-,- 16 | FAM15300-i1-1.1,-,0,0,1,0,-,0,0,0,1,0,0,-,1,0,0,-,-,1,0,-,0,1,0,0,-,0,0,0,0,0,0,1,0,0,0,-,-,0,1,0,1,0,1,0,-,0,0,0,1,0,0,-,-,1,0,0,1,-,1,0,1,-,0,0,0,0,0,-,0,0,-,-,0,0,0,-,0,1,-,0,0,1,0,0,0,0,0,1,0,-,1,0,0,0,0,-,-,-,- 17 | FAM15333-i1-1.1,1,0,0,1,0,-,-,0,1,1,1,0,-,1,0,0,0,0,0,0,-,0,-,0,0,0,0,0,-,0,1,-,-,1,0,0,1,-,0,1,0,1,0,1,0,-,0,-,0,-,1,0,1,-,0,0,1,1,1,1,1,1,-,0,0,0,0,1,1,1,0,-,1,0,0,0,1,0,-,0,0,0,1,0,0,0,0,0,1,-,-,1,1,1,0,0,0,-,-,- 18 | FAM15346-i1-1.1,0,0,1,0,0,-,-,-,0,1,0,-,1,1,0,-,-,1,0,0,-,0,-,0,0,-,-,0,0,0,0,0,0,0,1,0,-,-,0,0,1,1,0,1,0,1,0,0,0,0,0,0,1,-,1,0,0,1,-,1,0,1,-,0,0,0,0,0,1,-,0,-,-,1,0,1,0,0,-,-,0,0,0,1,0,0,0,0,0,0,-,0,0,0,0,0,-,0,0,- 19 | FAM15347-i1-1.1,-,0,0,0,0,1,0,-,-,0,-,0,1,1,0,1,0,1,-,0,0,0,0,0,1,-,-,0,0,0,-,0,1,-,0,0,-,0,0,0,1,0,0,0,0,-,0,0,0,0,1,0,1,-,0,0,0,1,-,1,0,0,-,0,0,0,0,0,0,-,0,0,0,-,0,0,-,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,-,-,0,- 20 | FAM15381-i1-1.1,-,0,0,0,0,-,0,-,0,1,0,-,1,1,0,0,0,0,0,1,-,0,0,0,0,-,0,0,-,0,-,0,-,-,0,0,1,1,0,0,0,1,0,1,0,-,0,0,-,0,1,0,1,-,0,0,0,1,1,0,1,1,-,0,0,0,0,0,1,1,-,-,1,-,0,1,-,0,0,0,0,1,0,0,0,0,0,0,0,0,-,0,1,0,0,0,-,1,-,- 21 | FAM15407-i1-1.1,0,0,0,0,0,0,-,-,0,1,0,0,1,1,0,-,0,1,0,0,-,0,0,-,0,-,0,0,1,0,0,0,1,0,1,0,1,1,0,0,1,1,0,-,1,-,0,0,0,0,0,0,0,0,1,-,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,-,1,1,-,1,0,0,-,0,0,-,0,1,-,0,0,0,0,0,-,0,0,0,0,0,1,1,0,- 22 | FAM19015-i1-1.1,-,0,0,0,1,0,1,1,0,1,1,1,-,1,-,0,1,0,0,1,-,0,-,0,0,1,-,0,1,-,-,0,1,1,0,1,1,1,0,-,0,1,0,1,0,-,0,-,1,-,1,1,-,-,1,0,0,1,1,-,1,1,-,1,0,0,1,-,-,-,1,1,1,1,1,0,0,0,0,1,-,1,0,0,-,0,0,0,0,0,-,0,0,0,0,0,-,-,-,- 23 | FAM19016-i1-1.1,-,0,0,1,0,0,0,0,0,1,0,0,1,0,0,-,0,1,0,0,-,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,-,1,0,-,1,1,0,1,-,1,0,0,-,1,0,0,-,0,1,0,-,1,0,0,-,1,-,0,0,0,1,-,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,-,0,0,0,-,0,-,1,1,0,0,0,1,-,0,- 24 | FAM19020-i1-1.1,1,0,0,1,0,0,-,0,0,0,0,0,0,1,0,0,-,0,0,0,-,0,-,0,1,0,0,1,-,0,-,0,0,-,0,0,-,-,0,1,0,0,0,0,0,0,0,-,0,-,1,0,-,-,0,0,-,1,1,1,0,0,-,0,-,0,0,-,0,-,0,-,0,0,0,0,0,0,0,-,0,0,1,0,0,0,0,0,1,-,-,1,0,0,0,0,-,-,1,- 25 | FAM19022-i1-1.1,1,0,-,0,0,0,-,0,1,0,-,0,-,1,0,0,-,0,-,0,-,0,1,0,0,-,-,0,0,0,1,1,0,1,0,1,-,-,0,1,0,-,0,0,0,-,0,-,0,0,1,0,1,-,0,1,0,1,1,1,0,-,-,0,0,0,1,-,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-,-,1,- 26 | FAM19023-i1-1.1,-,0,-,0,0,0,-,0,0,1,-,0,-,1,0,0,0,0,0,0,-,0,-,0,0,-,0,0,0,0,-,0,0,-,0,1,1,1,0,0,0,1,0,1,1,0,0,-,0,-,1,0,-,-,0,1,0,1,1,0,-,1,-,0,0,0,1,-,1,-,-,-,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-,-,0,1,0,0,0,-,-,-,- 27 | FAM19024-p1-1.1,1,0,0,0,0,-,-,-,0,1,0,0,-,1,0,0,-,-,-,0,-,1,0,0,1,-,0,0,1,-,1,-,0,-,0,0,-,1,1,0,-,1,0,1,0,-,1,1,0,0,1,0,1,-,1,0,1,1,1,1,1,1,-,0,-,0,0,0,1,-,0,1,1,0,-,0,-,1,0,1,0,0,0,-,0,0,0,0,0,-,1,0,1,0,0,1,0,-,-,- 28 | FAM19025-p1-1.1,-,0,1,0,1,0,-,-,-,1,-,0,-,1,0,0,0,0,0,1,-,0,1,0,0,-,-,0,-,0,-,0,0,1,-,1,1,1,0,-,0,1,0,1,0,0,0,-,-,1,1,0,-,0,0,1,1,1,1,0,-,1,-,0,0,0,1,1,-,-,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,-,0,1,0,0,0,1,-,-,- 29 | FAM19030-i2-1.1,-,0,0,0,0,0,-,1,0,1,-,1,0,1,0,0,0,0,0,1,-,0,-,-,0,-,0,0,-,0,0,0,0,-,0,1,-,1,0,0,0,1,0,1,0,0,0,0,-,-,0,0,0,-,1,1,0,1,-,0,0,1,-,0,0,0,1,0,1,0,1,1,1,1,0,-,0,0,0,0,0,0,0,0,1,0,-,0,0,0,-,0,1,0,0,0,-,0,0,- 30 | FAM19031-i2-1.1,1,0,1,0,0,1,0,0,1,1,0,0,0,-,0,0,0,-,0,0,-,0,-,0,0,0,0,0,0,0,1,1,0,-,0,0,0,-,0,0,-,1,0,1,0,-,0,-,0,0,1,0,1,1,0,0,-,1,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,-,0,0,-,0,0,-,0,-,0,0,0,0,0,1,1 31 | FAM19034-i1-1.1,1,0,0,1,1,-,-,-,0,1,1,1,-,1,0,-,-,-,1,0,-,0,1,0,-,-,0,1,1,1,1,1,0,1,0,0,-,1,1,1,0,1,0,1,1,-,0,-,1,-,1,0,1,-,0,0,0,1,1,0,1,1,1,0,0,0,0,1,1,-,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,1,0,0,1,0,-,1,- 32 | FAM22019-i1-1.1,1,0,1,0,1,0,0,0,1,-,1,-,0,1,0,0,-,0,1,-,-,0,-,0,1,-,0,0,-,0,1,1,0,1,0,1,-,1,1,-,0,1,0,1,0,0,0,1,0,1,1,0,1,1,0,0,0,1,1,1,-,1,1,0,1,0,0,-,0,1,0,0,-,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,1 33 | FAM22020-i1-1.1,1,0,0,1,0,0,1,-,0,1,0,-,-,1,0,1,1,-,0,1,-,1,-,0,0,-,0,1,1,1,0,0,0,0,-,0,0,1,0,1,-,1,-,1,0,0,1,0,1,1,0,-,1,-,1,0,0,1,0,1,1,1,-,0,0,0,0,-,1,-,1,1,-,-,0,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0,-,1,1,0,0,1,-,1,0,- 34 | FAM22021-p1-1.1,0,0,0,0,0,-,-,-,0,1,0,0,1,1,0,0,-,1,-,0,-,0,-,0,0,-,-,0,0,0,-,0,1,-,1,0,1,-,0,0,1,1,0,1,0,-,0,0,0,0,0,0,1,0,1,-,0,1,0,1,1,-,-,0,0,0,-,-,0,-,0,0,0,-,0,0,0,0,-,0,0,1,0,-,0,0,0,0,0,0,-,0,1,0,0,0,1,1,0,- 35 | FAM23848-i1-1.1,-,0,-,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,-,1,0,0,0,0,0,0,0,0,-,0,1,-,0,1,1,-,0,0,0,-,0,1,0,-,0,0,0,0,0,0,1,-,1,0,0,0,-,0,0,-,-,0,0,0,1,-,-,0,1,-,-,1,0,0,0,0,0,-,0,0,0,0,-,1,0,0,0,0,1,0,1,0,0,0,-,0,-,- 36 | FAM23852-i1-1.1,1,1,1,0,0,1,-,1,1,1,0,1,1,0,0,0,-,1,1,1,-,0,-,0,0,0,-,0,-,0,-,0,0,1,0,0,0,-,0,0,1,-,0,-,0,1,1,0,0,-,0,0,1,1,1,1,-,1,0,-,1,-,-,0,0,1,0,0,1,0,0,1,-,0,0,0,0,0,-,0,0,0,0,-,0,0,0,0,0,-,1,0,0,0,0,0,1,1,-,- 37 | FAM23853-i1-1.1,-,0,0,0,0,0,-,-,0,1,1,-,0,1,0,0,1,0,0,0,-,0,-,0,0,-,-,0,-,0,-,-,1,1,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,-,1,0,0,1,1,1,0,1,-,0,0,0,1,-,1,-,0,-,1,0,0,0,0,0,0,1,0,-,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,-,1,- 38 | FAM23855-i1-1.1,-,0,0,1,0,0,0,0,-,1,0,0,1,1,0,0,0,0,-,0,0,-,0,0,1,0,0,-,0,0,-,0,0,0,0,0,1,-,0,1,-,1,0,-,0,-,0,0,0,-,0,0,-,-,1,0,1,1,-,1,0,1,-,0,1,0,0,-,-,0,0,0,1,1,0,-,0,0,0,0,0,0,1,0,-,0,0,0,1,0,-,1,1,0,0,0,-,1,0,- 39 | FAM23864-i1-1.1,0,0,0,1,0,0,-,1,0,1,0,0,-,1,0,0,1,1,0,1,-,0,0,1,0,-,-,-,1,0,0,0,0,0,1,0,-,1,0,1,-,1,0,1,0,-,0,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,0,0,0,0,-,-,1,0,-,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,- 40 | FAM23867-i1-1.1,-,0,0,1,0,-,-,-,0,1,0,0,1,1,0,-,0,-,-,1,-,0,-,0,0,-,0,1,1,0,0,0,1,-,0,0,1,1,0,1,-,1,0,1,0,-,0,0,0,-,0,0,-,0,1,0,0,1,-,1,1,1,-,0,0,-,0,0,-,0,0,0,-,1,0,0,0,0,0,-,0,0,1,-,-,0,0,0,1,0,-,1,1,0,0,0,1,-,0,- 41 | FAM23868-i1-1.1,-,0,1,1,0,0,0,-,0,1,0,0,-,0,0,1,0,1,-,1,-,1,-,-,1,-,0,1,1,-,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,-,0,0,-,1,0,0,0,-,1,0,1,1,-,-,1,1,-,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,-,0,0,0,1,1,-,0,0,0,-,0,0,1,1,1,0,0,-,-,0,- 42 | FAM23869-i1-1.1,-,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,-,0,-,0,-,0,0,0,-,0,0,0,1,0,1,-,-,1,1,0,0,-,-,0,0,0,0,0,0,0,0,1,0,0,-,-,1,0,-,1,0,1,0,0,-,0,0,-,-,-,-,0,0,-,-,1,0,0,0,0,-,0,0,1,0,0,1,0,0,0,0,0,-,0,0,0,0,0,-,-,-,- 43 | FAM23870-i1-1.1,1,0,0,0,0,-,0,-,1,1,1,0,1,1,0,0,0,1,1,0,-,0,-,0,0,0,0,0,-,0,-,0,0,1,0,0,1,-,0,0,1,1,0,1,0,-,0,-,0,-,1,0,1,-,0,0,0,1,1,-,-,1,-,0,0,0,0,1,-,1,0,0,-,0,0,0,1,0,-,0,0,0,0,1,0,0,0,0,0,0,-,0,1,0,0,0,0,1,-,- 44 | FAM23877-p1-1.1,0,0,1,0,0,-,1,-,1,1,0,0,-,1,1,1,1,1,1,1,-,1,-,1,0,1,0,0,-,0,0,0,0,-,0,0,0,-,0,0,1,1,0,1,1,1,0,0,-,0,0,1,-,-,1,0,0,1,0,-,0,1,-,0,0,0,0,0,1,0,0,1,1,0,-,-,0,0,1,0,0,0,0,1,1,0,1,0,0,0,-,0,0,0,0,1,-,0,0,0 45 | FAM24252-i1-1.1,-,0,-,0,0,0,-,0,0,0,-,-,-,1,1,0,0,0,-,-,-,0,-,0,0,-,-,0,0,0,-,0,-,-,0,1,1,-,0,1,0,0,0,-,0,-,0,0,0,-,1,0,1,-,1,1,0,1,1,1,0,-,-,0,0,0,1,-,1,-,1,-,1,1,0,0,0,0,-,0,0,-,0,0,0,0,0,0,0,1,1,0,0,0,0,0,-,-,-,- 46 | -------------------------------------------------------------------------------- /tests/test_scoary.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | from init_tests import * 4 | 5 | from scoary.scoary import * 6 | 7 | os.environ['MGWAS_LINK_ONLY'] = 'true' 8 | 9 | RESTRICT_TO = 'FAM14177-p1-1.1,FAM14184-i1-1.1,FAM14193-i1-1.1,FAM14197-i1-1.1,FAM14217-p1-1.1,FAM14221-p1-1.1,' \ 10 | 'FAM14222-p1-1.1,FAM1414-i1-1.1,FAM15061-i1-1.1,FAM15078-i1-1.1,FAM15113-i1-1.1,FAM15170-i1-1.1,' \ 11 | 'FAM15190-i1-1.1,FAM15192-i1-1.1,FAM15300-i1-1.1,FAM15333-i1-1.1,FAM15346-i1-1.1,FAM15347-i1-1.1,' \ 12 | 'FAM15381-i1-1.1,FAM15407-i1-1.1,FAM19015-i1-1.1,FAM19016-i1-1.1,FAM19020-i1-1.1,FAM19022-i1-1.1,' \ 13 | 'FAM19023-i1-1.1,FAM19024-p1-1.1,FAM19025-p1-1.1,FAM19030-i2-1.1,FAM19031-i2-1.1,FAM19034-i1-1.1,' \ 14 | 'FAM22019-i1-1.1,FAM22020-i1-1.1,FAM22021-p1-1.1,FAM23848-i1-1.1,FAM23852-i1-1.1,FAM23853-i1-1.1,' \ 15 | 'FAM23855-i1-1.1,FAM23864-i1-1.1,FAM23867-i1-1.1,FAM23868-i1-1.1,FAM23869-i1-1.1,FAM23870-i1-1.1,' \ 16 | 'FAM23877-p1-1.1,FAM24252-i1-1.1' 17 | 18 | 19 | class TestScoary(TestCase): 20 | def setUp(self) -> None: 21 | self.tempdir = get_tempdir_path() 22 | if os.path.isdir(self.tempdir): 23 | shutil.rmtree(self.tempdir) 24 | 25 | def test_scoary_single_threaded(self): 26 | scoary( 27 | trait_wise_correction=False, 28 | genes='../data/tetracycline/Gene_presence_absence.csv', 29 | traits='../data/tetracycline/Tetracycline_resistance.csv', 30 | n_permut=1000, 31 | multiple_testing='fdr_bh:0.5', 32 | n_cpus=1, 33 | outdir=self.tempdir 34 | ) 35 | 36 | def test_scoary_multi_threaded(self): 37 | scoary( 38 | trait_wise_correction=True, 39 | genes='../data/tetracycline/Gene_presence_absence.csv', 40 | traits='../data/tetracycline/Tetracycline_resistance.csv', 41 | n_permut=200, 42 | n_cpus=4, 43 | outdir=self.tempdir, 44 | multiple_testing='native:0.05' 45 | ) 46 | 47 | def test_scoary_gene_info(self): 48 | scoary( 49 | genes='../data/tetracycline/Gene_presence_absence.csv', 50 | gene_info='../data/tetracycline/gene-info.tsv', 51 | traits='../data/tetracycline/Tetracycline_resistance.csv', 52 | n_permut=10000, 53 | n_cpus=1, 54 | outdir=self.tempdir 55 | ) 56 | 57 | def test_scoary_long_binary(self): 58 | scoary( 59 | trait_wise_correction=True, 60 | newicktree='../data/new_ds/SpeciesTree_rooted.txt', 61 | multiple_testing='fdr_bh:0.6', 62 | linkage_method='average', 63 | genes='../data/new_ds/N0.tsv', 64 | gene_data_type='gene-list:\t', 65 | traits='../data/new_ds/LC-binary.tsv', 66 | trait_data_type='binary:\t', 67 | n_permut=200, 68 | # ignore='Starter-only-5A,FAMIX,Starter-only-10,Starter-only-7,mixture', 69 | restrict_to=RESTRICT_TO, 70 | random_state=42, 71 | n_cpus=7, 72 | outdir=self.tempdir, 73 | # limit_traits=(0, 20), 74 | limit_traits=(320, 340), 75 | max_genes=100 76 | ) 77 | 78 | def test_scoary_long_numeric(self): 79 | scoary( 80 | multiple_testing='fdr_bh:0.3', 81 | genes='../data/new_ds/N0.tsv', 82 | gene_info='../data/new_ds/N0_best_names.tsv', 83 | gene_data_type='gene-list:\t', 84 | traits='../data/new_ds/LC.tsv', 85 | trait_data_type='gaussian:skip:\t:tied', 86 | trait_info='../data/new_ds/LC-meta.tsv', 87 | isolate_info='../data/new_ds/isolate-meta.tsv', 88 | n_permut=200, 89 | # ignore='Starter-only-5A,FAMIX,Starter-only-10,Starter-only-7,mixture', 90 | restrict_to='FAM14177-p1-1.1,FAM14184-i1-1.1,FAM14193-i1-1.1,FAM14197-i1-1.1,FAM14217-p1-1.1,FAM14221-p1-1.1,FAM14222-p1-1.1,FAM1414-i1-1.1,FAM15061-i1-1.1,FAM15078-i1-1.1,FAM15113-i1-1.1,FAM15170-i1-1.1,FAM15190-i1-1.1,FAM15192-i1-1.1,FAM15300-i1-1.1,FAM15333-i1-1.1,FAM15346-i1-1.1,FAM15347-i1-1.1,FAM15381-i1-1.1,FAM15407-i1-1.1,FAM19015-i1-1.1,FAM19016-i1-1.1,FAM19020-i1-1.1,FAM19022-i1-1.1,FAM19023-i1-1.1,FAM19024-p1-1.1,FAM19025-p1-1.1,FAM19030-i2-1.1,FAM19031-i2-1.1,FAM19034-i1-1.1,FAM22019-i1-1.1,FAM22020-i1-1.1,FAM22021-p1-1.1,FAM23848-i1-1.1,FAM23852-i1-1.1,FAM23853-i1-1.1,FAM23855-i1-1.1,FAM23864-i1-1.1,FAM23867-i1-1.1,FAM23868-i1-1.1,FAM23869-i1-1.1,FAM23870-i1-1.1,FAM23877-p1-1.1,FAM24252-i1-1.1', 91 | random_state=42, 92 | n_cpus=7, 93 | outdir=self.tempdir, 94 | limit_traits=(0, 200), 95 | pairwise=True 96 | ) 97 | 98 | def test_scoary_gauss_kmeans(self): 99 | scoary( 100 | genes='../data/new_ds/N0.tsv', 101 | gene_info='../data/new_ds/N0_best_names.tsv', 102 | gene_data_type='gene-list:\t', 103 | traits='../data/new_ds/LC.tsv', 104 | trait_data_type=f'gaussian:kmeans:\t', 105 | trait_info='../data/new_ds/LC-meta.tsv', 106 | isolate_info='../data/new_ds/isolate-meta.tsv', 107 | n_permut=200, 108 | restrict_to=RESTRICT_TO, 109 | random_state=42, 110 | n_cpus=7, 111 | outdir=self.tempdir, 112 | # limit_traits=(0, 100), 113 | # pairwise=False 114 | ) 115 | 116 | def test_scoary_full(self): 117 | scoary( 118 | multiple_testing='bonferroni:0.1', 119 | genes='../data/full_ds/N0.tsv', 120 | gene_info='../data/full_ds/N0_best_names.tsv', 121 | gene_data_type='gene-list:\t', 122 | traits='../data/full_ds/traits.tsv', 123 | trait_data_type=f'gaussian:skip:\t:tied', # {'tied', 'full', 'diag', 'spherical'} 124 | trait_info='../data/full_ds/trait_info.tsv', 125 | isolate_info='../data/full_ds/isolate_info.tsv', 126 | n_permut=600, 127 | random_state=42, 128 | n_cpus=8, 129 | n_cpus_binarization=1, 130 | restrict_to=RESTRICT_TO, 131 | max_genes=50, 132 | # limit_traits=(12377, 12378), 133 | limit_traits=(3750, 3760), 134 | trait_wise_correction=True, 135 | # limit_traits=(2330, 2340), 136 | worst_cutoff=0.1, 137 | outdir=self.tempdir, 138 | ) 139 | 140 | def test_scoary_marco(self): 141 | scoary( 142 | genes='../data/marco/Orthogroups.tsv', 143 | gene_data_type='gene-list:\t', 144 | traits='../data/marco/traits.tsv', 145 | trait_data_type='binary: ', # {'tied', 'full', 'diag', 'spherical'} 146 | n_permut=1000, 147 | random_state=42, 148 | n_cpus=1, 149 | outdir=self.tempdir, 150 | multiple_testing='native:0.05', 151 | ) 152 | 153 | def test_scoary_jacordova(self): 154 | scoary( 155 | genes='../data/jacordova/GeneCount_Scoary_Ecoli.txt', 156 | gene_data_type='gene-count:\t', 157 | traits='../data/jacordova/Ecoli_traits.txt', 158 | trait_data_type='gaussian:kmeans:\t', # {'tied', 'full', 'diag', 'spherical'} 159 | n_permut=1000, 160 | random_state=42, 161 | n_cpus=1, 162 | outdir=self.tempdir, 163 | multiple_testing='native:0.05', 164 | ) 165 | 166 | def test_same_hemming_result(self): 167 | """ 168 | Check if old scoary generates the same data (hamming similarity matrix) 169 | """ 170 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 171 | tdm_new = pd.DataFrame(distance.squareform(distance.pdist(genes_df.T, 'hamming'))) 172 | tdm_old = np.flip(pd.read_csv('../data/tetracycline/tetracycline_TDM.csv', index_col=0).values) # has to be flipped 173 | np.fill_diagonal(tdm_old, 0) # diagonal should be 0, not 1 174 | tdm_old = pd.DataFrame(tdm_old) 175 | self.assertTrue(np.isclose(tdm_old, tdm_new).all()) 176 | 177 | def test_recursion_depth(self): 178 | strains = [f'strain_{i}' for i in range(13000)] 179 | genes = [f'gene_{i}' for i in range(100)] 180 | traits = [f'trait_{i}' for i in range(4)] 181 | genes_df = pd.DataFrame( 182 | np.random.randint( 183 | low=0, high=2, size=(len(genes), len(strains)) 184 | ), index=genes, columns=strains 185 | ) 186 | traits_df = pd.DataFrame( 187 | np.random.randint( 188 | low=0, high=2, size=(len(strains), len(traits)) 189 | ), index=strains, columns=traits 190 | ) 191 | genes_df.to_csv('../data/huge_ds/genes.tsv', sep='\t') 192 | traits_df.to_csv('../data/huge_ds/traits.tsv', sep='\t') 193 | # Calculating tree is very slow, but it works. 194 | with open('../data/huge_ds/tree.nwk', 'w') as f: 195 | f.write('(' * (len(strains) - 1)) 196 | f.write(strains[0]) 197 | f.write(',') 198 | f.write('),'.join(strains[1:])) 199 | f.write(');') 200 | 201 | scoary( 202 | genes='../data/huge_ds/genes.tsv', 203 | traits='../data/huge_ds/traits.tsv', 204 | trait_data_type='binary:\t', 205 | gene_data_type='gene-count:\t', 206 | newicktree='../data/huge_ds/tree.nwk', 207 | n_permut=1000, 208 | n_cpus=4, 209 | outdir=self.tempdir 210 | ) 211 | 212 | def test_scoary_roary_gene_list(self): 213 | # GitHub issue #5 214 | # scoary( 215 | # genes=get_path('roary-list', 'genes'), 216 | # traits=get_path('roary-list', 'traits'), 217 | # gene_data_type='gene-list:,', 218 | # n_permut=1000, 219 | # multiple_testing='native:0.05', 220 | # n_cpus=1, 221 | # outdir=self.tempdir 222 | # ) 223 | scoary( 224 | genes='../data/roary-list/gene_presence_absence-b.csv', 225 | traits='../data/roary-list/traits-b.csv', 226 | gene_data_type='gene-list:,', 227 | n_permut=1000, 228 | multiple_testing='native:0.05', 229 | n_cpus=1, 230 | outdir=self.tempdir 231 | ) 232 | 233 | def test_scoary_pyseer(self): 234 | scoary( 235 | genes='../data/pyseer/gene_presence_absence.Rtab', 236 | traits='../data/pyseer/resistances.pheno', 237 | gene_data_type='gene-count:\t', 238 | trait_data_type='binary:\t', 239 | multiple_testing='bonferroni:0.05', 240 | n_cpus=1, 241 | outdir=self.tempdir, 242 | pairwise=False 243 | ) 244 | -------------------------------------------------------------------------------- /benchmarking/binarization/benchmark_binarization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from scipy.stats import norm 5 | import matplotlib as mpl 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | 9 | 10 | def create_common_ancestor_genome(core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]) -> np.array: 11 | return np.concatenate(( 12 | np.full(core_genes, True, dtype=bool), # core genes are always present 13 | np.full(pan_genes + len(causal_genes), False, dtype=bool) # pan genes and causal genes are initially absent 14 | )) 15 | 16 | 17 | def create_mut_chance_series(core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]) -> np.array: 18 | return np.concatenate(( 19 | np.zeros(core_genes), # core genes have no mutation chance 20 | np.random.rand(pan_genes) / 100, # each pan gene has a random mutation chance between 0 and 0.01 21 | np.array(causal_genes) # causal genes have specified mutation chance 22 | )).reshape(core_genes + pan_genes + len(causal_genes), 1) # create 2D array 23 | 24 | 25 | def mutate_genomes(genomes: pd.DataFrame, mut_change_series: np.array): 26 | """Mutate genomes""" 27 | random_values = np.random.rand(*genomes.shape) 28 | mutated = random_values <= mut_change_series 29 | # genome_collection and mutated are arrays of the same size. Wherever mutated is True, flip the bit in genome_collection 30 | return pd.DataFrame( 31 | np.logical_xor(genomes.values, mutated), 32 | index=genomes.index, 33 | columns=genomes.columns 34 | ) 35 | 36 | 37 | def branch_genomes(genomes: pd.DataFrame, mut_change_series: np.array, branch_probability: float): 38 | """Branch genomes""" 39 | random_values = np.random.rand(len(genomes.columns)) 40 | branch = random_values < branch_probability 41 | if branch.sum() == 0: # no new genomes 42 | return genomes 43 | new_genomes = genomes.loc[:, branch].copy() # .copy necessary? 44 | column_names = [f'genome_{i}' for i in range(len(genomes.columns), len(genomes.columns) + len(new_genomes.columns))] 45 | new_genomes.columns = column_names 46 | mutate_genomes(new_genomes, mut_change_series=mut_change_series) 47 | return pd.concat([genomes, new_genomes], axis=1) 48 | 49 | 50 | def create_genomes(n_genomes: int, core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]): 51 | mut_change_series = create_mut_chance_series(core_genes, pan_genes, causal_genes) 52 | 53 | genomes = pd.DataFrame( 54 | data={'genome_0': create_common_ancestor_genome(core_genes, pan_genes, causal_genes)}, 55 | index=[f'core_{x:05}' for x in range(core_genes)] + 56 | [f'pan_{x:05}' for x in range(pan_genes)] + 57 | [f'causal_{x:05}' for x in range(len(causal_genes))], 58 | dtype="bool" 59 | ) 60 | 61 | while len(genomes.columns) < n_genomes: 62 | # mutate all genomes 63 | genomes = mutate_genomes(genomes, mut_change_series) 64 | # create new genomes/branches 65 | genomes = branch_genomes(genomes, mut_change_series=mut_change_series, branch_probability=0.01) 66 | 67 | # The last iteration of branch_genomes may have created more genomes than necessary. 68 | # Return only first n_genomes genomes 69 | genomes = genomes.iloc[:, :n_genomes] 70 | return genomes 71 | 72 | 73 | def calculate_phenotype(genomes: pd.DataFrame, effect_size: float): 74 | """ 75 | Calculate phenotype asuming a normal distribution. 76 | """ 77 | return pd.Series( 78 | np.random.normal( 79 | loc=genomes.loc['causal_00000'].astype(int) * effect_size, 80 | scale=1 81 | ), 82 | index=genomes.columns, 83 | name='phenotype' 84 | ) 85 | 86 | 87 | def write_files(genomes: pd.DataFrame, genomes_file: str, phenotype: pd.Series, phenotype_file: str): 88 | genomes.astype('int').to_csv(genomes_file, sep='\t') 89 | phenotype.to_csv(phenotype_file, sep='\t') 90 | 91 | 92 | def test(n_genomes: int, effect_size: float, core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]): 93 | genomes = create_genomes(n_genomes, core_genes, pan_genes, causal_genes) 94 | print(genomes) 95 | phenotype = calculate_phenotype(genomes, effect_size) 96 | print(phenotype) 97 | write_files(genomes, 'simulations/genomes.tsv', phenotype, 'simulations/phenotype.tsv') 98 | 99 | 100 | def _simulate(n_replicates, n, e, r): 101 | if os.path.isdir(f'simulations/{n=}_{e=}_{r=}'): 102 | return 103 | 104 | print(f'Running replicate {r} of {n_replicates} for {n} genomes and {e} effect size') 105 | np.random.seed(n + int(e * 2) + r) # dirty hack: each replicate gets a predictable seed 106 | 107 | genomes = create_genomes(n) 108 | phenotype = calculate_phenotype(genomes, e) 109 | 110 | os.makedirs(f'simulations/{n=}_{e=}_{r=}', exist_ok=True) 111 | write_files( 112 | genomes, f'simulations/{n=}_{e=}_{r=}/genomes.tsv', 113 | phenotype, f'simulations/{n=}_{e=}_{r=}/phenotype.tsv' 114 | ) 115 | 116 | 117 | def generate_simulations(): 118 | n_replicates = 20 119 | n_genomes = [25, 50, 75, 100, 150, 200] 120 | effect_size = [0.5, 0.75, 1., 1.5, 2., 2.5, 3.] 121 | 122 | os.makedirs('simulations', exist_ok=True) 123 | 124 | from multiprocessing import Pool 125 | with Pool() as pool: 126 | pool.starmap( 127 | _simulate, 128 | [(n_replicates, n, e, r) for n in n_genomes for e in effect_size for r in range(n_replicates)] 129 | ) 130 | 131 | 132 | def _scoary(msg: str, simulation: str): 133 | os.environ['SCOARY_RESET_LOGGERS'] = 'TRUE' 134 | os.environ['SCOARY_LOGLEVEL_STDOUT'] = 'WARNING' 135 | os.environ['SCOARY_PRINT_CITATION'] = 'FALSE' 136 | os.environ['SCOARY_PRINT_PROGRESS'] = 'FALSE' 137 | 138 | from scoary import scoary 139 | 140 | print(f'{msg}: Analyzing {simulation}') 141 | 142 | genes = f'simulations/{simulation}/genomes.tsv' 143 | traits = f'simulations/{simulation}/phenotype.tsv' 144 | outdir = f'simulations/{simulation}/scoary' 145 | 146 | if os.path.isdir(outdir): 147 | return 148 | # import shutil 149 | # shutil.rmtree(outdir) 150 | 151 | for file in [genes, traits]: 152 | assert os.path.exists(file), f'{file} does not exist' 153 | 154 | scoary( 155 | genes, 156 | traits, 157 | outdir, 158 | trait_data_type='gaussian:kmeans:\t', 159 | gene_data_type='gene-count:\t', 160 | multiple_testing='native:0.05', 161 | n_permut=1000, 162 | n_cpus=1, 163 | random_state=42, 164 | ) 165 | 166 | assert os.path.isdir(outdir), f'{outdir} does not exist' 167 | 168 | if not os.listdir(f'{outdir}/traits'): 169 | print(f'{simulation=}: No traits found') 170 | 171 | 172 | def analyze_scoary_results(): 173 | datapoints = [] 174 | for simulation in os.listdir('simulations'): 175 | datapoint = {key: float(value) if '.' in value else int(value) 176 | for key, value in (pair.split('=') for pair in simulation.split('_'))} 177 | try: 178 | df = pd.read_csv(f'simulations/{simulation}/scoary/traits/phenotype/result.tsv', sep='\t', index_col=0) 179 | assert 'causal_00000' in df.index, f'{simulation=}: causal_00000 not in index. {df.shape=}' 180 | causal_rank = list(df.index).index('causal_00000') + 1 181 | datapoint['causal_rank'] = causal_rank 182 | datapoints.append(datapoint) 183 | except AssertionError as e: 184 | print(f'{simulation=}: {e}') 185 | datapoint['causal_rank'] = np.nan 186 | datapoints.append(datapoint) 187 | except FileNotFoundError as e: 188 | print(f'{simulation=}: {e}') 189 | datapoint['causal_rank'] = np.nan 190 | datapoints.append(datapoint) 191 | 192 | df = pd.DataFrame(datapoints) 193 | 194 | # rename columns 195 | df = df.rename(columns={ 196 | 'n': 'Number of genomes', 197 | 'e': 'Effect size', 198 | 'r': 'Replicate', 199 | 'causal_rank': 'Rank of causal gene' 200 | }) 201 | 202 | os.makedirs('out', exist_ok=True) 203 | df.to_csv('out/results.tsv', sep='\t') 204 | return df 205 | 206 | 207 | def run_scoary(): 208 | simulations = os.listdir('simulations') 209 | simulations = list(set([x.split('-')[0] for x in simulations])) 210 | n_simulations = len(simulations) 211 | 212 | for i, simulation in enumerate(simulations, start=1): 213 | _scoary(f'{i}/{n_simulations}', simulation) 214 | 215 | 216 | def plot_all(df: pd.DataFrame, effect_sizes: [float] = [0.5, 1., 1.5, 2, 3.]): 217 | mpl.use('module://backend_interagg') 218 | 219 | # fill missing values with max_rank + 20 220 | max_rank = df['Rank of causal gene'].max() 221 | df = df.fillna(max_rank + 100) 222 | 223 | fig = plt.figure(figsize=(15, 5)) 224 | axs = fig.subplots(2, 5, height_ratios=[1, 2], sharey='row') 225 | for ax in axs.flat: 226 | ax.label_outer() 227 | 228 | def add_normal(ax, mean, sd, x, line_color='black', fill_color='red', alpha: float = 0.5): 229 | # Calculate mean and standard deviation 230 | y = norm.pdf(x, mean, sd) 231 | ax.plot(x, y, color=line_color) 232 | ax.fill_between(x, y, color=fill_color, alpha=alpha) 233 | 234 | for i, effect_size in enumerate(effect_sizes): 235 | effect_size_str = str(effect_size).removesuffix('.0') 236 | 237 | ax_effect_size = axs.flat[i] 238 | ax_lineplot = axs.flat[i + len(effect_sizes)] 239 | 240 | # plot effect size 241 | _center = effect_size / 2 242 | x = np.arange(_center - 5, _center + 5, 0.01) 243 | ax_effect_size.grid(False) 244 | ax_effect_size.set_xticks([]) 245 | ax_effect_size.set_yticks([]) 246 | # ax_effect_size.set_xlabel(f'Distribution of sampled traits') 247 | add_normal(ax_effect_size, 0, 1, x, fill_color='#a6cee3') 248 | add_normal(ax_effect_size, effect_size, 1, x, fill_color='#b2df8a') 249 | # add a dotted line from [0, 0.41] to [effect_size, 0.41] 250 | ax_effect_size.plot([0, effect_size], [0.42, 0.42], color='black', linestyle='dotted') 251 | # add a letter d above the dashed line 252 | ax_effect_size.text(effect_size / 2, 0.445, f'$ d \equal {effect_size_str} \sigma $', horizontalalignment='center', verticalalignment='center') 253 | ax_effect_size.set_ylim(0, 0.5) 254 | 255 | # set title 256 | ax_effect_size.set_title(f'Effect size: {effect_size_str}') 257 | # ax_title.grid(False) 258 | # ax_title.axis('off') 259 | 260 | # plot lineplot 261 | sns.lineplot( 262 | x='Number of genomes', y='Rank of causal gene', hue='Effect size', 263 | palette=sns.color_palette(['black'], 1), 264 | data=df[df['Effect size'] == effect_size], 265 | ax=ax_lineplot, 266 | legend=False 267 | ) 268 | ax_lineplot.set_ylim(1, 250) 269 | ax_lineplot.set_xlim(25, 200) 270 | # make y axis logarithmic 271 | ax_lineplot.set_yscale('symlog') 272 | ax_lineplot.set_yticks([1, 2, 5, 10, 20, 50, 100]) 273 | # ax_lineplot.get_yaxis().tick_right() 274 | ax_lineplot.get_yaxis().set_label_position("left") 275 | ax_lineplot.get_yaxis().set_major_formatter(mpl.ticker.ScalarFormatter()) 276 | ax_lineplot.set_xticks(df['Number of genomes'].unique()) 277 | 278 | plt.tight_layout() 279 | # plt.show() 280 | plt.savefig('out/effect_sizes_horizontal.svg') 281 | 282 | 283 | if __name__ == '__main__': 284 | if not os.path.isfile('out/results.tsv'): 285 | generate_simulations() 286 | run_scoary() 287 | df = analyze_scoary_results() 288 | else: 289 | df = pd.read_csv('out/results.tsv', sep='\t', index_col=0) 290 | 291 | plot_all(df, effect_sizes=[0.5, 1., 1.5, 2., 3.]) 292 | 293 | print('Complete success.') 294 | -------------------------------------------------------------------------------- /scoary/picking.py: -------------------------------------------------------------------------------- 1 | from functools import cache 2 | 3 | from numba import njit 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.stats import binomtest 7 | 8 | from .ScoaryTree import ScoaryTree 9 | 10 | 11 | def pick( 12 | tree: [], 13 | label_to_trait_a: {str: bool}, 14 | trait_b_df: pd.DataFrame, 15 | calc_pvals: bool = True 16 | ) -> (np.array,): 17 | """ 18 | Traverse the tree and perform pair picking 19 | 20 | :param tree: Tree in list form 21 | :param label_to_trait_a: maps each label of the tree to whether it has trait a 22 | :param trait_b_df: DataFrame (dtype:bool); columns: labels of the tree; rows: whether trait b is present 23 | :param calc_pvals: If False, binomial test will not be applied and best/worst will be None 24 | :return: (max_contr, max_suppo, max_oppos, best, worst) if calc_pvals else (max_contr, max_suppo, max_oppos) 25 | """ 26 | 27 | assert not trait_b_df.isna().values.any() 28 | 29 | def _pick(left_label, right_label): 30 | # follow tree until terminal node 31 | if type(left_label) is not str: 32 | left = _pick(left_label[0], left_label[1]) 33 | if type(right_label) is not str: 34 | right = _pick(right_label[0], right_label[1]) 35 | 36 | # only load new leafs when needed for combination (safe RAM) 37 | if type(left_label) is str: 38 | left = init_leaf( 39 | trait_a=label_to_trait_a[left_label], 40 | trait_b_list=trait_b_df[left_label].to_numpy(dtype='bool') 41 | ) 42 | if type(right_label) is str: 43 | right = init_leaf( 44 | trait_a=label_to_trait_a[right_label], 45 | trait_b_list=trait_b_df[right_label].to_numpy(dtype='bool') 46 | ) 47 | 48 | combined = combine_branches(left, right) 49 | 50 | return combined 51 | 52 | values = _pick(tree[0], tree[1]) 53 | 54 | max_contr = values[:, 0, :].max(axis=1) 55 | max_suppo = values[:, 1, :].max(axis=1) 56 | max_oppos = values[:, 2, :].max(axis=1) 57 | 58 | if not calc_pvals: 59 | return max_contr, max_suppo, max_oppos 60 | 61 | best, worst = apply_binomtest(max_contr, max_suppo, max_oppos) 62 | 63 | return max_contr, max_suppo, max_oppos, best, worst 64 | 65 | 66 | def pick_single( 67 | tree: [], 68 | label_to_trait_a: {str: bool}, 69 | label_to_trait_b: {str: bool}, 70 | calc_pvals: bool = True 71 | ) -> {str: int | float}: 72 | res = pick( 73 | tree=tree, 74 | label_to_trait_a=label_to_trait_a, 75 | trait_b_df=pd.DataFrame([label_to_trait_b]), 76 | calc_pvals=calc_pvals 77 | ) 78 | return dict(zip( 79 | ['max_contrasting_pairs', 'max_supporting_pairs', 'max_opposing_pairs', 'best_pval', 'worst_pval'], 80 | [v[0] for v in res] 81 | )) 82 | 83 | 84 | def pick_nonrecursive( 85 | tree: [], 86 | label_to_trait_a: {str: bool}, 87 | trait_b_df: pd.DataFrame, 88 | calc_pvals: bool = True 89 | ) -> (np.array, np.array, np.array, np.array, np.array): 90 | if tree.is_leaf: 91 | return init_leaf( 92 | trait_a=label_to_trait_a[tree.label], 93 | trait_b_list=trait_b_df[tree.label].to_numpy(dtype='bool') 94 | ) 95 | 96 | stack = [[tree, 'right'], [tree, 'left']] 97 | 98 | while stack: 99 | current_parent, current_direction = stack[-1] 100 | current_node: ScoaryTree = getattr(current_parent, current_direction) 101 | 102 | if current_node.is_leaf: 103 | # current node is leaf 104 | this = init_leaf( 105 | trait_a=label_to_trait_a[current_node.label], 106 | trait_b_list=trait_b_df[current_node.label].to_numpy(dtype='bool') 107 | ) 108 | 109 | # append data to parent 110 | current_node._values = this 111 | 112 | if current_direction == 'right': 113 | # found terminal node 114 | # # GO UP UNTIL CAN GO RIGHT 115 | while stack and stack[-1][1] == 'right': 116 | ancestor_tree, ancestor_direction = stack.pop() 117 | ancestor_tree._values = combine_branches( 118 | ancestor_tree.left._values, 119 | ancestor_tree.right._values 120 | ) 121 | ancestor_tree.left._values = None 122 | ancestor_tree.right._values = None 123 | 124 | if not stack: 125 | # arrived at root node 126 | break 127 | 128 | # pop left node -> go right next 129 | stack.pop() 130 | 131 | else: 132 | stack.extend([(current_node, 'right'), (current_node, 'left')]) 133 | 134 | values = tree._values 135 | tree._values = None 136 | 137 | max_contr = values[:, 0, :].max(axis=1) 138 | max_suppo = values[:, 1, :].max(axis=1) 139 | max_oppos = values[:, 2, :].max(axis=1) 140 | 141 | if not calc_pvals: 142 | return max_contr, max_suppo, max_oppos 143 | 144 | best, worst = apply_binomtest(max_contr, max_suppo, max_oppos) 145 | 146 | return max_contr, max_suppo, max_oppos, best, worst 147 | 148 | 149 | @cache 150 | def _binomtest(k: int, n: int) -> float: 151 | # caching this function increases speed ~ 40x 152 | return binomtest(k=k, n=n).pvalue 153 | 154 | 155 | def apply_binomtest(max_contr, max_suppo, max_oppos): 156 | n_traits = max_contr.shape[0] 157 | result = np.empty(shape=(2, n_traits), dtype='float') 158 | 159 | for i in range(n_traits): 160 | b = _binomtest(max_suppo[i], n=max_contr[i]) 161 | w = _binomtest(max_oppos[i], n=max_contr[i]) 162 | 163 | if b < w: 164 | result[0][i] = b 165 | result[1][i] = w 166 | else: 167 | result[0][i] = w 168 | result[1][i] = b 169 | return result 170 | 171 | 172 | # selecting:values[, <3 TYPES OF PAIRINGS>, <5 COMBINATIONS>] 173 | # selecting:values[, <0: max; 1: supporting; 2: opposing>, <0: 11; 1: 10; 2: 01; 3: 00; 4: nf>] 174 | 175 | # values[n, 0, :] -> all max contrasting pairs for trait n 176 | # values[n, 1, :] -> all max supporting pairs for trait n 177 | # values[n, 2, :] -> all max opposing pairs for trait n 178 | 179 | # values[n, 0, 0] -> max supporting pairs for trait n if condition '11' is added 180 | # values[n, 0, 1] -> max supporting pairs for trait n if condition '10' is added 181 | # values[n, 0, 2] -> max supporting pairs for trait n if condition '01' is added 182 | # values[n, 0, 3] -> max supporting pairs for trait n if condition '00' is added 183 | # values[n, 0, 4] -> max supporting pairs for trait n if condition 'nf' is added 184 | 185 | 186 | @njit('int64[:, ::3, ::5](boolean, boolean[:])', 187 | cache=True, nogil=True, boundscheck=False, parallel=False) # prange not better 188 | def init_leaf(trait_a: bool, trait_b_list: np.array) -> np.array: 189 | n_traits = trait_b_list.shape[0] 190 | 191 | values = np.full(shape=(n_traits, 3, 5), fill_value=-1, dtype='int') 192 | if trait_a: 193 | values[:, :, 0][trait_b_list] = 0 194 | values[:, :, 1][~trait_b_list] = 0 195 | 196 | else: 197 | values[:, :, 2][trait_b_list] = 0 198 | values[:, :, 3][~trait_b_list] = 0 199 | 200 | return values 201 | 202 | 203 | @njit('int64[::3, ::5], int64[::3, ::5]', 204 | cache=True, nogil=True, boundscheck=False, parallel=False) # parallel kills performance 205 | def calculate_max_nofree(left: np.array, right: np.array): 206 | values = np.full(shape=(3, 5), fill_value=-1, dtype='int') 207 | 208 | if left[0][4] > -1 and right[0][4] > -1: # nf vs nf 209 | values[0][0] = left[0][4] + right[0][4] 210 | values[1][0] = left[1][4] + right[1][4] 211 | values[2][0] = left[2][4] + right[2][4] 212 | 213 | if left[0][0] > -1 and right[0][3] > -1: # 11 vs 00 214 | values[0][1] = left[0][0] + right[0][3] + 1 215 | values[1][1] = left[1][0] + right[1][3] + 1 216 | values[2][1] = left[2][0] + right[2][3] 217 | 218 | if left[0][3] > -1 and right[0][0] > -1: # 00 vs 11 219 | values[0][2] = left[0][3] + right[0][0] + 1 220 | values[1][2] = left[1][3] + right[1][0] + 1 221 | values[2][2] = left[2][3] + right[2][0] 222 | 223 | if left[0][1] > -1 and right[0][2] > -1: # 10 vs 01 224 | values[0][3] = left[0][1] + right[0][2] + 1 225 | values[1][3] = left[1][1] + right[1][2] 226 | values[2][3] = left[2][1] + right[2][2] + 1 227 | 228 | if left[0][2] > -1 and right[0][1] > -1: # 01 vs 10 229 | values[0][4] = left[0][2] + right[0][1] + 1 230 | values[1][4] = left[1][2] + right[1][1] 231 | values[2][4] = left[2][2] + right[2][1] + 1 232 | 233 | max_contr = values[0].max() 234 | 235 | max_suppo = -1 236 | for i in range(5): 237 | if values[0][i] == max_contr and values[1][i] > max_suppo: 238 | max_suppo = values[1][i] 239 | 240 | max_oppos = -1 241 | for i in range(5): 242 | if values[0][i] == max_contr and values[2][i] > max_oppos: 243 | max_oppos = values[2][i] 244 | 245 | return max_contr, max_suppo, max_oppos 246 | 247 | 248 | @njit('int64, int64[::3, ::5], int64[::3, ::5]', 249 | cache=True, nogil=True, boundscheck=False, parallel=False) 250 | def calculate_max_given_condition(condition: int, left: np.array, right: np.array): # parallel kills performance 251 | values = np.full(shape=(3, 9), fill_value=-1, dtype='int') 252 | 253 | if left[0][condition] > -1: 254 | # compare condition with all conditions 255 | for i in range(5): 256 | values[0][i] = left[0][condition] + right[0][i] 257 | values[1][i] = left[1][condition] + right[1][i] 258 | values[2][i] = left[2][condition] + right[2][i] 259 | 260 | if right[0][condition] > -1: 261 | col_id = 5 262 | # compare all conditions with condition 263 | for i in range(5): 264 | if i == condition: # this comparison has already been made above 265 | continue 266 | 267 | values[0][col_id] = left[0][i] + right[0][condition] 268 | values[1][col_id] = left[1][i] + right[1][condition] 269 | values[2][col_id] = left[2][i] + right[2][condition] 270 | 271 | col_id += 1 272 | 273 | max_contr = values[0].max() 274 | 275 | max_suppo = -1 276 | for i in range(9): 277 | if values[0][i] == max_contr and values[1][i] > max_suppo: 278 | max_suppo = values[1][i] 279 | 280 | max_oppos = -1 281 | for i in range(9): 282 | if values[0][i] == max_contr and values[2][i] > max_oppos: 283 | max_oppos = values[2][i] 284 | 285 | return max_contr, max_suppo, max_oppos 286 | 287 | 288 | @njit('int64[:, ::3, ::5], int64[:, ::3, ::5]', 289 | cache=True, nogil=True, boundscheck=False, parallel=False) 290 | def combine_branches(left: np.array, right: np.array): 291 | assert left.shape == right.shape 292 | n_traits = left.shape[0] 293 | 294 | values = np.full(shape=left.shape, fill_value=-1, dtype='int') 295 | 296 | # selecting:values[, <0: max; 1: supporting; 2: opposing>, <0: 11; 1: 10; 2: 01; 3: 00; 4: nf>] 297 | for trait_id in range(n_traits): # prange kills performance 298 | for cond in range(4): # prange kills performance 299 | # {"11": 0, "10": 1, "01": 2, "00": 3, "nf": 4} 300 | max_contr, max_suppo, max_oppos = calculate_max_given_condition( 301 | cond, 302 | left[trait_id, :, :], 303 | right[trait_id, :, :] 304 | ) 305 | values[trait_id, 0, cond] = max_contr 306 | values[trait_id, 1, cond] = max_suppo 307 | values[trait_id, 2, cond] = max_oppos 308 | max_contr, max_suppo, max_oppos = calculate_max_nofree( 309 | left[trait_id, :, :], 310 | right[trait_id, :, :] 311 | ) 312 | values[trait_id, 0, 4] = max_contr 313 | values[trait_id, 1, 4] = max_suppo 314 | values[trait_id, 2, 4] = max_oppos 315 | 316 | return values 317 | -------------------------------------------------------------------------------- /tests/test_picking.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable 2 | 3 | from init_tests import * 4 | 5 | from scoary.utils import print_tree 6 | from scoary.scoary import * 7 | from scoary.analyze_trait import init_result_df, pair_picking 8 | from scoary.ScoaryTree import ScoaryTree 9 | from scoary.picking import pick, pick_nonrecursive, pick_single 10 | 11 | from scoary.scoary_1_picking import * 12 | 13 | from timeit import default_timer as timer 14 | 15 | boolify = lambda t1, t2: f"{'A' if t1 else 'a'}{'B' if t2 else 'b'}" 16 | 17 | dummy_tree = [['isolate1', 'isolate2'], ['isolate3', 'isolate4']] 18 | 19 | dummy_trait_a = { 20 | 'isolate1': True, 21 | 'isolate2': False, 22 | 'isolate3': False, 23 | 'isolate4': True, 24 | } 25 | 26 | dummy_trait_b_df = pd.DataFrame( 27 | [ 28 | [True, True, False, False], 29 | [True, False, True, False], 30 | [True, False, False, True], 31 | [False, True, True, False], 32 | [False, True, False, True], 33 | [False, True, False, True], 34 | [False, True, False, True], 35 | [False, True, False, True], 36 | ], columns=['isolate1', 'isolate2', 'isolate3', 'isolate4'] 37 | ) 38 | 39 | 40 | def time_fn(fn: Callable, args=None, kwargs=None, n_times: int = 1) -> (float, Any): 41 | if kwargs is None: 42 | kwargs = {} 43 | if args is None: 44 | args = [] 45 | 46 | diffs = [] 47 | for i in range(n_times): 48 | start = timer() 49 | res = fn(*args, **kwargs) 50 | end = timer() 51 | diffs.append(end - start) # Time in seconds, e.g. 5.38091952400282 52 | return np.mean(diffs), res 53 | 54 | 55 | def scoary_1_pick(tree: [], label_to_trait_a: {str: bool}, trait_b_df: pd.DataFrame): 56 | labels = set(trait_b_df.columns) 57 | 58 | max_contrasting = np.empty(shape=len(trait_b_df), dtype='int') 59 | max_supporting = np.empty(shape=len(trait_b_df), dtype='int') 60 | max_opposing = np.empty(shape=len(trait_b_df), dtype='int') 61 | 62 | for i, (_, label_to_trait) in enumerate(trait_b_df.iterrows()): 63 | gtc = {l: boolify(label_to_trait_a[l], label_to_trait[l]) for l in labels} 64 | phylo_tree, result_dict = convert_upgma_to_phylotree(tree, gtc) 65 | 66 | max_contrasting[i] = result_dict['Total'] 67 | max_supporting[i] = result_dict['Pro'] 68 | max_opposing[i] = result_dict['Anti'] 69 | 70 | return max_contrasting, max_supporting, max_opposing 71 | 72 | 73 | class Test(TestCase): 74 | def test_demo(self): 75 | tree = [['isolate1', 'isolate2'], [['isolate3', 'isolate4'], ['isolate5', 'isolate6']]] 76 | label_to_trait_a = { 77 | 'isolate1': True, 78 | 'isolate2': False, 79 | 'isolate3': True, 80 | 'isolate4': False, 81 | 'isolate5': True, 82 | 'isolate6': False, 83 | } 84 | label_to_trait_b = { 85 | 'isolate1': True, 86 | 'isolate2': False, 87 | 'isolate3': True, 88 | 'isolate4': False, 89 | 'isolate5': True, 90 | 'isolate6': False, 91 | } 92 | 93 | print_tree( 94 | ScoaryTree.from_list(tree), 95 | label_to_trait_a, label_to_trait_b 96 | ) 97 | result = pick_single(tree, label_to_trait_a, label_to_trait_b, calc_pvals=True) 98 | print(result) 99 | 100 | def test_simple(self): 101 | mc_1, ms_1, mo_1 = scoary_1_pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df) 102 | mc_2, ms_2, mo_2 = pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df, 103 | calc_pvals=False) 104 | 105 | self.assertTrue(all(np.equal(mc_1, mc_2)), msg='contrasting') 106 | self.assertTrue(all(np.equal(ms_1, ms_2)), msg='supporting') 107 | self.assertTrue(all(np.equal(mo_1, mo_2)), msg='opposing') 108 | 109 | def test_benchmark_tetracycline(self, run_scoary_1=True): 110 | # HP Spectre x360 15-df0709nz (i7 8Gen 8565U) 111 | # Scoary1 took 23.241052357999614 sec 112 | # Scoary2 took 0.49214521629996855 sec 113 | # Scoary1 vs Scoary2: 47.22397290118921x improvement 114 | 115 | # HP Spectre x360 14-ef2759nz (i7 13Gen 1355U) 116 | # Scoary1 took 9.602997977599989 sec 117 | # Scoary2 took 0.31401244160000485 sec 118 | # Scoary1 vs Scoary2: 30.58158437503083x improvement 119 | 120 | tetr_tree = get_json('../data/tetracycline/expected_result.json')['as_list'] 121 | _, tetr_genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', 122 | ignore=roary_ignore) 123 | _, tetr_traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,') 124 | 125 | tetr_label_to_gene = tetr_traits_df['Tetracycline_resistance'].to_dict() 126 | 127 | # jit compile 128 | pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df, calc_pvals=False) 129 | 130 | if run_scoary_1: 131 | print('Scoary1') 132 | time_1, res = time_fn( 133 | scoary_1_pick, 134 | kwargs=dict(tree=tetr_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df), 135 | n_times=5 136 | ) 137 | mc_1, ms_1, mo_1 = res 138 | else: 139 | time_1 = 19. 140 | 141 | print('Scoary2') 142 | time_2, res = time_fn( 143 | pick, 144 | kwargs=dict(tree=tetr_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df, 145 | calc_pvals=False), 146 | n_times=20 147 | ) 148 | mc_2, ms_2, mo_2 = res 149 | 150 | print(f'Scoary1 took {time_1} sec') 151 | print(f'Scoary2 took {time_2} sec') 152 | print(f'Scoary1 vs Scoary2: {time_1 / time_2}x improvement') # 33.88 x 153 | 154 | if run_scoary_1: 155 | self.assertTrue(all(np.equal(mc_1, mc_2)), msg='contrasting') 156 | self.assertTrue(all(np.equal(ms_1, ms_2)), msg='supporting') 157 | self.assertTrue(all(np.equal(mo_1, mo_2)), msg='opposing') 158 | 159 | def test_tetracycline_norecursive(self, run_scoary_1=True): 160 | # Scoary1 took 23.021255266200022 sec 161 | # Scoary2nonrec took 0.5782416850000118 sec 162 | # Scoary1 vs Scoary2nonrec: 39.81251415002976x improvement 163 | tetr_tree = get_json('../data/tetracycline/expected_result.json')['as_list'] 164 | _, tetr_genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', 165 | ignore=roary_ignore) 166 | _, tetr_traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,') 167 | 168 | tetr_label_to_gene = tetr_traits_df['Tetracycline_resistance'].to_dict() 169 | 170 | # jit compile 171 | pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df, calc_pvals=False) 172 | 173 | if run_scoary_1: 174 | print('Scoary1') 175 | time_1, res = time_fn( 176 | scoary_1_pick, 177 | kwargs=dict(tree=tetr_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df), 178 | n_times=5 179 | ) 180 | mc_1, ms_1, mo_1 = res 181 | else: 182 | time_1 = 19. 183 | 184 | print('Scoary2') 185 | tetr_scoary_tree = ScoaryTree.from_list(tetr_tree) 186 | time_2, res = time_fn( 187 | pick_nonrecursive, 188 | kwargs=dict(tree=tetr_scoary_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df, 189 | calc_pvals=False), 190 | n_times=20 191 | ) 192 | mc_2, ms_2, mo_2 = res 193 | 194 | print(f'Scoary1 took {time_1} sec') 195 | print(f'Scoary2nonrec took {time_2} sec') 196 | print(f'Scoary1 vs Scoary2nonrec: {time_1 / time_2}x improvement') 197 | 198 | if run_scoary_1: 199 | self.assertTrue(all(np.equal(mc_1, mc_2)), msg='contrasting') 200 | self.assertTrue(all(np.equal(ms_1, ms_2)), msg='supporting') 201 | self.assertTrue(all(np.equal(mo_1, mo_2)), msg='opposing') 202 | 203 | def test_pairs_paper(self): 204 | scoary_tree = ScoaryTree.from_list( 205 | [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'], 206 | [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]] 207 | ) 208 | print(scoary_tree) 209 | labels = scoary_tree.labels() 210 | assert labels == [str(v) for v in list(range(1, 22))] 211 | 212 | seq = [(0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (1, 0), (0, 1), (0, 0), 213 | (1, 1), (1, 1), (0, 0), (0, 0), 214 | (1, 1), (1, 1), (0, 0), (1, 1), ] 215 | 216 | label_to_gene = {lab: bool(tup[0]) for tup, lab in zip(seq, labels)} 217 | label_to_trait = {lab: bool(tup[1]) for tup, lab in zip(seq, labels)} 218 | 219 | print_tree(scoary_tree, label_to_gene, label_to_trait) 220 | 221 | res = pick( 222 | scoary_tree.to_list, 223 | label_to_trait_a=label_to_trait, 224 | trait_b_df=pd.DataFrame(label_to_gene, index=['fakegene']), 225 | calc_pvals=False 226 | ) 227 | 228 | max_comparisons = res[0][0] 229 | max_supporting = res[1][0] 230 | max_opposing = res[2][0] 231 | 232 | print_tree(scoary_tree, label_to_gene, label_to_trait) 233 | 234 | self.assertEqual(7, max_comparisons, msg='max_comparisons of pairs failed') 235 | self.assertEqual(7, max_supporting, msg='max_supporting of pairs failed') 236 | self.assertEqual(1, max_opposing, msg='max_opposing of pairs failed') 237 | 238 | def test_pairs_scoary1(self): 239 | _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore) 240 | _, traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,') 241 | expected_result = pd.read_csv('../data/tetracycline/fisher_permute100.results.csv') 242 | 243 | scoary_tree = ScoaryTree.from_presence_absence(genes_df) 244 | label_to_trait = traits_df.Tetracycline_resistance.apply(bool).to_dict() 245 | 246 | assert set(scoary_tree.labels()) == set(traits_df.index) 247 | assert not traits_df.Tetracycline_resistance.hasnans 248 | 249 | for i, row in expected_result.iterrows(): 250 | gene = row.Gene 251 | print(gene) 252 | old_max_comparisons = row.Max_Pairwise_comparisons 253 | old_max_supporting = row.Max_supporting_pairs 254 | old_max_opposing = row.Max_opposing_pairs 255 | old_best = row.Best_pairwise_comp_p 256 | old_worst = row.Worst_pairwise_comp_p 257 | 258 | label_to_gene = genes_df.loc[gene].apply(bool).to_dict() 259 | 260 | res = pick( 261 | scoary_tree.to_list, 262 | label_to_trait_a=label_to_trait, 263 | trait_b_df=pd.DataFrame(label_to_gene, index=['fakegene']), 264 | calc_pvals=True 265 | ) 266 | 267 | comparisons = { 268 | 'max_comparisons': (old_max_comparisons, res[0][0]), 269 | 'max_supporting': (old_max_supporting, res[1][0]), 270 | 'max_opposing': (old_max_opposing, res[2][0]), 271 | 'best': (old_best, res[3][0]), 272 | 'worst': (old_worst, res[4][0]), 273 | } 274 | 275 | for comparison, (old, new) in comparisons.items(): 276 | if not np.isclose(old, new): 277 | print(gene, comparison, old, new, scoary_tree) 278 | print_tree(scoary_tree, label_to_gene, label_to_trait) 279 | self.fail(msg=f'Disagreement between Scoary1 and Scoary2') 280 | 281 | def test_scoary1_generated(self): 282 | _, genes_df = load_genes('../data/bigger_ds/pres_abs.csv', 'gene-count:,') 283 | _, traits_df = load_traits('../data/bigger_ds/trait_trees.csv', trait_data_type='binary:,') 284 | 285 | for trait_name in ['t1', 't2']: 286 | label_to_trait = traits_df[trait_name].apply(bool).to_dict() 287 | expected_result = pd.read_csv(f'../data/bigger_ds/{trait_name}.results.csv') 288 | 289 | with open('../data/bigger_ds/newick.nwk') as f: 290 | newick = f.read() 291 | scoary_tree = ScoaryTree.from_newick(newick) 292 | # scoary_tree = ScoaryTree.from_presence_absence(genes_df) 293 | 294 | result_df = init_result_df(genes_df, pd.Series(label_to_trait, dtype='boolean')) 295 | result_df = pair_picking(result_df, genes_df, scoary_tree, label_to_trait) 296 | 297 | assert set(scoary_tree.labels()) == set(traits_df.index) 298 | 299 | for i, row in expected_result.sample(frac=1, random_state=42).iterrows(): 300 | gene = row.Gene 301 | old_max_comparisons = row.Max_Pairwise_comparisons 302 | old_max_supporting = row.Max_supporting_pairs 303 | old_max_opposing = row.Max_opposing_pairs 304 | old_best = row.Best_pairwise_comp_p 305 | old_worst = row.Worst_pairwise_comp_p 306 | 307 | new_row = result_df[result_df['Gene'] == gene].iloc[0] 308 | 309 | comparisons = { 310 | 'max_comparisons': (old_max_comparisons, new_row.contrasting), 311 | 'max_supporting': (old_max_supporting, new_row.supporting), 312 | 'max_opposing': (old_max_opposing, new_row.opposing), 313 | 'best': (old_best, new_row.best), 314 | 'worst': (old_worst, new_row.worst), 315 | } 316 | 317 | for comparison, (old, new) in comparisons.items(): 318 | if not np.isclose(old, new): 319 | print(f'Error on {gene=} / {comparison=}') 320 | print(comparisons) 321 | label_to_gene = genes_df.loc[gene].apply(bool).to_dict() 322 | print_tree(scoary_tree, label_to_gene, label_to_trait) 323 | self.fail(msg=f'Disagreement between Scoary1 and Scoary2') 324 | -------------------------------------------------------------------------------- /scoary/analyze_trait.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | from collections import defaultdict 5 | import numpy as np 6 | import pandas as pd 7 | from statsmodels.stats.multitest import multipletests 8 | from fast_fisher.fast_fisher_numba import odds_ratio, test1t as fisher_exact_two_tailed 9 | from queue import Empty 10 | 11 | from .ScoaryTree import ScoaryTree 12 | from .picking import pick 13 | from .permutations import permute_picking 14 | from .progressbar import print_progress 15 | from .utils import setup_logging, AnalyzeTraitNamespace, fisher_id, grasp_namespace 16 | 17 | logger = logging.getLogger('scoary.analyze_trait') 18 | 19 | 20 | def worker( 21 | q, 22 | ns: AnalyzeTraitNamespace, 23 | step: int, 24 | result_container: {dict | str | None}, 25 | proc_id: int 26 | ): 27 | logger = setup_logging( 28 | logger=logging.getLogger('scoary'), 29 | path=f'{ns.outdir}/logs/scoary-2_proc{proc_id}.log', 30 | print_info=False, 31 | reset=True 32 | ) 33 | logger.info(f'Setting up trait analysis worker {proc_id}') 34 | 35 | new_ns = grasp_namespace(AnalyzeTraitNamespace, ns) 36 | del ns 37 | 38 | analyze_trait_fn = analyze_trait_step_1_fisher if step == 1 else analyze_trait_step_2_pairpicking 39 | 40 | local_result_container = {} 41 | 42 | while True: 43 | try: 44 | trait = q.get_nowait() 45 | except Empty: 46 | break # completely done 47 | 48 | local_result_container[trait] = analyze_trait_fn(trait, new_ns, proc_id) 49 | q.task_done() 50 | 51 | result_container.update(local_result_container) 52 | 53 | 54 | def analyze_trait_step_1_fisher(trait: str, ns: AnalyzeTraitNamespace, proc_id: int = None) -> np.ndarray | str: 55 | logger.debug(f"Analyzing {trait=}, step 1: Fisher's test") 56 | with ns.lock: 57 | ns.counter.value += 1 58 | message = trait if proc_id is None else f'P{proc_id} | {trait}' 59 | print_progress( 60 | ns.counter.value, ns.queue_size, 61 | message=message, start_time=ns.start_time, message_width=25 62 | ) 63 | 64 | if trait in ns.duplicates: 65 | logger.debug(f'Duplicated trait: {trait} -> {ns.duplicates[trait]}') 66 | save_duplicated_result(trait, ns) 67 | return ns.duplicates[trait] 68 | 69 | # Prepare results.tsv 70 | isolate_trait_series = ns.traits_df[trait].dropna() 71 | result_df = init_result_df(ns.genes_bool_df, isolate_trait_series) 72 | 73 | # Sometimes, binarization gives extreme results and no genes are left 74 | if len(result_df) == 0: 75 | logger.info(f'Found 0 genes for {trait=}!') 76 | return False 77 | 78 | # Compute Fisher's test efficiently 79 | test_df = create_test_df(result_df) 80 | test_df = add_odds_ratio(test_df) 81 | result_df = pd.merge(test_df, result_df, how="left", on='__contingency_table__', copy=False) 82 | 83 | # Perform multiple testing correction 84 | multiple_testing_df = result_df[['__pattern_id__', 'fisher_p']].drop_duplicates('__pattern_id__') 85 | if ns.trait_wise_correction: 86 | multiple_testing_df = multiple_testing_correction( 87 | multiple_testing_df, 'fisher_p', 'fisher_q', 88 | ns.mt_f_method, ns.mt_f_cutoff, True 89 | ) 90 | if len(multiple_testing_df) == 0: 91 | logger.info(f'Found 0 genes for {trait=} after multiple testing correction!') 92 | return False 93 | 94 | multiple_testing_df.drop('fisher_p', axis=1, inplace=True) 95 | result_df = pd.merge(multiple_testing_df, result_df, how="left", on='__pattern_id__', copy=False) 96 | result = True 97 | else: 98 | result = multiple_testing_df 99 | 100 | os.makedirs(f'{ns.outdir}/traits/{trait}') 101 | result_df.to_csv(f'{ns.outdir}/traits/{trait}/result.tsv', sep='\t', index=False) 102 | 103 | return result 104 | 105 | 106 | def analyze_trait_step_2_pairpicking(trait: str, ns: AnalyzeTraitNamespace, proc_id: int = None) -> dict | str | None: 107 | logger.debug(f'Analyzing {trait=}, step 2: Pair picking') 108 | with ns.lock: 109 | ns.counter.value += 1 110 | message = trait if proc_id is None else f'P{proc_id} | {trait}' 111 | print_progress( 112 | ns.counter.value, ns.queue_size, 113 | message=message, start_time=ns.start_time, message_width=25 114 | ) 115 | summary_data = {} 116 | 117 | result_df = pd.read_csv(f'{ns.outdir}/traits/{trait}/result.tsv', sep='\t') 118 | 119 | if ns.trait_wise_correction: 120 | assert 'fisher_q' in result_df.columns, f'{result_df.columns=} must contain "fisher_q"!' 121 | else: 122 | multiple_testing_df = ns.multiple_testing_df.loc[trait, :] 123 | result_df = pd.merge(multiple_testing_df, result_df, how="left", on='__pattern_id__', copy=False) 124 | 125 | assert 'fisher_p' in result_df.columns, f'{result_df.columns=} must contain "fisher_p"!' 126 | assert 'fisher_q' in result_df.columns, f'{result_df.columns=} must contain "fisher_q"!' 127 | 128 | if not ns.pairwise: 129 | min_row = result_df.loc[result_df['fisher_p'].idxmin()] 130 | summary_data['best_fisher_p'] = min_row['fisher_p'] 131 | summary_data['best_fisher_q'] = min_row['fisher_q'] 132 | else: 133 | trait_series = ns.traits_df[trait].dropna() 134 | isolates = set(trait_series.index) 135 | if ns.all_labels == isolates: 136 | pruned_tree = ns.tree 137 | else: 138 | pruned_tree = ns.tree.prune(labels=isolates) 139 | 140 | result_df = pair_picking( 141 | result_df, 142 | significant_genes_df=ns.genes_bool_df.loc[result_df.Gene], 143 | tree=pruned_tree, 144 | label_to_trait=trait_series 145 | ) 146 | 147 | if ns.worst_cutoff: 148 | keep = result_df['worst'] <= ns.worst_cutoff 149 | if not keep.any(): 150 | logger.info(f'Found 0 genes for {trait=} ' 151 | f'after worst_cutoff={ns.worst_cutoff} filtration') 152 | return None 153 | result_df = result_df[keep] 154 | 155 | assert result_df.fisher_p.is_monotonic_increasing, f'{result_df.fisher_p=} must be monotonic increasing!' 156 | 157 | if ns.max_genes: 158 | if len(result_df) > ns.max_genes: 159 | logger.info(f'Found too {len(result_df)} genes for {trait=} ' 160 | f'keeping only {ns.max_genes} with best Fisher\'s test.') 161 | summary_data['max_genes'] = f'Trimmed {len(result_df)} genes to {ns.max_genes}.' 162 | result_df = result_df.iloc[:ns.max_genes] 163 | 164 | if ns.n_permut: 165 | result_df['empirical_p'] = permute_picking( 166 | trait=trait, 167 | result_df=result_df, 168 | tree=pruned_tree, 169 | label_to_trait=trait_series, 170 | n_permut=ns.n_permut, 171 | random_state=ns.random_state, 172 | genes_bool_df=ns.genes_bool_df 173 | ) 174 | 175 | result_df['fq*ep'] = result_df['fisher_q'] * result_df['empirical_p'] 176 | result_df.sort_values(by='fq*ep', inplace=True) 177 | 178 | best_row = result_df.iloc[0] 179 | summary_data['best_fisher_p'] = best_row['fisher_p'] 180 | summary_data['best_fisher_q'] = best_row['fisher_q'] 181 | summary_data['best_empirical_p'] = best_row['empirical_p'] 182 | summary_data['best_fq*ep'] = best_row['fq*ep'] 183 | 184 | save_result_df(trait, ns, result_df) 185 | 186 | # return minimal pvalues 187 | return summary_data 188 | 189 | 190 | def _save_trait(trait: str, ns: AnalyzeTraitNamespace): 191 | trait_df = pd.DataFrame(index=ns.traits_df.index) 192 | trait_df['binary'] = ns.traits_df[trait] 193 | if ns.numeric_df is not None: 194 | trait_df['numeric'] = ns.numeric_df[trait] 195 | trait_df.index.name = 'isolate' 196 | trait_df.to_csv(f'{ns.outdir}/traits/{trait}/values.tsv', sep='\t') 197 | 198 | 199 | def save_result_df(trait: str, ns: AnalyzeTraitNamespace, result_df: pd.DataFrame): 200 | # add annotations 201 | if ns.gene_info_df is None: 202 | additional_columns = [] 203 | else: 204 | additional_columns = ns.gene_info_df.columns.to_list() 205 | result_df = result_df.merge(ns.gene_info_df, left_on='Gene', right_index=True, how='left', copy=False) 206 | 207 | # reorder columns 208 | col_order = ['Gene', *additional_columns, 209 | 'g+t+', 'g+t-', 'g-t+', 'g-t-', 210 | 'sensitivity', 'specificity', 'odds_ratio', 211 | 'fisher_p', 'fisher_q', 'empirical_p', 'fq*ep', 212 | 'contrasting', 'supporting', 'opposing', 'best', 'worst'] 213 | result_df = result_df[[col for col in col_order if col in result_df.columns]] 214 | 215 | result_df.to_csv(f'{ns.outdir}/traits/{trait}/result.tsv', sep='\t', index=False) 216 | 217 | binarization_info = ns.traits_df.attrs['binarization_info'] 218 | if type(binarization_info) is str: 219 | binarization_info = defaultdict(lambda: 'none') 220 | 221 | with open(f'{ns.outdir}/traits/{trait}/meta.json', 'w') as f: 222 | meta_data = { 223 | 'genes-content-type': ns.genes_orig_df.attrs['content_type'], 224 | 'binarization-method': ns.traits_df.attrs['binarization_method'], 225 | 'binarization-info': binarization_info[trait] 226 | } 227 | # add trait info 228 | if ns.trait_info_df is not None: 229 | try: 230 | info = ns.trait_info_df.loc[trait].to_dict() 231 | meta_data['info'] = {k: v for k, v in info.items() if not pd.isna(v)} 232 | except KeyError: 233 | pass 234 | 235 | json.dump(meta_data, f, indent=4, allow_nan=False) 236 | 237 | coverage_matrix = ns.genes_orig_df[ns.genes_orig_df.index.isin(result_df.Gene)].T 238 | coverage_matrix.index.name = 'Isolate' 239 | coverage_matrix.to_csv(f'{ns.outdir}/traits/{trait}/coverage-matrix.tsv', sep='\t') 240 | _save_trait(trait, ns) 241 | 242 | 243 | def save_duplicated_result(trait: str, ns: AnalyzeTraitNamespace): 244 | os.makedirs(f'{ns.outdir}/traits/{trait}') 245 | 246 | # use data from previous duplicate 247 | ref_trait = ns.duplicates[trait] 248 | for f in ['result.tsv', 'meta.json', 'coverage-matrix.tsv']: 249 | os.symlink(src=f'../{ref_trait}/{f}', dst=f'{ns.outdir}/traits/{trait}/{f}') 250 | 251 | # create values.tsv only if numeric trait 252 | if ns.numeric_df is None: 253 | os.symlink(src=f'../{ref_trait}/values.tsv', dst=f'{ns.outdir}/traits/{trait}/values.tsv') 254 | else: 255 | _save_trait(trait, ns) 256 | 257 | 258 | def init_result_df(genes_bool_df: pd.DataFrame, trait_series: pd.Series) -> pd.DataFrame: 259 | """ 260 | Create result_df with index=strains and columns=[g+t+, g+t-, g-t+, g-t-, __contingency_table__] 261 | 262 | :param genes_bool_df: DataFrame (dtype: bool); columns: strains; rows: genes 263 | :param trait_series: Boolean Series that indicates which isolates have the trait 264 | :return: result_df (DataFrame); columns: ['g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__]; index: strains 265 | """ 266 | assert trait_series.dtype == 'boolean', f'trait_series must be boolean pandas.Series!' 267 | assert not trait_series.hasnans, f'trait_series may not contain NANs!' 268 | # Preparation 269 | trait_pos = trait_series.index[trait_series] 270 | trait_neg = trait_series.index[~trait_series] 271 | n_pos = len(trait_pos) 272 | n_neg = len(trait_neg) 273 | n_tot = n_pos + n_neg 274 | assert n_tot == len(trait_series) 275 | 276 | # Create result_df 277 | result_df = pd.DataFrame(index=genes_bool_df.index) 278 | result_df['g+t+'] = genes_bool_df[trait_pos].sum(axis=1) # trait positive gene positive 279 | result_df['g+t-'] = genes_bool_df[trait_neg].sum(axis=1) # trait negative gene positive 280 | result_df['g-t+'] = n_pos - result_df['g+t+'] # trait positive gene negative 281 | result_df['g-t-'] = n_neg - result_df['g+t-'] # trait negative gene negative 282 | 283 | # Remove genes that are shared by none or all 284 | gene_sum = result_df['g+t+'] + result_df['g+t-'] 285 | to_keep = (gene_sum != 0) & (gene_sum != n_tot) 286 | result_df = result_df[to_keep] 287 | 288 | # Add unique pattern ID 289 | genes_bool_df_reduced = genes_bool_df.loc[to_keep, trait_pos.to_list() + trait_neg.to_list()] 290 | pattern_id = genes_bool_df_reduced.groupby(by=genes_bool_df_reduced.columns.to_list()).ngroup() 291 | result_df['__pattern_id__'] = pattern_id 292 | 293 | # Add contingency table, sensitivity and specificity 294 | result_df['__contingency_table__'] = [tuple(x) for x in result_df[['g+t+', 'g+t-', 'g-t+', 'g-t-']].to_numpy()] 295 | if n_pos: 296 | pos_sensitivity = (result_df['g+t+'] / n_pos * 100) # use if positive g/t correlation 297 | neg_sensitivity = (result_df['g-t+'] / n_pos * 100) # use if negative g/t correlation 298 | else: 299 | pos_sensitivity = neg_sensitivity = pd.Series(0, index=result_df.index) 300 | 301 | if n_neg: 302 | pos_specificity = (result_df['g-t-'] / n_neg * 100) # use if positive g/t correlation 303 | neg_specificity = (result_df['g+t-'] / n_neg * 100) # use if negative g/t correlation 304 | else: 305 | pos_specificity = neg_specificity = pd.Series(0, index=result_df.index) 306 | 307 | keep_pos = (pos_sensitivity + pos_specificity) > (neg_sensitivity + neg_specificity) 308 | result_df["sensitivity"] = pos_sensitivity.where(keep_pos, neg_sensitivity) 309 | result_df["specificity"] = pos_specificity.where(keep_pos, neg_specificity) 310 | 311 | # Reset index so that Gene is its own column 312 | result_df.reset_index(inplace=True) 313 | 314 | return result_df 315 | 316 | 317 | def create_test_df(result_df: pd.DataFrame, sort=True) -> pd.DataFrame: 318 | """ 319 | Create test_df with index=__contingency_id__ and columns=[fisher_p] 320 | 321 | Reduce to unique contingency tables 322 | Add column: fisher_p 323 | 324 | :param result_df: DataFrame with column '__contingency_table__' 325 | :param sort: whether to sort the DataFrame by pvalue 326 | :return: test_df (DataFrame) 327 | """ 328 | 329 | test_df = pd.DataFrame(result_df.__contingency_table__.unique(), columns=['__contingency_table__']) 330 | 331 | # add __fisher_unique_table__ 332 | test_df['__fisher_unique_table__'] = test_df.__contingency_table__.apply(lambda table: fisher_id(*table)) 333 | 334 | # calculate Fisher's exact test 335 | table_to_pval = {table: fisher_exact_two_tailed(*table) for table in test_df.__fisher_unique_table__.unique()} 336 | 337 | # add Fisher's exact test 338 | test_df['fisher_p'] = test_df.__fisher_unique_table__.apply(lambda table: table_to_pval[table]) 339 | 340 | # remove fisher_identifier 341 | test_df.drop('__fisher_unique_table__', axis=1, inplace=True) 342 | 343 | if sort: 344 | # sort test_df by pvalue 345 | test_df.sort_values(by='fisher_p', inplace=True) 346 | 347 | return test_df 348 | 349 | 350 | def add_odds_ratio(test_df: pd.DataFrame) -> pd.DataFrame: 351 | # add odds_ratio 352 | test_df['odds_ratio'] = test_df.__contingency_table__.apply(lambda table: odds_ratio(*table)) 353 | return test_df 354 | 355 | 356 | def multiple_testing_correction( 357 | df: pd.DataFrame, 358 | pval_column: str, 359 | qval_column: str, 360 | method: str, 361 | cutoff: float, 362 | is_sorted: bool = False 363 | ) -> (float, pd.DataFrame): 364 | assert pval_column in df.columns, f'{pval_column=} must be in {df.columns=}!' 365 | if qval_column in df.columns: 366 | logger.warning(f'Overwriting {qval_column=} in {df.columns=}!') 367 | 368 | pvals = df[pval_column] 369 | 370 | # Apply multiple testing correction for each orthogene 371 | if method == 'native': 372 | reject = pvals <= cutoff 373 | _, qval, _, _ = multipletests(pvals=pvals, alpha=1, method='bonferroni', is_sorted=is_sorted) 374 | else: 375 | reject, qval, alphac_sidak, alphac_bonf = multipletests( 376 | pvals=pvals, alpha=cutoff, method=method, is_sorted=is_sorted, 377 | ) 378 | 379 | df[qval_column] = qval 380 | df = df[reject] 381 | return df 382 | 383 | 384 | def pair_picking(result_df: pd.DataFrame, significant_genes_df: pd.DataFrame, tree: ScoaryTree, 385 | label_to_trait: pd.Series | dict) -> pd.DataFrame: 386 | """ 387 | Required rows: 388 | - Gene 389 | 390 | Add columns: 391 | - Max_Pairwise_comparisons 392 | - Max_supporting_pairs 393 | - Max_opposing_pairs 394 | - Best_pairwise_comp_p 395 | - Worst_pairwise_comp_p 396 | """ 397 | assert result_df.Gene.to_list() == list(significant_genes_df.index) 398 | 399 | max_contr, max_suppo, max_oppos, best, worst = pick( 400 | tree=tree.to_list, label_to_trait_a=label_to_trait, 401 | trait_b_df=significant_genes_df, calc_pvals=True 402 | ) 403 | 404 | result_df['contrasting'] = max_contr 405 | result_df['supporting'] = max_suppo 406 | result_df['opposing'] = max_oppos 407 | result_df['best'] = best 408 | result_df['worst'] = worst 409 | 410 | return result_df 411 | -------------------------------------------------------------------------------- /scoary/scoary.py: -------------------------------------------------------------------------------- 1 | from .progressbar import print_progress 2 | from .utils import * 3 | from .ScoaryTree import ScoaryTree 4 | from .load_genes import load_genes 5 | from .load_traits import load_traits 6 | from .final_overview import create_final_overview 7 | from .analyze_trait import analyze_trait_step_1_fisher, analyze_trait_step_2_pairpicking, worker, multiple_testing_correction 8 | 9 | logger = logging.getLogger('scoary') 10 | 11 | 12 | def scoary( 13 | genes: str, 14 | traits: str, 15 | outdir: str, 16 | multiple_testing: str = 'bonferroni:0.999', 17 | trait_wise_correction: bool = False, 18 | worst_cutoff: float = None, 19 | max_genes: int = None, 20 | gene_info: str = None, 21 | trait_info: str = None, 22 | isolate_info: str = None, 23 | newicktree: str = None, 24 | pairwise: bool = True, 25 | n_permut: int = 500, 26 | restrict_to: str = None, 27 | ignore: str = None, 28 | n_cpus: int = 1, 29 | n_cpus_binarization: int = None, 30 | trait_data_type: str = 'binary:,', 31 | gene_data_type: str = 'gene-count:,', 32 | force_binary_clustering: bool = False, 33 | symmetric: bool = True, 34 | distance_metric: str = 'jaccard', 35 | linkage_method: str = 'ward', 36 | optimal_ordering: bool = True, 37 | corr_method: str = 'pearson', 38 | random_state: int = None, 39 | limit_traits: (int, int) = None, 40 | version: bool = False # Dummy variable, only used to create docstring (see main function) 41 | ) -> None: 42 | """ 43 | Scoary2: Associate genes with traits! 44 | 45 | :param genes: Path to gene presence/absence table: columns=isolates, rows=genes 46 | :param traits: Path to trait presence/absence table: columns=traits, rows=isolates 47 | :param outdir: Directory to place output files 48 | :param multiple_testing: Apply multiple testing to the p-values of Fisher's test to account for the many 49 | genes/traits tested. Format: "method:cutoff". 50 | Cutoff is a number that specifies the FWER and method is one of [native, bonferroni, sidak, holm-sidak, holm, 51 | simes-hochberg, hommel, fdr_bh, fdr_by, fdr_tsbh, fdr_tsbky]. 52 | If method is 'native': then, the cutoff targets the uncorrected p-value from Fisher's test. 53 | :param trait_wise_correction: Apply multiple testing correction to each trait separately. Not recommended as 54 | this can lead to many false positives! 55 | :param worst_cutoff: Drop traits if no gene with "worst" p-value lower than threshold. Recommended if 56 | dataset contains multiple species 57 | :param max_genes: Keep only n highest-scoring genes in Fisher's test. Recommended if dataset is big and contains 58 | multiple species; avoids waisting computational resources on traits that simply correlate with phylogeny 59 | :param gene_info: Path to file that describes genes: columns=arbitrary properties, rows=genes 60 | :param trait_info: Path to file that describes traits: columns=arbitrary properties, rows=traits 61 | :param isolate_info: Path to file that describes isolates: columns=arbitrary properties, rows=isolates 62 | :param newicktree: Path to a custom tree in Newick format 63 | :param pairwise: If False, only perform Fisher's test. If True, also perform pairwise comparisons 64 | algorithm. 65 | :param n_permut: Post-hoc label-switching test: perform N permutations of the phenotype by random label switching. 66 | Low p-values suggest that the effect is not merely lineage-specific. 67 | :param restrict_to: Comma-separated list of isolates to which to restrict this analysis 68 | :param ignore: Comma-separated list of isolates to be ignored for this analysis 69 | :param n_cpus: Number of CPUs that should be used. There is overhead in multiprocessing, so if the dataset is 70 | small, use n_cpus=1 71 | :param n_cpus_binarization: Number of CPUs that should be used for binarization. Default: one tenth of n_cpus 72 | :param trait_data_type: "::::" How to read the traits 73 | table. Example: "gene-list:\\t" for OrthoFinder N0.tsv table 74 | :param gene_data_type: ":" How to read the genes table. Example: "gene-list:\\t" for 75 | OrthoFinder N0.tsv table 76 | :param force_binary_clustering: Force clustering of binary data even if numeric data is available 77 | :param symmetric: if True, correlated and anti-correlated traits will cluster together 78 | :param distance_metric: distance metric (binary data only); See metric in https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html 79 | :param linkage_method: linkage method for clustering [single, complete, average, weighted, ward, centroid, median] 80 | :param optimal_ordering: whether to use optimal ordering; See scipy.cluster.hierarchy.linkage. 81 | :param corr_method: correlation method (numeric data only) [pearson, kendall, spearman] 82 | :param random_state: Set a fixed seed for the random number generator 83 | :param limit_traits: Limit the analysis to traits n to m. Useful for debugging. Example: "(0, 10)" 84 | :param version: Print software version of Scoary2 and exit. 85 | """ 86 | SCOARY_PRINT_CITATION = os.environ.get('SCOARY_PRINT_CITATION', 'TRUE') == 'TRUE' 87 | if SCOARY_PRINT_CITATION: 88 | print(f'Welcome to Scoary2! ({get_version()})') 89 | 90 | # parse input, create outdir, setup logging 91 | trait_data_type = decode_unicode(trait_data_type) 92 | gene_data_type = decode_unicode(gene_data_type) 93 | if n_cpus_binarization is None: 94 | n_cpus_binarization = 1 + n_cpus // 10 95 | outdir = setup_outdir(outdir, input=locals()) 96 | 97 | setup_logging(logger, f'{outdir}/logs/scoary-2.log') 98 | 99 | logger.debug(f'Scoary2 Version: {get_version()}') 100 | mt_f_method, mt_f_cutoff = parse_correction(multiple_testing, 'multiple_testing') 101 | assert n_permut == 0 or n_permut >= 100, f'{n_permut=} must be at least 100.' 102 | 103 | # start 104 | start_time = datetime.now() 105 | 106 | # load traits data (numeric_df may be None) 107 | logger.info('Loading traits...') 108 | numeric_df, traits_df = load_traits( 109 | traits=traits, 110 | trait_data_type=trait_data_type, 111 | restrict_to=restrict_to, 112 | ignore=ignore, 113 | n_cpus=n_cpus_binarization, 114 | random_state=random_state, 115 | outdir=outdir, 116 | limit_traits=limit_traits 117 | ) 118 | 119 | # dynamically set recursion limit, should work for ~ 13'000 isolates 120 | _recursion_limit = max(1000, 100 + len(traits_df.index) ** 2) 121 | logger.debug(f'Setting recursion limit to {_recursion_limit}') 122 | sys.setrecursionlimit(_recursion_limit) 123 | 124 | if trait_info: 125 | logger.info('Loading trait info...') 126 | trait_info = load_info_file( 127 | logger=logger, info_file=trait_info, merge_col='Trait', 128 | expected_overlap_set=set(traits_df.columns), reference_file=traits 129 | ) 130 | 131 | logger.info('Loading genes...') 132 | genes_orig_df, genes_bool_df = load_genes( 133 | genes, 134 | gene_data_type=gene_data_type, 135 | restrict_to=traits_df.index, 136 | ) 137 | 138 | if gene_info: 139 | logger.info('Loading gene info...') 140 | gene_info = load_info_file( 141 | logger=logger, info_file=gene_info, merge_col='Gene', 142 | expected_overlap_set=set(genes_bool_df.index), reference_file=genes 143 | ) 144 | 145 | if isolate_info: 146 | logger.info('Loading isolate info...') 147 | isolate_info = load_info_file( 148 | logger=logger, info_file=isolate_info, merge_col='Isolate', 149 | expected_overlap_set=set(genes_bool_df.columns), reference_file='placeholder' 150 | ) 151 | 152 | # load phylogeny 153 | if newicktree is None: 154 | logger.info('Generating phylogenetic tree from gene presence-absence-matrix...') 155 | tree = ScoaryTree.from_presence_absence(genes_bool_df) 156 | else: 157 | logger.info('Loading phylogenetic tree from newick file...') 158 | with open(newicktree) as f: 159 | tree = ScoaryTree.from_newick(f.read()) 160 | tree = tree.prune(genes_bool_df.columns) 161 | tree.write_newick(f'{outdir}/tree.nwk') 162 | 163 | all_labels = set(tree.labels()) 164 | 165 | traits = traits_df.columns.to_list() 166 | duplicates = find_duplicates(traits_df) 167 | 168 | logger.info('Finalizing setup...') 169 | if n_cpus == 1: 170 | ns, counter, lock = AnalyzeTraitNamespace(), MockCounter(), MockLock() 171 | else: 172 | from .init_multiprocessing import init, mp 173 | mgr, ns, counter, lock = init() 174 | 175 | ns = AnalyzeTraitNamespace.create_namespace(ns, { 176 | 'start_time': datetime.now(), 177 | 'counter': counter, 178 | 'queue_size': len(traits), 179 | 'lock': lock, 180 | 'outdir': outdir, 181 | 'genes_orig_df': genes_orig_df, 182 | 'genes_bool_df': genes_bool_df, 183 | 'gene_info_df': gene_info, 184 | 'numeric_df': numeric_df, 185 | 'traits_df': traits_df, 186 | 'trait_info_df': trait_info, 187 | 'duplicates': duplicates, 188 | 'tree': tree, 189 | 'all_labels': all_labels, 190 | 'mt_f_method': mt_f_method, 191 | 'mt_f_cutoff': mt_f_cutoff, 192 | 'trait_wise_correction': trait_wise_correction, 193 | 'max_genes': max_genes, 194 | 'worst_cutoff': worst_cutoff, 195 | 'n_permut': n_permut, 196 | 'random_state': random_state, 197 | 'pairwise': pairwise, 198 | 'multiple_testing_df': None, 199 | }) 200 | 201 | logger.info('Starting step 1: Fisher\'s test...') 202 | if n_cpus == 1: 203 | step_1_start = datetime.now() 204 | trait_to_result = {trait: analyze_trait_step_1_fisher(trait, ns) for trait in traits} 205 | else: 206 | mp.freeze_support() 207 | queue = mgr.JoinableQueue() 208 | trait_to_result = mgr.dict() 209 | [queue.put(trait) for trait in traits] 210 | procs = [mp.Process(target=worker, args=(queue, ns, 1, trait_to_result, i)) for i in range(n_cpus)] 211 | step_1_start = datetime.now() 212 | [p.start() for p in procs] 213 | [p.join() for p in procs] 214 | 215 | step_1_end = datetime.now() 216 | print_progress( 217 | len(traits), len(traits), 218 | message='Step 1 complete!', start_time=step_1_start, message_width=25, 219 | end='\n' 220 | ) 221 | logger.info(f'Step 1 took {step_1_end - step_1_start}') 222 | 223 | duplicated_traits = {trait: res for trait, res in trait_to_result.items() if type(res) is str} 224 | logger.info(f'Number of duplicated traits: {len(duplicated_traits)}') 225 | logger.info(f'Number of non-duplicated traits: {len(trait_to_result) - len(duplicated_traits)}') 226 | 227 | # multiple testing correction 228 | if trait_wise_correction: 229 | traits_left = {trait for trait, res in trait_to_result.items() if res is True} 230 | ns.multiple_testing_df = 'Not used' 231 | else: 232 | trait_to_result = {trait: res for trait, res in trait_to_result.items() if type(res) is not str} 233 | multiple_testing_df = multiple_testing_correction( 234 | pd.concat(trait_to_result), 'fisher_p', 'fisher_q', 235 | ns.mt_f_method, ns.mt_f_cutoff, False 236 | ) 237 | multiple_testing_df.drop('fisher_p', axis=1, inplace=True) 238 | traits_left = multiple_testing_df.index.get_level_values(0).unique().to_list() 239 | ns.multiple_testing_df = multiple_testing_df 240 | del trait_to_result 241 | 242 | # Step 2: Pairpicking 243 | ns.queue_size = len(traits_left) 244 | ns.counter.value = 0 245 | logger.info(f'Number of traits left after multiple testing correction: {len(traits_left)}') 246 | 247 | logger.info('Starting step 2: Pair picking...') 248 | if n_cpus == 1: 249 | step_2_start = datetime.now() 250 | trait_to_result = {trait: analyze_trait_step_2_pairpicking(trait, ns) for trait in traits_left} 251 | else: 252 | mp.freeze_support() 253 | queue = mgr.JoinableQueue() 254 | trait_to_result = mgr.dict() 255 | [queue.put(trait) for trait in traits_left] 256 | procs = [mp.Process(target=worker, args=(queue, ns, 2, trait_to_result, i)) for i in range(n_cpus)] 257 | step_2_start = datetime.now() 258 | [p.start() for p in procs] 259 | [p.join() for p in procs] 260 | 261 | step_2_end = datetime.now() 262 | print_progress( 263 | len(traits_left), len(traits_left), 264 | message='Step 2 complete!', start_time=step_2_start, message_width=25, 265 | end='\n' 266 | ) 267 | logger.info(f'Step 2 took {step_2_end - step_2_start}') 268 | 269 | try: 270 | summary_df = create_summary_df(trait_to_result, duplicated_traits) 271 | except NoTraitsLeftException as e: 272 | logger.info(str(e)) 273 | logger.debug(f'Took {datetime.now() - start_time}') 274 | return 275 | del trait_to_result 276 | 277 | summary_df = summary_df.sort_values( 278 | by='best_fq*ep' if 'best_fq*ep' in summary_df.columns else 'best_fisher_q', 279 | ascending=False 280 | ) 281 | 282 | create_final_overview(summary_df, ns.traits_df, ns.numeric_df, ns.outdir, ns.trait_info_df, isolate_info, 283 | force_binary_clustering, symmetric, distance_metric, linkage_method, optimal_ordering, corr_method) 284 | 285 | logger.info('Cleaning up...') 286 | clean_up(outdir, summary_df.index.to_list()) 287 | 288 | logger.info('Complete success!') 289 | 290 | logger.info(f'Took {datetime.now() - start_time}') 291 | 292 | if SCOARY_PRINT_CITATION: 293 | print(CITATION) 294 | 295 | 296 | def create_summary_df(trait_to_result: {str: [dict | None]}, duplicated_traits: {str: str}) -> pd.DataFrame | None: 297 | """ 298 | Turn trait_to_result into a pandas.DataFrame. Example: 299 | 300 | best_fisher_p best_fisher_q best_empirical_p best_fq*ep 301 | Trait_1 0.574066 4.384058e-01 0.035964 0.035964 302 | Trait_2 0.432940 2.667931e-01 0.133866 0.133866 303 | Trait_3 0.194418 7.981206e-08 0.020979 0.691309 304 | 305 | :param trait_to_result: dictionary where keys are trait names and values are either dict|str|None 306 | :return: pandas.DataFrame 307 | """ 308 | # res may contain: dict or None: 309 | # - dict: data to be added to summary_df as a row 310 | # - None: no gene was significant 311 | 312 | # remove Nones 313 | trait_to_result = {t: r for t, r in trait_to_result.items() if r is not None} 314 | 315 | # remove traits with no significant genes 316 | trait_to_result.update({t: trait_to_result[r] for t, r in duplicated_traits.items() if r in trait_to_result}) 317 | 318 | if len(trait_to_result) == 0: 319 | raise NoTraitsLeftException('No traits left after filtering') 320 | 321 | summary_df = pd.DataFrame(trait_to_result).T 322 | summary_df = summary_df.infer_objects() # harmonize dtypes 323 | 324 | logger.debug(f'Created summary_df:\n{summary_df}') 325 | 326 | return summary_df 327 | 328 | 329 | def find_duplicates(traits_df: pd.DataFrame) -> pd.Series: 330 | """ 331 | Returns a pd.Series that maps duplicated traits to the first occurrence 332 | """ 333 | hash_df = pd.DataFrame(index=traits_df.columns) 334 | hash_df['hash'] = traits_df.apply(lambda x: hash(tuple(x)), axis=0) 335 | hash_df['is_duplicated'] = hash_df['hash'].duplicated(keep=False) 336 | hash_df['use_cache'] = hash_df['hash'].duplicated(keep='first') 337 | lookup_df = hash_df[hash_df['is_duplicated'] & ~hash_df['use_cache']].sort_values(by='hash') 338 | duplicates = hash_df[hash_df['use_cache']] 339 | duplicates = duplicates['hash'].apply( 340 | func=lambda h: lookup_df.iloc[lookup_df.hash.searchsorted(h)].name 341 | ) 342 | return duplicates 343 | 344 | 345 | def clean_up(outdir: str, traits_left: list[str]) -> None: 346 | import shutil 347 | for trait in os.listdir(f'{outdir}/traits'): 348 | if trait not in traits_left: 349 | shutil.rmtree(f'{outdir}/traits/{trait}') 350 | 351 | 352 | CITATION = f''' 353 | ██████ ▄████▄ ▒█████ ▄▄▄ ██▀███ ▓██ ██▓ ░▒█████▒░ 354 | ▒██ ▒ ▒██▀ ▀█ ▒██▒ ██ ▒████▄ ▓██ ██ ▒██ ██▒ ▒█▒ ██▒░ 355 | ░ ▓██▄ ▒▓█ ▄ ▒██░ ██ ▒██ ▀█▄ ▓██ ░▄█ ▒██ ██░ ░█▀ 356 | ▒ ██ ▒▓▓▄ ▄██▒▒██ ██ ░██▄▄▄▄██ ▒██▀▀█▄ ░ ▐██▓░ ▄█ 357 | ▒██████▒ ▓███▀ ░░ ████▓▒░ ▓█ ▓██▒░██▓ ▒██▒ ██▒▓░ ░███████▒ 358 | ▒ ▒▓▒ ▒ ░ ░▒ ▒ ░░ ▒░▒░▒░ ▒▒ ▓▒█░░ ▒▓ ░▒▓░ ██▒▒▒ ░▒▒ ░▒░ 359 | ░ ░▒ ░ ░ ░ ▒ ░ ▒ ▒░ ▒ ▒▒ ░ ░▒ ░ ▒░▓██ ░▒░ ░░ ▒░ 360 | ░ ░ ░ ░ ░ ░ ░ ▒ ░ ▒ ░░ ░ ▒ ▒ ░░ ░ 361 | ░ ░ ░ ░ ░ ░ ░ ░ ░ ░ 362 | ░ ░ ░ 363 | Microbial Pan-GWAS 364 | 365 | 366 | If you use Scoary2 ({get_version()}), please cite: 367 | Roder, T. et al. Scoary2: Rapid association of phenotypic multi-omics 368 | data with microbial pan-genomes. 369 | BioRxiv (2023) doi:10.1101/2023.04.19.537353. 370 | '''.strip('\n') 371 | 372 | 373 | def main(): 374 | import sys, fire 375 | 376 | if '--version' in sys.argv: 377 | print(f'{get_version()}') 378 | exit(0) 379 | 380 | fire.Fire(scoary) 381 | 382 | 383 | if __name__ == '__main__': 384 | main() 385 | --------------------------------------------------------------------------------