├── tests
    ├── __init__.py
    ├── test_progressbar.py
    ├── test_load_genes.py
    ├── test_utils.py
    ├── init_tests.py
    ├── test_permtations.py
    ├── test_newick.py
    ├── test_load_traits.py
    ├── test_scoary_tree.py
    ├── test_upgma.py
    ├── test_analyze_trait.py
    ├── test_final_overview.py
    ├── test_scoary.py
    └── test_picking.py
├── media
    ├── scoary.ai
    └── scoary-2-logo.svg
├── data
    ├── vcf
    │   ├── ExampleVCFTrait.csv
    │   └── Example.vcf
    ├── tetracycline
    │   ├── Restrict_to.csv
    │   ├── ExampleTree.nwk
    │   ├── Tetracycline_resistance.csv
    │   └── expected_result.json
    └── generated
    │   └── Trait.csv
├── benchmarking
    ├── runtime
    │   ├── data
    │   │   ├── runtime.txt
    │   │   └── 100_traits.csv
    │   ├── README.md
    │   └── Optimization strategies.md
    ├── picking_performance
    │   ├── data
    │   │   ├── benchmark.png
    │   │   ├── benchmark_with_GLM.png
    │   │   └── benchmark_with_PySR.png
    │   └── README.md
    └── binarization
    │   ├── README.md
    │   └── benchmark_binarization.py
├── .gitignore
├── scoary
    ├── init_multiprocessing.py
    ├── __init__.py
    ├── KeyValueStore.py
    ├── progressbar.py
    ├── upgma.py
    ├── newick.py
    ├── permutations.py
    ├── final_overview.py
    ├── load_genes.py
    ├── vcf2scoary.py
    ├── ScoaryTree.py
    ├── utils.py
    ├── picking.py
    ├── analyze_trait.py
    └── scoary.py
├── Dockerfile
├── pyproject.toml
├── LICENCE.md
├── Notes.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/media/scoary.ai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/media/scoary.ai


--------------------------------------------------------------------------------
/data/vcf/ExampleVCFTrait.csv:
--------------------------------------------------------------------------------
1 | ,ExampleVCFtrait
2 | Reference,0
3 | Strain_A,1
4 | Strain_B,1
5 | Strain_C,0
6 | 


--------------------------------------------------------------------------------
/benchmarking/runtime/data/runtime.txt:
--------------------------------------------------------------------------------
1 | s2 start: 14:58:12
2 | s2 end: 14:58:35
3 | s1 start: 14:58:58
4 | s1 end: 15:21:31
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | dist/
3 | data/
4 | TEST_OUTPUT/
5 | TMP/
6 | *__pycache__*
7 | /benchmarking/binarization/simulations/
8 | /benchmarking/*/.old
9 | 


--------------------------------------------------------------------------------
/benchmarking/picking_performance/data/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/benchmarking/picking_performance/data/benchmark.png


--------------------------------------------------------------------------------
/benchmarking/picking_performance/data/benchmark_with_GLM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/benchmarking/picking_performance/data/benchmark_with_GLM.png


--------------------------------------------------------------------------------
/benchmarking/picking_performance/data/benchmark_with_PySR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MrTomRod/scoary-2/HEAD/benchmarking/picking_performance/data/benchmark_with_PySR.png


--------------------------------------------------------------------------------
/scoary/init_multiprocessing.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing as mp
 2 | 
 3 | mp.set_start_method('spawn')
 4 | 
 5 | 
 6 | def init():
 7 |     mgr = mp.Manager()
 8 |     ns = mgr.Namespace()
 9 |     counter = mgr.Value('i', 0)
10 |     lock = mgr.Lock()
11 |     return mgr, ns, counter, lock
12 | 


--------------------------------------------------------------------------------
/scoary/__init__.py:
--------------------------------------------------------------------------------
 1 | from .scoary import scoary
 2 | from .ScoaryTree import ScoaryTree
 3 | from .picking import pick, pick_single
 4 | from .permutations import permute_picking
 5 | from .utils import print_tree, get_version
 6 | 
 7 | __version__ = get_version()
 8 | __author__ = 'Thomas Roder'
 9 | __credits__ = ['Thomas Roder', 'Ola Brynildsrud']
10 | __license__ = 'MIT'
11 | __maintainer__ = 'Thomas Roder'
12 | __email__ = 'roder.thomas@gmail.com'
13 | 


--------------------------------------------------------------------------------
/data/tetracycline/Restrict_to.csv:
--------------------------------------------------------------------------------
1 | Isolate_1,Isolate_10,Isolate_11,Isolate_12,Isolate_13,Isolate_14,Isolate_15,Isolate_16,Isolate_17,Isolate_18,Isolate_19,Isolate_2,Isolate_20,Isolate_21,Isolate_22,Isolate_23,Isolate_24,Isolate_25,Isolate_26,Isolate_27,Isolate_28,Isolate_29,Isolate_3,Isolate_30,Isolate_31,Isolate_32,Isolate_33,Isolate_34,Isolate_35,Isolate_36,Isolate_37,Isolate_38,Isolate_39,Isolate_4,Isolate_40,Isolate_41,Isolate_42,Isolate_43,Isolate_44,Isolate_45,Isolate_46,Isolate_47,Isolate_48,Isolate_49,Isolate_5,Isolate_50
2 | 
3 | 


--------------------------------------------------------------------------------
/data/vcf/Example.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 3 | ##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele.">
 4 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Reference	Strain_A	Strain_B	Strain_C
 5 | NC_000962	4013	0	T	C	9999	0	TYPE=snp	GT	0	1	1	1
 6 | NC_000962	4705	0	T	C,A	9999	0	TYPE=snp	GT	0	0	1	2
 7 | NC_000962	6575	0	C	A,T,G	9999	0	TYPE=snp	GT	0	3	1	2
 8 | NC_000962	6750	0	C	T	9999	0	TYPE=snp	GT	0	0	1	0
 9 | NC_000962	7362	0	G	C	9999	0	TYPE=snp	GT	0	1	1	1
10 | 


--------------------------------------------------------------------------------
/tests/test_progressbar.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | import time
 3 | from scoary.progressbar import *
 4 | 
 5 | 
 6 | class Test(TestCase):
 7 |     def test_print_progress(self):
 8 |         start_time = datetime.now()
 9 |         n_tot = 20
10 |         for i in range(n_tot + 1):
11 |             time.sleep(0.05)
12 |             msg = f'{i}: {" a" * i}'
13 |             print_progress(i, n_tot, message=msg, start_time=start_time, message_width=30, default_width=120)
14 | 
15 |     def test_print_edge(self):
16 |         start_time = datetime.now()
17 |         msg = f'{0}: {" a" * 0}'
18 |         print_progress(0, 0, message=msg, start_time=start_time, message_width=30, default_width=120)
19 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim-bullseye
 2 | 
 3 | 
 4 | RUN apt-get update && \
 5 |     apt-get install -y build-essential && \
 6 |     apt-get clean
 7 | 
 8 | ARG SCOARY_VERSION
 9 | 
10 | # to build from local sources, use the lines below:
11 | COPY dist/*$SCOARY_VERSION* /tmp/scoary/
12 | RUN pip install -U /tmp/scoary/scoary_2-$SCOARY_VERSION-py3-none-any.whl && \
13 |     pip cache purge && \
14 |     rm -rf /tmp/scoary
15 | 
16 | # to build from pip, use this:
17 | # RUN pip install scoary-2==$SCOARY_VERSION && \
18 | #     pip cache purge
19 | 
20 | # set these environment variables to directories where non-root is allowed to write
21 | ENV NUMBA_CACHE_DIR=/tmp/NUMBA_CACHE_DIR
22 | ENV CONFINT_DB=/tmp/CONFINT_DB
23 | ENV MPLCONFIGDIR=/tmp/MPLCONFIGDIR
24 | 
25 | WORKDIR /data
26 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "scoary-2"
 3 | version = "0.0.15"
 4 | description = "Scoary2: Associate genes with traits!"
 5 | authors = ["MrTomRod <roder.thomas@gmail.com>"]
 6 | readme = "README.md"
 7 | license = "GPL3"
 8 | packages = [
 9 |     { include = "scoary" }
10 | ]
11 | 
12 | 
13 | [tool.poetry.scripts]
14 | scoary2 = "scoary.scoary:main"
15 | vcf2scoary = "scoary.vcf2scoary:main"
16 | 
17 | 
18 | [tool.poetry.dependencies]
19 | python = ">=3.10,<3.11"
20 | numba = "^0.58.0"
21 | pandas = "^2"
22 | scipy = "^1.7.3"
23 | scikit-learn = "^1.0.2"
24 | fast-fisher = "^0.0.4"
25 | matplotlib = "^3.5.2"
26 | statsmodels = "^0.14.0"
27 | fire = "^0.5.0"
28 | mgwas-data-exploration-app = "^0.1.0"
29 | 
30 | 
31 | [tool.poetry.dev-dependencies]
32 | ete3 = "^3.1.2"
33 | biotite = "^0.38.0"
34 | 
35 | 
36 | [build-system]
37 | requires = ["poetry-core>=1.0.0"]
38 | build-backend = "poetry.core.masonry.api"
39 | 


--------------------------------------------------------------------------------
/tests/test_load_genes.py:
--------------------------------------------------------------------------------
 1 | from init_tests import *
 2 | 
 3 | from scoary.load_genes import load_genes
 4 | 
 5 | 
 6 | class Test(TestCase):
 7 |     def test_count(self):
 8 |         orig_data, binary_data = load_genes('../data/generated/Gene_presence_absence.csv', gene_data_type='gene-count')
 9 |         print(orig_data, binary_data)
10 |         orig_data, binary_data = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count',
11 |                                             ignore=roary_ignore)
12 |         print(orig_data, binary_data)
13 | 
14 |     def test_list(self):
15 |         orig_data, binary_data = load_genes(
16 |             '../data/new_ds/Orthogroups.tsv',
17 |             gene_data_type='gene-list:\t'
18 |         )
19 |         print(orig_data, binary_data)
20 |         orig_data, binary_data = load_genes(
21 |             '../data/new_ds/N0.tsv',
22 |             gene_data_type='gene-list:\t',
23 |             ignore=orthofinder_ignore
24 |         )
25 |         print(orig_data, binary_data)
26 | 


--------------------------------------------------------------------------------
/benchmarking/binarization/README.md:
--------------------------------------------------------------------------------
 1 | # Pangenome Simulator
 2 | 
 3 | 1) `generate_simulations()`
 4 | 
 5 | Script for simulating a pan-genome. Outputs a Roary-like gene_presence_absence.csv and a Traits file.
 6 | 
 7 | This script is based on Ola Brynildsrud's [Simulate_pan_genome](https://github.com/AdmiralenOla/Simulate_pan_genome/).
 8 | 
 9 | > [!CAUTION]
10 | > Disclaimer: This script is intended for demonstrating the utility of Scoary2 and may or may not be a realistic 
11 | implementation of how bacterial evolution works.
12 | 
13 | 2) `run_scoary()`
14 | 
15 | Run Scoary2 on the simulated data.
16 | 
17 | 3) `analyze_scoary_results()`
18 | 
19 | Parse the output of Scoary2 to find the rank o          f the true trait.
20 | 
21 | Creates. [results.tsv](out%2Fresults.tsv).
22 | 
23 | If Scoary2 produces no output (no gene left after multiple testing correction) or if the true trait is not in 
24 | the final list of traits, the rank is set to `nan`.
25 | 
26 | 3) `plot_all()`
27 | 
28 | Plot the results of the analysis.
29 | 
30 | Creates `out/effect_sizes.png`.
31 | 
32 | ![effect_sizes.svg](out%2Feffect_sizes.svg)


--------------------------------------------------------------------------------
/LICENCE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Thomas Roder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from init_tests import *
 2 | from scoary.utils import *
 3 | 
 4 | logger = logging.getLogger('TEST_LOGGER')
 5 | 
 6 | 
 7 | class Test(TestCase):
 8 |     def test_load_info_file_trait(self):
 9 |         trait_info_df = load_info_file(
10 |             logger=logger, info_file='../data/new_ds/LC-meta.tsv', merge_col='Trait',
11 |             expected_overlap_set={'Compound_287', 'Compound_287'}, reference_file='placeholder'
12 |         )
13 |         print(trait_info_df)
14 | 
15 |     def test_load_info_file_genes(self):
16 |         gene_info_df = load_info_file(
17 |             logger=logger, info_file='../data/new_ds/N0_best_names.tsv', merge_col='Gene',
18 |             expected_overlap_set={'N0.HOG0000000', 'N0.HOG0000001'}, reference_file='placeholder'
19 |         )
20 |         print(gene_info_df)
21 | 
22 |     def test_load_info_file_isolate(self):
23 |         isolate_info_df = load_info_file(
24 |             logger=logger, info_file='../data/new_ds/isolate-meta.tsv', merge_col='Isolate',
25 |             expected_overlap_set={'FAM23868-i1-1.1'}, reference_file='placeholder'
26 |         )
27 |         print(isolate_info_df)
28 | 


--------------------------------------------------------------------------------
/Notes.md:
--------------------------------------------------------------------------------
 1 | # How to publish
 2 | 
 3 | ## Pypi
 4 | 
 5 | Create update:
 6 | 
 7 | 1) Change version number in [pyproject.toml](pyproject.toml)
 8 | 2) Create new package and upload:
 9 | 
10 | ```bash
11 | SCOARY_VERSION="?.?.?"
12 | # build: will create files in dist/
13 | poetry build
14 | # test: install .whl file
15 | pip install -U dist/scoary_2-$SCOARY_VERSION-py3-none-any.whl
16 | # upload
17 | poetry publish
18 | ```
19 | 
20 | ## Docker / Podman
21 | 
22 | If you use docker, simply replace each `podman` with `docker`.
23 | 
24 | ```shell
25 | podman build --build-arg SCOARY_VERSION=$SCOARY_VERSION --tag troder/scoary-2 .
26 | ```
27 | 
28 | Publish docker image:
29 | 
30 | ```shell
31 | # podman login docker.io --get-login
32 | # podman login docker.io
33 | podman tag troder/scoary-2 troder/scoary-2:$SCOARY_VERSION
34 | podman push troder/scoary-2:$SCOARY_VERSION
35 | 
36 | # update tag 'latest'
37 | podman tag troder/scoary-2 troder/scoary-2:latest
38 | podman push troder/scoary-2:latest
39 | ```
40 | 
41 | ## Docker / Zenodo links in Wiki
42 | 
43 | Update Zenodo:
44 | 1) Create a new release on GitHub (Title: `scoary-2:$SCOARY_VERSION`)
45 | 2) Will automatically create a new DOI on Zenodo
46 | 3) Make sure links are updated
47 | 


--------------------------------------------------------------------------------
/benchmarking/picking_performance/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark of pair picking
 2 | 
 3 | Goal: compare the performance of Scoary vs Scoary2 pair picking algorithms.
 4 | 
 5 | ## Output
 6 | 
 7 | Raw data: [benchmark.tsv](data%2Fbenchmark.tsv)
 8 | 
 9 | ![benchmark.png](data%2Fbenchmark.png)
10 | 
11 | **GLM:** ` time ~ n_isolates + n_genes + n_isolates * n_genes`
12 | 
13 | - `scoary = 0.0006532479935340877 + -8.266041425328844e-07 * n_isolates + -0.00010316416563699979 * n_genes + 2.8076161350353536e-05 * n_isolates * n_genes`
14 | - `scoary2 = 4.729632447019741e-05 + 1.387879503778183e-05 * n_isolates + -2.187177527361452e-06 * n_genes + 6.866970437111624e-07 * n_isolates * n_genes`
15 | 
16 | Full output: see [benchmark_picking.py](benchmark_picking.py#L308-L352)
17 | 
18 | ![benchmark_with_GLM.png](data%2Fbenchmark_with_GLM.png)
19 | 
20 | **PySR:** symbolic regression
21 | 
22 | Operators: `["+", "*", "exp", inv(x)"]`
23 | 
24 | "Best" scoring models are `constant * n_genes * n_isolates` for Scoary and Scoary2 with the following coefficients:
25 | 
26 | - Scoary: `2.6693995e-5`
27 | - Scoary2: `8.678912e-7`
28 | 
29 | Full output: see [benchmark_picking.py](benchmark_picking.py#L396-L426)
30 | 
31 | ![benchmark_with_PySR.png](data%2Fbenchmark_with_PySR.png)
32 | 
33 | 


--------------------------------------------------------------------------------
/benchmarking/runtime/README.md:
--------------------------------------------------------------------------------
 1 | # Running Scoary and Scoary2 on the same data
 2 | 
 3 | Dataset: 100 randomly picked and binarized traits from the Scoary2 dataset.
 4 | 
 5 | **1) Run Scoary**
 6 | 
 7 | ```bash
 8 | echo "s1 start: $(date +"%T")" >> runtime.txt
 9 | 
10 | podman run --user 0:0 --rm -it -v ./:/data:Z biocontainers/scoary:v1.6.16-1-deb_cv1 \
11 | scoary -t 100_traits.csv -g N0_count.csv -s 2 -o s1_out --permute 1000 --correction I -p 0.1
12 | 
13 | echo "s1 end: $(date +"%T")" >> runtime.txt
14 | ```
15 | 
16 | **2) Run Scoary2**
17 | 
18 | ```bash
19 | echo "s2 start: $(date +"%T")" >> runtime.txt
20 | 
21 | podman run --rm -v ./:/data:Z troder/scoary-2 \
22 | scoary2 \
23 | --genes N0_count.csv \
24 | --gene-data-type 'gene-count:,' \
25 | --traits 100_traits.csv \
26 | --trait-data-type 'binary:,' \
27 | --multiple_testing native:0.1 --n-permut 1000 \
28 | --n-cpus 8 \
29 | --random-state 42 \
30 | --outdir s2_out \
31 | --trait_wise_correction
32 | 
33 | echo "s2 end: $(date +"%T")" >> runtime.txt
34 | ```
35 | 
36 | ## Results
37 | 
38 | ```bash
39 | $ cat runtime.txt
40 | s2 start: 14:58:12
41 | s2 end: 14:58:35
42 | s1 start: 14:58:58
43 | s1 end: 15:21:31
44 | ```
45 | 
46 | - Scoary2 took 23 seconds
47 | - Scoary took 22 minutes and 33 seconds or 1353 seconds
48 | - Scoary2 is 1353 / 23 = **59 times** faster than Scoary on this dataset
49 | 


--------------------------------------------------------------------------------
/data/tetracycline/ExampleTree.nwk:
--------------------------------------------------------------------------------
1 | (((((('Isolate_1', 'Isolate_55'), ((('Isolate_36', 'Isolate_46'), 'Isolate_97'), 'Isolate_51')), (((((((('Isolate_10', ('Isolate_9', 'Isolate_91')), 'Isolate_31'), ('Isolate_38', ('Isolate_45', 'Isolate_5'))), 'Isolate_59'), (((('Isolate_15', 'Isolate_21'), 'Isolate_70'), 'Isolate_22'), 'Isolate_32')), (((((('Isolate_13', 'Isolate_80'), ('Isolate_50', 'Isolate_63')), 'Isolate_66'), ((('Isolate_23', 'Isolate_69'), 'Isolate_25'), 'Isolate_57')), 'Isolate_72'), ((('Isolate_19', 'Isolate_41'), 'Isolate_82'), 'Isolate_48'))), (('Isolate_52', 'Isolate_81'), ('Isolate_61', 'Isolate_79'))), (((('Isolate_12', 'Isolate_86'), 'Isolate_62'), (('Isolate_29', 'Isolate_84'), ('Isolate_64', 'Isolate_78'))), (((('Isolate_26', ('Isolate_4', 'Isolate_75')), 'Isolate_95'), (('Isolate_7', 'Isolate_74'), 'Isolate_85')), ('Isolate_68', 'Isolate_83'))))), (((('Isolate_11', 'Isolate_18'), 'Isolate_60'), ((('Isolate_14', 'Isolate_73'), (('Isolate_24', 'Isolate_6'), 'Isolate_33')), 'Isolate_54')), ('Isolate_35', 'Isolate_96'))), ((((('Isolate_16', 'Isolate_65'), 'Isolate_90'), 'Isolate_89'), (((('Isolate_17', 'Isolate_8'), 'Isolate_58'), 'Isolate_77'), (('Isolate_44', 'Isolate_100'), 'Isolate_56'))), (((((('Isolate_2', 'Isolate_88'), (('Isolate_28', 'Isolate_49'), 'Isolate_39')), ((((('Isolate_20', 'Isolate_47'), (('Isolate_3', 'Isolate_42'), 'Isolate_53')), 'Isolate_71'), ('Isolate_40', 'Isolate_67')), 'Isolate_92')), 'Isolate_87'), 'Isolate_94'), 'Isolate_99'))), (((('Isolate_27', 'Isolate_43'), ('Isolate_37', 'Isolate_76')), ('Isolate_34', 'Isolate_98')), ('Isolate_30', 'Isolate_93')));


--------------------------------------------------------------------------------
/data/generated/Trait.csv:
--------------------------------------------------------------------------------
  1 | ,Trait
  2 | root,0
  3 | strain0001,0
  4 | strain0002,0
  5 | strain0003,0
  6 | strain0004,0
  7 | strain0005,0
  8 | strain0006,0
  9 | strain0007,1
 10 | strain0008,1
 11 | strain0009,0
 12 | strain0010,0
 13 | strain0011,0
 14 | strain0012,0
 15 | strain0013,0
 16 | strain0014,1
 17 | strain0015,1
 18 | strain0016,0
 19 | strain0017,1
 20 | strain0018,1
 21 | strain0019,0
 22 | strain0020,0
 23 | strain0021,0
 24 | strain0022,0
 25 | strain0023,0
 26 | strain0024,1
 27 | strain0025,0
 28 | strain0026,1
 29 | strain0027,0
 30 | strain0028,0
 31 | strain0029,0
 32 | strain0030,0
 33 | strain0031,1
 34 | strain0032,0
 35 | strain0033,1
 36 | strain0034,1
 37 | strain0035,0
 38 | strain0036,1
 39 | strain0037,1
 40 | strain0038,1
 41 | strain0039,0
 42 | strain0040,1
 43 | strain0041,1
 44 | strain0042,0
 45 | strain0043,0
 46 | strain0044,0
 47 | strain0045,0
 48 | strain0046,1
 49 | strain0047,1
 50 | strain0048,1
 51 | strain0049,1
 52 | strain0050,1
 53 | strain0051,1
 54 | strain0052,0
 55 | strain0053,1
 56 | strain0054,0
 57 | strain0055,1
 58 | strain0056,1
 59 | strain0057,1
 60 | strain0058,0
 61 | strain0059,1
 62 | strain0060,0
 63 | strain0061,0
 64 | strain0062,1
 65 | strain0063,1
 66 | strain0064,0
 67 | strain0065,0
 68 | strain0066,0
 69 | strain0067,0
 70 | strain0068,0
 71 | strain0069,0
 72 | strain0070,0
 73 | strain0071,0
 74 | strain0072,0
 75 | strain0073,0
 76 | strain0074,1
 77 | strain0075,0
 78 | strain0076,1
 79 | strain0077,0
 80 | strain0078,0
 81 | strain0079,0
 82 | strain0080,0
 83 | strain0081,0
 84 | strain0082,0
 85 | strain0083,1
 86 | strain0084,0
 87 | strain0085,0
 88 | strain0086,1
 89 | strain0087,1
 90 | strain0088,0
 91 | strain0089,0
 92 | strain0090,0
 93 | strain0091,0
 94 | strain0092,1
 95 | strain0093,0
 96 | strain0094,1
 97 | strain0095,0
 98 | strain0096,1
 99 | strain0097,0
100 | strain0098,1
101 | strain0099,1
102 | 


--------------------------------------------------------------------------------
/tests/init_tests.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import pandas as pd
 4 | from os.path import dirname, exists
 5 | from scipy.spatial import distance
 6 | from scipy.stats import fisher_exact, boschloo_exact
 7 | 
 8 | from unittest import TestCase
 9 | 
10 | # set up logging
11 | import logging
12 | 
13 | logging.basicConfig()
14 | # logging.getLogger().setLevel(logging.INFO)
15 | 
16 | ROOT = dirname(dirname(__file__))
17 | 
18 | roary_ignore = ['Non-unique Gene name', 'Annotation', 'No. isolates', 'No. sequences', 'Avg sequences per isolate',
19 |                 'Genome fragment', 'Order within fragment', 'Accessory Fragment', 'Accessory Order with Fragment', 'QC',
20 |                 'Min group size nuc', 'Max group size nuc', 'Avg group size nuc']
21 | orthofinder_ignore = ['OG', 'Gene Tree Parent Clade']
22 | 
23 | 
24 | def get_json(path: str):
25 |     with open(path) as f:
26 |         return json.load(f)
27 | 
28 | 
29 | def is_equivalent(a, b):
30 |     if np.isinf(a) and np.isinf(b):
31 |         return True
32 |     if np.isnan(a) and np.isnan(b):
33 |         return True
34 |     return np.isclose(a, b)
35 | 
36 | 
37 | def is_equivalent_tree(a, b) -> bool:
38 |     if type(a) is str or type(b) is str:
39 |         return a == b
40 |     else:
41 |         return (
42 |                 is_equivalent_tree(a[0], b[0]) and is_equivalent_tree(a[1], b[1])
43 |         ) or (
44 |                 is_equivalent_tree(a[0], b[1]) and is_equivalent_tree(a[1], b[0])
45 |         )
46 | 
47 | 
48 | def get_tempdir_path() -> str:
49 |     # template = '/tmp/scoary-test-outdir-{i}'
50 |     # i = 0
51 |     # while exists(template.format(i=i)):
52 |     #     i += 1
53 |     #
54 |     # tempdir_path = template.format(i=i)
55 | 
56 |     tempdir_path = '/home/thomas/PycharmProjects/scoary-2/TEST_OUTPUT'
57 | 
58 |     logging.warning(f'Using this tempdir: file://{tempdir_path}')
59 | 
60 |     return tempdir_path
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Scoary2 logo (light mode)](media/scoary-2-logo-full.svg#gh-light-mode-only)
 2 | ![Scoary2 logo (dark mode)](media/scoary-2-logo-full-dark.svg#gh-dark-mode-only)
 3 | 
 4 | Scoary2 associates orthogenes (e.g. generated using [OrthoFinder][orthofinder]
 5 | or [Roary][roary] to traits. It reports a list of genes sorted by strength of
 6 | association per trait. The results can be explored interactively with a simple, static HTML/JS app.
 7 | 
 8 | 
 9 | [![Publication](https://img.shields.io/badge/BMC%20Genome%20Biology-10.1186%2Fs13059--024--03233--7-blue)](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-024-03233-7)
10 | [![bioRxiv Preprint](https://img.shields.io/badge/bioRxiv-2023.04.19.537353-b31b1b.svg)](https://www.biorxiv.org/content/10.1101/2023.04.19.537353v1.full)
11 | [![Docker Image Version (latest semver)](https://img.shields.io/docker/v/troder/scoary-2?logo=docker&label=Docker&color=%231D63ED)](https://hub.docker.com/troder/scoary-2)
12 | [![DOI](https://zenodo.org/badge/445173674.svg)](https://zenodo.org/doi/10.5281/zenodo.10352170)
13 | 
14 | 
15 | # Wiki
16 | 
17 | - [Home](https://github.com/MrTomRod/scoary-2/wiki/Home)
18 | - [Installation](https://github.com/MrTomRod/scoary-2/wiki/Installation)
19 | - [Usage](https://github.com/MrTomRod/scoary-2/wiki/Usage)
20 | - [Input](https://github.com/MrTomRod/scoary-2/wiki/Input)
21 | - [Output](https://github.com/MrTomRod/scoary-2/wiki/Output)
22 | - [Tutorial](https://github.com/MrTomRod/scoary-2/wiki/Tutorial)
23 | - [App](https://github.com/MrTomRod/scoary-2/wiki/App)
24 | - [Understanding the p values](https://github.com/MrTomRod/scoary-2/wiki/Understanding-the-p-values)
25 | - [Usage as Python library](https://github.com/MrTomRod/scoary-2/wiki/Usage-as-Python-library)
26 | 
27 | 
28 | [orthofinder]: https://github.com/davidemms/OrthoFinder/
29 | [roary]: https://sanger-pathogens.github.io/Roary/
30 | 
31 | # Paper
32 | 
33 | Please cite:
34 | 
35 | > Roder, T. _et al._ _Scoary2_: rapid association of phenotypic multi-omics data with microbial pan-genomes.
36 | > _Genome Biol_ **25**, 93 (2024). https://doi.org/10.1186/s13059-024-03233-7
37 | 


--------------------------------------------------------------------------------
/data/tetracycline/Tetracycline_resistance.csv:
--------------------------------------------------------------------------------
  1 | ,Tetracycline_resistance,Bogus_trait
  2 | Isolate_1,0,0
  3 | Isolate_10,0,1
  4 | Isolate_11,0,1
  5 | Isolate_12,1,0
  6 | Isolate_13,0,NA
  7 | Isolate_14,0,1
  8 | Isolate_15,0,0
  9 | Isolate_16,0,0
 10 | Isolate_17,0,1
 11 | Isolate_18,0,0
 12 | Isolate_19,0,0
 13 | Isolate_2,0,1
 14 | Isolate_20,0,1
 15 | Isolate_21,0,1
 16 | Isolate_22,1,1
 17 | Isolate_23,1,0
 18 | Isolate_24,0,0
 19 | Isolate_25,0,1
 20 | Isolate_26,1,-
 21 | Isolate_27,0,0
 22 | Isolate_28,1,1
 23 | Isolate_29,1,1
 24 | Isolate_3,0,1
 25 | Isolate_30,1,1
 26 | Isolate_31,1,0
 27 | Isolate_32,0,0
 28 | Isolate_33,0,0
 29 | Isolate_34,1,1
 30 | Isolate_35,1,1
 31 | Isolate_36,1,0
 32 | Isolate_37,0,1
 33 | Isolate_38,1,1
 34 | Isolate_39,1,1
 35 | Isolate_4,1,0
 36 | Isolate_40,1,1
 37 | Isolate_41,0,1
 38 | Isolate_42,0,0
 39 | Isolate_43,0,0
 40 | Isolate_44,0,1
 41 | Isolate_45,0,0
 42 | Isolate_46,0,1
 43 | Isolate_47,0,1
 44 | Isolate_48,0,0
 45 | Isolate_49,1,1
 46 | Isolate_5,0,1
 47 | Isolate_50,1,0
 48 | Isolate_51,0,0
 49 | Isolate_52,0,0
 50 | Isolate_53,0,0
 51 | Isolate_54,1,0
 52 | Isolate_55,0,1
 53 | Isolate_56,1,0
 54 | Isolate_57,1,1
 55 | Isolate_58,0,1
 56 | Isolate_59,0,0
 57 | Isolate_6,0,1
 58 | Isolate_60,0,1
 59 | Isolate_61,0,1
 60 | Isolate_62,0,1
 61 | Isolate_63,0,0
 62 | Isolate_64,0,1
 63 | Isolate_65,0,0
 64 | Isolate_66,1,1
 65 | Isolate_67,0,1
 66 | Isolate_68,1,0
 67 | Isolate_69,1,1
 68 | Isolate_7,0,0
 69 | Isolate_70,0,1
 70 | Isolate_71,0,1
 71 | Isolate_72,0,0
 72 | Isolate_73,0,0
 73 | Isolate_74,0,0
 74 | Isolate_75,1,1
 75 | Isolate_76,0,1
 76 | Isolate_77,1,1
 77 | Isolate_78,0,0
 78 | Isolate_79,1,.
 79 | Isolate_8,0,1
 80 | Isolate_80,0,0
 81 | Isolate_81,0,0
 82 | Isolate_82,0,1
 83 | Isolate_83,0,1
 84 | Isolate_84,0,0
 85 | Isolate_85,1,0
 86 | Isolate_86,1,1
 87 | Isolate_87,1,1
 88 | Isolate_88,0,1
 89 | Isolate_89,0,1
 90 | Isolate_9,0,1
 91 | Isolate_90,1,0
 92 | Isolate_91,0,0
 93 | Isolate_92,0,1
 94 | Isolate_93,0,1
 95 | Isolate_94,0,0
 96 | Isolate_95,1,1
 97 | Isolate_96,0,1
 98 | Isolate_97,0,0
 99 | Isolate_98,0,0
100 | Isolate_99,1,0
101 | Isolate_100,0,0
102 | 


--------------------------------------------------------------------------------
/scoary/KeyValueStore.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import sqlite3
 4 | 
 5 | 
 6 | class KeyValueStore:
 7 |     table_name: str
 8 | 
 9 |     def __init__(self, table_name, db_path: str = None):
10 |         self.table_name = table_name
11 | 
12 |         if db_path is None:
13 |             if 'KEY_VALUE_STORE_DB' in os.environ:
14 |                 db_path = os.environ['KEY_VALUE_STORE_DB']
15 |             else:
16 |                 db_path = os.path.expanduser('~/.cache/keyvaluestore.db')
17 | 
18 |         self._db_path = db_path
19 |         self.con, self.cur = self.get_cur()
20 |         self.create_db()
21 | 
22 |     def __str__(self):
23 |         return f'KeyValueStore {self.table_name} ({self._db_path})'
24 | 
25 |     def get_cur(self):
26 |         try:
27 |             con = sqlite3.connect(self._db_path)
28 |             cur = con.cursor()
29 |         except Exception as e:
30 |             logging.warning(f'Failed to connect to db: {self._db_path}')
31 |             raise e
32 |         return con, cur
33 | 
34 |     def __del__(self):
35 |         try:
36 |             self.cur.close()
37 |             self.con.close()
38 |         except Exception:
39 |             pass
40 | 
41 |     def create_db(self):
42 |         raise NotImplementedError(f'Users of the abstract class {self.__class__} must implement this function!')
43 | 
44 |     @staticmethod
45 |     def list_to_string(l) -> str:
46 |         return ', '.join(f"'{e}'" for e in l)
47 | 
48 |     @staticmethod
49 |     def list_to_string_bracket(l):
50 |         return ', '.join(f"('{e}')" for e in l)
51 | 
52 |     def _create_db(self, columns: {str: str}, pk_col: str):
53 |         columns = ', '.join(f'{col_name} {col_type}' for col_name, col_type in columns.items())
54 |         sql = f'''
55 |             CREATE TABLE IF NOT EXISTS {self.table_name} (
56 |                 {columns},
57 |                 PRIMARY KEY ({pk_col})
58 |             );
59 |         '''
60 |         try:
61 |             self.cur.execute(sql)
62 |         except sqlite3.OperationalError as e:
63 |             logging.warning(f'Failed to run this SQL command on db {self._db_path}:\n{sql}')
64 |             raise e
65 | 
66 |     def drop_db(self):
67 |         self.cur.execute(f'''DROP TABLE {self.table_name}''')
68 | 


--------------------------------------------------------------------------------
/tests/test_permtations.py:
--------------------------------------------------------------------------------
 1 | from .init_tests import *
 2 | 
 3 | from scoary import ScoaryTree, pick_single, print_tree, pick
 4 | 
 5 | from scoary.permutations import create_permuted_df, permute_picking
 6 | 
 7 | 
 8 | def test_permutations(tree: list, label_to_trait_a, label_to_trait_b, n_permut):
 9 |     max_contr, max_suppo, max_oppos, best, worst = pick(
10 |         tree=tree,
11 |         label_to_trait_a=label_to_trait_a,
12 |         trait_b_df=pd.DataFrame(label_to_trait_b, index=['gene']),
13 |         calc_pvals=True
14 |     )
15 |     print_tree(
16 |         ScoaryTree.from_list(tree),
17 |         label_to_trait_a, label_to_trait_b
18 |     )
19 |     is_positively_correlated = max_suppo >= max_oppos
20 |     n_pos, n_neg = sum(label_to_trait_b.values()), len(label_to_trait_b)
21 |     n_positive = n_pos if is_positively_correlated else n_neg
22 | 
23 |     estimator = (max_suppo if is_positively_correlated else max_oppos) / max_contr
24 |     print(f'{max_contr=}\n{max_suppo=}\n{max_oppos=}\n{best=}\n{worst=}\n{estimator=}')
25 | 
26 |     print('Calculating permutatons... p-value=', end='')
27 |     permuted_df = create_permuted_df(
28 |         labels=[f'i{i}' for i in range(1, 17)], n_positive=n_positive,
29 |         n_permut=n_permut, random_state=42
30 |     )
31 |     max_contr, max_suppo, max_oppos = pick(
32 |         tree=tree, label_to_trait_a=label_to_trait_a,
33 |         trait_b_df=permuted_df, calc_pvals=False
34 |     )
35 | 
36 |     permuted_estimators = max_suppo / max_contr
37 | 
38 |     pval = ((permuted_estimators >= estimator).sum() + 1) / (n_permut + 1)
39 | 
40 |     print(pval)
41 | 
42 | 
43 | class Test(TestCase):
44 |     def test_bad(self, n_permut=3000):
45 |         tree = [[[['i1', 'i2'], ['i3', 'i4']], [['i5', 'i6'], ['i7', 'i8']]],
46 |                 [[['i9', 'i10'], ['i11', 'i12']], [['i13', 'i14'], ['i15', 'i16']]]]
47 |         label_to_trait_a = {f'i{i}': bool(i % 2) for i in range(1, 17)}
48 |         label_to_trait_b = label_to_trait_a.copy()
49 |         test_permutations(tree, label_to_trait_a, label_to_trait_b, n_permut)
50 | 
51 |     def test_good(self, n_permut=3000):
52 |         tree = [[[['i1', 'i2'], ['i3', 'i4']], [['i5', 'i6'], ['i7', 'i8']]],
53 |                 [[['i9', 'i10'], ['i11', 'i12']], [['i13', 'i14'], ['i15', 'i16']]]]
54 |         label_to_trait_a = {f'i{i}': bool(i < 9) for i in range(1, 17)}
55 |         label_to_trait_b = label_to_trait_a.copy()
56 |         test_permutations(tree, label_to_trait_a, label_to_trait_b, n_permut)
57 | 


--------------------------------------------------------------------------------
/scoary/progressbar.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, timedelta
 3 | from textwrap import shorten
 4 | import logging
 5 | 
 6 | SCOARY_PRINT_PROGRESS = os.environ.get('SCOARY_PRINT_PROGRESS', 'TRUE').upper() == 'TRUE'
 7 | 
 8 | # can os determine the terminal size?
 9 | try:
10 |     n_cols = os.get_terminal_size().columns
11 |     DYNAMIC_TERMINAL_WIDTH = True
12 |     LINEBREAK_CHAR = '\r'
13 | except Exception:
14 |     DYNAMIC_TERMINAL_WIDTH = False
15 |     LINEBREAK_CHAR = '\n'
16 | 
17 | # set function get_terminal_width depending on DYNAMIC_TERMINAL_WIDTH
18 | if DYNAMIC_TERMINAL_WIDTH:
19 |     def get_terminal_width(min_: int, default: int) -> int:
20 |         return max(min_, os.get_terminal_size().columns)
21 | else:
22 |     def get_terminal_width(min_: int, default: int) -> int:
23 |         return default
24 | 
25 | 
26 | def stringify_timedelta(delta: timedelta) -> str:
27 |     """
28 |     Returns string 5 characters long.
29 |     """
30 |     d = delta.days
31 |     h, rem = divmod(delta.seconds, 3600)
32 |     m, s = divmod(rem, 60)
33 |     if d:
34 |         res = f'{d}d {h}h'
35 |     elif h:
36 |         res = f'{h}h {m}m'
37 |     elif m:
38 |         res = f'{m}m {s}s'
39 |     else:
40 |         res = f'{s}s'
41 |     res = shorten(res, width=5, placeholder='')
42 |     return f'{res:>5s}' if res else f'>999d'
43 | 
44 | 
45 | def print_progress(
46 |         i: int,
47 |         n: int,
48 |         message: str,
49 |         start_time: datetime,
50 |         message_width: int = 40,
51 |         default_width: int = 100,
52 |         sep: str = ' | ',
53 |         end: str = LINEBREAK_CHAR
54 | ) -> None:
55 |     if not SCOARY_PRINT_PROGRESS:
56 |         return
57 | 
58 |     message = f"{shorten(message, width=message_width, placeholder='...'):{message_width}s}"
59 |     assert len(message) == message_width
60 | 
61 |     n = max(1, n)
62 |     i_safe = min(max(1, i), n)
63 |     time_left = stringify_timedelta((datetime.now() - start_time) / i_safe * (n - i_safe))  # 5 chars
64 |     percentage = f"{f'{i / n:.0%}':>4s}"  # 4 chars
65 | 
66 |     width_total = get_terminal_width(min_=message_width + len(sep) * 2 + 20, default=default_width)
67 | 
68 |     text = f'{percentage}{sep}{time_left}{sep}{message}'
69 |     len_progressbar = width_total - len(text)
70 |     n_bars = len_progressbar - 3  # because of '[] '
71 | 
72 |     res = f"[{'=' * round(i / n * n_bars):{n_bars}}] {text}"
73 | 
74 |     if not len(res) == width_total:
75 |         logging.warning('Something went wrong with the progressbar!')
76 | 
77 |     print(res, end=end)
78 | 


--------------------------------------------------------------------------------
/tests/test_newick.py:
--------------------------------------------------------------------------------
 1 | from init_tests import *
 2 | 
 3 | from scoary.newick import parse_newick
 4 | 
 5 | 
 6 | class Test(TestCase):
 7 |     def test_newick(self):
 8 |         test_data = [
 9 |             ('(A,(C,D));', ['A', ['C', 'D']]),
10 |             ('(A, (C,D));', ['A', ['C', 'D']]),
11 |             ('(A(C,D));', ['A', ['C', 'D']]),
12 |             ('(A(C, D));', ['A', ['C', 'D']]),
13 |             ('A,(C,D);', ['A', ['C', 'D']]),
14 |             ('((A,B),(C,D));', [['A', 'B'], ['C', 'D']]),
15 |             ('(A,B),(C,D);', [['A', 'B'], ['C', 'D']]),
16 |             ('(A,B),(C,D);', [['A', 'B'], ['C', 'D']]),
17 |             ('(A,B)(C,D);', [['A', 'B'], ['C', 'D']]),
18 |             ('(C,D)E;', [['C', 'D'], 'E']),
19 |             ('(C,D),E;', [['C', 'D'], 'E']),
20 |             ('(A,(C,D))F;', [['A', ['C', 'D']], 'F']),
21 |             ('(  A  , (  C  ,  D ) ) F ;', [['A', ['C', 'D']], 'F']),
22 |             ('(A(C,D))F;', [['A', ['C', 'D']], 'F']),
23 |             ('(A:0.1,(C:0.3,D:0.4):0.5);', ['A', ['C', 'D']]),
24 |             ('(A:0.1,(C:0.3,D:0.4))F;', [['A', ['C', 'D']], 'F']),
25 |             ('((B:0.2,(C:0.3,D:0.4))F:0.1)A;', [[['B', ['C', 'D']], 'F'], 'A']),
26 |             ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);', ['A', [['B', [['D', 'G'], 'E']], 'C']]),
27 |             ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);', ['A', [['B', [['D', 'G'], 'E']], 'C']]),
28 |             ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));', ['A', ['B', ['D', 'G']]]),
29 |             ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);', ['A', ['B', ['D', 'G']]]),
30 |             ('(A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);', ['A', [['B', [['D', 'G'], 'E']], 'C']]),
31 |             ('(A,(B,(D,G)E)C);', ['A', [['B', [['D', 'G'], 'E']], 'C']]),
32 |             ('(A,(B,(D,G)));', ['A', ['B', ['D', 'G']]]),
33 |             ('(hodor one,(hodor-two,(hodor_3,0hodor 4)));', ['hodor one', ['hodor-two', ['hodor_3', '0hodor 4']]]),
34 |             ('(FAM18356-i1-1.1:0.289313,(FAM19471-i1-1.1:0.210374,(FAM23169-i1-1.1:0.0764835,FAM1079-i1-1.1:0.0778624)0.977444:0.115601));',
35 |              ['FAM18356-i1-1.1', ['FAM19471-i1-1.1', ['FAM23169-i1-1.1', 'FAM1079-i1-1.1']]])
36 |         ]
37 | 
38 |         for n, expected_result in test_data:
39 |             res = parse_newick(n)
40 |             self.assertEqual(res, expected_result)
41 |             print()
42 | 
43 |     def test_tetracycline(self):
44 |         with open('../data/tetracycline/ExampleTree.nwk') as f:
45 |             newick = f.read()
46 |         expected_result = get_json('../data/tetracycline/expected_result.json')['as_list']
47 |         res = parse_newick(newick)
48 |         self.assertEqual(res, expected_result)
49 | 


--------------------------------------------------------------------------------
/data/tetracycline/expected_result.json:
--------------------------------------------------------------------------------
1 | {
2 |   "as_list": [[[[[["Isolate_1", "Isolate_55"], [[["Isolate_36", "Isolate_46"], "Isolate_97"], "Isolate_51"]], [[[[[[[["Isolate_10", ["Isolate_9", "Isolate_91"]], "Isolate_31"], ["Isolate_38", ["Isolate_45", "Isolate_5"]]], "Isolate_59"], [[[["Isolate_15", "Isolate_21"], "Isolate_70"], "Isolate_22"], "Isolate_32"]], [[[[[["Isolate_13", "Isolate_80"], ["Isolate_50", "Isolate_63"]], "Isolate_66"], [[["Isolate_23", "Isolate_69"], "Isolate_25"], "Isolate_57"]], "Isolate_72"], [[["Isolate_19", "Isolate_41"], "Isolate_82"], "Isolate_48"]]], [["Isolate_52", "Isolate_81"], ["Isolate_61", "Isolate_79"]]], [[[["Isolate_12", "Isolate_86"], "Isolate_62"], [["Isolate_29", "Isolate_84"], ["Isolate_64", "Isolate_78"]]], [[[["Isolate_26", ["Isolate_4", "Isolate_75"]], "Isolate_95"], [["Isolate_7", "Isolate_74"], "Isolate_85"]], ["Isolate_68", "Isolate_83"]]]]], [[[["Isolate_11", "Isolate_18"], "Isolate_60"], [[["Isolate_14", "Isolate_73"], [["Isolate_24", "Isolate_6"], "Isolate_33"]], "Isolate_54"]], ["Isolate_35", "Isolate_96"]]], [[[[["Isolate_16", "Isolate_65"], "Isolate_90"], "Isolate_89"], [[[["Isolate_17", "Isolate_8"], "Isolate_58"], "Isolate_77"], [["Isolate_44", "Isolate_100"], "Isolate_56"]]], [[[[[["Isolate_2", "Isolate_88"], [["Isolate_28", "Isolate_49"], "Isolate_39"]], [[[[["Isolate_20", "Isolate_47"], [["Isolate_3", "Isolate_42"], "Isolate_53"]], "Isolate_71"], ["Isolate_40", "Isolate_67"]], "Isolate_92"]], "Isolate_87"], "Isolate_94"], "Isolate_99"]]], [[[["Isolate_27", "Isolate_43"], ["Isolate_37", "Isolate_76"]], ["Isolate_34", "Isolate_98"]], ["Isolate_30", "Isolate_93"]]],
3 |   "as_newick": "((((((Isolate_1,Isolate_55),(((Isolate_36,Isolate_46),Isolate_97),Isolate_51)),((((((((Isolate_10,(Isolate_9,Isolate_91)),Isolate_31),(Isolate_38,(Isolate_45,Isolate_5))),Isolate_59),((((Isolate_15,Isolate_21),Isolate_70),Isolate_22),Isolate_32)),((((((Isolate_13,Isolate_80),(Isolate_50,Isolate_63)),Isolate_66),(((Isolate_23,Isolate_69),Isolate_25),Isolate_57)),Isolate_72),(((Isolate_19,Isolate_41),Isolate_82),Isolate_48))),((Isolate_52,Isolate_81),(Isolate_61,Isolate_79))),((((Isolate_12,Isolate_86),Isolate_62),((Isolate_29,Isolate_84),(Isolate_64,Isolate_78))),((((Isolate_26,(Isolate_4,Isolate_75)),Isolate_95),((Isolate_7,Isolate_74),Isolate_85)),(Isolate_68,Isolate_83))))),((((Isolate_11,Isolate_18),Isolate_60),(((Isolate_14,Isolate_73),((Isolate_24,Isolate_6),Isolate_33)),Isolate_54)),(Isolate_35,Isolate_96))),(((((Isolate_16,Isolate_65),Isolate_90),Isolate_89),((((Isolate_17,Isolate_8),Isolate_58),Isolate_77),((Isolate_44,Isolate_100),Isolate_56))),((((((Isolate_2,Isolate_88),((Isolate_28,Isolate_49),Isolate_39)),(((((Isolate_20,Isolate_47),((Isolate_3,Isolate_42),Isolate_53)),Isolate_71),(Isolate_40,Isolate_67)),Isolate_92)),Isolate_87),Isolate_94),Isolate_99))),((((Isolate_27,Isolate_43),(Isolate_37,Isolate_76)),(Isolate_34,Isolate_98)),(Isolate_30,Isolate_93)));"
4 | }


--------------------------------------------------------------------------------
/tests/test_load_traits.py:
--------------------------------------------------------------------------------
 1 | from init_tests import *
 2 | from scoary.load_traits import load_numeric, load_binary, apply_kmeans, apply_gm, binarize, load_traits
 3 | 
 4 | traits_bin = '../data/tetracycline/Tetracycline_resistance.csv'
 5 | traits_num = '../data/tetracycline/Tetracycline_resistance_numeric.csv'
 6 | 
 7 | 
 8 | class Test(TestCase):
 9 |     def test_load_binary(self):
10 |         binary_df = load_binary(traits=traits_bin, delimiter=',')
11 |         print(binary_df)
12 | 
13 |     def test_load_numeric(self):
14 |         numeric_df = load_numeric(traits=traits_num, delimiter=',')
15 |         print(numeric_df)
16 | 
17 |     def test_binarize_kmeans(self):
18 |         numeric_df = load_numeric(traits=traits_num, delimiter=',')
19 |         for alternative in ['skip', 'kmeans']:
20 |             for cutoff in [.5, .7, .9]:
21 |                 for covar_type in ['full', 'tied', 'diag', 'spherical']:
22 |                     binary_df = binarize(
23 |                         numeric_df, method='kmeans', random_state=42, n_cpus=1,
24 |                         cutoff=cutoff, covariance_type=covar_type,
25 |                         alternative=alternative, outdir=None
26 |                     )
27 | 
28 |     def test_binarize_gaussian_nonconverging(self):
29 |         # Tetracycline trait cannot be binarized with cutoff=0.999
30 |         numeric_df = load_numeric(traits=traits_num, delimiter=',')
31 |         for method, n_expected_columns in [('gaussian', 1), ('kmeans', 2)]:
32 |             binary_df = binarize(
33 |                 numeric_df, method=method, random_state=42, n_cpus=1,
34 |                 cutoff=0.9998, covariance_type='full',
35 |                 alternative='skip', outdir=None
36 |             )
37 |             self.assertEqual(n_expected_columns, len(binary_df.columns),
38 |                              f'{method=}; {n_expected_columns=}; {binary_df.columns=}')
39 | 
40 |     def test_illegal(self):
41 |         with self.assertRaises(AssertionError):
42 |             numeric_df, traits_df = load_traits(traits_num, trait_data_type=f'gaussian:0.4999', random_state=42)
43 |         with self.assertRaises(AssertionError):
44 |             numeric_df, traits_df = load_traits(traits_num, trait_data_type=f'gaussian:1', random_state=42)
45 |         with self.assertRaises(AssertionError):
46 |             # fails because no traits can be binarized. Certainty is never high enough.
47 |             numeric_df, traits_df = load_traits(traits_num, trait_data_type=f'gaussian:.999999999999', random_state=42)
48 | 
49 |     def test_multiprocessing(self):
50 |         for n_cpus in [1, 5]:
51 |             numeric_df, traits_df = load_traits(
52 |                 '../data/new_ds/LC.tsv',
53 |                 trait_data_type='gaussian:skip:\t',
54 |                 ignore='Starter-only-5A,FAMIX,Starter-only-10,Starter-only-7,mixture',
55 |                 n_cpus=n_cpus, limit_traits=(0, 10),
56 |                 outdir=f'{ROOT}/TEST_OUTPUT'
57 |             )
58 | 


--------------------------------------------------------------------------------
/benchmarking/runtime/Optimization strategies.md:
--------------------------------------------------------------------------------
 1 | # Comparing different optimization strategies
 2 | 
 3 | I also tried original [Scoary's optimization](https://github.com/AdmiralenOla/Scoary/blob/b713e10fc1968488132f62652c6dba35636ca3e6/scoary/methods.py#L1360-L1363)
 4 | (breaking the permutations) instead of my approach, caching confidence intervals.
 5 | 
 6 | ## Results
 7 | 
 8 | **Scoary:**
 9 | 
10 | - break disabled: 41 minutes
11 | - normal: 22 minutes
12 | 
13 | **Scoary2 (1 CPU):**
14 | 
15 | - cache disabled: 2:01
16 | - normal: 1:12
17 | - break instead of cache: 1:46
18 | 
19 | **Scoary2 (8 CPUs):**
20 | 
21 | - cache: 26 sec
22 | - break: 39 sec
23 | 
24 | ## Summary
25 | 
26 | My caching optimization appears to be better.
27 | 
28 | ## Code
29 | 
30 | The code below replaces the permute_picking function in [permutations.py](/scoary/permutations.py)
31 | 
32 | Note: I have not thoroughly tested this code, so it may contain bugs.
33 | 
34 | ```python
35 | import scipy.stats as ss
36 | 
37 | 
38 | def permute_picking(
39 |         trait: str,
40 |         tree: ScoaryTree,
41 |         label_to_trait: pd.Series | dict,
42 |         result_df: pd.DataFrame,
43 |         genes_bool_df: pd.DataFrame,
44 |         n_permut: int,
45 |         random_state: int = None,
46 |         batch_size: int = 50
47 | ) -> np.array:
48 |     if type(label_to_trait) is dict:
49 |         label_to_trait = pd.Series(label_to_trait, dtype='boolean')
50 |     n_tot = len(label_to_trait)
51 |     n_pos = sum(label_to_trait)
52 |     n_neg = n_tot - n_pos
53 |     labels = label_to_trait.keys()
54 | 
55 |     n_reused = 0
56 | 
57 |     pvals = []
58 |     for _, row in result_df.iterrows():
59 |         label_to_gene = genes_bool_df.loc[row.Gene]
60 | 
61 |         is_positively_correlated = row.supporting >= row.opposing
62 |         estimator = (row.supporting if is_positively_correlated else row.opposing) / row.contrasting
63 |         n_pos_assoc = n_pos if is_positively_correlated else n_neg
64 | 
65 |         r = 0
66 |         for batch_start in range(0, n_permut, batch_size):
67 |             batch_end = min(batch_start + batch_size, n_permut)
68 |             batch_size_current = batch_end - batch_start
69 |             # print(f"Processing {batch_start + 1}-{batch_end} ({batch_size_current} of {n_permut} items)")
70 | 
71 |             permuted_df = create_permuted_df(
72 |                 labels=labels, n_positive=n_pos_assoc,
73 |                 n_permut=batch_size_current, random_state=random_state
74 |             )
75 |             max_contr, max_suppo, max_oppos = pick(
76 |                 tree=tree.to_list, label_to_trait_a=label_to_gene,
77 |                 trait_b_df=permuted_df, calc_pvals=False
78 |             )
79 | 
80 |             # Check how many estimators are higher than the unpermuted
81 |             r += sum((max_suppo / max_contr) >= estimator)
82 | 
83 |             # If r indicates a p > 0.1 with a probability of 95%, abort
84 |             if batch_end >= 30 and (1 - ss.binom.cdf(r, batch_end, 0.1)) < 0.05:
85 |                 pval = (r + 1) / (batch_end + 1)
86 |                 break
87 | 
88 |         else:
89 |             pval = (r + 1) / (n_permut + 1)
90 | 
91 |         pvals.append(pval)
92 | 
93 |     return pvals
94 | ```


--------------------------------------------------------------------------------
/media/scoary-2-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 26.3.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 133.09 133.09" style="enable-background:new 0 0 133.09 133.09;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#010202;}
 7 | </style>
 8 | <g>
 9 | 	<path class="st0" d="M22.81,62.13c-2.16,4.73-3.35,10-3.35,15.53c0,1.8,0.12,3.57,0.37,5.3c0.19,0.44,0.45,0.98,0.77,1.63
10 | 		c-0.21-2.78-0.12-5.55,0.29-8.22c0.39-2.5,1.05-4.92,1.97-7.24C22.69,67.18,22.57,64.61,22.81,62.13z"/>
11 | 	<path class="st0" d="M46.25,80.12L33.17,66.6c-1.99,0.62-5.59,1.97-8.31,4.26c-0.69,1.88-1.2,3.83-1.51,5.83
12 | 		c-0.41,2.62-0.49,5.35-0.24,8.11c0.22,2.44,0.67,4.47,1.05,5.87c2.02,2.97,4.68,6.14,7.94,8.53l12.79-4.27L46.25,80.12z"/>
13 | 	<path class="st0" d="M55.23,47.62c-2.65,0.25-5.31,0.83-7.92,1.75c-4.48,1.56-7.27,3.58-7.3,3.59c0,0-4.28,5.54-5.18,11.78
14 | 		l13.33,13.77l16.14-3.1l6.73-18.36l-9.34-9.35C59.55,47.45,57.39,47.42,55.23,47.62z"/>
15 | 	<path class="st0" d="M46.39,47.06c2.81-0.99,5.68-1.63,8.54-1.9c2.19-0.21,4.37-0.2,6.53,0.03c2.59-2.21,4.14-3.19,5.4-3.7
16 | 		c-3.2-0.88-6.59-1.32-9.91-1.32c-4.97,0-9.95,1.01-14.52,2.92c-2.15,0.9-4.26,1.98-6.17,3.32c-0.36,0.26-0.71,0.55-1.07,0.82
17 | 		c-0.14,0.11-0.57,0.53-0.78,0.46c1.32,0.43,4,1.31,6.07,2.05C41.85,48.96,43.86,47.95,46.39,47.06z"/>
18 | 	<path class="st0" d="M46.07,97.14l-12.92,4.31c-0.02,1.44,0.37,5.38,0.33,5.45c2.23,1.8,4.68,3.34,7.25,4.58
19 | 		c2.95,1.42,6.08,2.45,9.3,3.06c2.92-0.11,10.13-0.55,12.98-2.42c0,0,0.01-0.01,0.02-0.02l-2.06-8.5L46.07,97.14z"/>
20 | 	<path class="st0" d="M73.86,91.97l-10.5,10.93l2.06,8.51c0.02,0.12,3.44-0.45,3.71-0.49c3.57-0.59,7.58-1.31,10.58-3.43
21 | 		c3.2-2.26,5.86-5.25,8.08-8.45c0.21-0.31,0.42-0.62,0.64-0.93c-0.17-1.97-0.45-5.21-0.66-7.42L73.86,91.97z"/>
22 | 	<path class="st0" d="M80.12,48.16c-0.19-0.14-0.37-0.29-0.56-0.43c-0.06-0.04-0.12-0.09-0.17-0.13c3.11,3.45,4.26,6.74,4.68,8.83
23 | 		c2.31,1.88,4.34,3.98,6.05,6.27c1.08,1.45,2.04,2.97,2.86,4.54c0-0.02,0-0.05,0.01-0.08c-1.42-4.87-3.84-9.4-7.05-13.31
24 | 		C84.21,51.76,82.26,49.85,80.12,48.16z"/>
25 | 	<path class="st0" d="M67.48,0h-0.94H65.6C29.37,0,0,29.37,0,65.6v0.94v0.94v65.6h65.6h0.94h0.94c36.23,0,65.6-29.37,65.6-65.6
26 | 		v-0.94V65.6V0H67.48z M104.55,31.36c4.71,8.28,5,16.01-3.24,17.54c-8.24,1.53,0.29,18.72,7.94,5.9c0,0,1.43,12.86-5,17.78
27 | 		c-6.43,4.92-5.7,13.12-6.09,17.19c-0.12,1.22-2.53,3.33-6.07,5.6c-1.88,3.74-4.34,7.14-7.33,10.13c-3.61,3.61-7.82,6.44-12.5,8.43
28 | 		c-4.85,2.05-10.01,3.09-15.31,3.09c-5.31,0-10.46-1.04-15.31-3.09c-4.68-1.98-8.89-4.82-12.5-8.43c-3.61-3.61-6.44-7.82-8.43-12.5
29 | 		c-2.05-4.85-3.09-10.01-3.09-15.31c0-5.31,1.04-10.46,3.09-15.31c1.98-4.68,4.82-8.89,8.43-12.5c3.61-3.61,7.82-6.44,12.5-8.43
30 | 		c0.73-0.31,1.47-0.59,2.21-0.86c3.89-7.13,11.42-15.13,25.97-17.17c0,0-8.45,9.34,0,7.95c6.8-1.12,12.65-9.21,12.65-9.21
31 | 		s-4.86,12.69,0,14.77c7.9,3.38,10.3-4.97,12.36-13.51c2.06-8.55,19.72-7.52,20.6-7.01C115.45,16.4,99.85,23.09,104.55,31.36z"/>
32 | 	<path class="st0" d="M82.82,58.63l-9.52-0.6l-6.74,18.4l7.45,13.04l13.85-1.27c0.05-0.1,0.1-0.19,0.15-0.27
33 | 		c1.04-1.59,3.48-6.17,4.54-15.36c-1.01-2.97-2.49-5.77-4.39-8.32C86.64,62.21,84.84,60.32,82.82,58.63z"/>
34 | </g>
35 | </svg>
36 | 


--------------------------------------------------------------------------------
/scoary/upgma.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def _find_min(arr: np.ndarray, n_cols: int) -> (int, int):
 6 |     min_index = int(np.nanargmin(arr))
 7 |     x, y = (min_index // n_cols, min_index % n_cols)
 8 |     assert x > y
 9 |     return x, y
10 | 
11 | 
12 | def _merge(arr: np.ndarray, node_list: [], cluster_sizes: [int], x: int, y: int):
13 |     n_rows, n_cols = arr.shape
14 |     assert n_rows == n_cols == len(node_list) == len(cluster_sizes)
15 |     assert x > y
16 | 
17 |     # update arr
18 |     for p in range(len(arr)):
19 |         if p in (x, y):
20 |             continue
21 |         px1, px2 = (x, p) if p < x else (p, x)
22 |         py1, py2 = (y, p) if p < y else (p, y)
23 |         assert not (np.isnan(arr[px1, px2]) or np.isnan(arr[py1, py2]))
24 | 
25 |         # calculate mean difference
26 |         arr[py1, py2] = (arr[px1, px2] * cluster_sizes[x] + arr[py1, py2] * cluster_sizes[y]) / (cluster_sizes[x] + cluster_sizes[y])  # row, col
27 | 
28 |     # remove row and col x
29 |     arr = np.delete(arr, x, 0)
30 |     arr = np.delete(arr, x, 1)
31 | 
32 |     # update labels
33 |     new_label = [node_list[y], node_list[x]]
34 |     del node_list[x]
35 |     node_list[y] = new_label
36 | 
37 |     # update cluster_sizes
38 |     cluster_sizes[y] = cluster_sizes[x] + cluster_sizes[y]
39 |     del cluster_sizes[x]
40 | 
41 |     assert arr.shape == (n_rows - 1, n_cols - 1)
42 |     assert len(node_list) == n_rows - 1
43 |     return arr, node_list, cluster_sizes
44 | 
45 | 
46 | def _upgma(arr: np.ndarray, node_list: [str]) -> []:
47 |     # fill triu with nan
48 |     arr[np.triu_indices(arr.shape[0], k=0)] = np.nan
49 | 
50 |     cluster_sizes = [1 for _ in range(len(node_list))]
51 | 
52 |     while len(node_list) > 1:
53 |         n_rows, n_cols = arr.shape
54 |         assert len(node_list) == n_rows == n_cols
55 | 
56 |         # find next columns to merge
57 |         x, y = _find_min(arr, len(node_list))
58 | 
59 |         # merge columns and update labels
60 |         arr, node_list, cluster_sizes = _merge(arr, node_list, cluster_sizes, x, y)
61 | 
62 |     assert len(node_list) == 1
63 |     tree = node_list[0]
64 | 
65 |     return tree
66 | 
67 | 
68 | def upgma(distances: pd.DataFrame) -> []:
69 |     """
70 |     Apply UPGMA (unweighted pair group method with arithmetic mean) algorithm.
71 |     Returns unweighted tree in nested list form.
72 | 
73 |     Insipred by 'Creating a Phylogenetic Tree' by Oxford Academic (https://www.youtube.com/watch?v=09eD4A_HxVQ)
74 | 
75 |     :param distances: pandas.DataFrame: values: symmetric array; columns: tree labels
76 |     :return: tree: nested list of strings
77 |     """
78 |     # split pandas.DataFrame into numpy.ndarray and labels
79 |     labels: [] = [str(c) for c in distances.columns]
80 |     arr: np.ndarray = distances.values.astype(float)
81 | 
82 |     # sanity checks
83 |     assert len(set(labels)) == len(labels), f'labels are not unique! {labels=}'
84 |     assert np.allclose(arr, arr.T, rtol=1e-05, atol=1e-08), f'arr is not symmetric! arr:\n{distances.to_string()}'
85 |     assert arr.shape[0] == arr.shape[1]
86 |     assert not np.isnan(arr).any(), 'Distance matrix contains nan'
87 |     assert not np.isinf(arr).any(), 'Distance matrix contains inf'
88 |     assert not np.any(distances < 0), 'Distances must be positive'
89 | 
90 |     return _upgma(arr=arr, node_list=labels)
91 | 


--------------------------------------------------------------------------------
/scoary/newick.py:
--------------------------------------------------------------------------------
 1 | from re import compile
 2 | 
 3 | BRANCH_LENTGHS_COLON = compile(r':[0-9]+(\.[0-9]+)?(e-?[0-9]+)?')
 4 | BRANCH_LENTGHS_BRACKET = compile(r'\)[0-9]+(.[0-9]+)?(e-?[0-9]+)?')
 5 | 
 6 | class NewickParserException(Exception):
 7 |     pass
 8 | 
 9 | 
10 | def parse_newick(newick_string: str) -> []:
11 |     """
12 |     A simple function to parse Newick strings to list tree format.
13 | 
14 |     Example:
15 |         >>> parse_newick('(A,(B,C))D;')
16 |         [['A', ['B', 'C']], 'D']
17 | 
18 |     Limitations:
19 |         - Only binary trees are supported
20 |         - Distances are ignored
21 |         - All labels must be named
22 |         - NHX format not supported
23 | 
24 |     :param newick_string: Phylogenetic tree in newick format
25 |     :return: Phylogenetic tree in list format
26 |     """
27 | 
28 |     # strip and remove branch lengths
29 |     newick_string = newick_string.strip()
30 |     newick_string = BRANCH_LENTGHS_COLON.sub(string=newick_string, repl='')
31 |     newick_string = BRANCH_LENTGHS_BRACKET.sub(string=newick_string, repl=')')
32 | 
33 |     # sanity check
34 |     if not newick_string.endswith(';'):
35 |         raise NewickParserException(f'Newick string does not end in semicolon! {newick_string=}')
36 | 
37 |     def find_corresponding_closing(string: str) -> int:
38 |         n_opening = 0
39 |         n_closing = 0
40 |         for i, char in enumerate(string):
41 |             if char == '(':
42 |                 n_opening += 1
43 |                 continue
44 |             if char == ')':
45 |                 n_closing += 1
46 |                 if n_closing == n_opening:
47 |                     return i
48 | 
49 |         raise NewickParserException(f'Could not find corresponding closing bracket in {string=}! {newick_string=}')
50 | 
51 |     def split_node(string: str) -> (str, str):
52 |         if ',' in string:
53 |             first_comma = string.index(',')
54 |             if '(' in string:
55 |                 first_bracket = string.index('(')
56 |                 if first_bracket < first_comma:
57 |                     return string[0:first_bracket], string[first_bracket:]
58 | 
59 |             return string[0:first_comma], string[first_comma + 1:]
60 |         else:
61 |             if '(' not in string:
62 |                 raise NewickParserException(f'Could not find separators "," or "(" in {string=}! {newick_string=}')
63 |             first_bracket = string.index('(')
64 |             return string[0:first_bracket], string[first_bracket:]
65 | 
66 |     def parse_leaf(string: str) -> str:
67 |         string = string.strip('"\' ')
68 |         if len(string) == 0:
69 |             raise NewickParserException(f'Leaf with no label: {string=}! {newick_string=}')
70 |         return string
71 | 
72 |     def parse_recursive(string: str) -> list | str:
73 |         string = string.strip()
74 | 
75 |         # leaf
76 |         if not ',' in string and not '(' in string and not ')' in string:
77 |             return parse_leaf(string)
78 | 
79 |         # remove enclosing brackets
80 |         if string.startswith('(') and string.endswith(')') \
81 |                 and find_corresponding_closing(string) == len(string) - 1:
82 |             string = string[1:-1]
83 | 
84 |         # parse node
85 |         if string.startswith('('):
86 |             closing_idx = find_corresponding_closing(string)
87 |             left = string[1:closing_idx]
88 |             right = string[closing_idx + 1:].lstrip(',')
89 |         else:
90 |             left, right = split_node(string)
91 | 
92 |         left = parse_recursive(left)
93 |         right = parse_recursive(right)
94 | 
95 |         return [left, right]
96 | 
97 |     return parse_recursive(newick_string[:-1])  # remove semicolon
98 | 


--------------------------------------------------------------------------------
/scoary/permutations.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import pandas as pd
  4 | import numpy as np
  5 | 
  6 | from .KeyValueStore import KeyValueStore
  7 | from .picking import pick
  8 | from .ScoaryTree import ScoaryTree
  9 | 
 10 | logger = logging.getLogger('scoary.permutations')
 11 | 
 12 | 
 13 | class ConfintStore(KeyValueStore):
 14 |     def create_db(self):
 15 |         self._create_db(
 16 |             columns={
 17 |                 'tree': 'str',
 18 |                 'n_pos_assoc': 'int',
 19 |                 'n_permut': 'int',
 20 |                 'confidence_interval': 'str'
 21 |             },
 22 |             pk_col='tree, n_pos_assoc, n_permut'
 23 |         )
 24 | 
 25 |     def get(self, tree: str, n_pos_assoc: int, n_permut: int):
 26 |         sql = f'SELECT confidence_interval FROM {self.table_name} WHERE tree = ? AND n_pos_assoc = ? AND n_permut = ?'
 27 |         res = self.cur.execute(
 28 |             sql,
 29 |             (tree, n_pos_assoc, n_permut,)
 30 |         ).fetchone()
 31 |         return np.frombuffer(res[0], dtype=float) if res is not None else None
 32 | 
 33 |     def set(self, tree: str, n_pos_assoc: int, n_permut: int, confidence_interval: [float]):
 34 |         confidence_interval = confidence_interval.tobytes()
 35 |         sql = f'INSERT OR IGNORE INTO {self.table_name} VALUES (?, ?, ?, ?)'
 36 |         self.cur.execute(
 37 |             sql, (tree, n_pos_assoc, n_permut, confidence_interval)
 38 |         )
 39 |         self.con.commit()
 40 | 
 41 | 
 42 | CONFINT_CACHE = ConfintStore(table_name='confint_cache', db_path=os.environ.get('CONFINT_DB', None))
 43 | 
 44 | 
 45 | def create_permuted_df(labels: [str], n_positive: int, n_permut: int, random_state: int = None):
 46 |     if random_state:
 47 |         np.random.seed(random_state)
 48 | 
 49 |     n_negative = len(labels) - n_positive
 50 |     arr = np.repeat(np.array([[1] * n_positive + [0] * n_negative]), n_permut, axis=0)
 51 | 
 52 |     # creates a copy -> slow
 53 |     arr = np.apply_along_axis(np.random.permutation, axis=1, arr=arr)
 54 | 
 55 |     return pd.DataFrame(arr, columns=labels)
 56 | 
 57 | 
 58 | def permute_picking(
 59 |         trait: str,
 60 |         tree: ScoaryTree,
 61 |         label_to_trait: pd.Series | dict,
 62 |         result_df: pd.DataFrame,
 63 |         genes_bool_df: pd.DataFrame,
 64 |         n_permut: int,
 65 |         random_state: int = None,
 66 | ) -> np.array:
 67 |     if type(label_to_trait) is dict:
 68 |         label_to_trait = pd.Series(label_to_trait, dtype='boolean')
 69 |     n_tot = len(label_to_trait)
 70 |     n_pos = sum(label_to_trait)
 71 |     n_neg = n_tot - n_pos
 72 |     labels = label_to_trait.keys()
 73 | 
 74 |     n_reused = 0
 75 | 
 76 |     pvals = []
 77 |     for _, row in result_df.iterrows():
 78 |         label_to_gene = genes_bool_df.loc[row.Gene]
 79 |         unique_topology = tree.uniquify(label_to_gene)
 80 | 
 81 |         is_positively_correlated = row.supporting >= row.opposing
 82 |         estimator = (row.supporting if is_positively_correlated else row.opposing) / row.contrasting
 83 |         n_pos_assoc = n_pos if is_positively_correlated else n_neg
 84 | 
 85 |         permuted_estimators = CONFINT_CACHE.get(unique_topology, n_pos_assoc, n_permut)
 86 |         if permuted_estimators is None:
 87 |             permuted_df = create_permuted_df(
 88 |                 labels=labels, n_positive=n_pos_assoc,
 89 |                 n_permut=n_permut, random_state=random_state
 90 |             )
 91 |             max_contr, max_suppo, max_oppos = pick(
 92 |                 tree=tree.to_list, label_to_trait_a=label_to_gene,
 93 |                 trait_b_df=permuted_df, calc_pvals=False
 94 |             )
 95 | 
 96 |             permuted_estimators = max_suppo / max_contr
 97 |             CONFINT_CACHE.set(unique_topology, n_pos_assoc, n_permut, permuted_estimators)
 98 |         else:
 99 |             n_reused += 1
100 | 
101 |         pval = ((permuted_estimators >= estimator).sum() + 1) / (n_permut + 1)
102 |         pvals.append(pval)
103 | 
104 |     logger.debug(f'{trait}: reused {n_reused} out of {len(result_df)}')
105 | 
106 |     return pvals
107 | 


--------------------------------------------------------------------------------
/tests/test_scoary_tree.py:
--------------------------------------------------------------------------------
  1 | from init_tests import *
  2 | 
  3 | from scoary.scoary import *
  4 | 
  5 | from scoary.ScoaryTree import *
  6 | 
  7 | 
  8 | class TestTreeFunctions(TestCase):
  9 |     def test_tree_from_list_to_list(self):
 10 |         expected_result = get_json('../data/tetracycline/expected_result.json')['as_list']
 11 |         # convert to ScoaryTree
 12 |         scoary_tree = ScoaryTree.from_list(expected_result)
 13 |         # convert back to list
 14 |         list_tree = scoary_tree.to_list
 15 | 
 16 |         self.assertEqual(expected_result, list_tree)
 17 | 
 18 |     def test_tree_from_genes_df(self):
 19 |         """
 20 |         Check if old scoary generates the equivalent tree based on genes presence/absence
 21 |         """
 22 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
 23 |         # convert to ScoaryTree
 24 |         scoary_tree = ScoaryTree.from_presence_absence(genes_df)
 25 |         # convert to list
 26 |         list_tree = scoary_tree.to_list
 27 |         # compare to Scoary 1
 28 |         expected_result = get_json('../data/tetracycline/expected_result.json')['as_list']
 29 | 
 30 |         self.assertTrue(is_equivalent_tree(expected_result, list_tree))
 31 | 
 32 |     def test_tree_from_newick_to_newick(self):
 33 |         """
 34 |         Check if newick tree is imported correctly
 35 |         """
 36 |         expected_result = get_json('../data/tetracycline/expected_result.json')['as_newick']
 37 |         scoary_tree = ScoaryTree.from_newick(newick=expected_result)
 38 |         newick = scoary_tree.to_newick()
 39 |         self.assertEqual(expected_result, newick)
 40 | 
 41 |     def test_prune(self):
 42 |         scoary_tree = ScoaryTree.from_list(
 43 |             [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'],
 44 |              [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]]
 45 |         )
 46 |         prune_labels = ['1', '2', '3', '18', '19', '21']
 47 |         pruned_tree = scoary_tree.prune(labels=prune_labels)
 48 |         real_labels = pruned_tree.labels()
 49 |         self.assertEqual(real_labels, prune_labels)
 50 | 
 51 |     def test_copy(self):
 52 |         scoary_tree = ScoaryTree.from_list(
 53 |             [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'],
 54 |              [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]]
 55 |         )
 56 |         copied_tree = scoary_tree.copy_nonrecursive()
 57 |         nonrec_copied_tree = scoary_tree.copy_nonrecursive()
 58 | 
 59 |         def confirm_copy(t1: ScoaryTree, t2: ScoaryTree):
 60 |             self.assertFalse(t1 is t2)
 61 |             if t1.is_leaf:
 62 |                 self.assertTrue(t2.is_leaf)
 63 |                 self.assertTrue(t1.label == t2.label)
 64 |             else:
 65 |                 self.assertFalse(t2.is_leaf)
 66 |                 confirm_copy(t1.left, t2.left)
 67 |                 confirm_copy(t1.right, t2.right)
 68 | 
 69 |         confirm_copy(scoary_tree, copied_tree)
 70 |         confirm_copy(scoary_tree, nonrec_copied_tree)
 71 |         with self.assertRaises(AssertionError):
 72 |             confirm_copy(scoary_tree, scoary_tree)
 73 | 
 74 |     def test_prune_nonrecursive(self):
 75 |         scoary_tree = ScoaryTree.from_list(
 76 |             [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'],
 77 |              [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]]
 78 |         )
 79 |         prune_labels = ['1', '2', '3', '18', '19', '21']
 80 |         pruned_tree = scoary_tree.prune_nonrecursive(labels=prune_labels)
 81 |         real_labels = pruned_tree.labels()
 82 |         self.assertEqual(real_labels, prune_labels)
 83 | 
 84 |     def test_uniquivy(self):
 85 |         label_to_trait = {'X': True, ' ': False}
 86 | 
 87 |         def apply(tree):
 88 |             return ScoaryTree.from_list(tree).uniquify(label_to_trait)
 89 | 
 90 |         expected_result = '(((01)1)(01))'
 91 |         for tree in (
 92 |                 [['X', ' '], ['X', [' ', 'X']]],
 93 |                 [[' ', 'X'], ['X', [' ', 'X']]],
 94 |                 [[' ', 'X'], [[' ', 'X'], 'X']],
 95 |                 [['X', [' ', 'X']], ['X', ' ']],
 96 |                 [['X', ['X', ' ']], ['X', ' ']],
 97 |                 [['X', ['X', ' ']], [' ', 'X']],
 98 |         ):
 99 |             unique_string = apply(tree)
100 |             self.assertEqual(expected_result, unique_string)
101 | 
102 |         self.assertNotEqual(
103 |             expected_result,
104 |             apply([['X', ' '], ['X', [' ', ' ']]])
105 |         )
106 | 


--------------------------------------------------------------------------------
/tests/test_upgma.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from init_tests import *
  4 | 
  5 | from unittest import TestCase
  6 | import numpy as np
  7 | from biotite.sequence.phylo import upgma as _biotite_upgma
  8 | from biotite.sequence.phylo.tree import TreeNode as BiotiteTreeNode
  9 | 
 10 | from scipy.cluster.hierarchy import linkage, to_tree, ClusterNode
 11 | 
 12 | from scoary.upgma import upgma as scoary_upgma
 13 | from scoary.ScoaryTree import ScoaryTree
 14 | 
 15 | 
 16 | def biotite_upgma(tree: BiotiteTreeNode, labels: [str]) -> ScoaryTree:
 17 |     def convert(node: BiotiteTreeNode) -> ScoaryTree:
 18 |         """recursive function"""
 19 |         if node.is_leaf():
 20 |             return ScoaryTree(label=str(node_to_label[node]))
 21 |         else:
 22 |             return ScoaryTree(left=convert(node.children[0]), right=convert(node.children[1]))
 23 | 
 24 |     node_to_label: {BiotiteTreeNode: str} = {node: label for node, label in zip(tree.leaves, labels)}
 25 |     return convert(tree.root)
 26 | 
 27 | 
 28 | def scipy_upgma(distances, labels: [str]):
 29 |     """
 30 |     scipy.cluster.hierarchy.linkage: method=’average’ is called the UPGMA algorithm!
 31 |     """
 32 | 
 33 |     def convert(node: ClusterNode) -> ScoaryTree:
 34 |         """recursive function"""
 35 |         if node.count == 1:  # is_leaf
 36 |             return ScoaryTree(label=str(node_to_label[node.id]))
 37 |         else:
 38 |             return ScoaryTree(left=convert(node.left), right=convert(node.right))
 39 | 
 40 |     node_to_label: {BiotiteTreeNode: str} = dict(enumerate(labels))
 41 | 
 42 |     Z = linkage(distances, method='average')
 43 |     tree = to_tree(Z, False)
 44 |     return convert(tree)
 45 | 
 46 | 
 47 | class Test(TestCase):
 48 |     def test_upgma(self):
 49 |         distances = np.array([
 50 |             [0, 1, 7, 7, 9],
 51 |             [1, 0, 7, 6, 8],
 52 |             [7, 7, 0, 2, 4],
 53 |             [7, 6, 2, 0, 3],
 54 |             [9, 8, 4, 3, 0],
 55 |         ])
 56 |         labels = [f'l{i}' for i in range(5)]
 57 | 
 58 |         _biotite_tree = _biotite_upgma(distances)
 59 |         biotite_tree = biotite_upgma(_biotite_tree, labels=labels).to_list
 60 | 
 61 |         scipy_tree = scipy_upgma(distances, labels).to_list
 62 | 
 63 |         distances_df = pd.DataFrame(distances, columns=labels)
 64 |         scoary_tree = scoary_upgma(distances_df)
 65 | 
 66 |         print(biotite_tree)
 67 |         print(scoary_tree)
 68 |         print(scipy_tree)
 69 | 
 70 |         assert is_equivalent_tree(biotite_tree, scipy_tree)
 71 |         assert is_equivalent_tree(biotite_tree, scoary_tree)
 72 | 
 73 |     def test_scoary(self, size=20, n_tests=1000):
 74 |         labels = [f'l{i}' for i in range(size)]
 75 | 
 76 |         n_failures = 0
 77 |         for i in range(n_tests):
 78 |             matrix = np.random.randint(0, 2000, size=(size, size))
 79 |             symmetrical_matrix = (matrix + matrix.T) / 2
 80 | 
 81 |             distances_df = pd.DataFrame(symmetrical_matrix, columns=labels)
 82 |             scoary_tree = scoary_upgma(distances_df)
 83 | 
 84 |             _biotite_tree = _biotite_upgma(symmetrical_matrix)
 85 |             biotite_tree = biotite_upgma(_biotite_tree, labels=labels).to_list
 86 | 
 87 |             if not is_equivalent_tree(biotite_tree, scoary_tree):
 88 |                 print('no match:')
 89 |                 print(f'  {biotite_tree=}')
 90 |                 print(f'   {scoary_tree=}')
 91 |                 n_failures += 1
 92 | 
 93 |         print(f'{n_failures=} out of {n_tests} tests')
 94 |         print(n_failures / n_tests)
 95 |         self.assertLess(n_failures / n_tests, 0.05, f'Lots of failures, wtf?')
 96 | 
 97 |     def test_scipy(self, size=20, n_tests=1000):
 98 |         """
 99 |         0 % agreement!
100 |         Evidently, scipy's upgma implementation works rather differently!
101 |         """
102 |         labels = [f'l{i}' for i in range(size)]
103 | 
104 |         n_failures = 0
105 |         for i in range(n_tests):
106 |             matrix = np.random.randint(0, 2000, size=(size, size))
107 |             symmetrical_matrix = (matrix + matrix.T) / 2
108 | 
109 |             _biotite_tree = _biotite_upgma(symmetrical_matrix)
110 |             biotite_tree = biotite_upgma(_biotite_tree, labels=labels).to_list
111 | 
112 |             scipy_tree = scipy_upgma(symmetrical_matrix, labels).to_list
113 | 
114 |             if not is_equivalent_tree(biotite_tree, scipy_tree):
115 |                 print('no match:')
116 |                 print(f'  {biotite_tree=}')
117 |                 print(f'    {scipy_tree=}')
118 |                 n_failures += 1
119 | 
120 |         print(f'{n_failures=} out of {n_tests} tests')
121 |         print(n_failures / n_tests)
122 |         self.assertLess(n_failures / n_tests, 0.05, f'Lots of failures, wtf?')
123 | 


--------------------------------------------------------------------------------
/scoary/final_overview.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os.path
  3 | 
  4 | import pandas as pd
  5 | import matplotlib as mpl
  6 | 
  7 | mpl.use('SVG')
  8 | # The SVG backend avoids this error message:
  9 | # ValueError: Image size of 700x165660 pixels is too large. It must be less than 2^16 in each direction.
 10 | # This allows for dendrograms with at least 20'000 traits
 11 | 
 12 | import mgwas_data_exploration_app.main as exploration_app
 13 | 
 14 | logger = logging.getLogger('scoary.final_overview')
 15 | 
 16 | SCORES_CONFIG = {
 17 |     "best_fisher_q": {
 18 |         "legend": "Fisher's <i>q</i>-value",
 19 |         "marker-matplotlib": "$f$",
 20 |         "marker-html": "<i>f</i>",
 21 |         "color": "forestgreen"
 22 |     },
 23 |     "best_empirical_p": {
 24 |         "legend": "Empirical <i>p</i>-value",
 25 |         "marker-matplotlib": "$e$",
 26 |         "marker-html": "<i>e</i>",
 27 |         "color": "mediumpurple"
 28 |     },
 29 |     "best_fq*ep": {
 30 |         "legend": "<i>fq*ep</i> score",
 31 |         "marker-matplotlib": "*",
 32 |         "marker-html": "*",
 33 |         "color": "crimson"
 34 |     }
 35 | }
 36 | 
 37 | 
 38 | def create_final_overview(
 39 |         summary_df: pd.DataFrame,
 40 |         traits_df: pd.DataFrame,
 41 |         numeric_df: pd.DataFrame,
 42 |         outdir: str,
 43 |         trait_info_df: pd.DataFrame = None,
 44 |         isolate_info_df: pd.DataFrame = None,
 45 |         force_binary_clustering: bool = False,
 46 |         symmetric: bool = True,
 47 |         distance_metric: str = 'jaccard',
 48 |         linkage_method: str = 'ward',
 49 |         optimal_ordering: bool = True,
 50 |         corr_method: str = 'pearson'
 51 | ):
 52 |     # copy files from exploration app
 53 |     logger.info('Copying exploration app...')
 54 |     exploration_app.copy_app(outdir, config={'scores': SCORES_CONFIG})
 55 | 
 56 |     if isolate_info_df is not None:
 57 |         logger.info('Adding isolate_info.tsv...')
 58 |         isolate_info_df.to_csv(f'{outdir}/isolate_info.tsv', sep='\t')
 59 | 
 60 |     logger.debug('Adding preliminary summary.tsv...')
 61 |     summary_df.index.name = 'Trait'
 62 |     summary_df.to_csv(f'{outdir}/summary_orig.tsv', sep='\t')
 63 | 
 64 |     # append trait info
 65 |     if trait_info_df is not None:
 66 |         logger.debug('Adding trait_info_df to summary.tsv...')
 67 |         summary_df_index = list(summary_df.index)
 68 |         summary_df = summary_df \
 69 |             .merge(trait_info_df, left_index=True, right_index=True, how='left', copy=False) \
 70 |             .reindex(summary_df_index)  # merging destroys index order
 71 |         summary_df.index.name = 'Trait'
 72 |         summary_df.to_csv(f'{outdir}/summary.tsv', sep='\t')
 73 | 
 74 |     if len(summary_df) > 1:
 75 |         logger.info('Calculating dendrogram linkage matrix...')
 76 |         if numeric_df is None or force_binary_clustering:
 77 |             logger.info(f'Calculating dendrogram based on binary data using jaccard distances...')
 78 |             linkage_matrix, labels = exploration_app.calculate_linkage_matrix_from_binary(
 79 |                 summary_df=summary_df,
 80 |                 traits_df=traits_df,
 81 |                 symmetric=symmetric,
 82 |                 distance_metric=distance_metric,
 83 |                 linkage_method=linkage_method,
 84 |                 optimal_ordering=optimal_ordering
 85 |             )
 86 |         else:
 87 |             logger.info(f'Calculating dendrogram based on correlation of numeric features...')
 88 |             linkage_matrix, labels = exploration_app.calculate_linkage_matrix_from_numeric(
 89 |                 summary_df=summary_df,
 90 |                 traits_df=numeric_df,
 91 |                 symmetric=symmetric,
 92 |                 scale=True,
 93 |                 corr_method=corr_method,
 94 |                 linkage_method=linkage_method,
 95 |                 optimal_ordering=optimal_ordering,
 96 |             )
 97 | 
 98 |         logger.info('Calculating dendrogram plot...')
 99 |         summary_df = exploration_app.final_plot(
100 |             linkage_matrix=linkage_matrix,
101 |             labels=labels,
102 |             summary_df=summary_df,
103 |             scores_config=SCORES_CONFIG,
104 |             workdir=outdir,
105 |             dendrogram_x_scale='linear',
106 |             scores_x_scale='manhattan'
107 |         )
108 | 
109 |         # save summary_df, ensure order matches plot
110 |         logger.info('Saving sorted summary.tsv...')
111 |         summary_df.index.name = 'Trait'
112 |         summary_df.to_csv(f'{outdir}/summary.tsv', sep='\t')
113 | 
114 |     if not os.path.isfile(f'{outdir}/summary.tsv'):
115 |         logger.debug('Moving summary_orig.tsv to summary.tsv...')
116 |         os.rename(f'{outdir}/summary_orig.tsv', f'{outdir}/summary.tsv')
117 | 


--------------------------------------------------------------------------------
/scoary/load_genes.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pandas as pd
  3 | 
  4 | logger = logging.getLogger('scoary.load_genes')
  5 | 
  6 | 
  7 | def filter_df(df: pd.DataFrame, restrict_to: [str] = None, ignore: [str] = None) -> pd.DataFrame:
  8 |     if ignore:
  9 |         ignore = set(ignore)
 10 |         missing = ignore.difference(set(df.columns))
 11 |         assert len(missing) == 0, f'Some strains in ignore were not found: {missing=}'
 12 |         df = df[[c for c in df.columns if c not in ignore]]
 13 | 
 14 |     if restrict_to is not None:
 15 |         restrict_to = set(restrict_to)
 16 |         have_cols = set(df.columns)
 17 |         cols_missing = restrict_to.difference(have_cols)
 18 |         assert len(cols_missing) == 0, f'Some strains in restrict_to were not found:' \
 19 |                                        f'\n{cols_missing=}' \
 20 |                                        f'\n{restrict_to=}' \
 21 |                                        f'\n{have_cols=}'
 22 |         cols_dropped = restrict_to.difference(set(df.columns))
 23 |         logger.debug(f'Cols kept: {list(restrict_to)}')
 24 |         logger.debug(f'Cols dropped: {list(cols_dropped)}')
 25 |         df = df[[c for c in df.columns if c in restrict_to]]
 26 | 
 27 |     return df
 28 | 
 29 | 
 30 | def load_gene_count_file(
 31 |         path: str,
 32 |         delimiter: str,
 33 |         restrict_to: [str] = None,
 34 |         ignore: [str] = None
 35 | ) -> (pd.DataFrame, pd.DataFrame):
 36 |     """
 37 |     Load Roary-style gene count file with columns=strains and rows=genes
 38 | 
 39 |     :param path: Path to file
 40 |     :param delimiter: delimiter of the file
 41 |     :param restrict_to: columns to keep, will drop all other columns
 42 |     :param ignore: columns to ignore
 43 |     :return: genes_df (DataFrame, dtype: bool); columns: strains; index: genes
 44 |     """
 45 |     count_df = pd.read_csv(path, delimiter=delimiter, index_col=0)
 46 | 
 47 |     # remove columns that are not in traits_df
 48 |     if restrict_to is not None or ignore is not None:
 49 |         count_df = filter_df(count_df, restrict_to, ignore)
 50 | 
 51 |     # sanity checks
 52 |     assert count_df.columns.is_unique, f'{path=}: columns not unique'
 53 |     assert count_df.index.is_unique, f'{path=}: index not unique'
 54 |     assert not count_df.isna().values.any(), f'{path=}: contains NaN'
 55 | 
 56 |     # add metadata
 57 |     count_df.attrs['content_type'] = 'gene-count'
 58 | 
 59 |     # convert to bool
 60 |     binary_df = count_df >= 1
 61 | 
 62 |     # remove core- and unique genes
 63 |     row_sums = binary_df.sum(axis=1)
 64 |     binary_df = binary_df[(row_sums != 0) & (row_sums != len(binary_df.columns))]
 65 | 
 66 |     logger.debug(f'Loaded gene-count-df:\n{binary_df}')
 67 |     return count_df, binary_df
 68 | 
 69 | 
 70 | def load_gene_list_file(
 71 |         path: str,
 72 |         delimiter: str,
 73 |         restrict_to: [str] = None,
 74 |         ignore: [str] = None
 75 | ) -> (pd.DataFrame, pd.DataFrame):
 76 |     """
 77 |     Load Orthofinder-style gene list file with columns=strains and rows=genes
 78 | 
 79 |     :param path: Path to file
 80 |     :param delimiter: delimiter of the file
 81 |     :param restrict_to: columns to keep, will drop all other columns
 82 |     :param ignore: columns to ignore
 83 |     :return: genes_df (DataFrame, dtype: bool); columns: strains; index: genes
 84 |     """
 85 |     list_df = pd.read_csv(path, delimiter=delimiter, index_col=0, dtype=str)
 86 | 
 87 |     # remove columns that are not in traits_df
 88 |     if restrict_to is not None or ignore is not None:
 89 |         list_df = filter_df(list_df, restrict_to, ignore)
 90 | 
 91 |     # sanity checks
 92 |     assert list_df.columns.is_unique, f'{path=}: columns not unique'
 93 |     assert list_df.index.is_unique, f'{path=}: index not unique'
 94 | 
 95 |     # add metadata
 96 |     list_df.attrs['content_type'] = 'gene-list'
 97 | 
 98 |     # convert to bool
 99 |     binary_df = ~list_df.isna()
100 | 
101 |     # remove core- and unique genes
102 |     row_sums = binary_df.sum(axis=1)
103 |     binary_df = binary_df[(row_sums != 0) & (row_sums != len(binary_df.columns))]
104 | 
105 |     logger.debug(f'Loaded gene-list -df:\n{binary_df}')
106 |     return list_df, binary_df
107 | 
108 | 
109 | def parse_params(orig_params: str) -> (str, str):
110 |     error_message = f"""
111 | {orig_params=} is poorly formatted.
112 | Must be '<data_type>:<?delimiter>'.
113 |   Possible values for data_type:  {{'gene-count', 'gene-list'}}  (default: gene-count)
114 |   Possible values for delimiter:  any single character, only relevant when data_type=gene-count  (default: ',')
115 | """.strip()
116 | 
117 |     params = orig_params.lower().split(':')
118 | 
119 |     if len(params) == 1:
120 |         data_type, delimiter = params[0], ','
121 |     elif len(params) == 2:
122 |         data_type, delimiter = params
123 |     else:
124 |         raise AssertionError(error_message)
125 | 
126 |     assert data_type in {'gene-count', 'gene-list'}, error_message
127 | 
128 |     return data_type, delimiter
129 | 
130 | 
131 | def load_genes(
132 |         genes: str,
133 |         gene_data_type: str,
134 |         restrict_to: [str] = None,
135 |         ignore: [str] = None
136 | ) -> (pd.DataFrame, pd.DataFrame):
137 |     """
138 |     Load genes_df with columns=strains and rows=genes
139 | 
140 |     :param genes: Path to genes file
141 |     :return: genes_df (DataFrame, dtype: bool); columns: strains; index: genes
142 |     """
143 |     data_type, delimiter = parse_params(gene_data_type)
144 | 
145 |     if data_type == 'gene-count':
146 |         genes_orig_df, genes_bool_df = load_gene_count_file(genes, delimiter, restrict_to, ignore)
147 |     elif data_type == 'gene-list':
148 |         genes_orig_df, genes_bool_df = load_gene_list_file(genes, delimiter, restrict_to, ignore)
149 |     else:
150 |         raise AssertionError(f'Programming error: {data_type=} must be gene-count or gene-list!')
151 | 
152 |     # ensure the index name is always the same
153 |     genes_orig_df.index.name = 'Gene'
154 |     genes_bool_df.index.name = 'Gene'
155 | 
156 |     return genes_orig_df, genes_bool_df
157 | 


--------------------------------------------------------------------------------
/tests/test_analyze_trait.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from init_tests import *
  4 | from datetime import datetime
  5 | from scoary.ScoaryTree import ScoaryTree
  6 | from scoary.load_genes import load_genes
  7 | from scoary.load_traits import load_traits
  8 | from scoary.analyze_trait import init_result_df, create_test_df, add_odds_ratio, pair_picking
  9 | 
 10 | 
 11 | def generate_fake_traits(genes_df: pd.DataFrame) -> {str: bool}:
 12 |     label_to_trait = {}
 13 |     label_to_trait.update({l: True for l in genes_df.columns[:11]})
 14 |     label_to_trait.update({l: False for l in genes_df.columns[89:]})
 15 |     return pd.Series(label_to_trait, dtype='boolean')
 16 | 
 17 | 
 18 | class TestScoary(TestCase):
 19 |     def test_create_result_df(self):
 20 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
 21 |         result_df = init_result_df(genes_df, trait_series=generate_fake_traits(genes_df))
 22 |         self.assertEqual(
 23 |             result_df.columns.tolist(),
 24 |             ['Gene', 'g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__', 'sensitivity', 'specificity']
 25 |         )
 26 | 
 27 |     def test_contingency_test(self):
 28 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
 29 |         result_df = init_result_df(genes_df, trait_series=generate_fake_traits(genes_df))
 30 |         test_df = create_test_df(result_df=result_df)
 31 |         self.assertEqual(['__contingency_table__', 'fisher_p'], test_df.columns.tolist())
 32 |         print(f"Done: minpval={test_df.fisher_p.min()}")
 33 | 
 34 |     def test_odds_ratio(self):
 35 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
 36 |         genes_df = genes_df[:100]  # only first 100 rows
 37 |         test_df = init_result_df(genes_df, generate_fake_traits(genes_df))
 38 | 
 39 |         # apply function
 40 |         test_df = add_odds_ratio(test_df)
 41 |         self.assertEqual(
 42 |             test_df.columns.tolist(),
 43 |             ['Gene', 'g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__', 'sensitivity', 'specificity',
 44 |              'odds_ratio']
 45 |         )
 46 | 
 47 |         # calculate odds_ratio with fisher_exact
 48 |         fisher_ors = test_df.apply(
 49 |             lambda row: fisher_exact([[row['g+t+'], row['g+t-']], [row['g-t+'], row['g-t-']]])[0], axis=1)
 50 | 
 51 |         # check if result is identical
 52 |         for manual_or, fisher_or in zip(test_df['odds_ratio'], fisher_ors):
 53 |             self.assertTrue(is_equivalent(manual_or, fisher_or))
 54 | 
 55 |     def test_init_result_df_performance(self):
 56 |         _, genes_df = load_genes('../data/new_ds/N0.tsv', gene_data_type='gene-list:\t')
 57 |         ltt = generate_fake_traits(genes_df)
 58 |         start = datetime.now()
 59 |         result_df = init_result_df(genes_df, trait_series=ltt)
 60 |         end = datetime.now()
 61 |         print(result_df)
 62 |         print('took:', end - start)
 63 | 
 64 |     def test_tetracycline(self):
 65 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
 66 |         _, traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,')
 67 |         trait_series = traits_df['Tetracycline_resistance']
 68 | 
 69 |         # calculate sensitivity and specificity
 70 |         test_df = init_result_df(
 71 |             genes_df,
 72 |             trait_series=pd.Series(
 73 |                 {l: bool(v) for l, v in trait_series.items() if v in (0, 1)},
 74 |                 dtype='boolean'
 75 |             )
 76 |         )
 77 |         # calculate odds_ratio
 78 |         test_df = add_odds_ratio(test_df)
 79 |         # calculate pairwise comparisons
 80 |         tree = ScoaryTree.from_list(get_json('../data/tetracycline/expected_result.json')['as_list'])
 81 |         assert set(tree.labels()) == set(genes_df.columns)
 82 |         test_df = pair_picking(test_df, genes_df, tree=tree, label_to_trait=trait_series)
 83 | 
 84 |         # load expected result from scoary 1
 85 |         expected_result = pd.read_csv('../data/tetracycline/fisher_permute100.results.csv')
 86 | 
 87 |         test_df.set_index('Gene', inplace=True)
 88 | 
 89 |         # check if result is identical
 90 |         for i, row in expected_result.iterrows():
 91 |             table = (row.Number_pos_present_in,
 92 |                      row.Number_neg_present_in,
 93 |                      row.Number_pos_not_present_in,
 94 |                      row.Number_neg_not_present_in)
 95 |             new_row = test_df.loc[row.Gene]
 96 |             new_table = tuple(int(new_row[c]) for c in ('g+t+', 'g+t-', 'g-t+', 'g-t-'))
 97 | 
 98 |             self.assertEqual(table, new_table)
 99 |             self.assertAlmostEqual(
100 |                 row.Odds_ratio, new_row.odds_ratio,
101 |                 msg=f'Failed to calculate odds_ratio for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}'
102 |             )
103 |             self.assertAlmostEqual(
104 |                 row.Sensitivity, new_row.sensitivity,
105 |                 msg=f'Failed to calculate sensitivity for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}'
106 |             )
107 |             self.assertAlmostEqual(
108 |                 row.Specificity, new_row.specificity,
109 |                 msg=f'Failed to calculate specificity for {row.Gene}: {row.Odds_ratio} != {new_row.odds_ratio}'
110 |             )
111 | 
112 |             xx = [
113 |                 (row.Max_Pairwise_comparisons, new_row.contrasting),
114 |                 (row.Max_supporting_pairs, new_row.supporting),
115 |                 (row.Max_opposing_pairs, new_row.opposing),
116 |                 (row.Best_pairwise_comp_p, new_row.best),
117 |                 (row.Worst_pairwise_comp_p, new_row.worst)
118 |             ]
119 |             try:
120 |                 self.assertEqual(row.Max_Pairwise_comparisons, new_row.contrasting)
121 |                 self.assertEqual(row.Max_supporting_pairs, new_row.supporting)
122 |                 self.assertEqual(row.Max_opposing_pairs, new_row.opposing)
123 |                 self.assertAlmostEqual(row.Best_pairwise_comp_p, new_row.best)
124 |                 self.assertAlmostEqual(row.Worst_pairwise_comp_p, new_row.worst)
125 |             except Exception as e:
126 |                 print(i, row.Gene, xx)
127 |                 self.fail(msg=str(e))
128 | 


--------------------------------------------------------------------------------
/tests/test_final_overview.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from subprocess import call
  3 | from init_tests import *
  4 | from scoary.final_overview import create_final_overview
  5 | from scoary.load_traits import load_binary
  6 | from scoary.utils import pd, AnalyzeTraitNamespace
  7 | 
  8 | REPLACE_COPIES_WITH_SYMLINKS = True
  9 | 
 10 | 
 11 | def replace_copies_with_symlinks():
 12 |     def repl(fn, relpath='../..', subdir='app'):
 13 |         src = f'{relpath}/scoary/templates/{fn}'
 14 |         target = f'../TEST_OUTPUT/{subdir}/{fn}'
 15 |         if os.path.isfile(target):
 16 |             os.remove(target)
 17 |         os.symlink(src=src, dst=target)
 18 | 
 19 |     for file in ['trait.html', 'overview.html']:
 20 |         repl(file, relpath='..', subdir='')
 21 |     for file in ['config.json', 'favicon.svg', 'overview.css', 'overview.js', 'trait.css', 'trait.js']:
 22 |         repl(file)
 23 | 
 24 | 
 25 | class Test(TestCase):
 26 |     def setUp(self) -> None:
 27 |         self.temp_dir = get_tempdir_path()
 28 |         self.fake_ns = AnalyzeTraitNamespace()
 29 |         self.fake_ns.outdir = self.temp_dir
 30 |         self.fake_ns.trait_info_df = None
 31 | 
 32 |         os.makedirs(self.temp_dir, exist_ok=True)
 33 |         call(f'rm -rf {self.temp_dir}/*', shell=True)
 34 |         for dir_ in ['app', 'traits', 'logs']:
 35 |             os.makedirs(f'{self.temp_dir}/{dir_}', exist_ok=True)
 36 | 
 37 |     def tearDown(self) -> None:
 38 |         if REPLACE_COPIES_WITH_SYMLINKS:
 39 |             replace_copies_with_symlinks()
 40 |         print(f'Open file://{self.temp_dir} to see the result!')
 41 |         print(f'To clean up, run "rm -r {self.temp_dir}"')
 42 | 
 43 |     def test_simple(self):
 44 |         summary_df = pd.DataFrame(**{'index': ['Compound_242', 'Compound_267', 'Compound_286'],
 45 |                                      'columns': ['best_fisher_p', 'best_fisher_q', 'best_empirical_p', 'best_fq*ep'],
 46 |                                      'data': [[0.574065934065931, 0.438405797101457, 0.03596403596403, 1.576684e-02],
 47 |                                               [0.432940190858691, 0.266793137470672, 0.13386613386613, 3.571457e-02],
 48 |                                               [0.194418465932588, 7.98120572982e-08, 0.02097902097902, 1.674379e-09]]})
 49 |         traits_df = pd.DataFrame(**{
 50 |             'index': ['FAM10789-i1-1.1', 'FAM1079-i1-1.1', 'FAM10792-i1-1.1', 'FAM11142-i1-1.1', 'FAM11194-i1-1.1',
 51 |                       'FAM11199-i1-1.1', 'FAM11206-i1-1.1', 'FAM1233-i1-1.1', 'FAM1301-i1-1.1', 'FAM13493-i1-1.1'],
 52 |             'columns': ['Compound_242', 'Compound_267', 'Compound_286'],
 53 |             'data': [[pd.NA, True, True], [pd.NA, pd.NA, True], [pd.NA, pd.NA, True], [pd.NA, False, False],
 54 |                      [pd.NA, True, True], [pd.NA, pd.NA, True], [True, pd.NA, True], [pd.NA, False, True],
 55 |                      [False, pd.NA, True], [pd.NA, pd.NA, True]]},
 56 |                                  dtype='boolean')
 57 |         self.fake_ns.traits_df = traits_df  # load_binary('../data/new_ds/LC-binary.tsv', '\t')
 58 |         create_final_overview(summary_df=summary_df, ns=self.fake_ns)
 59 | 
 60 |     def test_larger(self):
 61 |         self.fake_ns.traits_df = load_binary('../data/new_ds/LC-binary.tsv', '\t')
 62 |         summary_df = pd.DataFrame(index=self.fake_ns.traits_df.columns)
 63 |         for col in ['best_fisher_p', 'best_fisher_q', 'best_empirical_p']:
 64 |             summary_df[col] = np.random.rand(1, len(self.fake_ns.traits_df.columns))[0]
 65 |         create_final_overview(summary_df=summary_df, ns=self.fake_ns)
 66 | 
 67 |     def test_largest(self):
 68 |         # This function was used to determine the desired recursion limit in plot_dendrogram
 69 |         n_traits, n_isolates = 100, 44  # 10000, 44
 70 |         self.fake_ns.traits_df = pd.DataFrame(
 71 |             np.random.rand(n_isolates, n_traits) > 0.5,
 72 |             index=[f'I{i}' for i in range(n_isolates)],
 73 |             columns=[f'T{i}' for i in range(n_traits)],
 74 |         )
 75 |         summary_df = pd.DataFrame(index=self.fake_ns.traits_df.columns)
 76 |         for col in ['best_fisher_p', 'best_fisher_q', 'best_empirical_p']:
 77 |             summary_df[col] = np.random.rand(1, len(self.fake_ns.traits_df.columns))[0]
 78 | 
 79 |         create_final_overview(summary_df=summary_df, ns=self.fake_ns)
 80 | 
 81 |     def test_real(self):
 82 |         # Find the best way of plotting dendrogram
 83 |         # summary_df = pd.read_csv(f'../data/summary__.tsv', sep='\t', index_col=0)
 84 |         summary_df = pd.read_csv(f'../TMP/TEST_OUTPUT_real_restricted/summary.tsv', sep='\t', index_col=0)
 85 |         self.fake_ns.traits_df = pd.read_csv('../data/traits_df.tsv', sep='\t', index_col=0, dtype='str') == 'True'
 86 |         create_final_overview(
 87 |             summary_df=summary_df,
 88 |             traits_df=self.fake_ns.traits_df,
 89 |             outdir=self.fake_ns.outdir
 90 |         )
 91 | 
 92 | 
 93 |     def test_understand_jaccard(self):
 94 |         from scipy.spatial.distance import cdist
 95 | 
 96 |         a = np.array([[0, 0, 0, 0, 0, 0, 1, 0, -1, 0],
 97 |                       [1, 0, 0, -1, 1, 0, 0, -1, 0, 0],
 98 |                       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ],
 99 |                       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ],
100 |                       [1, 1, 1, -1, 1, 1, 1, 1, 1, 1]], dtype=int)
101 |         print('input:\n', a)
102 | 
103 |         def conf(arr):
104 |             aa, nn = arr.shape
105 |             res = np.zeros(shape=(aa, aa))
106 | 
107 |             for a in range(aa):
108 |                 for b in range(aa):
109 |                     if a == b:
110 |                         break
111 | 
112 |                     n = nn - sum(np.logical_and(arr[a] == 0, arr[b] == 0))
113 | 
114 |                     # x = np.abs(arr[a] - arr[b]) / 2
115 |                     # y = np.abs(arr[a] - (0 - arr[b])) / 2
116 |                     # r = min(x.sum(), y.sum()) / n
117 |                     # print(a, b, x, y, n, r)
118 | 
119 |                     x = arr[a] != arr[b]
120 |                     y = arr[a] != (0 - arr[b])
121 |                     r = min(x.sum(), y.sum()) / n
122 |                     print(a, b, x, y, n, r)
123 | 
124 |                     res[a, b] = r
125 |                     res[b, a] = r
126 |             print('res')
127 |             print(res)
128 | 
129 |         conf(a)
130 | 
131 |         def x(a):
132 |             a = np.nan_to_num(a, nan=0.5)
133 |             b = 0 - a
134 | 
135 |             d1 = cdist(a, a, metric='jaccard')
136 |             d2 = cdist(a, b, metric='jaccard')
137 |             d = np.minimum(d1, d2)
138 | 
139 |             # print('a')
140 |             # print(a)
141 |             # print('b')
142 |             # print(b)
143 |             # print('d1')
144 |             # print(d1)
145 |             # print('d2')
146 |             # print(d2)
147 |             print('d')
148 |             print(d)
149 | 
150 |         x(a)
151 | 
152 | 
153 | class ReplaceCopiesWithSymlinks(TestCase):
154 |     def test_replace_copies_with_symlinks(self):
155 |         replace_copies_with_symlinks()
156 | 


--------------------------------------------------------------------------------
/scoary/vcf2scoary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Script to search vcf files for mutations within specific coordinates
  5 | # Input: 
  6 | # -A vcf file
  7 | #
  8 | # Output:
  9 | # -A Roary-like file with mutations sorted in rows, strains as columns and presence/absence in cells
 10 | # -Columns: Chromosome, Position, variant (eg C->T), type (eg missense, synonymous, frameshift etc)
 11 | 
 12 | 
 13 | # Reading VCF
 14 | # File metainfo starts as ##key=value
 15 | # These are always formed and should be caught
 16 | # example ##fileformat=VCFv4.3 - give warning if format is off
 17 | # Columns 8 MANDATORY
 18 | # CHROM POS ID REF ALT QUAL FILTER INFO
 19 | # OPTIONAL COLUMNS
 20 | # FORMAT SAMPLE1 SAMPLE2 etc
 21 | # All data lines are tab-delimited
 22 | # CHROM : string, no whitespace
 23 | # POS : integer. Can have many lines with same pos. Pos=0 or N+1 for telomere positions
 24 | # ID : semicolon-delimited list of strings
 25 | # REF : string, ACGTN (can be multiple)
 26 | # ALT : comma-separated list, ACGTN* (* = allele is missing due to overlapping deletion)
 27 | # (NOTE: Suggest splitting ALT variants into different lines to preserve binarity)
 28 | # QUAL : float
 29 | # FILTER : PASS or semicolon-delimited list
 30 | # INFO : semicolon-delimited list of key=value pairs or flags
 31 | # FORMAT (optional) : colon-delimited list.
 32 | # Genotype fields - Genotype always first field
 33 | # GT encoded as allele values separated by | or /. 0 = reference. 1 = first ALT. 2 = second alt etc
 34 | # NOTE: Haploid calls (bacteria) have only 1 value
 35 | # NOTE: / means genotype unphased. | means genotype phased
 36 | # INFO field SVtypes : DELetion, INSertion, DUPlication, INVersion, CNV
 37 | 
 38 | import sys
 39 | import argparse
 40 | import os
 41 | import csv
 42 | import re
 43 | import traceback
 44 | 
 45 | __version__ = '0.1b'
 46 | __author__ = 'Ola Brynildsrud'
 47 | __credits = ['Ola Brynildsrud']
 48 | __email__ = 'olbb@fhi.no'
 49 | 
 50 | def main():
 51 |     """
 52 |     Converts VCF files (version 4.x) to Scoary format
 53 |     """
 54 |     ##########################################################################
 55 |     # Parse command line arguments
 56 | 
 57 |     parser = argparse.ArgumentParser(
 58 |         description='This script takes in vcf files and creates a '
 59 |                     'presence/absence matrix of mutations in the '
 60 |                     'Roary/Scoary format',
 61 |         epilog='by Ola Brynildsrud (olbb@fhi.no)')
 62 |     parser.add_argument(
 63 |         '--out',
 64 |         action='store',
 65 |         default='./mutations_presence_absence.csv',
 66 |         help='The path to the output file')
 67 |     parser.add_argument(
 68 |         '--types',
 69 |         action='store',
 70 |         default='ALL',
 71 |         help='The types of variants to include in the output. NOTE: This '
 72 |              'works if TYPE=XX can be found in the INFO column of the vcf '
 73 |              'file. The special keyword ALL includes all types. This is '
 74 |              'the default setting. Common types are snp, mnp, ins, del '
 75 |              'and complex. Give as comma-separated list. '
 76 |              'Example: --types snp,ins,del')
 77 |     parser.add_argument(
 78 |         '--version',
 79 |         action='version',
 80 |         version=__version__)
 81 |     parser.add_argument(
 82 |         '--force',
 83 |         action='store_true',
 84 |         default=False,
 85 |         help='Force overwriting of output file. (If it already '
 86 |              'exists)')
 87 |     parser.add_argument(
 88 |         'vcf',
 89 |         action='store',
 90 |         metavar='<VCF_file>',
 91 |         help='The VCF file to convert to Roary/Scoary format')
 92 | 
 93 |     args = parser.parse_args()
 94 |     if args.types != "ALL":
 95 |         args.types = args.types.split(",")
 96 | 
 97 |     if os.path.isfile(args.out) and not args.force:
 98 |         sys.exit("Outfile already exists. Change name of outfile or "
 99 |                  "run with --force")
100 |     if not os.path.isfile(args.vcf):
101 |         sys.exit("Unable to locate input file %s" % args.vcf)
102 | 
103 |     with open(args.vcf) as vcffile, open(args.out,'w') as outfile:
104 |         lines = csv.reader(vcffile, delimiter='\t', quotechar='"')
105 |         metainfo = {"##INFO" : {},
106 |                     "##FILTER" : {},
107 |                     "##FORMAT" : {},
108 |                     "##ALT" : {},
109 |                     "##contig" : {},
110 |                     "##META" : {},
111 |                     "##SAMPLE" : {},
112 |                     "##PEDIGREE" : {}
113 |         }
114 |         #for line in lines:
115 |         while True:
116 |             try:
117 |                 line = next(lines)
118 |             except StopIteration:
119 |                 print(traceback.print_exc())
120 |                 sys.exit("ERROR: There appears to be only metainformation "
121 |                          "(lines starting with ##) in your VCF file.")
122 |             # Get metainfo from file
123 |             if line[0][:2] == '##':
124 |                 infoline = re.split('=',line[0], maxsplit=1)
125 |                 # Capture list output for complex tags
126 |                 if infoline[0] in metainfo:
127 |                     ID=re.search(r'ID=(\w+)',infoline[1]).group(1)
128 |                     infolist = re.split(',(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)',infoline[1].strip("<>"))
129 |                     metainfo[infoline[0]][ID] = {}
130 |                     # Enter all elements in infolist into appropriate dic
131 |                     for e in infolist:
132 |                         esplit = e.split("=")
133 |                         metainfo[infoline[0]][ID][esplit[0]] = esplit[1]
134 | 
135 |                 else:
136 |                     metainfo[infoline[0]] = infoline[1]
137 |             else:
138 |                 # Have reached the data section of the file
139 |                 data = {"header": line}
140 |                 break
141 | 
142 |         try:
143 |             vcfversion = metainfo["##fileformat"].split("v")[1]
144 |             if int(vcfversion[0]) != 4:
145 |                 print("WARNING: A VCF format other than 4.x detected."
146 |                       " File parsing may proceed with errors.")
147 |             else:
148 |                 print("VCF version %s detected" % vcfversion)
149 |         except:
150 |             print("WARNING: Could not detect VCF format. Expected "
151 |                   "v4.x. File parsing may proceed with errors.")
152 |             print(traceback.print_exc())
153 | 
154 |         # Check that genotype fields have a single allele
155 |         if metainfo["##FORMAT"]["GT"]["Number"] != "1":
156 |             sys.exit("ERROR: Expected a single allele per genotype. Scoary "
157 |                      "only works for haploid organisms.")
158 | 
159 |         # Have now caught all metainformation. Now get column information       
160 |         #header = next(line)
161 |         #print header
162 |         data["header"] = data["header"][:9] + ["DUMMY"] + data["header"][9:]
163 |         outfile.write(','.join('"' + c + '"' for c in data["header"]) + '\n')
164 | 
165 |         while True:
166 |             try:
167 |                 line = next(lines)
168 |             except StopIteration:
169 |                 print("Reached the end of the file")
170 |                 sys.exit(0)
171 |             # Check if line is allowed:
172 |             if args.types != "ALL":
173 |                 vartype = re.search(r'TYPE=(\w+)',line[7]).group(1)
174 |                 if vartype not in args.types:
175 |                     continue
176 |             
177 |             # Split line if ALT contains more than one variant
178 |             if "," in line[4]:
179 |                 orgline = line[:]
180 |                 alts = line[4].split(",")
181 |                 c = 1
182 |                 for a in alts:
183 |                      newline = orgline[:]
184 |                      newline[4] = a
185 |                      # Only get GT
186 |                      newline[9:] = \
187 |                          [cell.split(":")[0] for cell in orgline[9:]]
188 |                      # Fix dummy comparisons
189 |                      newline[9:] = fixdummy(newline[9:], c)
190 |                      newline = newline[:9] + ["True"] + newline[9:]
191 |                      c += 1
192 |                      writeLine(newline, outfile)
193 | 
194 |             # Genotype fields need to be 0 or 1
195 |             # GT is always first in colon-separated list
196 |             else:
197 |                 newline = line[:9] + ["False"] + line[9:]
198 |                 writeLine(newline, outfile)
199 | 
200 | def writeLine(line, outfile):
201 |     writeline = line[:9] + [cell.split(":")[0] for cell in line[9:]]
202 |     outfile.write(','.join('"' + c + '"' for c in writeline) + '\n')
203 | 
204 | def fixdummy(line,c):
205 |     newline = line[:]
206 |     try:
207 |         for x in range(len(line)):
208 |             if line[x] == ".":
209 |                 # Missing data get entered as reference / no presence
210 |                 newline[x] = "0"
211 |             elif int(line[x]) == c:
212 |                 newline[x] = "1"
213 |             else:
214 |                 newline[x] = "0"
215 |     except ValueError:
216 |         print(newline, c)
217 |         sys.exit(-1)
218 |     return newline
219 | 
220 | ########
221 | # MAIN #
222 | ########
223 | if __name__ == '__main__':
224 |     main()
225 | 


--------------------------------------------------------------------------------
/scoary/ScoaryTree.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from functools import cached_property
  4 | from typing import Optional, Callable
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from scipy.spatial import distance
  9 | 
 10 | from .newick import parse_newick
 11 | from .upgma import upgma
 12 | 
 13 | 
 14 | class ScoaryTree:
 15 |     left: Optional[ScoaryTree] = None
 16 |     right: Optional[ScoaryTree] = None
 17 |     label: Optional[str] = None
 18 |     is_leaf: bool = False
 19 |     _values: Optional[np.ndarray] = None
 20 |     _prune = False
 21 | 
 22 |     def __init__(self, left: ScoaryTree = None, right: ScoaryTree = None, label: str = None):
 23 |         if left is None and right is None:
 24 |             self.is_leaf = True
 25 |             assert type(label) is str, f'A valid node has a label! {label=}'
 26 |             self.label = label
 27 |         else:
 28 |             self.is_leaf = False
 29 |             assert type(left) is ScoaryTree and type(
 30 |                 right) is ScoaryTree, f'A valid tree has 0 or 2 children! {left=} {right=}'
 31 |             self.left = left
 32 |             self.right = right
 33 | 
 34 |     def __str__(self) -> str:
 35 |         return self._newick()
 36 | 
 37 |     def __repr__(self):
 38 |         return str(self)
 39 | 
 40 |     def _newick(self):
 41 |         return self.label if self.is_leaf else f"({self.left._newick()},{self.right._newick()})"
 42 | 
 43 |     def to_newick(self) -> str:
 44 |         return f'{self._newick()};'
 45 | 
 46 |     def write_newick(self, path: str):
 47 |         with open(path, 'w') as f:
 48 |             f.write(self.to_newick())
 49 | 
 50 |     def labels(self) -> [str]:
 51 |         if self.is_leaf:
 52 |             return [self.label]
 53 |         else:
 54 |             return self.left.labels() + self.right.labels()
 55 | 
 56 |     def uniquify(self, label_to_trait: {str: bool}):
 57 |         def uniquify(tree: ScoaryTree) -> str:
 58 |             if tree.is_leaf:
 59 |                 return '1' if label_to_trait[tree.label] else '0'
 60 |             else:
 61 |                 l, r = uniquify(tree.left), uniquify(tree.right)
 62 |                 return f'({l}{r})' if l < r else f'({r}{l})'
 63 | 
 64 |         return uniquify(self)
 65 | 
 66 |     def copy(self):
 67 |         def copy(tree: ScoaryTree) -> ScoaryTree:
 68 |             if tree.is_leaf:
 69 |                 return ScoaryTree(label=tree.label)
 70 |             else:
 71 |                 return ScoaryTree(left=copy(tree.left), right=copy(tree.right))
 72 | 
 73 |         return copy(self)
 74 | 
 75 |     def prune(self, labels: [str]) -> ScoaryTree:
 76 |         n_labels_found = 0
 77 | 
 78 |         def prune(tree: ScoaryTree) -> Optional[ScoaryTree]:
 79 |             if tree.is_leaf:
 80 |                 if tree.label in labels:
 81 |                     nonlocal n_labels_found
 82 |                     n_labels_found += 1
 83 |                     return ScoaryTree(label=tree.label)
 84 |                 else:
 85 |                     return None
 86 |             else:
 87 |                 left, right = prune(tree.left), prune(tree.right)
 88 |                 if left and right:
 89 |                     return ScoaryTree(left=left, right=right)
 90 |                 if left:
 91 |                     return left
 92 |                 if right:
 93 |                     return right
 94 |                 return None
 95 | 
 96 |         pruned_tree = prune(self)
 97 | 
 98 |         if n_labels_found != len(labels):
 99 |             missing = set(labels).difference(set(self.labels()))
100 |             raise AssertionError(f'Pruning went wrong: did not find all labels in tree! '
101 |                                  f'{n_labels_found=}; {missing=}; tree={self}')
102 | 
103 |         return pruned_tree
104 | 
105 |     def prune_nonrecursive(self, labels: [str]) -> ScoaryTree:
106 |         if self.is_leaf:
107 |             assert [self.label] == labels, f'Pruning went wrong. {[self.label]} != {labels}'
108 |             return ScoaryTree(label=self.label)
109 | 
110 |         n_labels_found = 0
111 | 
112 |         root = ScoaryTree(left=self.left, right=self.right)
113 | 
114 |         stack = [(root, 'right'), (root, 'left')]
115 | 
116 |         while stack:
117 |             current_parent, current_direction = stack[-1]
118 |             current_node: ScoaryTree = getattr(current_parent, current_direction)
119 | 
120 |             if current_node.is_leaf:
121 |                 # current node is leaf
122 | 
123 |                 this = ScoaryTree(label=current_node.label)
124 | 
125 |                 if current_node.label in labels:
126 |                     n_labels_found += 1
127 |                 else:
128 |                     this._prune = True  # mark for pruning
129 | 
130 |                 # append self to parent
131 |                 setattr(current_parent, current_direction, this)
132 | 
133 |                 if current_direction == 'right':
134 |                     # prune
135 |                     current_parent.__prune()
136 |                     stack.pop()
137 | 
138 |                     # found terminal node
139 |                     # # GO UP UNTIL CAN GO RIGHT
140 |                     while stack:
141 |                         ancestor_node, ancestor_direction = stack[-1]
142 |                         if ancestor_direction == 'right':
143 |                             ancestor_node.__prune()
144 |                             stack.pop()
145 |                         else:
146 |                             break
147 | 
148 |                     if not stack:
149 |                         print(f'done\n{self}\n{root}')
150 |                         break
151 | 
152 |                 # pop left node -> go right next
153 |                 stack.pop()
154 | 
155 |             else:
156 |                 this = ScoaryTree(left=current_node.left, right=current_node.right)
157 |                 stack.extend([(this, 'right'), (this, 'left')])
158 | 
159 |                 # append self to parent
160 |                 setattr(current_parent, current_direction, this)
161 | 
162 |         return root
163 | 
164 |     def copy_nonrecursive(self) -> ScoaryTree:
165 |         return self.rename_nonrecursive(func=lambda label: label)
166 | 
167 |     def rename_nonrecursive(self, func: Callable):
168 |         if self.is_leaf:
169 |             return ScoaryTree(label=func(self.label))
170 | 
171 |         root = ScoaryTree(left=self.left, right=self.right)
172 | 
173 |         stack = [(root, 'right'), (root, 'left')]
174 | 
175 |         while stack:
176 |             current_parent, current_direction = stack[-1]
177 |             current_node: ScoaryTree = getattr(current_parent, current_direction)
178 | 
179 |             if current_node.is_leaf:
180 |                 # current node is leaf
181 |                 this = ScoaryTree(label=func(current_node.label))
182 | 
183 |                 # append self to parent
184 |                 setattr(current_parent, current_direction, this)
185 | 
186 |                 if current_direction == 'right':
187 |                     # found terminal node
188 |                     # # GO UP UNTIL CAN GO RIGHT
189 |                     while stack and stack[-1][1] == 'right':
190 |                         stack.pop()
191 | 
192 |                     if not stack:
193 |                         print(f'done\n{self}\n{root}')
194 |                         break
195 | 
196 |                 # pop left node -> go right next
197 |                 stack.pop()
198 | 
199 |             else:
200 |                 this = ScoaryTree(left=current_node.left, right=current_node.right)
201 |                 stack.extend([(this, 'right'), (this, 'left')])
202 | 
203 |                 # append self to parent
204 |                 setattr(current_parent, current_direction, this)
205 | 
206 |         return root
207 | 
208 |     def rename(self, func: Callable):
209 |         """
210 |         Apply a function to each leaf label.
211 | 
212 |         Only used for debugging. This recursive function could cause RecursionError for big trees.
213 |         """
214 | 
215 |         def convert(tree: ScoaryTree) -> ScoaryTree:
216 |             """recursive function"""
217 |             if tree.is_leaf:
218 |                 return ScoaryTree(label=func(tree.label))
219 |             else:
220 |                 return ScoaryTree(left=convert(tree.left), right=convert(tree.right))
221 | 
222 |         return convert(self)
223 | 
224 |     @classmethod
225 |     def from_newick(cls, newick: str) -> ScoaryTree:
226 |         list_tree = parse_newick(newick)
227 |         return cls.from_list(list_tree)
228 | 
229 |     @classmethod
230 |     def from_list(cls, tree: []) -> ScoaryTree:
231 |         def convert(list_tree):
232 |             """recursive function"""
233 |             if type(list_tree) is str:
234 |                 return cls(label=list_tree)
235 |             else:
236 |                 return cls(left=convert(list_tree[0]), right=convert(list_tree[1]))
237 | 
238 |         return convert(tree)
239 | 
240 |     @cached_property
241 |     def to_list(self) -> []:
242 |         def to_list(tree: ScoaryTree) -> str | []:
243 |             if tree.is_leaf:
244 |                 return tree.label
245 |             else:
246 |                 return [to_list(tree.left), to_list(tree.right)]
247 | 
248 |         return to_list(self)
249 | 
250 |     @classmethod
251 |     def from_presence_absence(cls, genes_df: pd.DataFrame) -> ScoaryTree:
252 |         distance_matrix = pd.DataFrame(distance.squareform(distance.pdist(genes_df.T, 'hamming')),
253 |                                        columns=genes_df.columns)
254 |         tree_as_list = upgma(distance_matrix)
255 |         return cls.from_list(tree_as_list)
256 | 
257 |     def __prune(self):
258 |         if self.left._prune and self.right._prune:
259 |             self._prune = True
260 |         elif self.left._prune:
261 |             # become right
262 |             self.label = self.right.label
263 |             self.is_leaf = self.right.is_leaf
264 |             self.left = self.right.left
265 |             self.right = self.right.right
266 |         elif self.right._prune:
267 |             # become left
268 |             self.label = self.left.label
269 |             self.is_leaf = self.left.is_leaf
270 |             self.right = self.left.right
271 |             self.left = self.left.left
272 | 


--------------------------------------------------------------------------------
/scoary/utils.py:
--------------------------------------------------------------------------------
  1 | import importlib.metadata
  2 | import os
  3 | import sys
  4 | import json
  5 | import logging
  6 | from copy import deepcopy
  7 | import warnings
  8 | from functools import cache
  9 | from typing import Type, Any
 10 | from datetime import datetime
 11 | import numpy as np
 12 | import pandas as pd
 13 | from importlib.metadata import version
 14 | from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
 15 | 
 16 | warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
 17 | warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)
 18 | 
 19 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 20 | ALLOWED_CORRECTIONS = {'native', 'bonferroni', 'sidak', 'holm-sidak', 'holm', 'simes-hochberg', 'hommel', 'fdr_bh',
 21 |                        'fdr_by', 'fdr_tsbh', 'fdr_tsbky'}
 22 | 
 23 | logger = logging.getLogger('scoary.utils')
 24 | 
 25 | try:
 26 |     from ete3 import Tree as EteTree
 27 | 
 28 | 
 29 |     def print_tree(scoary_tree, label_to_gene: {str: bool}, label_to_trait: {str: bool}, show_label=True):
 30 |         if show_label:
 31 |             label_fn = lambda label: f'{int(label_to_gene[label])}{int(label_to_trait[label])}_{label}'
 32 |         else:
 33 |             label_fn = lambda label: f'{int(label_to_gene[label])}{int(label_to_trait[label])}'
 34 | 
 35 |         renamed_tree = scoary_tree.rename(label_fn)
 36 |         ete_tree = EteTree(renamed_tree.to_newick())
 37 |         print(ete_tree)
 38 | 
 39 | except ImportError as e:
 40 |     def print_tree(scoary_tree, label_to_gene: {str: bool}, label_to_trait: {str: bool}):
 41 |         raise ImportError('This function requires the ete3 library. Please install via "pip install ete3"')
 42 | 
 43 | 
 44 | @cache
 45 | def get_version() -> str:
 46 |     try:
 47 |         return version('scoary-2')
 48 |     except importlib.metadata.PackageNotFoundError:
 49 |         return 'development'
 50 | 
 51 | 
 52 | class NotSplittableError(Exception):
 53 |     pass
 54 | 
 55 | 
 56 | class NoTraitsLeftException(Exception):
 57 |     pass
 58 | 
 59 | 
 60 | def decode_unicode(string: str) -> str:
 61 |     return string.encode('utf-8').decode('unicode-escape')
 62 | 
 63 | 
 64 | def setup_outdir(outdir: str, input: dict) -> str:
 65 |     outdir = outdir.rstrip('/')
 66 |     assert not os.path.exists(outdir), f'ERROR: {outdir=} already exists!'
 67 |     os.makedirs(f'{outdir}/traits')
 68 |     os.makedirs(f'{outdir}/logs')
 69 |     os.makedirs(f'{outdir}/app')
 70 |     with open(f'{outdir}/logs/input.json', 'w') as f:
 71 |         json.dump(input, f, indent=4)
 72 |     return outdir
 73 | 
 74 | 
 75 | def setup_logging(logger: logging.Logger, path: str = None, print_info: bool = True, reset: bool = False):
 76 |     """
 77 |     Setup logging for Scoary
 78 | 
 79 |     :param logger: logging.logging.Logger
 80 |     :param path: if set, DEBUG and higher goes to log files
 81 |     :param print_info: if True, INFO and higher goes to stdout
 82 |     :param reset: if True: close and remove all file handlers. (Important for multiprocessing: removes locks!)
 83 |     :return:
 84 |     """
 85 |     if reset or os.environ.get('SCOARY_RESET_LOGGERS', 'FALSE').upper() == 'TRUE':
 86 |         while logger.handlers:
 87 |             handler = logger.handlers[0]
 88 |             handler.close()
 89 |             logger.removeHandler(handler)
 90 | 
 91 |     logger.setLevel(logging.DEBUG)
 92 | 
 93 |     if path is not None:
 94 |         # create logfile
 95 |         logfile = logging.FileHandler(path)
 96 |         logfile.setLevel(logging.DEBUG)
 97 |         logfile.setFormatter(logging.Formatter("%(asctime)s [%(name)s: %(levelname)s] %(message)s"))
 98 |         logger.addHandler(logfile)
 99 | 
100 |     if print_info:
101 |         # create streamhandler
102 |         stdout = logging.StreamHandler()
103 |         stdout.setLevel(getattr(logging, os.environ.get('SCOARY_LOGLEVEL_STDOUT', 'INFO').upper()))
104 |         logger.addHandler(stdout)
105 | 
106 |     return logger
107 | 
108 | 
109 | def ignore_warnings(warning: Type[Warning]):
110 |     """
111 |     Decorator to suppress warnings.
112 | 
113 |     Example:
114 | 
115 |     @ignore_warnings(warning=ConvergenceWarning)
116 |     def some_function():
117 |         # any produced ConvergenceWarnings will be suppressed
118 | 
119 |     :param warning: class of warning to be suppressed
120 |     """
121 | 
122 |     def decorator(function):
123 |         def wrapper(*args, **kwargs):
124 |             with warnings.catch_warnings():
125 |                 warnings.simplefilter('ignore', warning)
126 |                 return function(*args, **kwargs)
127 | 
128 |         return wrapper
129 | 
130 |     return decorator
131 | 
132 | 
133 | def parse_correction(correction_str: str, param_name: str) -> (str, float):
134 |     if ':' in correction_str:
135 |         method, cutoff = correction_str.split(':', 1)
136 |     else:
137 |         method, cutoff = correction_str, 'inf'
138 | 
139 |     assert method in ALLOWED_CORRECTIONS, f'{param_name}={correction_str} must be in {ALLOWED_CORRECTIONS}'
140 | 
141 |     try:
142 |         cutoff = float(cutoff)
143 |     except ValueError:
144 |         raise AssertionError(f'Error in {correction_str=}: {cutoff=} could not be converted to float')
145 | 
146 |     return method, cutoff
147 | 
148 | 
149 | def is_int(string: str) -> bool:
150 |     try:
151 |         int(string)
152 |         return True
153 |     except ValueError:
154 |         return False
155 | 
156 | 
157 | def is_float(string: str) -> bool:
158 |     try:
159 |         float(string)
160 |         return True
161 |     except ValueError:
162 |         return False
163 | 
164 | 
165 | def split_into_parts(list_: list, n_parts: int) -> [list]:
166 |     quotient, reminder = divmod(len(list_), n_parts)
167 |     return [
168 |         list_[i * quotient + min(i, reminder):(i + 1) * quotient + min(i + 1, reminder)]
169 |         for i in range(n_parts)
170 |     ]
171 | 
172 | 
173 | def fisher_id(a, b, c, d):
174 |     """
175 |     Eight contingency tables always give the same pvalue: ['abcd', 'acbd', 'badc', 'bdac', 'cadb', 'cdab', 'dbca', 'dcba']
176 | 
177 |     Compute and save only one version.
178 |     """
179 |     return min((
180 |         (a, b, c, d),
181 |         (a, c, b, d),
182 |         (b, a, d, c),
183 |         (b, d, a, c),
184 |         (c, a, d, b),
185 |         (c, d, a, b),
186 |         (d, b, c, a),
187 |         (d, c, b, a)
188 |     ))
189 | 
190 | 
191 | def load_info_file(
192 |         logger: logging.Logger,
193 |         info_file: str,
194 |         merge_col: str,
195 |         expected_overlap_set: set = None,
196 |         reference_file: str = None
197 | ) -> pd.DataFrame:
198 |     """
199 |     Load an info_file into pd.DataFrame:
200 |         - Separator: tab ('\t')
201 |         - Must have this header: {merge_col}\t{colname1}\t{colname2}...
202 | 
203 |     :param logger: instance of logging.Logger
204 |     :param info_file: path to file
205 |     :param merge_col: name of first column
206 |     :param expected_overlap_set: a set of strings, some of which must occur in the index of info_file
207 |     :param reference_file: path to reference file, just used for error messages
208 |     :return: pd.DataFrame with merge_col as index
209 |     """
210 |     info_df = pd.read_csv(info_file, index_col=0, delimiter='\t')
211 | 
212 |     assert info_df.index.name == merge_col, \
213 |         f'The file {info_file} is improperly formatted: The first column must be named "{merge_col}". ' \
214 |         f'Current name: {info_df.index.name}. Remaining columns: {info_df.columns.tolist()}'
215 | 
216 |     if expected_overlap_set is not None:
217 |         overlap_size = len(set.intersection(set(info_df.index), expected_overlap_set))
218 |         if overlap_size == 0:
219 |             logger.warning(f'The {merge_col}s in {info_file} do not match any {merge_col}s in {reference_file}')
220 |         logger.debug(f'Loaded descriptions for {overlap_size} {merge_col}s')
221 | 
222 |     logger.debug(f'Loaded {merge_col} descriptions. columns={info_df.columns.tolist()}')
223 |     assert not info_df.index.has_duplicates, \
224 |         f'{info_file} contains duplicates: {info_df.index[info_df.index.duplicated()]}'
225 |     return info_df
226 | 
227 | 
228 | class MockCounter:
229 |     """
230 |     Imitate multiprocessing.Manager.Value / multiprocessing.managers.ValueProxy
231 |     """
232 | 
233 |     def __init__(self):
234 |         self._value: int = 0
235 | 
236 |     @property
237 |     def value(self):
238 |         return self._value
239 | 
240 |     @value.setter
241 |     def value(self, value):
242 |         self._value = value
243 | 
244 | 
245 | class MockLock:
246 |     """
247 |     Imitate multiprocessing.Manager.Lock / multiprocessing.managers.AcquirerProxy
248 |     """
249 | 
250 |     def __init__(self):
251 |         self._value = 0
252 | 
253 |     def __enter__(self):
254 |         return self
255 | 
256 |     def __exit__(self, *exc):
257 |         return False
258 | 
259 | 
260 | class AbstractNamespace:
261 |     @classmethod
262 |     def create_namespace(cls, ns, properties: {str: Any}):
263 |         for name in cls.__dict__['__annotations__'].keys():
264 |             setattr(ns, name, properties[name])
265 |         return ns
266 | 
267 | 
268 | def grasp_namespace(cls, ns):
269 |     """
270 |     This will copy the elements of the multiprocessing namespace into the "private" memory of the current process
271 | 
272 |     :param ns: multiprocessing.managers.Namespace
273 |     :return: MockNameSpace
274 |     """
275 |     new_ns = cls()
276 |     for name in cls.__dict__['__annotations__'].keys():
277 |         value = getattr(ns, name)
278 |         if name in ['lock', 'counter']:
279 |             setattr(new_ns, name, value)
280 |         else:
281 |             setattr(new_ns, name, deepcopy(value))
282 |     return new_ns
283 | 
284 | 
285 | class AnalyzeTraitNamespace(AbstractNamespace):
286 |     counter: MockCounter
287 |     queue_size: int
288 |     lock: MockLock
289 |     outdir: str
290 |     start_time: datetime
291 |     genes_orig_df: pd.DataFrame
292 |     genes_bool_df: pd.DataFrame
293 |     gene_info_df: pd.DataFrame | None
294 |     numeric_df: pd.DataFrame
295 |     traits_df: pd.DataFrame
296 |     trait_info_df: pd.DataFrame | None
297 |     duplicates: pd.DataFrame
298 |     tree: object  #: ScoaryTree
299 |     all_labels: set
300 |     mt_f_method: str
301 |     mt_f_cutoff: float
302 |     trait_wise_correction: bool
303 |     max_genes: int
304 |     worst_cutoff: None | float
305 |     n_permut: int
306 |     random_state: int
307 |     pairwise: bool
308 |     multiple_testing_df: pd.DataFrame
309 | 
310 | 
311 | class BinarizeTraitNamespace(AbstractNamespace):
312 |     counter: MockCounter
313 |     lock: MockLock
314 |     outdir: str
315 |     start_time: datetime
316 |     numeric_df: pd.DataFrame
317 |     random_state: int
318 |     method: str
319 |     alternative: str
320 |     covariance_type: str
321 |     cutoff: float
322 |     random_state: int
323 | 


--------------------------------------------------------------------------------
/benchmarking/runtime/data/100_traits.csv:
--------------------------------------------------------------------------------
 1 | Name,lc:Compound_8069,lc:Compound_7747,lc:Compound_8286,lc:Compound_15820,vol:28.29B325,lc:Compound_15534,vol:28.34A521,vol:22.07B128,lc:Compound_7542,vol:29.32A619,lc:Compound_6322,vol:51.57B3119,lc:Compound_2501,vol:30.72B720,vol:23.12B149,vol:10.59A34,vol:43.43B1738,lc:Compound_7931,lc:Compound_10098,vol:50.04A1824,vol:39.67A1311,vol:31.09B751,lc:Compound_10065,lc:Compound_8550,lc:Compound_13565,vol:13.51A88,vol:33.59A924,lc:Compound_13311,vol:32.87B883,vol:28.63B419,lc:Compound_8016,lc:Compound_16708,lc:Compound_6217,lc:Compound_10163,lc:Compound_4586,lc:Compound_6238,lc:Compound_5186,vol:45.07B2035,lc:Compound_14592,lc:Compound_5252,lc:Compound_8405,vol:45.07B2053,vol:30.71A725,vol:45.07B2066,vol:40.16B1508,lc:Compound_12131,lc:Compound_11605,lc:Compound_12094,vol:41.79B1620,lc:Compound_7022,lc:Compound_2760,vol:23.55A329,lc:Compound_3387,lc:Compound_11587,lc:Compound_3197,vol:33.98A973,lc:Compound_2600,vol:26.27A428,lc:Compound_12011,vol:24.1A345,vol:31.72B789,vol:44.93B1925,lc:Compound_16370,vol:28.3B337,lc:Compound_12738,vol:49.37B2901,lc:Compound_6721,lc:Compound_6526,vol:47.94B2623,lc:Compound_4023,vol:50.74B2985,vol:47.94B2542,vol:47.94B2598,lc:Compound_12790,vol:34.12A980,lc:Compound_10040,lc:Compound_16135,lc:Compound_13119,lc:Compound_3793,vol:33.59A923,vol:28.29B330,lc:Compound_8627,lc:Compound_14527,lc:Compound_14036,lc:Compound_8062,lc:Compound_12717,lc:Compound_7448,lc:Compound_10182,lc:Compound_15808,lc:Compound_8001,lc:Compound_6351,lc:Compound_5738,vol:33.67B947,lc:Compound_15057,lc:Compound_15661,vol:28.49B400,lc:Compound_7986,lc:Compound_1737,lc:Compound_15484,lc:Compound_13098
 2 | FAM14177-p1-1.1,-,0,-,0,0,1,0,-,-,-,0,0,1,1,0,0,0,1,-,0,-,0,0,0,0,-,-,0,-,0,-,0,0,-,-,0,-,-,0,0,-,0,0,-,0,-,0,-,0,0,1,0,1,1,0,0,1,1,1,1,1,-,-,0,0,1,0,0,-,0,0,-,1,0,0,0,-,0,0,0,0,0,0,-,-,0,0,0,0,0,-,0,-,0,0,0,-,-,-,-
 3 | FAM14184-i1-1.1,1,0,-,0,1,1,-,-,1,1,-,-,1,1,0,0,-,1,1,0,-,0,-,0,0,-,0,0,1,0,1,0,0,1,1,0,-,-,0,0,1,-,0,-,1,1,0,-,0,0,1,0,1,1,0,0,-,1,1,0,1,-,-,0,0,0,0,0,-,-,0,-,0,0,-,0,1,0,-,0,0,0,0,-,-,0,0,0,0,0,-,-,0,0,0,0,-,-,-,-
 4 | FAM14193-i1-1.1,1,0,0,0,0,1,-,-,0,1,-,0,1,1,0,0,0,1,0,0,-,0,1,0,0,-,-,0,-,0,1,-,0,1,0,0,-,-,1,0,-,1,0,-,0,0,0,-,0,0,1,0,1,-,0,0,-,1,-,1,-,-,-,0,0,-,0,0,0,0,0,0,0,-,0,0,1,0,-,0,0,0,0,1,0,0,0,-,0,0,-,0,1,0,0,0,-,-,-,-
 5 | FAM14197-i1-1.1,-,0,-,0,0,0,0,0,-,0,1,1,0,1,1,0,1,0,-,0,-,0,0,0,0,-,-,0,0,0,1,0,1,1,0,1,1,-,0,1,0,-,0,-,0,0,0,0,-,-,1,0,1,1,0,1,0,1,1,1,0,-,1,0,0,0,1,0,1,-,0,1,1,-,0,0,-,0,0,0,0,0,0,0,-,0,0,0,0,-,1,0,0,0,0,0,-,1,-,-
 6 | FAM14217-p1-1.1,-,0,0,0,0,0,-,-,0,1,0,-,-,1,0,1,1,1,0,-,-,1,0,1,0,-,-,0,1,0,-,0,0,0,0,0,-,1,0,0,0,1,0,1,0,-,0,0,1,0,1,1,1,-,1,0,1,1,1,0,-,1,-,0,0,0,0,0,1,0,1,1,1,0,-,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,-,0,-,0,0,1,-,1,-,-
 7 | FAM14221-p1-1.1,-,0,0,0,1,0,1,1,0,1,0,1,-,0,0,0,-,0,-,0,1,0,-,0,0,-,1,0,1,0,-,0,0,-,1,0,-,1,0,0,0,1,1,1,0,-,0,-,-,0,1,0,1,0,0,0,0,1,-,1,1,1,-,1,0,0,0,-,1,-,0,1,0,-,1,0,-,0,0,1,1,0,0,-,-,0,0,0,0,1,1,0,1,0,0,0,-,-,-,-
 8 | FAM14222-p1-1.1,1,0,-,0,1,0,-,-,1,0,1,1,0,1,0,0,-,0,1,-,-,0,1,0,0,-,-,0,0,0,1,1,0,1,0,1,1,-,0,-,0,0,0,0,1,-,0,-,1,-,1,0,1,-,0,1,1,1,1,-,0,0,-,0,0,0,1,1,1,-,1,-,1,1,0,-,-,0,1,1,0,-,0,0,1,0,0,0,0,0,-,0,0,0,0,0,-,-,1,-
 9 | FAM1414-i1-1.1,-,1,0,1,0,-,0,0,0,1,0,0,0,-,0,0,0,0,0,0,-,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,-,-,0,1,0,1,0,1,0,-,0,0,0,-,1,0,-,-,1,0,0,1,0,1,1,1,-,0,1,0,0,0,0,-,0,0,0,0,0,1,0,0,0,0,0,0,1,0,-,0,0,0,1,0,-,1,1,0,0,0,-,-,-,-
10 | FAM15061-i1-1.1,1,0,0,0,0,-,-,-,-,1,0,0,-,1,0,0,0,1,0,0,-,0,1,0,0,-,-,0,-,0,-,0,-,-,0,0,-,-,0,0,0,-,0,1,0,-,0,-,0,1,0,0,1,1,1,0,-,1,1,1,-,-,1,0,0,1,0,0,-,-,0,-,-,0,0,-,1,0,0,0,0,0,0,0,-,0,0,0,0,0,-,0,0,0,0,0,-,-,-,-
11 | FAM15078-i1-1.1,1,0,1,0,0,0,-,0,1,0,1,1,-,1,0,0,0,0,1,0,0,0,-,0,0,-,-,0,0,0,1,-,-,1,0,1,1,0,0,1,0,0,0,0,1,-,0,-,0,1,1,0,1,1,1,0,-,1,1,0,0,0,-,0,0,0,1,-,-,-,0,-,-,-,0,0,-,0,0,1,0,0,0,0,1,0,0,0,-,-,-,0,0,0,0,0,-,-,1,-
12 | FAM15113-i1-1.1,-,0,1,0,0,0,-,0,0,0,1,-,1,1,0,0,1,0,1,1,-,0,0,-,1,-,-,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,-,0,0,0,-,0,0,-,-,1,1,0,1,-,-,0,0,-,0,1,0,-,-,1,0,1,1,1,1,0,-,0,0,0,1,0,1,0,0,0,0,0,0,0,0,-,0,0,0,0,0,-,-,-,-
13 | FAM15170-i1-1.1,1,0,0,1,1,-,-,-,0,1,-,-,0,1,0,-,0,0,0,1,-,0,-,0,1,-,-,1,1,0,-,0,-,1,0,0,-,1,0,-,0,1,0,1,1,0,0,-,-,-,1,0,1,-,0,0,0,1,1,1,1,1,-,0,0,0,0,-,0,-,1,0,0,0,0,-,-,0,-,0,0,0,1,0,-,1,0,0,1,0,-,1,1,0,0,0,0,-,-,-
14 | FAM15190-i1-1.1,-,0,1,0,0,1,1,-,-,1,-,0,1,1,0,1,1,1,1,0,-,0,-,0,0,-,1,0,1,0,1,0,0,-,1,0,-,-,0,0,-,1,0,1,0,-,0,-,-,0,1,1,1,-,1,0,-,1,1,1,-,-,-,0,0,0,0,0,1,-,0,1,1,0,0,0,-,1,0,0,0,0,0,0,0,0,1,0,0,0,-,0,1,0,0,-,-,1,-,-
15 | FAM15192-i1-1.1,-,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,-,0,-,0,1,0,0,0,0,0,1,-,0,-,0,1,1,0,0,-,0,0,0,0,0,-,0,-,-,1,1,0,1,-,0,0,0,1,1,1,0,0,-,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,-,0,0,0,0,0,-,-,-,-
16 | FAM15300-i1-1.1,-,0,0,1,0,-,0,0,0,1,0,0,-,1,0,0,-,-,1,0,-,0,1,0,0,-,0,0,0,0,0,0,1,0,0,0,-,-,0,1,0,1,0,1,0,-,0,0,0,1,0,0,-,-,1,0,0,1,-,1,0,1,-,0,0,0,0,0,-,0,0,-,-,0,0,0,-,0,1,-,0,0,1,0,0,0,0,0,1,0,-,1,0,0,0,0,-,-,-,-
17 | FAM15333-i1-1.1,1,0,0,1,0,-,-,0,1,1,1,0,-,1,0,0,0,0,0,0,-,0,-,0,0,0,0,0,-,0,1,-,-,1,0,0,1,-,0,1,0,1,0,1,0,-,0,-,0,-,1,0,1,-,0,0,1,1,1,1,1,1,-,0,0,0,0,1,1,1,0,-,1,0,0,0,1,0,-,0,0,0,1,0,0,0,0,0,1,-,-,1,1,1,0,0,0,-,-,-
18 | FAM15346-i1-1.1,0,0,1,0,0,-,-,-,0,1,0,-,1,1,0,-,-,1,0,0,-,0,-,0,0,-,-,0,0,0,0,0,0,0,1,0,-,-,0,0,1,1,0,1,0,1,0,0,0,0,0,0,1,-,1,0,0,1,-,1,0,1,-,0,0,0,0,0,1,-,0,-,-,1,0,1,0,0,-,-,0,0,0,1,0,0,0,0,0,0,-,0,0,0,0,0,-,0,0,-
19 | FAM15347-i1-1.1,-,0,0,0,0,1,0,-,-,0,-,0,1,1,0,1,0,1,-,0,0,0,0,0,1,-,-,0,0,0,-,0,1,-,0,0,-,0,0,0,1,0,0,0,0,-,0,0,0,0,1,0,1,-,0,0,0,1,-,1,0,0,-,0,0,0,0,0,0,-,0,0,0,-,0,0,-,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,-,-,0,-
20 | FAM15381-i1-1.1,-,0,0,0,0,-,0,-,0,1,0,-,1,1,0,0,0,0,0,1,-,0,0,0,0,-,0,0,-,0,-,0,-,-,0,0,1,1,0,0,0,1,0,1,0,-,0,0,-,0,1,0,1,-,0,0,0,1,1,0,1,1,-,0,0,0,0,0,1,1,-,-,1,-,0,1,-,0,0,0,0,1,0,0,0,0,0,0,0,0,-,0,1,0,0,0,-,1,-,-
21 | FAM15407-i1-1.1,0,0,0,0,0,0,-,-,0,1,0,0,1,1,0,-,0,1,0,0,-,0,0,-,0,-,0,0,1,0,0,0,1,0,1,0,1,1,0,0,1,1,0,-,1,-,0,0,0,0,0,0,0,0,1,-,0,1,0,0,1,1,0,0,0,0,1,0,1,0,0,-,1,1,-,1,0,0,-,0,0,-,0,1,-,0,0,0,0,0,-,0,0,0,0,0,1,1,0,-
22 | FAM19015-i1-1.1,-,0,0,0,1,0,1,1,0,1,1,1,-,1,-,0,1,0,0,1,-,0,-,0,0,1,-,0,1,-,-,0,1,1,0,1,1,1,0,-,0,1,0,1,0,-,0,-,1,-,1,1,-,-,1,0,0,1,1,-,1,1,-,1,0,0,1,-,-,-,1,1,1,1,1,0,0,0,0,1,-,1,0,0,-,0,0,0,0,0,-,0,0,0,0,0,-,-,-,-
23 | FAM19016-i1-1.1,-,0,0,1,0,0,0,0,0,1,0,0,1,0,0,-,0,1,0,0,-,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,-,1,0,-,1,1,0,1,-,1,0,0,-,1,0,0,-,0,1,0,-,1,0,0,-,1,-,0,0,0,1,-,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,-,0,0,0,-,0,-,1,1,0,0,0,1,-,0,-
24 | FAM19020-i1-1.1,1,0,0,1,0,0,-,0,0,0,0,0,0,1,0,0,-,0,0,0,-,0,-,0,1,0,0,1,-,0,-,0,0,-,0,0,-,-,0,1,0,0,0,0,0,0,0,-,0,-,1,0,-,-,0,0,-,1,1,1,0,0,-,0,-,0,0,-,0,-,0,-,0,0,0,0,0,0,0,-,0,0,1,0,0,0,0,0,1,-,-,1,0,0,0,0,-,-,1,-
25 | FAM19022-i1-1.1,1,0,-,0,0,0,-,0,1,0,-,0,-,1,0,0,-,0,-,0,-,0,1,0,0,-,-,0,0,0,1,1,0,1,0,1,-,-,0,1,0,-,0,0,0,-,0,-,0,0,1,0,1,-,0,1,0,1,1,1,0,-,-,0,0,0,1,-,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-,-,1,-
26 | FAM19023-i1-1.1,-,0,-,0,0,0,-,0,0,1,-,0,-,1,0,0,0,0,0,0,-,0,-,0,0,-,0,0,0,0,-,0,0,-,0,1,1,1,0,0,0,1,0,1,1,0,0,-,0,-,1,0,-,-,0,1,0,1,1,0,-,1,-,0,0,0,1,-,1,-,-,-,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,-,-,0,1,0,0,0,-,-,-,-
27 | FAM19024-p1-1.1,1,0,0,0,0,-,-,-,0,1,0,0,-,1,0,0,-,-,-,0,-,1,0,0,1,-,0,0,1,-,1,-,0,-,0,0,-,1,1,0,-,1,0,1,0,-,1,1,0,0,1,0,1,-,1,0,1,1,1,1,1,1,-,0,-,0,0,0,1,-,0,1,1,0,-,0,-,1,0,1,0,0,0,-,0,0,0,0,0,-,1,0,1,0,0,1,0,-,-,-
28 | FAM19025-p1-1.1,-,0,1,0,1,0,-,-,-,1,-,0,-,1,0,0,0,0,0,1,-,0,1,0,0,-,-,0,-,0,-,0,0,1,-,1,1,1,0,-,0,1,0,1,0,0,0,-,-,1,1,0,-,0,0,1,1,1,1,0,-,1,-,0,0,0,1,1,-,-,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,-,0,1,0,0,0,1,-,-,-
29 | FAM19030-i2-1.1,-,0,0,0,0,0,-,1,0,1,-,1,0,1,0,0,0,0,0,1,-,0,-,-,0,-,0,0,-,0,0,0,0,-,0,1,-,1,0,0,0,1,0,1,0,0,0,0,-,-,0,0,0,-,1,1,0,1,-,0,0,1,-,0,0,0,1,0,1,0,1,1,1,1,0,-,0,0,0,0,0,0,0,0,1,0,-,0,0,0,-,0,1,0,0,0,-,0,0,-
30 | FAM19031-i2-1.1,1,0,1,0,0,1,0,0,1,1,0,0,0,-,0,0,0,-,0,0,-,0,-,0,0,0,0,0,0,0,1,1,0,-,0,0,0,-,0,0,-,1,0,1,0,-,0,-,0,0,1,0,1,1,0,0,-,1,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,-,0,0,-,0,0,-,0,-,0,0,0,0,0,1,1
31 | FAM19034-i1-1.1,1,0,0,1,1,-,-,-,0,1,1,1,-,1,0,-,-,-,1,0,-,0,1,0,-,-,0,1,1,1,1,1,0,1,0,0,-,1,1,1,0,1,0,1,1,-,0,-,1,-,1,0,1,-,0,0,0,1,1,0,1,1,1,0,0,0,0,1,1,-,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,1,0,0,1,0,-,1,-
32 | FAM22019-i1-1.1,1,0,1,0,1,0,0,0,1,-,1,-,0,1,0,0,-,0,1,-,-,0,-,0,1,-,0,0,-,0,1,1,0,1,0,1,-,1,1,-,0,1,0,1,0,0,0,1,0,1,1,0,1,1,0,0,0,1,1,1,-,1,1,0,1,0,0,-,0,1,0,0,-,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,0,0,1,1
33 | FAM22020-i1-1.1,1,0,0,1,0,0,1,-,0,1,0,-,-,1,0,1,1,-,0,1,-,1,-,0,0,-,0,1,1,1,0,0,0,0,-,0,0,1,0,1,-,1,-,1,0,0,1,0,1,1,0,-,1,-,1,0,0,1,0,1,1,1,-,0,0,0,0,-,1,-,1,1,-,-,0,0,0,0,1,1,1,0,1,0,0,0,0,0,1,0,-,1,1,0,0,1,-,1,0,-
34 | FAM22021-p1-1.1,0,0,0,0,0,-,-,-,0,1,0,0,1,1,0,0,-,1,-,0,-,0,-,0,0,-,-,0,0,0,-,0,1,-,1,0,1,-,0,0,1,1,0,1,0,-,0,0,0,0,0,0,1,0,1,-,0,1,0,1,1,-,-,0,0,0,-,-,0,-,0,0,0,-,0,0,0,0,-,0,0,1,0,-,0,0,0,0,0,0,-,0,1,0,0,0,1,1,0,-
35 | FAM23848-i1-1.1,-,0,-,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,-,1,0,0,0,0,0,0,0,0,-,0,1,-,0,1,1,-,0,0,0,-,0,1,0,-,0,0,0,0,0,0,1,-,1,0,0,0,-,0,0,-,-,0,0,0,1,-,-,0,1,-,-,1,0,0,0,0,0,-,0,0,0,0,-,1,0,0,0,0,1,0,1,0,0,0,-,0,-,-
36 | FAM23852-i1-1.1,1,1,1,0,0,1,-,1,1,1,0,1,1,0,0,0,-,1,1,1,-,0,-,0,0,0,-,0,-,0,-,0,0,1,0,0,0,-,0,0,1,-,0,-,0,1,1,0,0,-,0,0,1,1,1,1,-,1,0,-,1,-,-,0,0,1,0,0,1,0,0,1,-,0,0,0,0,0,-,0,0,0,0,-,0,0,0,0,0,-,1,0,0,0,0,0,1,1,-,-
37 | FAM23853-i1-1.1,-,0,0,0,0,0,-,-,0,1,1,-,0,1,0,0,1,0,0,0,-,0,-,0,0,-,-,0,-,0,-,-,1,1,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,-,1,0,0,1,1,1,0,1,-,0,0,0,1,-,1,-,0,-,1,0,0,0,0,0,0,1,0,-,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,-,1,-
38 | FAM23855-i1-1.1,-,0,0,1,0,0,0,0,-,1,0,0,1,1,0,0,0,0,-,0,0,-,0,0,1,0,0,-,0,0,-,0,0,0,0,0,1,-,0,1,-,1,0,-,0,-,0,0,0,-,0,0,-,-,1,0,1,1,-,1,0,1,-,0,1,0,0,-,-,0,0,0,1,1,0,-,0,0,0,0,0,0,1,0,-,0,0,0,1,0,-,1,1,0,0,0,-,1,0,-
39 | FAM23864-i1-1.1,0,0,0,1,0,0,-,1,0,1,0,0,-,1,0,0,1,1,0,1,-,0,0,1,0,-,-,-,1,0,0,0,0,0,1,0,-,1,0,1,-,1,0,1,0,-,0,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,0,0,0,0,-,-,1,0,-,1,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,0,-
40 | FAM23867-i1-1.1,-,0,0,1,0,-,-,-,0,1,0,0,1,1,0,-,0,-,-,1,-,0,-,0,0,-,0,1,1,0,0,0,1,-,0,0,1,1,0,1,-,1,0,1,0,-,0,0,0,-,0,0,-,0,1,0,0,1,-,1,1,1,-,0,0,-,0,0,-,0,0,0,-,1,0,0,0,0,0,-,0,0,1,-,-,0,0,0,1,0,-,1,1,0,0,0,1,-,0,-
41 | FAM23868-i1-1.1,-,0,1,1,0,0,0,-,0,1,0,0,-,0,0,1,0,1,-,1,-,1,-,-,1,-,0,1,1,-,0,0,0,1,0,0,0,1,0,0,1,1,0,1,0,-,0,0,-,1,0,0,0,-,1,0,1,1,-,-,1,1,-,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,-,0,0,0,1,1,-,0,0,0,-,0,0,1,1,1,0,0,-,-,0,-
42 | FAM23869-i1-1.1,-,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,-,0,-,0,-,0,0,0,-,0,0,0,1,0,1,-,-,1,1,0,0,-,-,0,0,0,0,0,0,0,0,1,0,0,-,-,1,0,-,1,0,1,0,0,-,0,0,-,-,-,-,0,0,-,-,1,0,0,0,0,-,0,0,1,0,0,1,0,0,0,0,0,-,0,0,0,0,0,-,-,-,-
43 | FAM23870-i1-1.1,1,0,0,0,0,-,0,-,1,1,1,0,1,1,0,0,0,1,1,0,-,0,-,0,0,0,0,0,-,0,-,0,0,1,0,0,1,-,0,0,1,1,0,1,0,-,0,-,0,-,1,0,1,-,0,0,0,1,1,-,-,1,-,0,0,0,0,1,-,1,0,0,-,0,0,0,1,0,-,0,0,0,0,1,0,0,0,0,0,0,-,0,1,0,0,0,0,1,-,-
44 | FAM23877-p1-1.1,0,0,1,0,0,-,1,-,1,1,0,0,-,1,1,1,1,1,1,1,-,1,-,1,0,1,0,0,-,0,0,0,0,-,0,0,0,-,0,0,1,1,0,1,1,1,0,0,-,0,0,1,-,-,1,0,0,1,0,-,0,1,-,0,0,0,0,0,1,0,0,1,1,0,-,-,0,0,1,0,0,0,0,1,1,0,1,0,0,0,-,0,0,0,0,1,-,0,0,0
45 | FAM24252-i1-1.1,-,0,-,0,0,0,-,0,0,0,-,-,-,1,1,0,0,0,-,-,-,0,-,0,0,-,-,0,0,0,-,0,-,-,0,1,1,-,0,1,0,0,0,-,0,-,0,0,0,-,1,0,1,-,1,1,0,1,1,1,0,-,-,0,0,0,1,-,1,-,1,-,1,1,0,0,0,0,-,0,0,-,0,0,0,0,0,0,0,1,1,0,0,0,0,0,-,-,-,-
46 | 


--------------------------------------------------------------------------------
/tests/test_scoary.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | 
  3 | from init_tests import *
  4 | 
  5 | from scoary.scoary import *
  6 | 
  7 | os.environ['MGWAS_LINK_ONLY'] = 'true'
  8 | 
  9 | RESTRICT_TO = 'FAM14177-p1-1.1,FAM14184-i1-1.1,FAM14193-i1-1.1,FAM14197-i1-1.1,FAM14217-p1-1.1,FAM14221-p1-1.1,' \
 10 |               'FAM14222-p1-1.1,FAM1414-i1-1.1,FAM15061-i1-1.1,FAM15078-i1-1.1,FAM15113-i1-1.1,FAM15170-i1-1.1,' \
 11 |               'FAM15190-i1-1.1,FAM15192-i1-1.1,FAM15300-i1-1.1,FAM15333-i1-1.1,FAM15346-i1-1.1,FAM15347-i1-1.1,' \
 12 |               'FAM15381-i1-1.1,FAM15407-i1-1.1,FAM19015-i1-1.1,FAM19016-i1-1.1,FAM19020-i1-1.1,FAM19022-i1-1.1,' \
 13 |               'FAM19023-i1-1.1,FAM19024-p1-1.1,FAM19025-p1-1.1,FAM19030-i2-1.1,FAM19031-i2-1.1,FAM19034-i1-1.1,' \
 14 |               'FAM22019-i1-1.1,FAM22020-i1-1.1,FAM22021-p1-1.1,FAM23848-i1-1.1,FAM23852-i1-1.1,FAM23853-i1-1.1,' \
 15 |               'FAM23855-i1-1.1,FAM23864-i1-1.1,FAM23867-i1-1.1,FAM23868-i1-1.1,FAM23869-i1-1.1,FAM23870-i1-1.1,' \
 16 |               'FAM23877-p1-1.1,FAM24252-i1-1.1'
 17 | 
 18 | 
 19 | class TestScoary(TestCase):
 20 |     def setUp(self) -> None:
 21 |         self.tempdir = get_tempdir_path()
 22 |         if os.path.isdir(self.tempdir):
 23 |             shutil.rmtree(self.tempdir)
 24 | 
 25 |     def test_scoary_single_threaded(self):
 26 |         scoary(
 27 |             trait_wise_correction=False,
 28 |             genes='../data/tetracycline/Gene_presence_absence.csv',
 29 |             traits='../data/tetracycline/Tetracycline_resistance.csv',
 30 |             n_permut=1000,
 31 |             multiple_testing='fdr_bh:0.5',
 32 |             n_cpus=1,
 33 |             outdir=self.tempdir
 34 |         )
 35 | 
 36 |     def test_scoary_multi_threaded(self):
 37 |         scoary(
 38 |             trait_wise_correction=True,
 39 |             genes='../data/tetracycline/Gene_presence_absence.csv',
 40 |             traits='../data/tetracycline/Tetracycline_resistance.csv',
 41 |             n_permut=200,
 42 |             n_cpus=4,
 43 |             outdir=self.tempdir,
 44 |             multiple_testing='native:0.05'
 45 |         )
 46 | 
 47 |     def test_scoary_gene_info(self):
 48 |         scoary(
 49 |             genes='../data/tetracycline/Gene_presence_absence.csv',
 50 |             gene_info='../data/tetracycline/gene-info.tsv',
 51 |             traits='../data/tetracycline/Tetracycline_resistance.csv',
 52 |             n_permut=10000,
 53 |             n_cpus=1,
 54 |             outdir=self.tempdir
 55 |         )
 56 | 
 57 |     def test_scoary_long_binary(self):
 58 |         scoary(
 59 |             trait_wise_correction=True,
 60 |             newicktree='../data/new_ds/SpeciesTree_rooted.txt',
 61 |             multiple_testing='fdr_bh:0.6',
 62 |             linkage_method='average',
 63 |             genes='../data/new_ds/N0.tsv',
 64 |             gene_data_type='gene-list:\t',
 65 |             traits='../data/new_ds/LC-binary.tsv',
 66 |             trait_data_type='binary:\t',
 67 |             n_permut=200,
 68 |             # ignore='Starter-only-5A,FAMIX,Starter-only-10,Starter-only-7,mixture',
 69 |             restrict_to=RESTRICT_TO,
 70 |             random_state=42,
 71 |             n_cpus=7,
 72 |             outdir=self.tempdir,
 73 |             # limit_traits=(0, 20),
 74 |             limit_traits=(320, 340),
 75 |             max_genes=100
 76 |         )
 77 | 
 78 |     def test_scoary_long_numeric(self):
 79 |         scoary(
 80 |             multiple_testing='fdr_bh:0.3',
 81 |             genes='../data/new_ds/N0.tsv',
 82 |             gene_info='../data/new_ds/N0_best_names.tsv',
 83 |             gene_data_type='gene-list:\t',
 84 |             traits='../data/new_ds/LC.tsv',
 85 |             trait_data_type='gaussian:skip:\t:tied',
 86 |             trait_info='../data/new_ds/LC-meta.tsv',
 87 |             isolate_info='../data/new_ds/isolate-meta.tsv',
 88 |             n_permut=200,
 89 |             # ignore='Starter-only-5A,FAMIX,Starter-only-10,Starter-only-7,mixture',
 90 |             restrict_to='FAM14177-p1-1.1,FAM14184-i1-1.1,FAM14193-i1-1.1,FAM14197-i1-1.1,FAM14217-p1-1.1,FAM14221-p1-1.1,FAM14222-p1-1.1,FAM1414-i1-1.1,FAM15061-i1-1.1,FAM15078-i1-1.1,FAM15113-i1-1.1,FAM15170-i1-1.1,FAM15190-i1-1.1,FAM15192-i1-1.1,FAM15300-i1-1.1,FAM15333-i1-1.1,FAM15346-i1-1.1,FAM15347-i1-1.1,FAM15381-i1-1.1,FAM15407-i1-1.1,FAM19015-i1-1.1,FAM19016-i1-1.1,FAM19020-i1-1.1,FAM19022-i1-1.1,FAM19023-i1-1.1,FAM19024-p1-1.1,FAM19025-p1-1.1,FAM19030-i2-1.1,FAM19031-i2-1.1,FAM19034-i1-1.1,FAM22019-i1-1.1,FAM22020-i1-1.1,FAM22021-p1-1.1,FAM23848-i1-1.1,FAM23852-i1-1.1,FAM23853-i1-1.1,FAM23855-i1-1.1,FAM23864-i1-1.1,FAM23867-i1-1.1,FAM23868-i1-1.1,FAM23869-i1-1.1,FAM23870-i1-1.1,FAM23877-p1-1.1,FAM24252-i1-1.1',
 91 |             random_state=42,
 92 |             n_cpus=7,
 93 |             outdir=self.tempdir,
 94 |             limit_traits=(0, 200),
 95 |             pairwise=True
 96 |         )
 97 | 
 98 |     def test_scoary_gauss_kmeans(self):
 99 |         scoary(
100 |             genes='../data/new_ds/N0.tsv',
101 |             gene_info='../data/new_ds/N0_best_names.tsv',
102 |             gene_data_type='gene-list:\t',
103 |             traits='../data/new_ds/LC.tsv',
104 |             trait_data_type=f'gaussian:kmeans:\t',
105 |             trait_info='../data/new_ds/LC-meta.tsv',
106 |             isolate_info='../data/new_ds/isolate-meta.tsv',
107 |             n_permut=200,
108 |             restrict_to=RESTRICT_TO,
109 |             random_state=42,
110 |             n_cpus=7,
111 |             outdir=self.tempdir,
112 |             # limit_traits=(0, 100),
113 |             # pairwise=False
114 |         )
115 | 
116 |     def test_scoary_full(self):
117 |         scoary(
118 |             multiple_testing='bonferroni:0.1',
119 |             genes='../data/full_ds/N0.tsv',
120 |             gene_info='../data/full_ds/N0_best_names.tsv',
121 |             gene_data_type='gene-list:\t',
122 |             traits='../data/full_ds/traits.tsv',
123 |             trait_data_type=f'gaussian:skip:\t:tied',  # {'tied', 'full', 'diag', 'spherical'}
124 |             trait_info='../data/full_ds/trait_info.tsv',
125 |             isolate_info='../data/full_ds/isolate_info.tsv',
126 |             n_permut=600,
127 |             random_state=42,
128 |             n_cpus=8,
129 |             n_cpus_binarization=1,
130 |             restrict_to=RESTRICT_TO,
131 |             max_genes=50,
132 |             # limit_traits=(12377, 12378),
133 |             limit_traits=(3750, 3760),
134 |             trait_wise_correction=True,
135 |             # limit_traits=(2330, 2340),
136 |             worst_cutoff=0.1,
137 |             outdir=self.tempdir,
138 |         )
139 | 
140 |     def test_scoary_marco(self):
141 |         scoary(
142 |             genes='../data/marco/Orthogroups.tsv',
143 |             gene_data_type='gene-list:\t',
144 |             traits='../data/marco/traits.tsv',
145 |             trait_data_type='binary: ',  # {'tied', 'full', 'diag', 'spherical'}
146 |             n_permut=1000,
147 |             random_state=42,
148 |             n_cpus=1,
149 |             outdir=self.tempdir,
150 |             multiple_testing='native:0.05',
151 |         )
152 | 
153 |     def test_scoary_jacordova(self):
154 |         scoary(
155 |             genes='../data/jacordova/GeneCount_Scoary_Ecoli.txt',
156 |             gene_data_type='gene-count:\t',
157 |             traits='../data/jacordova/Ecoli_traits.txt',
158 |             trait_data_type='gaussian:kmeans:\t',  # {'tied', 'full', 'diag', 'spherical'}
159 |             n_permut=1000,
160 |             random_state=42,
161 |             n_cpus=1,
162 |             outdir=self.tempdir,
163 |             multiple_testing='native:0.05',
164 |         )
165 | 
166 |     def test_same_hemming_result(self):
167 |         """
168 |         Check if old scoary generates the same data (hamming similarity matrix)
169 |         """
170 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
171 |         tdm_new = pd.DataFrame(distance.squareform(distance.pdist(genes_df.T, 'hamming')))
172 |         tdm_old = np.flip(pd.read_csv('../data/tetracycline/tetracycline_TDM.csv', index_col=0).values)  # has to be flipped
173 |         np.fill_diagonal(tdm_old, 0)  # diagonal should be 0, not 1
174 |         tdm_old = pd.DataFrame(tdm_old)
175 |         self.assertTrue(np.isclose(tdm_old, tdm_new).all())
176 | 
177 |     def test_recursion_depth(self):
178 |         strains = [f'strain_{i}' for i in range(13000)]
179 |         genes = [f'gene_{i}' for i in range(100)]
180 |         traits = [f'trait_{i}' for i in range(4)]
181 |         genes_df = pd.DataFrame(
182 |             np.random.randint(
183 |                 low=0, high=2, size=(len(genes), len(strains))
184 |             ), index=genes, columns=strains
185 |         )
186 |         traits_df = pd.DataFrame(
187 |             np.random.randint(
188 |                 low=0, high=2, size=(len(strains), len(traits))
189 |             ), index=strains, columns=traits
190 |         )
191 |         genes_df.to_csv('../data/huge_ds/genes.tsv', sep='\t')
192 |         traits_df.to_csv('../data/huge_ds/traits.tsv', sep='\t')
193 |         # Calculating tree is very slow, but it works.
194 |         with open('../data/huge_ds/tree.nwk', 'w') as f:
195 |             f.write('(' * (len(strains) - 1))
196 |             f.write(strains[0])
197 |             f.write(',')
198 |             f.write('),'.join(strains[1:]))
199 |             f.write(');')
200 | 
201 |         scoary(
202 |             genes='../data/huge_ds/genes.tsv',
203 |             traits='../data/huge_ds/traits.tsv',
204 |             trait_data_type='binary:\t',
205 |             gene_data_type='gene-count:\t',
206 |             newicktree='../data/huge_ds/tree.nwk',
207 |             n_permut=1000,
208 |             n_cpus=4,
209 |             outdir=self.tempdir
210 |         )
211 | 
212 |     def test_scoary_roary_gene_list(self):
213 |         # GitHub issue #5
214 |         # scoary(
215 |         #     genes=get_path('roary-list', 'genes'),
216 |         #     traits=get_path('roary-list', 'traits'),
217 |         #     gene_data_type='gene-list:,',
218 |         #     n_permut=1000,
219 |         #     multiple_testing='native:0.05',
220 |         #     n_cpus=1,
221 |         #     outdir=self.tempdir
222 |         # )
223 |         scoary(
224 |             genes='../data/roary-list/gene_presence_absence-b.csv',
225 |             traits='../data/roary-list/traits-b.csv',
226 |             gene_data_type='gene-list:,',
227 |             n_permut=1000,
228 |             multiple_testing='native:0.05',
229 |             n_cpus=1,
230 |             outdir=self.tempdir
231 |         )
232 | 
233 |     def test_scoary_pyseer(self):
234 |         scoary(
235 |             genes='../data/pyseer/gene_presence_absence.Rtab',
236 |             traits='../data/pyseer/resistances.pheno',
237 |             gene_data_type='gene-count:\t',
238 |             trait_data_type='binary:\t',
239 |             multiple_testing='bonferroni:0.05',
240 |             n_cpus=1,
241 |             outdir=self.tempdir,
242 |             pairwise=False
243 |         )
244 | 


--------------------------------------------------------------------------------
/benchmarking/binarization/benchmark_binarization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import pandas as pd
  4 | from scipy.stats import norm
  5 | import matplotlib as mpl
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | 
  9 | 
 10 | def create_common_ancestor_genome(core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]) -> np.array:
 11 |     return np.concatenate((
 12 |         np.full(core_genes, True, dtype=bool),  # core genes are always present
 13 |         np.full(pan_genes + len(causal_genes), False, dtype=bool)  # pan genes and causal genes are initially absent
 14 |     ))
 15 | 
 16 | 
 17 | def create_mut_chance_series(core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]) -> np.array:
 18 |     return np.concatenate((
 19 |         np.zeros(core_genes),  # core genes have no mutation chance
 20 |         np.random.rand(pan_genes) / 100,  # each pan gene has a random mutation chance between 0 and 0.01
 21 |         np.array(causal_genes)  # causal genes have specified mutation chance
 22 |     )).reshape(core_genes + pan_genes + len(causal_genes), 1)  # create 2D array
 23 | 
 24 | 
 25 | def mutate_genomes(genomes: pd.DataFrame, mut_change_series: np.array):
 26 |     """Mutate genomes"""
 27 |     random_values = np.random.rand(*genomes.shape)
 28 |     mutated = random_values <= mut_change_series
 29 |     # genome_collection and mutated are arrays of the same size. Wherever mutated is True, flip the bit in genome_collection
 30 |     return pd.DataFrame(
 31 |         np.logical_xor(genomes.values, mutated),
 32 |         index=genomes.index,
 33 |         columns=genomes.columns
 34 |     )
 35 | 
 36 | 
 37 | def branch_genomes(genomes: pd.DataFrame, mut_change_series: np.array, branch_probability: float):
 38 |     """Branch genomes"""
 39 |     random_values = np.random.rand(len(genomes.columns))
 40 |     branch = random_values < branch_probability
 41 |     if branch.sum() == 0:  # no new genomes
 42 |         return genomes
 43 |     new_genomes = genomes.loc[:, branch].copy()  # .copy necessary?
 44 |     column_names = [f'genome_{i}' for i in range(len(genomes.columns), len(genomes.columns) + len(new_genomes.columns))]
 45 |     new_genomes.columns = column_names
 46 |     mutate_genomes(new_genomes, mut_change_series=mut_change_series)
 47 |     return pd.concat([genomes, new_genomes], axis=1)
 48 | 
 49 | 
 50 | def create_genomes(n_genomes: int, core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]):
 51 |     mut_change_series = create_mut_chance_series(core_genes, pan_genes, causal_genes)
 52 | 
 53 |     genomes = pd.DataFrame(
 54 |         data={'genome_0': create_common_ancestor_genome(core_genes, pan_genes, causal_genes)},
 55 |         index=[f'core_{x:05}' for x in range(core_genes)] +
 56 |               [f'pan_{x:05}' for x in range(pan_genes)] +
 57 |               [f'causal_{x:05}' for x in range(len(causal_genes))],
 58 |         dtype="bool"
 59 |     )
 60 | 
 61 |     while len(genomes.columns) < n_genomes:
 62 |         # mutate all genomes
 63 |         genomes = mutate_genomes(genomes, mut_change_series)
 64 |         # create new genomes/branches
 65 |         genomes = branch_genomes(genomes, mut_change_series=mut_change_series, branch_probability=0.01)
 66 | 
 67 |     # The last iteration of branch_genomes may have created more genomes than necessary.
 68 |     # Return only first n_genomes genomes
 69 |     genomes = genomes.iloc[:, :n_genomes]
 70 |     return genomes
 71 | 
 72 | 
 73 | def calculate_phenotype(genomes: pd.DataFrame, effect_size: float):
 74 |     """
 75 |     Calculate phenotype asuming a normal distribution.
 76 |     """
 77 |     return pd.Series(
 78 |         np.random.normal(
 79 |             loc=genomes.loc['causal_00000'].astype(int) * effect_size,
 80 |             scale=1
 81 |         ),
 82 |         index=genomes.columns,
 83 |         name='phenotype'
 84 |     )
 85 | 
 86 | 
 87 | def write_files(genomes: pd.DataFrame, genomes_file: str, phenotype: pd.Series, phenotype_file: str):
 88 |     genomes.astype('int').to_csv(genomes_file, sep='\t')
 89 |     phenotype.to_csv(phenotype_file, sep='\t')
 90 | 
 91 | 
 92 | def test(n_genomes: int, effect_size: float, core_genes: int = 3000, pan_genes: int = 6000, causal_genes: [float] = [0.01]):
 93 |     genomes = create_genomes(n_genomes, core_genes, pan_genes, causal_genes)
 94 |     print(genomes)
 95 |     phenotype = calculate_phenotype(genomes, effect_size)
 96 |     print(phenotype)
 97 |     write_files(genomes, 'simulations/genomes.tsv', phenotype, 'simulations/phenotype.tsv')
 98 | 
 99 | 
100 | def _simulate(n_replicates, n, e, r):
101 |     if os.path.isdir(f'simulations/{n=}_{e=}_{r=}'):
102 |         return
103 | 
104 |     print(f'Running replicate {r} of {n_replicates} for {n} genomes and {e} effect size')
105 |     np.random.seed(n + int(e * 2) + r)  # dirty hack: each replicate gets a predictable seed
106 | 
107 |     genomes = create_genomes(n)
108 |     phenotype = calculate_phenotype(genomes, e)
109 | 
110 |     os.makedirs(f'simulations/{n=}_{e=}_{r=}', exist_ok=True)
111 |     write_files(
112 |         genomes, f'simulations/{n=}_{e=}_{r=}/genomes.tsv',
113 |         phenotype, f'simulations/{n=}_{e=}_{r=}/phenotype.tsv'
114 |     )
115 | 
116 | 
117 | def generate_simulations():
118 |     n_replicates = 20
119 |     n_genomes = [25, 50, 75, 100, 150, 200]
120 |     effect_size = [0.5, 0.75, 1., 1.5, 2., 2.5, 3.]
121 | 
122 |     os.makedirs('simulations', exist_ok=True)
123 | 
124 |     from multiprocessing import Pool
125 |     with Pool() as pool:
126 |         pool.starmap(
127 |             _simulate,
128 |             [(n_replicates, n, e, r) for n in n_genomes for e in effect_size for r in range(n_replicates)]
129 |         )
130 | 
131 | 
132 | def _scoary(msg: str, simulation: str):
133 |     os.environ['SCOARY_RESET_LOGGERS'] = 'TRUE'
134 |     os.environ['SCOARY_LOGLEVEL_STDOUT'] = 'WARNING'
135 |     os.environ['SCOARY_PRINT_CITATION'] = 'FALSE'
136 |     os.environ['SCOARY_PRINT_PROGRESS'] = 'FALSE'
137 | 
138 |     from scoary import scoary
139 | 
140 |     print(f'{msg}: Analyzing {simulation}')
141 | 
142 |     genes = f'simulations/{simulation}/genomes.tsv'
143 |     traits = f'simulations/{simulation}/phenotype.tsv'
144 |     outdir = f'simulations/{simulation}/scoary'
145 | 
146 |     if os.path.isdir(outdir):
147 |         return
148 |         # import shutil
149 |         # shutil.rmtree(outdir)
150 | 
151 |     for file in [genes, traits]:
152 |         assert os.path.exists(file), f'{file} does not exist'
153 | 
154 |     scoary(
155 |         genes,
156 |         traits,
157 |         outdir,
158 |         trait_data_type='gaussian:kmeans:\t',
159 |         gene_data_type='gene-count:\t',
160 |         multiple_testing='native:0.05',
161 |         n_permut=1000,
162 |         n_cpus=1,
163 |         random_state=42,
164 |     )
165 | 
166 |     assert os.path.isdir(outdir), f'{outdir} does not exist'
167 | 
168 |     if not os.listdir(f'{outdir}/traits'):
169 |         print(f'{simulation=}: No traits found')
170 | 
171 | 
172 | def analyze_scoary_results():
173 |     datapoints = []
174 |     for simulation in os.listdir('simulations'):
175 |         datapoint = {key: float(value) if '.' in value else int(value)
176 |                      for key, value in (pair.split('=') for pair in simulation.split('_'))}
177 |         try:
178 |             df = pd.read_csv(f'simulations/{simulation}/scoary/traits/phenotype/result.tsv', sep='\t', index_col=0)
179 |             assert 'causal_00000' in df.index, f'{simulation=}: causal_00000 not in index. {df.shape=}'
180 |             causal_rank = list(df.index).index('causal_00000') + 1
181 |             datapoint['causal_rank'] = causal_rank
182 |             datapoints.append(datapoint)
183 |         except AssertionError as e:
184 |             print(f'{simulation=}: {e}')
185 |             datapoint['causal_rank'] = np.nan
186 |             datapoints.append(datapoint)
187 |         except FileNotFoundError as e:
188 |             print(f'{simulation=}: {e}')
189 |             datapoint['causal_rank'] = np.nan
190 |             datapoints.append(datapoint)
191 | 
192 |     df = pd.DataFrame(datapoints)
193 | 
194 |     # rename columns
195 |     df = df.rename(columns={
196 |         'n': 'Number of genomes',
197 |         'e': 'Effect size',
198 |         'r': 'Replicate',
199 |         'causal_rank': 'Rank of causal gene'
200 |     })
201 | 
202 |     os.makedirs('out', exist_ok=True)
203 |     df.to_csv('out/results.tsv', sep='\t')
204 |     return df
205 | 
206 | 
207 | def run_scoary():
208 |     simulations = os.listdir('simulations')
209 |     simulations = list(set([x.split('-')[0] for x in simulations]))
210 |     n_simulations = len(simulations)
211 | 
212 |     for i, simulation in enumerate(simulations, start=1):
213 |         _scoary(f'{i}/{n_simulations}', simulation)
214 | 
215 | 
216 | def plot_all(df: pd.DataFrame, effect_sizes: [float] = [0.5, 1., 1.5, 2, 3.]):
217 |     mpl.use('module://backend_interagg')
218 | 
219 |     # fill missing values with max_rank + 20
220 |     max_rank = df['Rank of causal gene'].max()
221 |     df = df.fillna(max_rank + 100)
222 | 
223 |     fig = plt.figure(figsize=(15, 5))
224 |     axs = fig.subplots(2, 5, height_ratios=[1, 2], sharey='row')
225 |     for ax in axs.flat:
226 |         ax.label_outer()
227 | 
228 |     def add_normal(ax, mean, sd, x, line_color='black', fill_color='red', alpha: float = 0.5):
229 |         # Calculate mean and standard deviation
230 |         y = norm.pdf(x, mean, sd)
231 |         ax.plot(x, y, color=line_color)
232 |         ax.fill_between(x, y, color=fill_color, alpha=alpha)
233 | 
234 |     for i, effect_size in enumerate(effect_sizes):
235 |         effect_size_str = str(effect_size).removesuffix('.0')
236 | 
237 |         ax_effect_size = axs.flat[i]
238 |         ax_lineplot = axs.flat[i + len(effect_sizes)]
239 | 
240 |         # plot effect size
241 |         _center = effect_size / 2
242 |         x = np.arange(_center - 5, _center + 5, 0.01)
243 |         ax_effect_size.grid(False)
244 |         ax_effect_size.set_xticks([])
245 |         ax_effect_size.set_yticks([])
246 |         # ax_effect_size.set_xlabel(f'Distribution of sampled traits')
247 |         add_normal(ax_effect_size, 0, 1, x, fill_color='#a6cee3')
248 |         add_normal(ax_effect_size, effect_size, 1, x, fill_color='#b2df8a')
249 |         # add a dotted line from [0, 0.41] to [effect_size, 0.41]
250 |         ax_effect_size.plot([0, effect_size], [0.42, 0.42], color='black', linestyle='dotted')
251 |         # add a letter d above the dashed line
252 |         ax_effect_size.text(effect_size / 2, 0.445, f'$ d \equal {effect_size_str} \sigma $', horizontalalignment='center', verticalalignment='center')
253 |         ax_effect_size.set_ylim(0, 0.5)
254 | 
255 |         # set title
256 |         ax_effect_size.set_title(f'Effect size: {effect_size_str}')
257 |         # ax_title.grid(False)
258 |         # ax_title.axis('off')
259 | 
260 |         # plot lineplot
261 |         sns.lineplot(
262 |             x='Number of genomes', y='Rank of causal gene', hue='Effect size',
263 |             palette=sns.color_palette(['black'], 1),
264 |             data=df[df['Effect size'] == effect_size],
265 |             ax=ax_lineplot,
266 |             legend=False
267 |         )
268 |         ax_lineplot.set_ylim(1, 250)
269 |         ax_lineplot.set_xlim(25, 200)
270 |         # make y axis logarithmic
271 |         ax_lineplot.set_yscale('symlog')
272 |         ax_lineplot.set_yticks([1, 2, 5, 10, 20, 50, 100])
273 |         # ax_lineplot.get_yaxis().tick_right()
274 |         ax_lineplot.get_yaxis().set_label_position("left")
275 |         ax_lineplot.get_yaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
276 |         ax_lineplot.set_xticks(df['Number of genomes'].unique())
277 | 
278 |     plt.tight_layout()
279 |     # plt.show()
280 |     plt.savefig('out/effect_sizes_horizontal.svg')
281 | 
282 | 
283 | if __name__ == '__main__':
284 |     if not os.path.isfile('out/results.tsv'):
285 |         generate_simulations()
286 |         run_scoary()
287 |         df = analyze_scoary_results()
288 |     else:
289 |         df = pd.read_csv('out/results.tsv', sep='\t', index_col=0)
290 | 
291 |     plot_all(df, effect_sizes=[0.5, 1., 1.5, 2., 3.])
292 | 
293 |     print('Complete success.')
294 | 


--------------------------------------------------------------------------------
/scoary/picking.py:
--------------------------------------------------------------------------------
  1 | from functools import cache
  2 | 
  3 | from numba import njit
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy.stats import binomtest
  7 | 
  8 | from .ScoaryTree import ScoaryTree
  9 | 
 10 | 
 11 | def pick(
 12 |         tree: [],
 13 |         label_to_trait_a: {str: bool},
 14 |         trait_b_df: pd.DataFrame,
 15 |         calc_pvals: bool = True
 16 | ) -> (np.array,):
 17 |     """
 18 |     Traverse the tree and perform pair picking
 19 | 
 20 |     :param tree: Tree in list form
 21 |     :param label_to_trait_a: maps each label of the tree to whether it has trait a
 22 |     :param trait_b_df: DataFrame (dtype:bool); columns: labels of the tree; rows: whether trait b is present
 23 |     :param calc_pvals: If False, binomial test will not be applied and best/worst will be None
 24 |     :return: (max_contr, max_suppo, max_oppos, best, worst) if calc_pvals else (max_contr, max_suppo, max_oppos)
 25 |     """
 26 | 
 27 |     assert not trait_b_df.isna().values.any()
 28 | 
 29 |     def _pick(left_label, right_label):
 30 |         # follow tree until terminal node
 31 |         if type(left_label) is not str:
 32 |             left = _pick(left_label[0], left_label[1])
 33 |         if type(right_label) is not str:
 34 |             right = _pick(right_label[0], right_label[1])
 35 | 
 36 |         # only load new leafs when needed for combination (safe RAM)
 37 |         if type(left_label) is str:
 38 |             left = init_leaf(
 39 |                 trait_a=label_to_trait_a[left_label],
 40 |                 trait_b_list=trait_b_df[left_label].to_numpy(dtype='bool')
 41 |             )
 42 |         if type(right_label) is str:
 43 |             right = init_leaf(
 44 |                 trait_a=label_to_trait_a[right_label],
 45 |                 trait_b_list=trait_b_df[right_label].to_numpy(dtype='bool')
 46 |             )
 47 | 
 48 |         combined = combine_branches(left, right)
 49 | 
 50 |         return combined
 51 | 
 52 |     values = _pick(tree[0], tree[1])
 53 | 
 54 |     max_contr = values[:, 0, :].max(axis=1)
 55 |     max_suppo = values[:, 1, :].max(axis=1)
 56 |     max_oppos = values[:, 2, :].max(axis=1)
 57 | 
 58 |     if not calc_pvals:
 59 |         return max_contr, max_suppo, max_oppos
 60 | 
 61 |     best, worst = apply_binomtest(max_contr, max_suppo, max_oppos)
 62 | 
 63 |     return max_contr, max_suppo, max_oppos, best, worst
 64 | 
 65 | 
 66 | def pick_single(
 67 |         tree: [],
 68 |         label_to_trait_a: {str: bool},
 69 |         label_to_trait_b: {str: bool},
 70 |         calc_pvals: bool = True
 71 | ) -> {str: int | float}:
 72 |     res = pick(
 73 |         tree=tree,
 74 |         label_to_trait_a=label_to_trait_a,
 75 |         trait_b_df=pd.DataFrame([label_to_trait_b]),
 76 |         calc_pvals=calc_pvals
 77 |     )
 78 |     return dict(zip(
 79 |         ['max_contrasting_pairs', 'max_supporting_pairs', 'max_opposing_pairs', 'best_pval', 'worst_pval'],
 80 |         [v[0] for v in res]
 81 |     ))
 82 | 
 83 | 
 84 | def pick_nonrecursive(
 85 |         tree: [],
 86 |         label_to_trait_a: {str: bool},
 87 |         trait_b_df: pd.DataFrame,
 88 |         calc_pvals: bool = True
 89 | ) -> (np.array, np.array, np.array, np.array, np.array):
 90 |     if tree.is_leaf:
 91 |         return init_leaf(
 92 |             trait_a=label_to_trait_a[tree.label],
 93 |             trait_b_list=trait_b_df[tree.label].to_numpy(dtype='bool')
 94 |         )
 95 | 
 96 |     stack = [[tree, 'right'], [tree, 'left']]
 97 | 
 98 |     while stack:
 99 |         current_parent, current_direction = stack[-1]
100 |         current_node: ScoaryTree = getattr(current_parent, current_direction)
101 | 
102 |         if current_node.is_leaf:
103 |             # current node is leaf
104 |             this = init_leaf(
105 |                 trait_a=label_to_trait_a[current_node.label],
106 |                 trait_b_list=trait_b_df[current_node.label].to_numpy(dtype='bool')
107 |             )
108 | 
109 |             # append data to parent
110 |             current_node._values = this
111 | 
112 |             if current_direction == 'right':
113 |                 # found terminal node
114 |                 # # GO UP UNTIL CAN GO RIGHT
115 |                 while stack and stack[-1][1] == 'right':
116 |                     ancestor_tree, ancestor_direction = stack.pop()
117 |                     ancestor_tree._values = combine_branches(
118 |                         ancestor_tree.left._values,
119 |                         ancestor_tree.right._values
120 |                     )
121 |                     ancestor_tree.left._values = None
122 |                     ancestor_tree.right._values = None
123 | 
124 |                 if not stack:
125 |                     # arrived at root node
126 |                     break
127 | 
128 |             # pop left node -> go right next
129 |             stack.pop()
130 | 
131 |         else:
132 |             stack.extend([(current_node, 'right'), (current_node, 'left')])
133 | 
134 |     values = tree._values
135 |     tree._values = None
136 | 
137 |     max_contr = values[:, 0, :].max(axis=1)
138 |     max_suppo = values[:, 1, :].max(axis=1)
139 |     max_oppos = values[:, 2, :].max(axis=1)
140 | 
141 |     if not calc_pvals:
142 |         return max_contr, max_suppo, max_oppos
143 | 
144 |     best, worst = apply_binomtest(max_contr, max_suppo, max_oppos)
145 | 
146 |     return max_contr, max_suppo, max_oppos, best, worst
147 | 
148 | 
149 | @cache
150 | def _binomtest(k: int, n: int) -> float:
151 |     # caching this function increases speed ~ 40x
152 |     return binomtest(k=k, n=n).pvalue
153 | 
154 | 
155 | def apply_binomtest(max_contr, max_suppo, max_oppos):
156 |     n_traits = max_contr.shape[0]
157 |     result = np.empty(shape=(2, n_traits), dtype='float')
158 | 
159 |     for i in range(n_traits):
160 |         b = _binomtest(max_suppo[i], n=max_contr[i])
161 |         w = _binomtest(max_oppos[i], n=max_contr[i])
162 | 
163 |         if b < w:
164 |             result[0][i] = b
165 |             result[1][i] = w
166 |         else:
167 |             result[0][i] = w
168 |             result[1][i] = b
169 |     return result
170 | 
171 | 
172 | # selecting:values[<TRAITS>, <3 TYPES OF PAIRINGS>, <5 COMBINATIONS>]
173 | # selecting:values[<TRAITS>, <0: max; 1: supporting; 2: opposing>, <0: 11; 1: 10; 2: 01; 3: 00; 4: nf>]
174 | 
175 | # values[n, 0, :] -> all max contrasting pairs for trait n
176 | # values[n, 1, :] -> all max supporting pairs for trait n
177 | # values[n, 2, :] -> all max opposing pairs for trait n
178 | 
179 | # values[n, 0, 0] -> max supporting pairs for trait n if condition '11' is added
180 | # values[n, 0, 1] -> max supporting pairs for trait n if condition '10' is added
181 | # values[n, 0, 2] -> max supporting pairs for trait n if condition '01' is added
182 | # values[n, 0, 3] -> max supporting pairs for trait n if condition '00' is added
183 | # values[n, 0, 4] -> max supporting pairs for trait n if condition 'nf' is added
184 | 
185 | 
186 | @njit('int64[:, ::3, ::5](boolean, boolean[:])',
187 |       cache=True, nogil=True, boundscheck=False, parallel=False)  # prange not better
188 | def init_leaf(trait_a: bool, trait_b_list: np.array) -> np.array:
189 |     n_traits = trait_b_list.shape[0]
190 | 
191 |     values = np.full(shape=(n_traits, 3, 5), fill_value=-1, dtype='int')
192 |     if trait_a:
193 |         values[:, :, 0][trait_b_list] = 0
194 |         values[:, :, 1][~trait_b_list] = 0
195 | 
196 |     else:
197 |         values[:, :, 2][trait_b_list] = 0
198 |         values[:, :, 3][~trait_b_list] = 0
199 | 
200 |     return values
201 | 
202 | 
203 | @njit('int64[::3, ::5], int64[::3, ::5]',
204 |       cache=True, nogil=True, boundscheck=False, parallel=False)  # parallel kills performance
205 | def calculate_max_nofree(left: np.array, right: np.array):
206 |     values = np.full(shape=(3, 5), fill_value=-1, dtype='int')
207 | 
208 |     if left[0][4] > -1 and right[0][4] > -1:  # nf vs nf
209 |         values[0][0] = left[0][4] + right[0][4]
210 |         values[1][0] = left[1][4] + right[1][4]
211 |         values[2][0] = left[2][4] + right[2][4]
212 | 
213 |     if left[0][0] > -1 and right[0][3] > -1:  # 11 vs 00
214 |         values[0][1] = left[0][0] + right[0][3] + 1
215 |         values[1][1] = left[1][0] + right[1][3] + 1
216 |         values[2][1] = left[2][0] + right[2][3]
217 | 
218 |     if left[0][3] > -1 and right[0][0] > -1:  # 00 vs 11
219 |         values[0][2] = left[0][3] + right[0][0] + 1
220 |         values[1][2] = left[1][3] + right[1][0] + 1
221 |         values[2][2] = left[2][3] + right[2][0]
222 | 
223 |     if left[0][1] > -1 and right[0][2] > -1:  # 10 vs 01
224 |         values[0][3] = left[0][1] + right[0][2] + 1
225 |         values[1][3] = left[1][1] + right[1][2]
226 |         values[2][3] = left[2][1] + right[2][2] + 1
227 | 
228 |     if left[0][2] > -1 and right[0][1] > -1:  # 01 vs 10
229 |         values[0][4] = left[0][2] + right[0][1] + 1
230 |         values[1][4] = left[1][2] + right[1][1]
231 |         values[2][4] = left[2][2] + right[2][1] + 1
232 | 
233 |     max_contr = values[0].max()
234 | 
235 |     max_suppo = -1
236 |     for i in range(5):
237 |         if values[0][i] == max_contr and values[1][i] > max_suppo:
238 |             max_suppo = values[1][i]
239 | 
240 |     max_oppos = -1
241 |     for i in range(5):
242 |         if values[0][i] == max_contr and values[2][i] > max_oppos:
243 |             max_oppos = values[2][i]
244 | 
245 |     return max_contr, max_suppo, max_oppos
246 | 
247 | 
248 | @njit('int64, int64[::3, ::5], int64[::3, ::5]',
249 |       cache=True, nogil=True, boundscheck=False, parallel=False)
250 | def calculate_max_given_condition(condition: int, left: np.array, right: np.array):  # parallel kills performance
251 |     values = np.full(shape=(3, 9), fill_value=-1, dtype='int')
252 | 
253 |     if left[0][condition] > -1:
254 |         # compare condition with all conditions
255 |         for i in range(5):
256 |             values[0][i] = left[0][condition] + right[0][i]
257 |             values[1][i] = left[1][condition] + right[1][i]
258 |             values[2][i] = left[2][condition] + right[2][i]
259 | 
260 |     if right[0][condition] > -1:
261 |         col_id = 5
262 |         # compare all conditions with condition
263 |         for i in range(5):
264 |             if i == condition:  # this comparison has already been made above
265 |                 continue
266 | 
267 |             values[0][col_id] = left[0][i] + right[0][condition]
268 |             values[1][col_id] = left[1][i] + right[1][condition]
269 |             values[2][col_id] = left[2][i] + right[2][condition]
270 | 
271 |             col_id += 1
272 | 
273 |     max_contr = values[0].max()
274 | 
275 |     max_suppo = -1
276 |     for i in range(9):
277 |         if values[0][i] == max_contr and values[1][i] > max_suppo:
278 |             max_suppo = values[1][i]
279 | 
280 |     max_oppos = -1
281 |     for i in range(9):
282 |         if values[0][i] == max_contr and values[2][i] > max_oppos:
283 |             max_oppos = values[2][i]
284 | 
285 |     return max_contr, max_suppo, max_oppos
286 | 
287 | 
288 | @njit('int64[:, ::3, ::5], int64[:, ::3, ::5]',
289 |       cache=True, nogil=True, boundscheck=False, parallel=False)
290 | def combine_branches(left: np.array, right: np.array):
291 |     assert left.shape == right.shape
292 |     n_traits = left.shape[0]
293 | 
294 |     values = np.full(shape=left.shape, fill_value=-1, dtype='int')
295 | 
296 |     # selecting:values[<TRAITS>, <0: max; 1: supporting; 2: opposing>, <0: 11; 1: 10; 2: 01; 3: 00; 4: nf>]
297 |     for trait_id in range(n_traits):  # prange kills performance
298 |         for cond in range(4):  # prange kills performance
299 |             # {"11": 0, "10": 1, "01": 2, "00": 3, "nf": 4}
300 |             max_contr, max_suppo, max_oppos = calculate_max_given_condition(
301 |                 cond,
302 |                 left[trait_id, :, :],
303 |                 right[trait_id, :, :]
304 |             )
305 |             values[trait_id, 0, cond] = max_contr
306 |             values[trait_id, 1, cond] = max_suppo
307 |             values[trait_id, 2, cond] = max_oppos
308 |         max_contr, max_suppo, max_oppos = calculate_max_nofree(
309 |             left[trait_id, :, :],
310 |             right[trait_id, :, :]
311 |         )
312 |         values[trait_id, 0, 4] = max_contr
313 |         values[trait_id, 1, 4] = max_suppo
314 |         values[trait_id, 2, 4] = max_oppos
315 | 
316 |     return values
317 | 


--------------------------------------------------------------------------------
/tests/test_picking.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable
  2 | 
  3 | from init_tests import *
  4 | 
  5 | from scoary.utils import print_tree
  6 | from scoary.scoary import *
  7 | from scoary.analyze_trait import init_result_df, pair_picking
  8 | from scoary.ScoaryTree import ScoaryTree
  9 | from scoary.picking import pick, pick_nonrecursive, pick_single
 10 | 
 11 | from scoary.scoary_1_picking import *
 12 | 
 13 | from timeit import default_timer as timer
 14 | 
 15 | boolify = lambda t1, t2: f"{'A' if t1 else 'a'}{'B' if t2 else 'b'}"
 16 | 
 17 | dummy_tree = [['isolate1', 'isolate2'], ['isolate3', 'isolate4']]
 18 | 
 19 | dummy_trait_a = {
 20 |     'isolate1': True,
 21 |     'isolate2': False,
 22 |     'isolate3': False,
 23 |     'isolate4': True,
 24 | }
 25 | 
 26 | dummy_trait_b_df = pd.DataFrame(
 27 |     [
 28 |         [True, True, False, False],
 29 |         [True, False, True, False],
 30 |         [True, False, False, True],
 31 |         [False, True, True, False],
 32 |         [False, True, False, True],
 33 |         [False, True, False, True],
 34 |         [False, True, False, True],
 35 |         [False, True, False, True],
 36 |     ], columns=['isolate1', 'isolate2', 'isolate3', 'isolate4']
 37 | )
 38 | 
 39 | 
 40 | def time_fn(fn: Callable, args=None, kwargs=None, n_times: int = 1) -> (float, Any):
 41 |     if kwargs is None:
 42 |         kwargs = {}
 43 |     if args is None:
 44 |         args = []
 45 | 
 46 |     diffs = []
 47 |     for i in range(n_times):
 48 |         start = timer()
 49 |         res = fn(*args, **kwargs)
 50 |         end = timer()
 51 |         diffs.append(end - start)  # Time in seconds, e.g. 5.38091952400282
 52 |     return np.mean(diffs), res
 53 | 
 54 | 
 55 | def scoary_1_pick(tree: [], label_to_trait_a: {str: bool}, trait_b_df: pd.DataFrame):
 56 |     labels = set(trait_b_df.columns)
 57 | 
 58 |     max_contrasting = np.empty(shape=len(trait_b_df), dtype='int')
 59 |     max_supporting = np.empty(shape=len(trait_b_df), dtype='int')
 60 |     max_opposing = np.empty(shape=len(trait_b_df), dtype='int')
 61 | 
 62 |     for i, (_, label_to_trait) in enumerate(trait_b_df.iterrows()):
 63 |         gtc = {l: boolify(label_to_trait_a[l], label_to_trait[l]) for l in labels}
 64 |         phylo_tree, result_dict = convert_upgma_to_phylotree(tree, gtc)
 65 | 
 66 |         max_contrasting[i] = result_dict['Total']
 67 |         max_supporting[i] = result_dict['Pro']
 68 |         max_opposing[i] = result_dict['Anti']
 69 | 
 70 |     return max_contrasting, max_supporting, max_opposing
 71 | 
 72 | 
 73 | class Test(TestCase):
 74 |     def test_demo(self):
 75 |         tree = [['isolate1', 'isolate2'], [['isolate3', 'isolate4'], ['isolate5', 'isolate6']]]
 76 |         label_to_trait_a = {
 77 |             'isolate1': True,
 78 |             'isolate2': False,
 79 |             'isolate3': True,
 80 |             'isolate4': False,
 81 |             'isolate5': True,
 82 |             'isolate6': False,
 83 |         }
 84 |         label_to_trait_b = {
 85 |             'isolate1': True,
 86 |             'isolate2': False,
 87 |             'isolate3': True,
 88 |             'isolate4': False,
 89 |             'isolate5': True,
 90 |             'isolate6': False,
 91 |         }
 92 | 
 93 |         print_tree(
 94 |             ScoaryTree.from_list(tree),
 95 |             label_to_trait_a, label_to_trait_b
 96 |         )
 97 |         result = pick_single(tree, label_to_trait_a, label_to_trait_b, calc_pvals=True)
 98 |         print(result)
 99 | 
100 |     def test_simple(self):
101 |         mc_1, ms_1, mo_1 = scoary_1_pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df)
102 |         mc_2, ms_2, mo_2 = pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df,
103 |                                 calc_pvals=False)
104 | 
105 |         self.assertTrue(all(np.equal(mc_1, mc_2)), msg='contrasting')
106 |         self.assertTrue(all(np.equal(ms_1, ms_2)), msg='supporting')
107 |         self.assertTrue(all(np.equal(mo_1, mo_2)), msg='opposing')
108 | 
109 |     def test_benchmark_tetracycline(self, run_scoary_1=True):
110 |         # HP Spectre x360 15-df0709nz (i7 8Gen 8565U)
111 |         # Scoary1 took 23.241052357999614 sec
112 |         # Scoary2 took 0.49214521629996855 sec
113 |         # Scoary1 vs Scoary2: 47.22397290118921x improvement
114 | 
115 |         # HP Spectre x360 14-ef2759nz (i7 13Gen 1355U)
116 |         # Scoary1 took 9.602997977599989 sec
117 |         # Scoary2 took 0.31401244160000485 sec
118 |         # Scoary1 vs Scoary2: 30.58158437503083x improvement
119 | 
120 |         tetr_tree = get_json('../data/tetracycline/expected_result.json')['as_list']
121 |         _, tetr_genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count',
122 |                                       ignore=roary_ignore)
123 |         _, tetr_traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,')
124 | 
125 |         tetr_label_to_gene = tetr_traits_df['Tetracycline_resistance'].to_dict()
126 | 
127 |         # jit compile
128 |         pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df, calc_pvals=False)
129 | 
130 |         if run_scoary_1:
131 |             print('Scoary1')
132 |             time_1, res = time_fn(
133 |                 scoary_1_pick,
134 |                 kwargs=dict(tree=tetr_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df),
135 |                 n_times=5
136 |             )
137 |             mc_1, ms_1, mo_1 = res
138 |         else:
139 |             time_1 = 19.
140 | 
141 |         print('Scoary2')
142 |         time_2, res = time_fn(
143 |             pick,
144 |             kwargs=dict(tree=tetr_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df,
145 |                         calc_pvals=False),
146 |             n_times=20
147 |         )
148 |         mc_2, ms_2, mo_2 = res
149 | 
150 |         print(f'Scoary1 took {time_1} sec')
151 |         print(f'Scoary2 took {time_2} sec')
152 |         print(f'Scoary1 vs Scoary2: {time_1 / time_2}x improvement')  # 33.88 x
153 | 
154 |         if run_scoary_1:
155 |             self.assertTrue(all(np.equal(mc_1, mc_2)), msg='contrasting')
156 |             self.assertTrue(all(np.equal(ms_1, ms_2)), msg='supporting')
157 |             self.assertTrue(all(np.equal(mo_1, mo_2)), msg='opposing')
158 | 
159 |     def test_tetracycline_norecursive(self, run_scoary_1=True):
160 |         # Scoary1 took 23.021255266200022 sec
161 |         # Scoary2nonrec took 0.5782416850000118 sec
162 |         # Scoary1 vs Scoary2nonrec: 39.81251415002976x improvement
163 |         tetr_tree = get_json('../data/tetracycline/expected_result.json')['as_list']
164 |         _, tetr_genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count',
165 |                                       ignore=roary_ignore)
166 |         _, tetr_traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,')
167 | 
168 |         tetr_label_to_gene = tetr_traits_df['Tetracycline_resistance'].to_dict()
169 | 
170 |         # jit compile
171 |         pick(tree=dummy_tree, label_to_trait_a=dummy_trait_a, trait_b_df=dummy_trait_b_df, calc_pvals=False)
172 | 
173 |         if run_scoary_1:
174 |             print('Scoary1')
175 |             time_1, res = time_fn(
176 |                 scoary_1_pick,
177 |                 kwargs=dict(tree=tetr_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df),
178 |                 n_times=5
179 |             )
180 |             mc_1, ms_1, mo_1 = res
181 |         else:
182 |             time_1 = 19.
183 | 
184 |         print('Scoary2')
185 |         tetr_scoary_tree = ScoaryTree.from_list(tetr_tree)
186 |         time_2, res = time_fn(
187 |             pick_nonrecursive,
188 |             kwargs=dict(tree=tetr_scoary_tree, label_to_trait_a=tetr_label_to_gene, trait_b_df=tetr_genes_df,
189 |                         calc_pvals=False),
190 |             n_times=20
191 |         )
192 |         mc_2, ms_2, mo_2 = res
193 | 
194 |         print(f'Scoary1 took {time_1} sec')
195 |         print(f'Scoary2nonrec took {time_2} sec')
196 |         print(f'Scoary1 vs Scoary2nonrec: {time_1 / time_2}x improvement')
197 | 
198 |         if run_scoary_1:
199 |             self.assertTrue(all(np.equal(mc_1, mc_2)), msg='contrasting')
200 |             self.assertTrue(all(np.equal(ms_1, ms_2)), msg='supporting')
201 |             self.assertTrue(all(np.equal(mo_1, mo_2)), msg='opposing')
202 | 
203 |     def test_pairs_paper(self):
204 |         scoary_tree = ScoaryTree.from_list(
205 |             [[[[[[['1', '2'], ['3', '4']], '5'], '6'], '7'], '8'],
206 |              [[[[['9', [['10', '11'], '12']], '13'], '14'], '15'], [['16', '17'], [['18', ['19', '20']], '21']]]]
207 |         )
208 |         print(scoary_tree)
209 |         labels = scoary_tree.labels()
210 |         assert labels == [str(v) for v in list(range(1, 22))]
211 | 
212 |         seq = [(0, 0), (0, 0), (1, 1), (1, 1), (1, 1), (0, 0), (0, 0), (1, 1), (1, 1), (0, 0), (1, 0), (0, 1), (0, 0),
213 |                (1, 1), (1, 1), (0, 0), (0, 0),
214 |                (1, 1), (1, 1), (0, 0), (1, 1), ]
215 | 
216 |         label_to_gene = {lab: bool(tup[0]) for tup, lab in zip(seq, labels)}
217 |         label_to_trait = {lab: bool(tup[1]) for tup, lab in zip(seq, labels)}
218 | 
219 |         print_tree(scoary_tree, label_to_gene, label_to_trait)
220 | 
221 |         res = pick(
222 |             scoary_tree.to_list,
223 |             label_to_trait_a=label_to_trait,
224 |             trait_b_df=pd.DataFrame(label_to_gene, index=['fakegene']),
225 |             calc_pvals=False
226 |         )
227 | 
228 |         max_comparisons = res[0][0]
229 |         max_supporting = res[1][0]
230 |         max_opposing = res[2][0]
231 | 
232 |         print_tree(scoary_tree, label_to_gene, label_to_trait)
233 | 
234 |         self.assertEqual(7, max_comparisons, msg='max_comparisons of pairs failed')
235 |         self.assertEqual(7, max_supporting, msg='max_supporting of pairs failed')
236 |         self.assertEqual(1, max_opposing, msg='max_opposing of pairs failed')
237 | 
238 |     def test_pairs_scoary1(self):
239 |         _, genes_df = load_genes('../data/tetracycline/Gene_presence_absence.csv', gene_data_type='gene-count', ignore=roary_ignore)
240 |         _, traits_df = load_traits('../data/tetracycline/Tetracycline_resistance.csv', trait_data_type='binary:,')
241 |         expected_result = pd.read_csv('../data/tetracycline/fisher_permute100.results.csv')
242 | 
243 |         scoary_tree = ScoaryTree.from_presence_absence(genes_df)
244 |         label_to_trait = traits_df.Tetracycline_resistance.apply(bool).to_dict()
245 | 
246 |         assert set(scoary_tree.labels()) == set(traits_df.index)
247 |         assert not traits_df.Tetracycline_resistance.hasnans
248 | 
249 |         for i, row in expected_result.iterrows():
250 |             gene = row.Gene
251 |             print(gene)
252 |             old_max_comparisons = row.Max_Pairwise_comparisons
253 |             old_max_supporting = row.Max_supporting_pairs
254 |             old_max_opposing = row.Max_opposing_pairs
255 |             old_best = row.Best_pairwise_comp_p
256 |             old_worst = row.Worst_pairwise_comp_p
257 | 
258 |             label_to_gene = genes_df.loc[gene].apply(bool).to_dict()
259 | 
260 |             res = pick(
261 |                 scoary_tree.to_list,
262 |                 label_to_trait_a=label_to_trait,
263 |                 trait_b_df=pd.DataFrame(label_to_gene, index=['fakegene']),
264 |                 calc_pvals=True
265 |             )
266 | 
267 |             comparisons = {
268 |                 'max_comparisons': (old_max_comparisons, res[0][0]),
269 |                 'max_supporting': (old_max_supporting, res[1][0]),
270 |                 'max_opposing': (old_max_opposing, res[2][0]),
271 |                 'best': (old_best, res[3][0]),
272 |                 'worst': (old_worst, res[4][0]),
273 |             }
274 | 
275 |             for comparison, (old, new) in comparisons.items():
276 |                 if not np.isclose(old, new):
277 |                     print(gene, comparison, old, new, scoary_tree)
278 |                     print_tree(scoary_tree, label_to_gene, label_to_trait)
279 |                     self.fail(msg=f'Disagreement between Scoary1 and Scoary2')
280 | 
281 |     def test_scoary1_generated(self):
282 |         _, genes_df = load_genes('../data/bigger_ds/pres_abs.csv', 'gene-count:,')
283 |         _, traits_df = load_traits('../data/bigger_ds/trait_trees.csv', trait_data_type='binary:,')
284 | 
285 |         for trait_name in ['t1', 't2']:
286 |             label_to_trait = traits_df[trait_name].apply(bool).to_dict()
287 |             expected_result = pd.read_csv(f'../data/bigger_ds/{trait_name}.results.csv')
288 | 
289 |             with open('../data/bigger_ds/newick.nwk') as f:
290 |                 newick = f.read()
291 |             scoary_tree = ScoaryTree.from_newick(newick)
292 |             # scoary_tree = ScoaryTree.from_presence_absence(genes_df)
293 | 
294 |             result_df = init_result_df(genes_df, pd.Series(label_to_trait, dtype='boolean'))
295 |             result_df = pair_picking(result_df, genes_df, scoary_tree, label_to_trait)
296 | 
297 |             assert set(scoary_tree.labels()) == set(traits_df.index)
298 | 
299 |             for i, row in expected_result.sample(frac=1, random_state=42).iterrows():
300 |                 gene = row.Gene
301 |                 old_max_comparisons = row.Max_Pairwise_comparisons
302 |                 old_max_supporting = row.Max_supporting_pairs
303 |                 old_max_opposing = row.Max_opposing_pairs
304 |                 old_best = row.Best_pairwise_comp_p
305 |                 old_worst = row.Worst_pairwise_comp_p
306 | 
307 |                 new_row = result_df[result_df['Gene'] == gene].iloc[0]
308 | 
309 |                 comparisons = {
310 |                     'max_comparisons': (old_max_comparisons, new_row.contrasting),
311 |                     'max_supporting': (old_max_supporting, new_row.supporting),
312 |                     'max_opposing': (old_max_opposing, new_row.opposing),
313 |                     'best': (old_best, new_row.best),
314 |                     'worst': (old_worst, new_row.worst),
315 |                 }
316 | 
317 |                 for comparison, (old, new) in comparisons.items():
318 |                     if not np.isclose(old, new):
319 |                         print(f'Error on {gene=} / {comparison=}')
320 |                         print(comparisons)
321 |                         label_to_gene = genes_df.loc[gene].apply(bool).to_dict()
322 |                         print_tree(scoary_tree, label_to_gene, label_to_trait)
323 |                         self.fail(msg=f'Disagreement between Scoary1 and Scoary2')
324 | 


--------------------------------------------------------------------------------
/scoary/analyze_trait.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | from collections import defaultdict
  5 | import numpy as np
  6 | import pandas as pd
  7 | from statsmodels.stats.multitest import multipletests
  8 | from fast_fisher.fast_fisher_numba import odds_ratio, test1t as fisher_exact_two_tailed
  9 | from queue import Empty
 10 | 
 11 | from .ScoaryTree import ScoaryTree
 12 | from .picking import pick
 13 | from .permutations import permute_picking
 14 | from .progressbar import print_progress
 15 | from .utils import setup_logging, AnalyzeTraitNamespace, fisher_id, grasp_namespace
 16 | 
 17 | logger = logging.getLogger('scoary.analyze_trait')
 18 | 
 19 | 
 20 | def worker(
 21 |         q,
 22 |         ns: AnalyzeTraitNamespace,
 23 |         step: int,
 24 |         result_container: {dict | str | None},
 25 |         proc_id: int
 26 | ):
 27 |     logger = setup_logging(
 28 |         logger=logging.getLogger('scoary'),
 29 |         path=f'{ns.outdir}/logs/scoary-2_proc{proc_id}.log',
 30 |         print_info=False,
 31 |         reset=True
 32 |     )
 33 |     logger.info(f'Setting up trait analysis worker {proc_id}')
 34 | 
 35 |     new_ns = grasp_namespace(AnalyzeTraitNamespace, ns)
 36 |     del ns
 37 | 
 38 |     analyze_trait_fn = analyze_trait_step_1_fisher if step == 1 else analyze_trait_step_2_pairpicking
 39 | 
 40 |     local_result_container = {}
 41 | 
 42 |     while True:
 43 |         try:
 44 |             trait = q.get_nowait()
 45 |         except Empty:
 46 |             break  # completely done
 47 | 
 48 |         local_result_container[trait] = analyze_trait_fn(trait, new_ns, proc_id)
 49 |         q.task_done()
 50 | 
 51 |     result_container.update(local_result_container)
 52 | 
 53 | 
 54 | def analyze_trait_step_1_fisher(trait: str, ns: AnalyzeTraitNamespace, proc_id: int = None) -> np.ndarray | str:
 55 |     logger.debug(f"Analyzing {trait=}, step 1: Fisher's test")
 56 |     with ns.lock:
 57 |         ns.counter.value += 1
 58 |         message = trait if proc_id is None else f'P{proc_id} | {trait}'
 59 |         print_progress(
 60 |             ns.counter.value, ns.queue_size,
 61 |             message=message, start_time=ns.start_time, message_width=25
 62 |         )
 63 | 
 64 |     if trait in ns.duplicates:
 65 |         logger.debug(f'Duplicated trait: {trait} -> {ns.duplicates[trait]}')
 66 |         save_duplicated_result(trait, ns)
 67 |         return ns.duplicates[trait]
 68 | 
 69 |     # Prepare results.tsv
 70 |     isolate_trait_series = ns.traits_df[trait].dropna()
 71 |     result_df = init_result_df(ns.genes_bool_df, isolate_trait_series)
 72 | 
 73 |     # Sometimes, binarization gives extreme results and no genes are left
 74 |     if len(result_df) == 0:
 75 |         logger.info(f'Found 0 genes for {trait=}!')
 76 |         return False
 77 | 
 78 |     # Compute Fisher's test efficiently
 79 |     test_df = create_test_df(result_df)
 80 |     test_df = add_odds_ratio(test_df)
 81 |     result_df = pd.merge(test_df, result_df, how="left", on='__contingency_table__', copy=False)
 82 | 
 83 |     # Perform multiple testing correction
 84 |     multiple_testing_df = result_df[['__pattern_id__', 'fisher_p']].drop_duplicates('__pattern_id__')
 85 |     if ns.trait_wise_correction:
 86 |         multiple_testing_df = multiple_testing_correction(
 87 |             multiple_testing_df, 'fisher_p', 'fisher_q',
 88 |             ns.mt_f_method, ns.mt_f_cutoff, True
 89 |         )
 90 |         if len(multiple_testing_df) == 0:
 91 |             logger.info(f'Found 0 genes for {trait=} after multiple testing correction!')
 92 |             return False
 93 | 
 94 |         multiple_testing_df.drop('fisher_p', axis=1, inplace=True)
 95 |         result_df = pd.merge(multiple_testing_df, result_df, how="left", on='__pattern_id__', copy=False)
 96 |         result = True
 97 |     else:
 98 |         result = multiple_testing_df
 99 | 
100 |     os.makedirs(f'{ns.outdir}/traits/{trait}')
101 |     result_df.to_csv(f'{ns.outdir}/traits/{trait}/result.tsv', sep='\t', index=False)
102 | 
103 |     return result
104 | 
105 | 
106 | def analyze_trait_step_2_pairpicking(trait: str, ns: AnalyzeTraitNamespace, proc_id: int = None) -> dict | str | None:
107 |     logger.debug(f'Analyzing {trait=}, step 2: Pair picking')
108 |     with ns.lock:
109 |         ns.counter.value += 1
110 |         message = trait if proc_id is None else f'P{proc_id} | {trait}'
111 |         print_progress(
112 |             ns.counter.value, ns.queue_size,
113 |             message=message, start_time=ns.start_time, message_width=25
114 |         )
115 |     summary_data = {}
116 | 
117 |     result_df = pd.read_csv(f'{ns.outdir}/traits/{trait}/result.tsv', sep='\t')
118 | 
119 |     if ns.trait_wise_correction:
120 |         assert 'fisher_q' in result_df.columns, f'{result_df.columns=} must contain "fisher_q"!'
121 |     else:
122 |         multiple_testing_df = ns.multiple_testing_df.loc[trait, :]
123 |         result_df = pd.merge(multiple_testing_df, result_df, how="left", on='__pattern_id__', copy=False)
124 | 
125 |     assert 'fisher_p' in result_df.columns, f'{result_df.columns=} must contain "fisher_p"!'
126 |     assert 'fisher_q' in result_df.columns, f'{result_df.columns=} must contain "fisher_q"!'
127 | 
128 |     if not ns.pairwise:
129 |         min_row = result_df.loc[result_df['fisher_p'].idxmin()]
130 |         summary_data['best_fisher_p'] = min_row['fisher_p']
131 |         summary_data['best_fisher_q'] = min_row['fisher_q']
132 |     else:
133 |         trait_series = ns.traits_df[trait].dropna()
134 |         isolates = set(trait_series.index)
135 |         if ns.all_labels == isolates:
136 |             pruned_tree = ns.tree
137 |         else:
138 |             pruned_tree = ns.tree.prune(labels=isolates)
139 | 
140 |         result_df = pair_picking(
141 |             result_df,
142 |             significant_genes_df=ns.genes_bool_df.loc[result_df.Gene],
143 |             tree=pruned_tree,
144 |             label_to_trait=trait_series
145 |         )
146 | 
147 |         if ns.worst_cutoff:
148 |             keep = result_df['worst'] <= ns.worst_cutoff
149 |             if not keep.any():
150 |                 logger.info(f'Found 0 genes for {trait=} '
151 |                             f'after worst_cutoff={ns.worst_cutoff} filtration')
152 |                 return None
153 |             result_df = result_df[keep]
154 | 
155 |         assert result_df.fisher_p.is_monotonic_increasing, f'{result_df.fisher_p=} must be monotonic increasing!'
156 | 
157 |         if ns.max_genes:
158 |             if len(result_df) > ns.max_genes:
159 |                 logger.info(f'Found too {len(result_df)} genes for {trait=} '
160 |                             f'keeping only {ns.max_genes} with best Fisher\'s test.')
161 |                 summary_data['max_genes'] = f'Trimmed {len(result_df)} genes to {ns.max_genes}.'
162 |                 result_df = result_df.iloc[:ns.max_genes]
163 | 
164 |         if ns.n_permut:
165 |             result_df['empirical_p'] = permute_picking(
166 |                 trait=trait,
167 |                 result_df=result_df,
168 |                 tree=pruned_tree,
169 |                 label_to_trait=trait_series,
170 |                 n_permut=ns.n_permut,
171 |                 random_state=ns.random_state,
172 |                 genes_bool_df=ns.genes_bool_df
173 |             )
174 | 
175 |             result_df['fq*ep'] = result_df['fisher_q'] * result_df['empirical_p']
176 |             result_df.sort_values(by='fq*ep', inplace=True)
177 | 
178 |             best_row = result_df.iloc[0]
179 |             summary_data['best_fisher_p'] = best_row['fisher_p']
180 |             summary_data['best_fisher_q'] = best_row['fisher_q']
181 |             summary_data['best_empirical_p'] = best_row['empirical_p']
182 |             summary_data['best_fq*ep'] = best_row['fq*ep']
183 | 
184 |     save_result_df(trait, ns, result_df)
185 | 
186 |     # return minimal pvalues
187 |     return summary_data
188 | 
189 | 
190 | def _save_trait(trait: str, ns: AnalyzeTraitNamespace):
191 |     trait_df = pd.DataFrame(index=ns.traits_df.index)
192 |     trait_df['binary'] = ns.traits_df[trait]
193 |     if ns.numeric_df is not None:
194 |         trait_df['numeric'] = ns.numeric_df[trait]
195 |     trait_df.index.name = 'isolate'
196 |     trait_df.to_csv(f'{ns.outdir}/traits/{trait}/values.tsv', sep='\t')
197 | 
198 | 
199 | def save_result_df(trait: str, ns: AnalyzeTraitNamespace, result_df: pd.DataFrame):
200 |     # add annotations
201 |     if ns.gene_info_df is None:
202 |         additional_columns = []
203 |     else:
204 |         additional_columns = ns.gene_info_df.columns.to_list()
205 |         result_df = result_df.merge(ns.gene_info_df, left_on='Gene', right_index=True, how='left', copy=False)
206 | 
207 |     # reorder columns
208 |     col_order = ['Gene', *additional_columns,
209 |                  'g+t+', 'g+t-', 'g-t+', 'g-t-',
210 |                  'sensitivity', 'specificity', 'odds_ratio',
211 |                  'fisher_p', 'fisher_q', 'empirical_p', 'fq*ep',
212 |                  'contrasting', 'supporting', 'opposing', 'best', 'worst']
213 |     result_df = result_df[[col for col in col_order if col in result_df.columns]]
214 | 
215 |     result_df.to_csv(f'{ns.outdir}/traits/{trait}/result.tsv', sep='\t', index=False)
216 | 
217 |     binarization_info = ns.traits_df.attrs['binarization_info']
218 |     if type(binarization_info) is str:
219 |         binarization_info = defaultdict(lambda: 'none')
220 | 
221 |     with open(f'{ns.outdir}/traits/{trait}/meta.json', 'w') as f:
222 |         meta_data = {
223 |             'genes-content-type': ns.genes_orig_df.attrs['content_type'],
224 |             'binarization-method': ns.traits_df.attrs['binarization_method'],
225 |             'binarization-info': binarization_info[trait]
226 |         }
227 |         # add trait info
228 |         if ns.trait_info_df is not None:
229 |             try:
230 |                 info = ns.trait_info_df.loc[trait].to_dict()
231 |                 meta_data['info'] = {k: v for k, v in info.items() if not pd.isna(v)}
232 |             except KeyError:
233 |                 pass
234 | 
235 |         json.dump(meta_data, f, indent=4, allow_nan=False)
236 | 
237 |     coverage_matrix = ns.genes_orig_df[ns.genes_orig_df.index.isin(result_df.Gene)].T
238 |     coverage_matrix.index.name = 'Isolate'
239 |     coverage_matrix.to_csv(f'{ns.outdir}/traits/{trait}/coverage-matrix.tsv', sep='\t')
240 |     _save_trait(trait, ns)
241 | 
242 | 
243 | def save_duplicated_result(trait: str, ns: AnalyzeTraitNamespace):
244 |     os.makedirs(f'{ns.outdir}/traits/{trait}')
245 | 
246 |     # use data from previous duplicate
247 |     ref_trait = ns.duplicates[trait]
248 |     for f in ['result.tsv', 'meta.json', 'coverage-matrix.tsv']:
249 |         os.symlink(src=f'../{ref_trait}/{f}', dst=f'{ns.outdir}/traits/{trait}/{f}')
250 | 
251 |     # create values.tsv only if numeric trait
252 |     if ns.numeric_df is None:
253 |         os.symlink(src=f'../{ref_trait}/values.tsv', dst=f'{ns.outdir}/traits/{trait}/values.tsv')
254 |     else:
255 |         _save_trait(trait, ns)
256 | 
257 | 
258 | def init_result_df(genes_bool_df: pd.DataFrame, trait_series: pd.Series) -> pd.DataFrame:
259 |     """
260 |     Create result_df with index=strains and columns=[g+t+, g+t-, g-t+, g-t-, __contingency_table__]
261 | 
262 |     :param genes_bool_df: DataFrame (dtype: bool); columns: strains; rows: genes
263 |     :param trait_series: Boolean Series that indicates which isolates have the trait
264 |     :return: result_df (DataFrame); columns: ['g+t+', 'g+t-', 'g-t+', 'g-t-', '__contingency_table__]; index: strains
265 |     """
266 |     assert trait_series.dtype == 'boolean', f'trait_series must be boolean pandas.Series!'
267 |     assert not trait_series.hasnans, f'trait_series may not contain NANs!'
268 |     # Preparation
269 |     trait_pos = trait_series.index[trait_series]
270 |     trait_neg = trait_series.index[~trait_series]
271 |     n_pos = len(trait_pos)
272 |     n_neg = len(trait_neg)
273 |     n_tot = n_pos + n_neg
274 |     assert n_tot == len(trait_series)
275 | 
276 |     # Create result_df
277 |     result_df = pd.DataFrame(index=genes_bool_df.index)
278 |     result_df['g+t+'] = genes_bool_df[trait_pos].sum(axis=1)  # trait positive gene positive
279 |     result_df['g+t-'] = genes_bool_df[trait_neg].sum(axis=1)  # trait negative gene positive
280 |     result_df['g-t+'] = n_pos - result_df['g+t+']  # trait positive gene negative
281 |     result_df['g-t-'] = n_neg - result_df['g+t-']  # trait negative gene negative
282 | 
283 |     # Remove genes that are shared by none or all
284 |     gene_sum = result_df['g+t+'] + result_df['g+t-']
285 |     to_keep = (gene_sum != 0) & (gene_sum != n_tot)
286 |     result_df = result_df[to_keep]
287 | 
288 |     # Add unique pattern ID
289 |     genes_bool_df_reduced = genes_bool_df.loc[to_keep, trait_pos.to_list() + trait_neg.to_list()]
290 |     pattern_id = genes_bool_df_reduced.groupby(by=genes_bool_df_reduced.columns.to_list()).ngroup()
291 |     result_df['__pattern_id__'] = pattern_id
292 | 
293 |     # Add contingency table, sensitivity and specificity
294 |     result_df['__contingency_table__'] = [tuple(x) for x in result_df[['g+t+', 'g+t-', 'g-t+', 'g-t-']].to_numpy()]
295 |     if n_pos:
296 |         pos_sensitivity = (result_df['g+t+'] / n_pos * 100)  # use if positive g/t correlation
297 |         neg_sensitivity = (result_df['g-t+'] / n_pos * 100)  # use if negative g/t correlation
298 |     else:
299 |         pos_sensitivity = neg_sensitivity = pd.Series(0, index=result_df.index)
300 | 
301 |     if n_neg:
302 |         pos_specificity = (result_df['g-t-'] / n_neg * 100)  # use if positive g/t correlation
303 |         neg_specificity = (result_df['g+t-'] / n_neg * 100)  # use if negative g/t correlation
304 |     else:
305 |         pos_specificity = neg_specificity = pd.Series(0, index=result_df.index)
306 | 
307 |     keep_pos = (pos_sensitivity + pos_specificity) > (neg_sensitivity + neg_specificity)
308 |     result_df["sensitivity"] = pos_sensitivity.where(keep_pos, neg_sensitivity)
309 |     result_df["specificity"] = pos_specificity.where(keep_pos, neg_specificity)
310 | 
311 |     # Reset index so that Gene is its own column
312 |     result_df.reset_index(inplace=True)
313 | 
314 |     return result_df
315 | 
316 | 
317 | def create_test_df(result_df: pd.DataFrame, sort=True) -> pd.DataFrame:
318 |     """
319 |     Create test_df with index=__contingency_id__ and columns=[fisher_p]
320 | 
321 |     Reduce to unique contingency tables
322 |     Add column: fisher_p
323 | 
324 |     :param result_df: DataFrame with column '__contingency_table__'
325 |     :param sort: whether to sort the DataFrame by pvalue
326 |     :return: test_df (DataFrame)
327 |     """
328 | 
329 |     test_df = pd.DataFrame(result_df.__contingency_table__.unique(), columns=['__contingency_table__'])
330 | 
331 |     # add __fisher_unique_table__
332 |     test_df['__fisher_unique_table__'] = test_df.__contingency_table__.apply(lambda table: fisher_id(*table))
333 | 
334 |     # calculate Fisher's exact test
335 |     table_to_pval = {table: fisher_exact_two_tailed(*table) for table in test_df.__fisher_unique_table__.unique()}
336 | 
337 |     # add Fisher's exact test
338 |     test_df['fisher_p'] = test_df.__fisher_unique_table__.apply(lambda table: table_to_pval[table])
339 | 
340 |     # remove fisher_identifier
341 |     test_df.drop('__fisher_unique_table__', axis=1, inplace=True)
342 | 
343 |     if sort:
344 |         # sort test_df by pvalue
345 |         test_df.sort_values(by='fisher_p', inplace=True)
346 | 
347 |     return test_df
348 | 
349 | 
350 | def add_odds_ratio(test_df: pd.DataFrame) -> pd.DataFrame:
351 |     # add odds_ratio
352 |     test_df['odds_ratio'] = test_df.__contingency_table__.apply(lambda table: odds_ratio(*table))
353 |     return test_df
354 | 
355 | 
356 | def multiple_testing_correction(
357 |         df: pd.DataFrame,
358 |         pval_column: str,
359 |         qval_column: str,
360 |         method: str,
361 |         cutoff: float,
362 |         is_sorted: bool = False
363 | ) -> (float, pd.DataFrame):
364 |     assert pval_column in df.columns, f'{pval_column=} must be in {df.columns=}!'
365 |     if qval_column in df.columns:
366 |         logger.warning(f'Overwriting {qval_column=} in {df.columns=}!')
367 | 
368 |     pvals = df[pval_column]
369 | 
370 |     # Apply multiple testing correction for each orthogene
371 |     if method == 'native':
372 |         reject = pvals <= cutoff
373 |         _, qval, _, _ = multipletests(pvals=pvals, alpha=1, method='bonferroni', is_sorted=is_sorted)
374 |     else:
375 |         reject, qval, alphac_sidak, alphac_bonf = multipletests(
376 |             pvals=pvals, alpha=cutoff, method=method, is_sorted=is_sorted,
377 |         )
378 | 
379 |     df[qval_column] = qval
380 |     df = df[reject]
381 |     return df
382 | 
383 | 
384 | def pair_picking(result_df: pd.DataFrame, significant_genes_df: pd.DataFrame, tree: ScoaryTree,
385 |                  label_to_trait: pd.Series | dict) -> pd.DataFrame:
386 |     """
387 |     Required rows:
388 |     - Gene
389 | 
390 |     Add columns:
391 |     - Max_Pairwise_comparisons
392 |     - Max_supporting_pairs
393 |     - Max_opposing_pairs
394 |     - Best_pairwise_comp_p
395 |     - Worst_pairwise_comp_p
396 |     """
397 |     assert result_df.Gene.to_list() == list(significant_genes_df.index)
398 | 
399 |     max_contr, max_suppo, max_oppos, best, worst = pick(
400 |         tree=tree.to_list, label_to_trait_a=label_to_trait,
401 |         trait_b_df=significant_genes_df, calc_pvals=True
402 |     )
403 | 
404 |     result_df['contrasting'] = max_contr
405 |     result_df['supporting'] = max_suppo
406 |     result_df['opposing'] = max_oppos
407 |     result_df['best'] = best
408 |     result_df['worst'] = worst
409 | 
410 |     return result_df
411 | 


--------------------------------------------------------------------------------
/scoary/scoary.py:
--------------------------------------------------------------------------------
  1 | from .progressbar import print_progress
  2 | from .utils import *
  3 | from .ScoaryTree import ScoaryTree
  4 | from .load_genes import load_genes
  5 | from .load_traits import load_traits
  6 | from .final_overview import create_final_overview
  7 | from .analyze_trait import analyze_trait_step_1_fisher, analyze_trait_step_2_pairpicking, worker, multiple_testing_correction
  8 | 
  9 | logger = logging.getLogger('scoary')
 10 | 
 11 | 
 12 | def scoary(
 13 |         genes: str,
 14 |         traits: str,
 15 |         outdir: str,
 16 |         multiple_testing: str = 'bonferroni:0.999',
 17 |         trait_wise_correction: bool = False,
 18 |         worst_cutoff: float = None,
 19 |         max_genes: int = None,
 20 |         gene_info: str = None,
 21 |         trait_info: str = None,
 22 |         isolate_info: str = None,
 23 |         newicktree: str = None,
 24 |         pairwise: bool = True,
 25 |         n_permut: int = 500,
 26 |         restrict_to: str = None,
 27 |         ignore: str = None,
 28 |         n_cpus: int = 1,
 29 |         n_cpus_binarization: int = None,
 30 |         trait_data_type: str = 'binary:,',
 31 |         gene_data_type: str = 'gene-count:,',
 32 |         force_binary_clustering: bool = False,
 33 |         symmetric: bool = True,
 34 |         distance_metric: str = 'jaccard',
 35 |         linkage_method: str = 'ward',
 36 |         optimal_ordering: bool = True,
 37 |         corr_method: str = 'pearson',
 38 |         random_state: int = None,
 39 |         limit_traits: (int, int) = None,
 40 |         version: bool = False  # Dummy variable, only used to create docstring (see main function)
 41 | ) -> None:
 42 |     """
 43 |     Scoary2: Associate genes with traits!
 44 | 
 45 |     :param genes: Path to gene presence/absence table: columns=isolates, rows=genes
 46 |     :param traits: Path to trait presence/absence table: columns=traits, rows=isolates
 47 |     :param outdir: Directory to place output files
 48 |     :param multiple_testing: Apply multiple testing to the p-values of Fisher's test to account for the many
 49 |     genes/traits tested. Format: "method:cutoff".
 50 |     Cutoff is a number that specifies the FWER and method is one of [native, bonferroni, sidak, holm-sidak, holm,
 51 |     simes-hochberg, hommel, fdr_bh, fdr_by,  fdr_tsbh, fdr_tsbky].
 52 |     If method is 'native': then, the cutoff targets the uncorrected p-value from Fisher's test.
 53 |     :param trait_wise_correction: Apply multiple testing correction to each trait separately. Not recommended as
 54 |     this can lead to many false positives!
 55 |     :param worst_cutoff: Drop traits if no gene with "worst" p-value lower than threshold. Recommended if
 56 |     dataset contains multiple species
 57 |     :param max_genes: Keep only n highest-scoring genes in Fisher's test. Recommended if dataset is big and contains
 58 |      multiple species; avoids waisting computational resources on traits that simply correlate with phylogeny
 59 |     :param gene_info: Path to file that describes genes: columns=arbitrary properties, rows=genes
 60 |     :param trait_info: Path to file that describes traits: columns=arbitrary properties, rows=traits
 61 |     :param isolate_info: Path to file that describes isolates: columns=arbitrary properties, rows=isolates
 62 |     :param newicktree: Path to a custom tree in Newick format
 63 |     :param pairwise: If False, only perform Fisher's test. If True, also perform pairwise comparisons
 64 |      algorithm.
 65 |     :param n_permut: Post-hoc label-switching test: perform N permutations of the phenotype by random label switching.
 66 |      Low p-values suggest that the effect is not merely lineage-specific.
 67 |     :param restrict_to: Comma-separated list of isolates to which to restrict this analysis
 68 |     :param ignore: Comma-separated list of isolates to be ignored for this analysis
 69 |     :param n_cpus: Number of CPUs that should be used. There is overhead in multiprocessing, so if the dataset is
 70 |     small, use n_cpus=1
 71 |     :param n_cpus_binarization: Number of CPUs that should be used for binarization. Default: one tenth of n_cpus
 72 |     :param trait_data_type: "<method>:<?cutoff>:<?covariance_type>:<?alternative>:<?delimiter>" How to read the traits
 73 |      table. Example: "gene-list:\\t" for OrthoFinder N0.tsv table
 74 |     :param gene_data_type: "<data_type>:<?delimiter>" How to read the genes table. Example: "gene-list:\\t" for
 75 |      OrthoFinder N0.tsv table
 76 |     :param force_binary_clustering: Force clustering of binary data even if numeric data is available
 77 |     :param symmetric: if True, correlated and anti-correlated traits will cluster together
 78 |     :param distance_metric: distance metric (binary data only); See metric in https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
 79 |     :param linkage_method: linkage method for clustering [single, complete, average, weighted, ward, centroid, median]
 80 |     :param optimal_ordering: whether to use optimal ordering; See scipy.cluster.hierarchy.linkage.
 81 |     :param corr_method: correlation method (numeric data only) [pearson, kendall, spearman]
 82 |     :param random_state: Set a fixed seed for the random number generator
 83 |     :param limit_traits: Limit the analysis to traits n to m. Useful for debugging. Example: "(0, 10)"
 84 |     :param version: Print software version of Scoary2 and exit.
 85 |     """
 86 |     SCOARY_PRINT_CITATION = os.environ.get('SCOARY_PRINT_CITATION', 'TRUE') == 'TRUE'
 87 |     if SCOARY_PRINT_CITATION:
 88 |         print(f'Welcome to Scoary2! ({get_version()})')
 89 | 
 90 |     # parse input, create outdir, setup logging
 91 |     trait_data_type = decode_unicode(trait_data_type)
 92 |     gene_data_type = decode_unicode(gene_data_type)
 93 |     if n_cpus_binarization is None:
 94 |         n_cpus_binarization = 1 + n_cpus // 10
 95 |     outdir = setup_outdir(outdir, input=locals())
 96 | 
 97 |     setup_logging(logger, f'{outdir}/logs/scoary-2.log')
 98 | 
 99 |     logger.debug(f'Scoary2 Version: {get_version()}')
100 |     mt_f_method, mt_f_cutoff = parse_correction(multiple_testing, 'multiple_testing')
101 |     assert n_permut == 0 or n_permut >= 100, f'{n_permut=} must be at least 100.'
102 | 
103 |     # start
104 |     start_time = datetime.now()
105 | 
106 |     # load traits data  (numeric_df may be None)
107 |     logger.info('Loading traits...')
108 |     numeric_df, traits_df = load_traits(
109 |         traits=traits,
110 |         trait_data_type=trait_data_type,
111 |         restrict_to=restrict_to,
112 |         ignore=ignore,
113 |         n_cpus=n_cpus_binarization,
114 |         random_state=random_state,
115 |         outdir=outdir,
116 |         limit_traits=limit_traits
117 |     )
118 | 
119 |     # dynamically set recursion limit, should work for ~ 13'000 isolates
120 |     _recursion_limit = max(1000, 100 + len(traits_df.index) ** 2)
121 |     logger.debug(f'Setting recursion limit to {_recursion_limit}')
122 |     sys.setrecursionlimit(_recursion_limit)
123 | 
124 |     if trait_info:
125 |         logger.info('Loading trait info...')
126 |         trait_info = load_info_file(
127 |             logger=logger, info_file=trait_info, merge_col='Trait',
128 |             expected_overlap_set=set(traits_df.columns), reference_file=traits
129 |         )
130 | 
131 |     logger.info('Loading genes...')
132 |     genes_orig_df, genes_bool_df = load_genes(
133 |         genes,
134 |         gene_data_type=gene_data_type,
135 |         restrict_to=traits_df.index,
136 |     )
137 | 
138 |     if gene_info:
139 |         logger.info('Loading gene info...')
140 |         gene_info = load_info_file(
141 |             logger=logger, info_file=gene_info, merge_col='Gene',
142 |             expected_overlap_set=set(genes_bool_df.index), reference_file=genes
143 |         )
144 | 
145 |     if isolate_info:
146 |         logger.info('Loading isolate info...')
147 |         isolate_info = load_info_file(
148 |             logger=logger, info_file=isolate_info, merge_col='Isolate',
149 |             expected_overlap_set=set(genes_bool_df.columns), reference_file='placeholder'
150 |         )
151 | 
152 |     # load phylogeny
153 |     if newicktree is None:
154 |         logger.info('Generating phylogenetic tree from gene presence-absence-matrix...')
155 |         tree = ScoaryTree.from_presence_absence(genes_bool_df)
156 |     else:
157 |         logger.info('Loading phylogenetic tree from newick file...')
158 |         with open(newicktree) as f:
159 |             tree = ScoaryTree.from_newick(f.read())
160 |         tree = tree.prune(genes_bool_df.columns)
161 |     tree.write_newick(f'{outdir}/tree.nwk')
162 | 
163 |     all_labels = set(tree.labels())
164 | 
165 |     traits = traits_df.columns.to_list()
166 |     duplicates = find_duplicates(traits_df)
167 | 
168 |     logger.info('Finalizing setup...')
169 |     if n_cpus == 1:
170 |         ns, counter, lock = AnalyzeTraitNamespace(), MockCounter(), MockLock()
171 |     else:
172 |         from .init_multiprocessing import init, mp
173 |         mgr, ns, counter, lock = init()
174 | 
175 |     ns = AnalyzeTraitNamespace.create_namespace(ns, {
176 |         'start_time': datetime.now(),
177 |         'counter': counter,
178 |         'queue_size': len(traits),
179 |         'lock': lock,
180 |         'outdir': outdir,
181 |         'genes_orig_df': genes_orig_df,
182 |         'genes_bool_df': genes_bool_df,
183 |         'gene_info_df': gene_info,
184 |         'numeric_df': numeric_df,
185 |         'traits_df': traits_df,
186 |         'trait_info_df': trait_info,
187 |         'duplicates': duplicates,
188 |         'tree': tree,
189 |         'all_labels': all_labels,
190 |         'mt_f_method': mt_f_method,
191 |         'mt_f_cutoff': mt_f_cutoff,
192 |         'trait_wise_correction': trait_wise_correction,
193 |         'max_genes': max_genes,
194 |         'worst_cutoff': worst_cutoff,
195 |         'n_permut': n_permut,
196 |         'random_state': random_state,
197 |         'pairwise': pairwise,
198 |         'multiple_testing_df': None,
199 |     })
200 | 
201 |     logger.info('Starting step 1: Fisher\'s test...')
202 |     if n_cpus == 1:
203 |         step_1_start = datetime.now()
204 |         trait_to_result = {trait: analyze_trait_step_1_fisher(trait, ns) for trait in traits}
205 |     else:
206 |         mp.freeze_support()
207 |         queue = mgr.JoinableQueue()
208 |         trait_to_result = mgr.dict()
209 |         [queue.put(trait) for trait in traits]
210 |         procs = [mp.Process(target=worker, args=(queue, ns, 1, trait_to_result, i)) for i in range(n_cpus)]
211 |         step_1_start = datetime.now()
212 |         [p.start() for p in procs]
213 |         [p.join() for p in procs]
214 | 
215 |     step_1_end = datetime.now()
216 |     print_progress(
217 |         len(traits), len(traits),
218 |         message='Step 1 complete!', start_time=step_1_start, message_width=25,
219 |         end='\n'
220 |     )
221 |     logger.info(f'Step 1 took {step_1_end - step_1_start}')
222 | 
223 |     duplicated_traits = {trait: res for trait, res in trait_to_result.items() if type(res) is str}
224 |     logger.info(f'Number of duplicated traits: {len(duplicated_traits)}')
225 |     logger.info(f'Number of non-duplicated traits: {len(trait_to_result) - len(duplicated_traits)}')
226 | 
227 |     # multiple testing correction
228 |     if trait_wise_correction:
229 |         traits_left = {trait for trait, res in trait_to_result.items() if res is True}
230 |         ns.multiple_testing_df = 'Not used'
231 |     else:
232 |         trait_to_result = {trait: res for trait, res in trait_to_result.items() if type(res) is not str}
233 |         multiple_testing_df = multiple_testing_correction(
234 |             pd.concat(trait_to_result), 'fisher_p', 'fisher_q',
235 |             ns.mt_f_method, ns.mt_f_cutoff, False
236 |         )
237 |         multiple_testing_df.drop('fisher_p', axis=1, inplace=True)
238 |         traits_left = multiple_testing_df.index.get_level_values(0).unique().to_list()
239 |         ns.multiple_testing_df = multiple_testing_df
240 |     del trait_to_result
241 | 
242 |     # Step 2: Pairpicking
243 |     ns.queue_size = len(traits_left)
244 |     ns.counter.value = 0
245 |     logger.info(f'Number of traits left after multiple testing correction: {len(traits_left)}')
246 | 
247 |     logger.info('Starting step 2: Pair picking...')
248 |     if n_cpus == 1:
249 |         step_2_start = datetime.now()
250 |         trait_to_result = {trait: analyze_trait_step_2_pairpicking(trait, ns) for trait in traits_left}
251 |     else:
252 |         mp.freeze_support()
253 |         queue = mgr.JoinableQueue()
254 |         trait_to_result = mgr.dict()
255 |         [queue.put(trait) for trait in traits_left]
256 |         procs = [mp.Process(target=worker, args=(queue, ns, 2, trait_to_result, i)) for i in range(n_cpus)]
257 |         step_2_start = datetime.now()
258 |         [p.start() for p in procs]
259 |         [p.join() for p in procs]
260 | 
261 |     step_2_end = datetime.now()
262 |     print_progress(
263 |         len(traits_left), len(traits_left),
264 |         message='Step 2 complete!', start_time=step_2_start, message_width=25,
265 |         end='\n'
266 |     )
267 |     logger.info(f'Step 2 took {step_2_end - step_2_start}')
268 | 
269 |     try:
270 |         summary_df = create_summary_df(trait_to_result, duplicated_traits)
271 |     except NoTraitsLeftException as e:
272 |         logger.info(str(e))
273 |         logger.debug(f'Took {datetime.now() - start_time}')
274 |         return
275 |     del trait_to_result
276 | 
277 |     summary_df = summary_df.sort_values(
278 |         by='best_fq*ep' if 'best_fq*ep' in summary_df.columns else 'best_fisher_q',
279 |         ascending=False
280 |     )
281 | 
282 |     create_final_overview(summary_df, ns.traits_df, ns.numeric_df, ns.outdir, ns.trait_info_df, isolate_info,
283 |                           force_binary_clustering, symmetric, distance_metric, linkage_method, optimal_ordering, corr_method)
284 | 
285 |     logger.info('Cleaning up...')
286 |     clean_up(outdir, summary_df.index.to_list())
287 | 
288 |     logger.info('Complete success!')
289 | 
290 |     logger.info(f'Took {datetime.now() - start_time}')
291 | 
292 |     if SCOARY_PRINT_CITATION:
293 |         print(CITATION)
294 | 
295 | 
296 | def create_summary_df(trait_to_result: {str: [dict | None]}, duplicated_traits: {str: str}) -> pd.DataFrame | None:
297 |     """
298 |     Turn trait_to_result into a pandas.DataFrame. Example:
299 | 
300 |              best_fisher_p  best_fisher_q  best_empirical_p  best_fq*ep
301 |     Trait_1       0.574066   4.384058e-01          0.035964    0.035964
302 |     Trait_2       0.432940   2.667931e-01          0.133866    0.133866
303 |     Trait_3       0.194418   7.981206e-08          0.020979    0.691309
304 | 
305 |     :param trait_to_result: dictionary where keys are trait names and values are either dict|str|None
306 |     :return: pandas.DataFrame
307 |     """
308 |     # res may contain: dict or None:
309 |     #  - dict:  data to be added to summary_df as a row
310 |     #  - None:  no gene was significant
311 | 
312 |     # remove Nones
313 |     trait_to_result = {t: r for t, r in trait_to_result.items() if r is not None}
314 | 
315 |     # remove traits with no significant genes
316 |     trait_to_result.update({t: trait_to_result[r] for t, r in duplicated_traits.items() if r in trait_to_result})
317 | 
318 |     if len(trait_to_result) == 0:
319 |         raise NoTraitsLeftException('No traits left after filtering')
320 | 
321 |     summary_df = pd.DataFrame(trait_to_result).T
322 |     summary_df = summary_df.infer_objects()  # harmonize dtypes
323 | 
324 |     logger.debug(f'Created summary_df:\n{summary_df}')
325 | 
326 |     return summary_df
327 | 
328 | 
329 | def find_duplicates(traits_df: pd.DataFrame) -> pd.Series:
330 |     """
331 |     Returns a pd.Series that maps duplicated traits to the first occurrence
332 |     """
333 |     hash_df = pd.DataFrame(index=traits_df.columns)
334 |     hash_df['hash'] = traits_df.apply(lambda x: hash(tuple(x)), axis=0)
335 |     hash_df['is_duplicated'] = hash_df['hash'].duplicated(keep=False)
336 |     hash_df['use_cache'] = hash_df['hash'].duplicated(keep='first')
337 |     lookup_df = hash_df[hash_df['is_duplicated'] & ~hash_df['use_cache']].sort_values(by='hash')
338 |     duplicates = hash_df[hash_df['use_cache']]
339 |     duplicates = duplicates['hash'].apply(
340 |         func=lambda h: lookup_df.iloc[lookup_df.hash.searchsorted(h)].name
341 |     )
342 |     return duplicates
343 | 
344 | 
345 | def clean_up(outdir: str, traits_left: list[str]) -> None:
346 |     import shutil
347 |     for trait in os.listdir(f'{outdir}/traits'):
348 |         if trait not in traits_left:
349 |             shutil.rmtree(f'{outdir}/traits/{trait}')
350 | 
351 | 
352 | CITATION = f'''
353 |   ██████  ▄████▄   ▒█████   ▄▄▄       ██▀███ ▓██   ██▓   ░▒█████▒░ 
354 | ▒██    ▒ ▒██▀ ▀█  ▒██▒  ██ ▒████▄    ▓██   ██ ▒██  ██▒   ▒█▒   ██▒░
355 | ░ ▓██▄   ▒▓█    ▄ ▒██░  ██ ▒██  ▀█▄  ▓██ ░▄█   ▒██ ██░       ░█▀   
356 |   ▒   ██ ▒▓▓▄ ▄██▒▒██   ██ ░██▄▄▄▄██ ▒██▀▀█▄   ░ ▐██▓░      ▄█     
357 | ▒██████▒   ▓███▀ ░░ ████▓▒░ ▓█   ▓██▒░██▓ ▒██▒   ██▒▓░   ░███████▒ 
358 | ▒ ▒▓▒ ▒ ░  ░▒ ▒  ░░ ▒░▒░▒░  ▒▒   ▓▒█░░ ▒▓ ░▒▓░  ██▒▒▒    ░▒▒  ░▒░  
359 | ░ ░▒  ░ ░  ░  ▒     ░ ▒ ▒░   ▒   ▒▒ ░  ░▒ ░ ▒░▓██ ░▒░     ░░   ▒░  
360 | ░  ░  ░  ░        ░ ░ ░ ▒    ░   ▒     ░░   ░ ▒ ▒ ░░           ░   
361 |       ░  ░ ░          ░ ░        ░  ░   ░     ░ ░                  
362 |          ░                                    ░ ░                  
363 |                         Microbial Pan-GWAS
364 | 
365 | 
366 | If you use Scoary2 ({get_version()}), please cite:
367 | Roder, T. et al. Scoary2: Rapid association of phenotypic multi-omics 
368 | data with microbial pan-genomes.
369 | BioRxiv (2023) doi:10.1101/2023.04.19.537353.
370 | '''.strip('\n')
371 | 
372 | 
373 | def main():
374 |     import sys, fire
375 | 
376 |     if '--version' in sys.argv:
377 |         print(f'{get_version()}')
378 |         exit(0)
379 | 
380 |     fire.Fire(scoary)
381 | 
382 | 
383 | if __name__ == '__main__':
384 |     main()
385 | 


--------------------------------------------------------------------------------