├── tests
    ├── __init__.py
    └── test_ismb2020_maldi.py
├── maldi-learn
    ├── tests
    │   ├── __init__.py
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   ├── test_topological_preprocessing.py
    │   │   ├── test_subset_peaks.py
    │   │   └── test_normalization.py
    │   ├── test_maldi_learn.py
    │   ├── mock.py
    │   └── vectorization
    │   │   └── test_binning_vectorizer.py
    ├── maldi_learn
    │   ├── __init__.py
    │   ├── vectorization
    │   │   ├── __init__.py
    │   │   └── binning.py
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   ├── generic.py
    │   │   ├── topological.py
    │   │   └── normalization.py
    │   ├── data.py
    │   └── kernels.py
    ├── .gitignore
    ├── pyproject.toml
    ├── README.md
    └── LICENSE
├── PIKE_behaviour.png
├── ismb2020_maldi
    ├── __init__.py
    ├── example_usage.py
    ├── submit_diffusion_kernel_confidence_jobs.sh
    ├── datasets
    │   ├── __init__.py
    │   ├── dataset.py
    │   └── antibiotics.py
    ├── util.py
    ├── submit_maldiquant_baseline_confidence_jobs.sh
    ├── submit_maldiquant_diffusion_kernel_confidence_jobs.sh
    ├── save_preprocessed.py
    ├── results_overview.sh
    ├── submit_baseline_jobs_bw.sh
    ├── demo_kernel.py
    ├── submit_maldiquant_baseline_jobs.sh
    ├── submit_maldiquant_diffusion_kernel_jobs.sh
    ├── submit_baseline_gp_rbf_jobs.sh
    ├── summarize_dataset.py
    ├── submit_diffusion_kernel_reduced_jobs.sh
    ├── submit_baseline_jobs.sh
    ├── extract_baseline_parameters.py
    ├── mean.py
    ├── visualise_feature_map.py
    ├── submit_diffusion_kernel_jobs.sh
    ├── mean_rejection.py
    ├── extract_kernel_parameters.py
    ├── calibrate_histograms.py
    ├── demo_kernel_confidence.py
    ├── visualise_baseline.py
    ├── analyse_split.py
    ├── collect_results.py
    ├── visualise_kernel.py
    ├── calibrate.py
    ├── baseline_gp_rbf.py
    ├── diffusion_kernel.py
    ├── baseline_maldiquant.py
    ├── baseline_maldiquant_confidence.py
    ├── baseline.py
    └── diffusion_kernel_confidence.py
├── PIKE_behaviour_matplotlib.png
├── .gitignore
├── data
    └── Example_peaks.txt
├── pyproject.toml
├── README.md
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.1.0'
2 | 


--------------------------------------------------------------------------------
/PIKE_behaviour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BorgwardtLab/maldi_PIKE/HEAD/PIKE_behaviour.png


--------------------------------------------------------------------------------
/ismb2020_maldi/__init__.py:
--------------------------------------------------------------------------------
1 | """Package with functionality used to analyze MALDI-TOF data."""
2 | 


--------------------------------------------------------------------------------
/PIKE_behaviour_matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BorgwardtLab/maldi_PIKE/HEAD/PIKE_behaviour_matplotlib.png


--------------------------------------------------------------------------------
/tests/test_ismb2020_maldi.py:
--------------------------------------------------------------------------------
1 | from ismb2020_maldi import __version__
2 | 
3 | 
4 | def test_version():
5 |     assert __version__ == '0.1.0'
6 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/vectorization/__init__.py:
--------------------------------------------------------------------------------
1 | """Transformers for vectorizing MALDI-TOF spectra."""
2 | from .binning import BinningVectorizer
3 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/test_maldi_learn.py:
--------------------------------------------------------------------------------
1 | from maldi_learn import __version__
2 | 
3 | 
4 | def test_version():
5 |     assert __version__ == '0.1.0'
6 | 


--------------------------------------------------------------------------------
/maldi-learn/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python cache
 2 | __pycache__/
 3 | 
 4 | # Pyenv files
 5 | .python-version
 6 | 
 7 | # Poetry
 8 | # Poetry lock
 9 | poetry.lock
10 | # Egg generated when running poetry install
11 | maldi_learn.egg-info/
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python cache
 2 | __pycache__/
 3 | 
 4 | # Pyenv files
 5 | .python-version
 6 | 
 7 | # Poetry
 8 | # Poetry lock
 9 | poetry.lock
10 | # Egg generated when running poetry install
11 | ismb2020_maldi.egg-info/
12 | 
13 | # Environment file defining path to data
14 | .env
15 | 
16 | # Ignore any `pip` installation files
17 | pip-wheel-metadata/
18 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/example_usage.py:
--------------------------------------------------------------------------------
 1 | """Example usage file."""
 2 | 
 3 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 4 | from maldi_learn.preprocessing import TopologicalPeakFiltering
 5 | dataset = EcoliAntibioticResistanceDataset('Ciprofloxacin')
 6 | X, y = dataset.complete_data
 7 | 
 8 | topf = TopologicalPeakFiltering(n_peaks=100)
 9 | X_sparse = topf.transform(X)
10 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_diffusion_kernel_confidence_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | for S in "${SEED[@]}"; do
 8 |   OUTPUT="Calibration_saureus_seed${S}_GP_diffusion"
 9 |   bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel_confidence.py --sigma 16.22 --seed $S"
10 | done
11 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | """Datasets."""
2 | from .dataset import Dataset
3 | from .antibiotics import AntibioticResistanceDataset, EcoliAntibioticResistanceDataset, \
4 |                         SaureusAntibioticResistanceDataset, KpneuAntibioticResistanceDataset
5 | __all__ = ['Dataset', 'AntibioticResistanceDataset', 'EcoliAntibioticResistanceDataset', 'SaureusAntibioticResistanceDataset', 'KpneuAntibioticResistanceDataset']
6 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | """Preprocessing of MALDI-TOF spectra."""
 2 | 
 3 | from .generic import SubsetPeaksTransformer
 4 | from .normalization import TotalIonCurrentNormalizer
 5 | from .normalization import ScaleNormalizer
 6 | from .topological import TopologicalPeakFiltering
 7 | 
 8 | 
 9 | __all__ = [
10 |     'ScaleNormalizer',
11 |     'SubsetPeaksTransformer',
12 |     'TopologicalPeakFiltering',
13 |     'TotalIonCurrentNormalizer'
14 | ]
15 | 


--------------------------------------------------------------------------------
/data/Example_peaks.txt:
--------------------------------------------------------------------------------
 1 | 1970.42 2.93
 2 | 1999.58 1.64
 3 | 2009.62 1.59
 4 | 2024.74 1.95
 5 | 2041.18 3.47
 6 | 2058.96 4.46
 7 | 2087.05 4.62
 8 | 2105.03 9.21
 9 | 2120.07 6.48
10 | 2136.46 5.02
11 | 2187.76 16.89
12 | 2208.36 1.26
13 | 2225.97 1.37
14 | 2248.97 5.04
15 | 2274.77 3.76
16 | 2290.40 7.27
17 | 2306.99 7.69
18 | 2322.29 8.49
19 | 2338.54 8.76
20 | 2355.30 5.44
21 | 2373.03 2.49
22 | 2384.43 2.59
23 | 2398.61 3.65
24 | 2415.13 11.14
25 | 2431.70 17.61
26 | 2455.74 13.73
27 | 2494.82 8.18
28 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/mock.py:
--------------------------------------------------------------------------------
 1 | """Module for generating mock data of MALDI-TOF spectra."""
 2 | import numpy as np
 3 | 
 4 | from maldi_learn.data import MaldiTofSpectrum
 5 | 
 6 | 
 7 | def generate_mock_data(n_examples):
 8 |     """Generate random data with correct shape."""
 9 |     n_peaks = np.random.normal(1000, 100, size=n_examples).astype(int)
10 |     print(n_peaks)
11 |     return [
12 |         MaldiTofSpectrum(
13 |             np.random.uniform(0, 10000, size=(peaks, 2)))
14 |         for peaks in n_peaks
15 |     ]
16 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/util.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Utility functions
 3 | '''
 4 | 
 5 | 
 6 | def create_binary_label(df_resistances, antibiotic):
 7 |     '''
 8 |     Given a data frame of resistance information and the name of an
 9 |     antibiotic, creates a binary label vector. The antibiotic needs
10 |     to be present in the data frame. Else, an error is raised.
11 |     '''
12 | 
13 |     # TODO: check whether this conversion makes sense
14 |     y = df_resistances[antibiotic].values
15 |     y[y != 'R'] = 0
16 |     y[y == 'R'] = 1
17 | 
18 |     return y.astype('int')
19 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_maldiquant_baseline_confidence_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/
 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/
 9 | 
10 | for S in "${SEED[@]}"; do
11 |   OUTPUT="Calibration_saureus_seed${S}_MQ"
12 |   bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant_confidence.py --seed $S --suffix _peaks_warped"
13 | done
14 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_maldiquant_diffusion_kernel_confidence_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/
 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/
 9 | 
10 | for S in "${SEED[@]}"; do
11 |   OUTPUT="Calibration_saureus_seed${S}_MQ_GP_diffusion"
12 |   bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel_confidence.py --sigma 4.18 --seed $S --suffix _peaks_warped"
13 | done
14 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/preprocessing/test_topological_preprocessing.py:
--------------------------------------------------------------------------------
 1 | """Tests for topological preprocessing routines."""
 2 | import unittest
 3 | 
 4 | from maldi_learn.preprocessing import TopologicalPeakFiltering
 5 | 
 6 | from tests.mock import generate_mock_data
 7 | 
 8 | 
 9 | class TestToplogicalPreprocessing(unittest.TestCase):
10 |     def test_correct_n_peaks(self, n_examples=10, n_peaks=100):
11 |         mock_data = generate_mock_data(n_examples)
12 |         transformer = TopologicalPeakFiltering(n_peaks=n_peaks)
13 |         transformed_data = transformer.fit_transform(mock_data)
14 |         print(transformed_data[0].shape)
15 |         self.assertTrue(
16 |             all([spectrum.n_peaks == n_peaks for spectrum in transformed_data])
17 |         )
18 | 


--------------------------------------------------------------------------------
/maldi-learn/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "maldi-learn"
 3 | version = "0.1.0"
 4 | description = "Software library for MALDI-TOF preprocessing and machine learning analysis."
 5 | authors = ["Caroline Weis <caroline.weis@bsse.ethz.ch>", "Max Horn <max.horn@bsse.ethz.ch>", "Bastian Rieck <bastian.rieck@bsse.ethz.ch>"]
 6 | readme = "README.md"
 7 | repository = "https://github.com/BorgwardtLab/maldi-learn"
 8 | homepage = "https://github.com/BorgwardtLab/maldi-learn"
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=3.6"
12 | scikit-learn = "^0.22.1"
13 | topf = {git = "ssh://git@github.com/BorgwardtLab/Topf.git"}
14 | pandas = "^0.25.3"
15 | [tool.poetry.dev-dependencies]
16 | pytest = "^3.0"
17 | 
18 | [build-system]
19 | requires = ["poetry>=0.12"]
20 | build-backend = "poetry.masonry.api"
21 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/save_preprocessed.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Read all spectra, preprocess them with TopologicalPeakFiltering and then save to new folder.
 3 | '''
 4 | 
 5 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
 6 | 
 7 | from maldi_learn.data import write_spectra
 8 | from maldi_learn.preprocessing import TopologicalPeakFiltering
 9 | 
10 | 
11 | dataset = AntibioticResistanceDataset(test_size=0.5)
12 | 
13 | # write testing_data
14 | X, y = dataset.testing_data
15 | 
16 | topf = TopologicalPeakFiltering(n_peaks=False)
17 | X_sparse = topf.transform(X)
18 | 
19 | write_spectra(X_sparse, y, '/links/groups/borgwardt/Data/ismb2020_maldi/spectra_preprocessed')
20 | 
21 | 
22 | # write training_data
23 | X, y = dataset.training_data
24 | 
25 | topf = TopologicalPeakFiltering(n_peaks=False)
26 | X_sparse = topf.transform(X)
27 | 
28 | write_spectra(X_sparse, y, '/links/groups/borgwardt/Data/ismb2020_maldi/spectra_preprocessed')
29 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/results_overview.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # example usage:
 4 | # ANTIBIOTICS=Ceftriaxon SPECIES=ecoli ./results_overview.sh
 5 | 
 6 | if [ -z ${ANTIBIOTICS+x} ]; then
 7 |   ANTIBIOTICS=(Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon)
 8 | fi
 9 | 
10 | if [ -z ${SPECIES+x} ]; then
11 |   SPECIES=(ecoli saureus kpneu)
12 | fi
13 | 
14 | for A in "${ANTIBIOTICS[@]}"; do
15 |   for S in "${SPECIES[@]}"; do
16 |     echo
17 |     echo ---- ${A} -- ${S} ----
18 |     echo - baseline on raw - 
19 |     cat /cluster/work/borgw/ismb2020_maldi/results/raw/${A}_${S}_*.out | grep Average
20 |     echo - baseline on preprocessed - 
21 |     cat /cluster/work/borgw/ismb2020_maldi/results/preprocessed/${A}_${S}_*.out | grep Average
22 |     echo - GP random oversampling 200 peaks - 
23 |     cat /cluster/work/borgw/ismb2020_maldi/results/diffusion_ros/GP_diffusion_${A}_${S}_*_200_*.out | grep Average
24 | 
25 |   done
26 | done
27 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "ismb2020_maldi"
 3 | version = "0.1.0"
 4 | description = "Maldi-Tof kernel and GP classifier"
 5 | authors = ["Caroline Weis <caroline.weis@bsse.ethz.ch>", "Max Horn <max.horn@bsse.ethz.ch>", "Dr. Bastian Alexander Rieck <bastian.rieck@bsse.ethz.ch>"]
 6 | readme = "README.md"
 7 | repository = "https://github.com/BorgwardtLab/maldi_PIKE"
 8 | homepage = "https://github.com/BorgwardtLab/maldi_PIKE"
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "^3.7"
12 | pandas = "^0.25.3"
13 | python-dotenv = "^0.10.3"
14 | maldi-learn = {path = "maldi-learn/"}
15 | scikit-learn = {git = "https://github.com/BorgwardtLab/scikit-learn.git", rev = "maldi-learn"}
16 | imbalanced-learn = "^0.6.1"
17 | json-tricks = "^3.13.5"
18 | tqdm = "^4.41.1"
19 | matplotlib = "^3.1.2"
20 | seaborn = "^0.10.0"
21 | [tool.poetry.dev-dependencies]
22 | pytest = "^3.0"
23 | ipython = "^7.11.1"
24 | 
25 | [build-system]
26 | requires = ["poetry>=0.12"]
27 | build-backend = "poetry.masonry.api"
28 | 


--------------------------------------------------------------------------------
/maldi-learn/README.md:
--------------------------------------------------------------------------------
 1 | # maldi-learn
 2 | Software library for MALDI-TOF preprocessing and machine learning analysis.
 3 | 
 4 | ## Installation - development
 5 | 
 6 | The installation of this package requires
 7 | [poetry](https://python-poetry.org/docs/).
 8 | 
 9 | In order to set up a development environment run `poetry install` in the
10 | project root.  To run commands in the associated virtual environment of this
11 | package run `poetry shell` to spawn a shell.
12 | 
13 | 
14 | ### Python version
15 | 
16 | This project requires at least python version `3.7`.  In a development setup it
17 | is recommended to install a appropriate python version using
18 | [pyenv](https://github.com/pyenv/pyenv), and then marking this folder for usage
19 | with this version:
20 | 
21 | ```bash
22 |  $ pyenv install 3.7.4  # Install python 3.7.4 using pyenv
23 |  $ pyenv local 3.7.4    # Mark python version 3.7.4 for usage in this folder
24 |  $ poetry install       # Setup the virtual environment for development
25 | ```
26 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_baseline_jobs_bw.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | # s. aureus
 8 | 
 9 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
10 |   for s in "${SEED[@]}"; do
11 |     OUTPUT="${A}_saureus_seed${s}_normalized.json"
12 |     nice poetry run python baseline.py --species saureus --antibiotic $A --seed $s --normalize --output ${OUTPUT} &
13 |   done
14 | done
15 | 
16 | # e. coli
17 | 
18 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
19 |   for s in "${SEED[@]}"; do
20 |     OUTPUT="${A}_ecoli_seed${s}_normalized.json"
21 |     nice poetry run python baseline.py --species ecoli --antibiotic $A --seed $s --normalize --output ${OUTPUT} &
22 |   done
23 | done
24 | 
25 | # k. pneu
26 | 
27 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
28 |   for s in "${SEED[@]}"; do
29 |     OUTPUT="${A}_kpneu_seed${s}_normalized.json"
30 |     nice poetry run python baseline.py --species kpneu --antibiotic $A --seed $s --normalize --output ${OUTPUT} &
31 |   done
32 | done
33 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/demo_kernel.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Demo file for kernel calculation.
 3 | '''
 4 | 
 5 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 6 | 
 7 | from maldi_learn.kernels import DiffusionKernel
 8 | 
 9 | from maldi_learn.preprocessing import SubsetPeaksTransformer
10 | 
11 | from sklearn.gaussian_process import GaussianProcessClassifier
12 | from sklearn.metrics import average_precision_score
13 | from sklearn.svm import SVC
14 | 
15 | 
16 | import numpy as np
17 | import sys
18 | 
19 | dataset = EcoliAntibioticResistanceDataset(antibiotic='Ceftriaxon',
20 |         test_size=0.20)
21 | X_train, y_train = dataset.training_data
22 | X_test, y_test = dataset.testing_data
23 | 
24 | st = SubsetPeaksTransformer(n_peaks=100)
25 | 
26 | X_train = st.fit_transform(X_train)
27 | X_test = st.transform(X_test)
28 | 
29 | kernel = DiffusionKernel(sigma=400)
30 | 
31 | clf = GaussianProcessClassifier(kernel=kernel, n_jobs=-1)
32 | clf.fit(X_train, y_train)
33 | 
34 | y_pred = clf.predict_proba(X_test)
35 | average_precision = average_precision_score(y_test, y_pred[:, 1])
36 | 
37 | print(f'Average precision: {100 * average_precision:2.2f}')
38 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/vectorization/test_binning_vectorizer.py:
--------------------------------------------------------------------------------
 1 | """Test BinningVectorizer."""
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from maldi_learn.data import MaldiTofSpectrum
 7 | from maldi_learn.vectorization import BinningVectorizer
 8 | 
 9 | 
10 | MOCK_DATA = [
11 |     MaldiTofSpectrum(
12 |         [[0.0,   5.0],
13 |          [10.7,  8.0],
14 |          [150.4, 10.],
15 |          [1000,  3.0]
16 |          ]
17 |     ),
18 |     MaldiTofSpectrum(
19 |         [[0.0,   15.0],
20 |          [10.7,  5.0],
21 |          [150.4, 10.],
22 |          [1000,  3.0]
23 |          ]
24 |     )
25 | ]
26 | 
27 | 
28 | class TestBinningVectorizer(unittest.TestCase):
29 |     def test_simple_binning(self):
30 |         vectorizer = BinningVectorizer(2, min_bin=-0.1, max_bin=999)
31 |         vectorized = vectorizer.fit_transform(MOCK_DATA)
32 |         self.assertEqual(vectorized.ndim, 2)
33 |         self.assertEqual(vectorized.shape[0], len(MOCK_DATA))
34 |         self.assertEqual(vectorized.shape[1], 2)
35 | 
36 |         self.assertTrue(np.all(vectorized[0] == np.array([23., 3.])))
37 |         self.assertTrue(np.all(vectorized[1] == np.array([30., 3.])))
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/preprocessing/test_subset_peaks.py:
--------------------------------------------------------------------------------
 1 | """Test SubsetPeaksTransformer."""
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from maldi_learn.preprocessing import SubsetPeaksTransformer
 7 | from maldi_learn.data import MaldiTofSpectrum
 8 | 
 9 | 
10 | MOCK_DATA = [
11 |     MaldiTofSpectrum(
12 |         [[0.0,   5.0],
13 |          [10.7,  8.0],
14 |          [150.4, 10.],
15 |          [1000,  3.0]
16 |          ]
17 |     ),
18 |     MaldiTofSpectrum(
19 |         [[0.0,   15.0],
20 |          [10.7,  5.0],
21 |          [150.4, 10.],
22 |          [1000,  3.0]
23 |          ]
24 |     )
25 | ]
26 | 
27 | 
28 | class TestSubsetPeakTransformer(unittest.TestCase):
29 |     def test_transformer(self, n_peaks=2):
30 |         transf = SubsetPeaksTransformer(n_peaks)
31 |         transformed = transf.fit_transform(MOCK_DATA)
32 |         print(transformed)
33 |         # First example
34 |         self.assertTrue(np.all(transformed[0][0] == np.array([10.7, 8.0])))
35 |         self.assertTrue(np.all(transformed[0][1] == np.array([150.4, 10.0])))
36 | 
37 |         # Second example
38 |         self.assertTrue(np.all(transformed[1][0] == np.array([0.0, 15.0])))
39 |         self.assertTrue(np.all(transformed[1][1] == np.array([150.4, 10.0])))
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # maldi_PIKE &mdash; Peak Information KErnel for MALDI-TOF MS spectra
 2 | 
 3 | `maldi_PIKE` is a small library for Python 3 that include the code used for 
 4 | 'Topological and kernel-based microbial phenotype prediction from MALDI-TOF mass 
 5 | spectra'. The main method includes PIKE, the Peak Information KErnel for MALDI-TOF MS spectra, 
 6 | embedded in a Gaussian Process. We developed PIKE based on heat diffusion on structured 
 7 | objects. It is well suited for MALDI-TOF mass spectra and able to capture interactions between
 8 | mass peaks.
 9 | 
10 | # Dependencies
11 | 
12 | - Python 3.7
13 | - packages listed in `pyproject.toml`
14 | 
15 | # Installation
16 | 
17 | - Clone the repository
18 | - `poetry install`
19 | 
20 | Follow the instructions given by `poetry`.
21 | 
22 | # Example behaviour of PIKE
23 | 
24 | Figure 2:
25 | 
26 | ![PIKE_behaviour](PIKE_behaviour.png)
27 | 
28 | Code to recreate this graphic in matplotlib can be found in
29 | `ismb2020_maldi/visualise_feature_map.py`. Use the script as
30 | follows:
31 | 
32 | ```
33 | poetry run python ismb2020_maldi/visualise_feature_map.py data/Example_peaks.txt
34 | ```
35 | 
36 | This should result in the following plot:
37 | 
38 | ![PIKE_behaviour_matplotlib](PIKE_behaviour_matplotlib.png)
39 | 
40 | This repository is work in progress.
41 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/datasets/dataset.py:
--------------------------------------------------------------------------------
 1 | """Base class for a dataset."""
 2 | import abc
 3 | from collections.abc import Sequence
 4 | from typing import List, Tuple
 5 | 
 6 | from maldi_learn.data import MaldiTofSpectrum
 7 | 
 8 | 
 9 | class Dataset(metaclass=abc.ABCMeta):
10 |     """Abstract base class for a dataset."""
11 | 
12 |     @property
13 |     @abc.abstractmethod
14 |     def training_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]:
15 |         """Get training data of dataset.
16 | 
17 |         Returns:
18 |             Tuple (X, y)
19 | 
20 |         """
21 | 
22 |     @property
23 |     @abc.abstractmethod
24 |     def validation_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]:
25 |         """Get validation data of dataset.
26 | 
27 |         Returns:
28 |             Tuple (X, y)
29 | 
30 |         """
31 | 
32 |     @property
33 |     @abc.abstractmethod
34 |     def testing_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]:
35 |         """Get testing data of dataset.
36 | 
37 |         Returns:
38 |             Tuple (X, y)
39 | 
40 |         """
41 | 
42 |     @property
43 |     @abc.abstractmethod
44 |     def complete_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]:
45 |         """Get complete dataset.
46 | 
47 |         Returns:
48 |             Tuple (X, y)
49 | 
50 |         """
51 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/data.py:
--------------------------------------------------------------------------------
 1 | """Classes to standardize handling of Spectra."""
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class MaldiTofSpectrum(np.ndarray):
 7 |     """Numpy NDArray subclass representing a MALDI-TOF Spectrum."""
 8 | 
 9 |     def __new__(cls, peaks):
10 |         """Create a MaldiTofSpectrum.
11 | 
12 |         Args:
13 |             peaks: 2d array or list of tuples or list of list containing pairs
14 |                 of mass/charge to intensity.
15 | 
16 |         Raises:
17 |             ValueError: If the input data is not in the correct format.
18 | 
19 |         """
20 |         peaks = np.asarray(peaks).view(cls)
21 |         if peaks.ndim != 2 or peaks.shape[1] != 2:
22 |             raise ValueError(
23 |                 f'Input shape of {peaks.shape} does not match expected shape '
24 |                 'for spectrum [n_peaks, 2].'
25 |             )
26 |         return peaks
27 | 
28 |     @property
29 |     def n_peaks(self):
30 |         """Get number of peaks of the spectrum."""
31 |         return self.shape[0]
32 | 
33 |     @property
34 |     def intensities(self):
35 |         """Get the intensities of the spectrum."""
36 |         return self[:, 1]
37 | 
38 |     @property
39 |     def mass_to_charge_ratios(self):
40 |         """Get mass-t0-charge ratios of spectrum."""
41 |         return self[:, 0]
42 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_maldiquant_baseline_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/
 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/
 9 | 
10 | # s. aureus
11 | 
12 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
13 |   for s in "${SEED[@]}"; do
14 |     OUTPUT="${A}_saureus_seed${s}_MQ"
15 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant.py --suffix _peaks_warped --species saureus --antibiotic $A --seed $s"
16 |   done
17 | done
18 | 
19 | # e. coli
20 | 
21 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
22 |   for s in "${SEED[@]}"; do
23 |     OUTPUT="${A}_ecoli_seed${s}_MQ"
24 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant.py --suffix _peaks_warped --species ecoli --antibiotic $A --seed $s"
25 |   done
26 | done
27 | 
28 | # k. pneu
29 | 
30 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
31 |   for s in "${SEED[@]}"; do
32 |     OUTPUT="${A}_kpneu_seed${s}_MQ"
33 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant.py --suffix _peaks_warped --species kpneu --antibiotic $A --seed $s"
34 |   done
35 | done
36 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_maldiquant_diffusion_kernel_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/
 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/
 9 | 
10 | # s. aureus
11 | 
12 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
13 |   for S in "${SEED[@]}"; do
14 |     OUTPUT="${A}_saureus_seed${S}_peaks${P}_MQ_GP_diffusion"
15 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --suffix _peaks_warped"
16 |   done
17 | done
18 | 
19 | # e. coli
20 | 
21 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
22 |   for S in "${SEED[@]}"; do
23 |     OUTPUT="${A}_ecoli_seed${S}_peaks${P}_MQ_GP_diffusion"
24 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --suffix _peaks_warped"
25 |   done
26 | done
27 | 
28 | # k. pneu
29 | 
30 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
31 |   for S in "${SEED[@]}"; do
32 |     OUTPUT="${A}_kpneu_seed${S}_peaks${P}_MQ_GP_diffusion"
33 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --suffix _peaks_warped"
34 |   done
35 | done
36 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_baseline_gp_rbf_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Submission script for the baseline GP-RBF classifier. This uses MQ
 4 | # features because we want to ensure that the improvements are given
 5 | # by our new kernel.
 6 | 
 7 | SEED=(58925 15250 97412 17965 44873)
 8 | MEMORY=8192
 9 | TIME=23:59
10 | 
11 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/
12 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/
13 | 
14 | # s. aureus
15 | 
16 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
17 |   for S in "${SEED[@]}"; do
18 |     OUTPUT="${A}_saureus_seed${S}_GP_RBF"
19 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_gp_rbf.py --species saureus --antibiotic $A --seed $S"
20 |   done
21 | done
22 | 
23 | # e. coli
24 | 
25 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
26 |   for S in "${SEED[@]}"; do
27 |     OUTPUT="${A}_ecoli_seed${S}_GP_RBF"
28 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_gp_rbf.py --species ecoli --antibiotic $A --seed $S"
29 |   done
30 | done
31 | 
32 | # k. pneu
33 | 
34 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
35 |   for S in "${SEED[@]}"; do
36 |     OUTPUT="${A}_kpneu_seed${S}_GP_RBF"
37 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_gp_rbf.py --species kpneu --antibiotic $A --seed $S"
38 |   done
39 | done
40 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/preprocessing/generic.py:
--------------------------------------------------------------------------------
 1 | """Generic preprocessing transformers for spectra."""
 2 | import numpy as np
 3 | 
 4 | from sklearn.base import TransformerMixin
 5 | from sklearn.base import BaseEstimator
 6 | 
 7 | 
 8 | class SubsetPeaksTransformer(BaseEstimator, TransformerMixin):
 9 |     """Transform to extract subset of peaks from spectrum."""
10 | 
11 |     def __init__(self, n_peaks=None):
12 |         """Initialize transformer for subsetting peaks.
13 | 
14 |         Args:
15 |             n_peaks: Number of peaks to extract from spectrum. If set to
16 |             `None`, will just pass through input data without changing
17 |             anything.
18 |             on_less: Behaviour when one of the spectra has less than n_peaks
19 |                 peaks.
20 | 
21 |         """
22 |         self.n_peaks = n_peaks
23 | 
24 |     def fit(self, X, y=None):
25 |         """Fit transformer, does nothing."""
26 |         return self
27 | 
28 |     def transform(self, X):
29 |         """Get the n_peaks peaks with the highest intensity."""
30 | 
31 |         # Bail out early because there is nothing to do
32 |         if self.n_peaks is None:
33 |             return X
34 | 
35 |         output = []
36 |         for spectrum in X:
37 |             intensity = spectrum[:, 1]
38 |             peak_indices = np.argsort(intensity, kind='stable')[::-1]
39 |             # We want to sort back the indices to perserve the original order
40 |             output.append(spectrum[sorted(peak_indices[:self.n_peaks])])
41 |         return output
42 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/summarize_dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Provide a summary over the dataset, including class balance for each species and antibiotic.
 3 | '''
 4 | 
 5 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset, SaureusAntibioticResistanceDataset, KpneuAntibioticResistanceDataset
 6 | 
 7 | import numpy as np
 8 | import sys
 9 | 
10 | 
11 | datasets_map = {
12 |     'ecoli': EcoliAntibioticResistanceDataset,
13 |     'saureus': SaureusAntibioticResistanceDataset,
14 |     'kpneu': KpneuAntibioticResistanceDataset
15 | }
16 | 
17 | antibiotic_map = {
18 |     'ecoli': ['Ciprofloxacin', 'Ceftriaxon','Amoxicillin-Clavulansaeure'],
19 |     'saureus': ['Ciprofloxacin', 'Penicillin','Amoxicillin-Clavulansaeure'],
20 |     'kpneu': ['Ciprofloxacin', 'Ceftriaxon','Piperacillin-Tazobactam'] 
21 | }
22 | 
23 | for species in datasets_map.keys():
24 |    
25 |     print(f'\n{species}')     
26 |     Dataset = datasets_map[species]
27 |     for antibiotic in antibiotic_map[species]:
28 |         
29 |         print(f'{antibiotic}')
30 |         dataset = Dataset(antibiotic, test_size=0.2)
31 |         _, y_complete = dataset.complete_data
32 |         _, y_train = dataset.training_data
33 |         _, y_test = dataset.testing_data
34 | 
35 |         #for y in [y_complete, y_train, y_test]:
36 |         for y in [y_complete]:
37 |             counts = y.value_counts()
38 |             print(counts)
39 |             print(y.shape[0])
40 |             assert counts.loc[0]+counts.loc[1] == y.shape[0]
41 |             print(round(counts.loc[1]/float(y.shape[0]), 3))
42 |         print()
43 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_diffusion_kernel_reduced_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Submission script for the _reduced_ set of diffusion kernel jobs, i.e.
 4 | # we do _not_ change the number of peaks and always use normalisation. A
 5 | # scenario like this closely matches that of MQ data.
 6 | 
 7 | SEED=(58925 15250 97412 17965 44873)
 8 | PEAKS=200
 9 | MEMORY=8192
10 | TIME=23:59
11 | 
12 | # s. aureus
13 | 
14 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
15 |   for S in "${SEED[@]}"; do
16 |     OUTPUT="${A}_saureus_seed${S}_peaks${P}_GP_diffusion"
17 |     bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --peaks ${PEAKS} --normalize"
18 |   done
19 | done
20 | 
21 | # e. coli
22 | 
23 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
24 |   for S in "${SEED[@]}"; do
25 |     OUTPUT="${A}_ecoli_seed${S}_peaks${P}_GP_diffusion"
26 |     bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --peaks ${PEAKS} --normalize"
27 |   done
28 | done
29 | 
30 | # k. pneu
31 | 
32 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
33 |   for S in "${SEED[@]}"; do
34 |     OUTPUT="${A}_kpneu_seed${S}_peaks${P}_GP_diffusion"
35 |     bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --peaks ${PEAKS} --normalize"
36 |   done
37 | done
38 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_baseline_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | MEMORY=8192
 5 | TIME=23:59
 6 | 
 7 | # s. aureus
 8 | 
 9 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
10 |   for s in "${SEED[@]}"; do
11 |     OUTPUT="${A}_saureus_seed${s}"
12 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species saureus --antibiotic $A --seed $s"
13 |     bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species saureus --antibiotic $A --seed $s --normalize"
14 |   done
15 | done
16 | 
17 | # e. coli
18 | 
19 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
20 |   for s in "${SEED[@]}"; do
21 |     OUTPUT="${A}_ecoli_seed${s}"
22 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species ecoli --antibiotic $A --seed $s"
23 |     bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species ecoli --antibiotic $A --seed $s --normalize"
24 |   done
25 | done
26 | 
27 | # k. pneu
28 | 
29 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
30 |   for s in "${SEED[@]}"; do
31 |     OUTPUT="${A}_kpneu_seed${s}"
32 |     bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species kpneu --antibiotic $A --seed $s"
33 |     bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species kpneu --antibiotic $A --seed $s --normalize"
34 |   done
35 | done
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2020, Machine Learning and Computational Biology Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/extract_baseline_parameters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env
 2 | #
 3 | # Auxiliary script for extracting baseline parameters from a set of runs
 4 | # and reporting their mean. This is useful to run a calibration with
 5 | # a pre-selected model.
 6 | 
 7 | import argparse
 8 | import re
 9 | 
10 | import json_tricks as jt
11 | import numpy as np
12 | 
13 | from tqdm import tqdm
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('INPUT', nargs='+', type=str)
20 |     args = parser.parse_args()
21 | 
22 |     parameters = []
23 | 
24 |     for filename in tqdm(args.INPUT, desc='Loading'):
25 |         with open(filename) as f:
26 |             # Ensures that we can parse normal JSON files
27 |             pos = 0
28 | 
29 |             for line in f:
30 | 
31 |                 # We found *probably* the beginning of the JSON file, so
32 |                 # we can start the parse process from here, having to do
33 |                 # a reset.
34 |                 if line.startswith('{'):
35 |                     f.seek(pos)
36 |                     break
37 |                 else:
38 |                     pos += len(line)
39 | 
40 |             # Check whether file is empty for some reason. If so, we
41 |             # skip it.
42 |             line = f.readline()
43 |             if line == '':
44 |                 continue
45 | 
46 |             # Not empty, so we need to reset the file pointer
47 |             else:
48 |                 f.seek(pos)
49 | 
50 |             data_raw = jt.load(f)
51 | 
52 |         seed = data_raw['seed']
53 |         best_parameters = data_raw['best_parameters']
54 |         print(f'Seed {seed}: {best_parameters}')
55 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/mean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Calculates the mean of a set of CSVs. The CSV files are assumed to
 4 | # contain the same ranges.
 5 | 
 6 | import argparse
 7 | import sys
 8 | 
 9 | import pandas as pd
10 | 
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('FILES', nargs='+', type=str)
16 | 
17 |     args = parser.parse_args()
18 | 
19 |     data = []
20 | 
21 |     for filename in args.FILES:
22 |         df = pd.read_csv(filename, header=0, index_col=0)
23 |         data.append(df)
24 | 
25 |     df = data[0]
26 |     columns = df.columns
27 | 
28 |     for index, right in enumerate(data[1:]):
29 |         df = pd.merge(df, right,
30 |                 suffixes=('', '_' + str(index + 1)),
31 |                 how='outer', on=['threshold']
32 |         )
33 | 
34 |     df = df.fillna(1.0)
35 | 
36 |     mean_auprc = df[['auprc', 'auprc_1', 'auprc_2', 'auprc_3',
37 |         'auprc_4']].mean(axis=1)
38 | 
39 |     std_auprc = df[['auprc', 'auprc_1', 'auprc_2', 'auprc_3',
40 |         'auprc_4']].std(axis=1)
41 | 
42 | 
43 |     mean_accuracy = df[['accuracy', 'accuracy_1', 'accuracy_2', 'accuracy_3',
44 |         'accuracy_4']].mean(axis=1)
45 | 
46 |     std_accuracy = df[['accuracy', 'accuracy_1', 'accuracy_2', 'accuracy_3',
47 |         'accuracy_4']].std(axis=1)
48 | 
49 |     mean_n_pos_samples = df[['n_pos_samples',
50 |         'n_pos_samples_1',
51 |         'n_pos_samples_2',
52 |         'n_pos_samples_3',
53 |         'n_pos_samples_4']].mean(axis=1)
54 | 
55 |     df = pd.DataFrame({'mean_auprc': mean_auprc, 'std_auprc': std_auprc,
56 |         'mean_n_pos_samples': mean_n_pos_samples, 'mean_accuracy':
57 |         mean_accuracy, 'std_accuracy': std_accuracy})
58 | 
59 |     df.to_csv(sys.stdout)
60 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/preprocessing/topological.py:
--------------------------------------------------------------------------------
 1 | """Preprocessing using TOpological Peak Filtering (TOPF)."""
 2 | from sklearn.base import TransformerMixin
 3 | from topf import PersistenceTransformer
 4 | from typing import List
 5 | 
 6 | from ..data import MaldiTofSpectrum
 7 | 
 8 | 
 9 | class TopologicalPeakFiltering(TransformerMixin):
10 |     """Topological peak filtering using TOPF."""
11 | 
12 |     _required_arguments = ['n_peaks']
13 | 
14 |     def __init__(self, n_peaks):
15 |         """Topological peak filtrering (TOPF) for MALDI-TOF spectra.
16 | 
17 |         Args:
18 |             n_peaks: Number of peaks to retain. Peaks will be eliminated in
19 |                 top-down order starting from the one with the lowest
20 |                 persistence. Thus, if the var is 1, only the highest peak will
21 |                 be kept.
22 | 
23 |         """
24 |         self.n_peaks = n_peaks
25 | 
26 |     def fit(self, X, y=None):
27 |         """Do nothing."""
28 |         return self
29 | 
30 |     @staticmethod
31 |     def _remove_non_peaks(spectrum):
32 |         return spectrum[spectrum[:, 1] != 0.]
33 | 
34 |     def transform(self, X: List[MaldiTofSpectrum]) -> List[MaldiTofSpectrum]:
35 |         """Apply topological peak filtering to the data array X.
36 | 
37 |         Args:
38 |             X: List of MALDI-TOF spectra.
39 | 
40 |         Returns:
41 |             Sparse spectra containing only n_peaks peaks.
42 | 
43 |         """
44 |         pers_transformer = PersistenceTransformer(
45 |             calculate_persistence_diagram=False, n_peaks=self.n_peaks)
46 | 
47 |         return [
48 |             MaldiTofSpectrum(
49 |                 self._remove_non_peaks(
50 |                     pers_transformer.fit_transform(spectrum)
51 |                 )
52 |             )
53 |             for spectrum in X
54 |         ]
55 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/visualise_feature_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Basic visualisation script for the feature map of our kernel, subject
 4 | # to a certain smoothing parameter.
 5 | 
 6 | from maldi_learn.data import MaldiTofSpectrum
 7 | 
 8 | from maldi_learn.preprocessing import ScaleNormalizer
 9 | 
10 | import matplotlib.pyplot as plt
11 | import seaborn as sns
12 | 
13 | import numpy as np
14 | import pandas as pd
15 | 
16 | import argparse
17 | 
18 | 
19 | def feature_map(spectrum, x, sigma=1.0):
20 |     positions = spectrum.mass_to_charge_ratios
21 |     peaks = spectrum.intensities
22 | 
23 |     f = np.multiply(peaks, np.exp(-(x - positions)**2 / (4 * sigma)))
24 |     f = 1 / (2 * np.sqrt(np.pi * sigma)) * np.sum(f)
25 | 
26 |     return f
27 | 
28 | 
29 | if __name__ == '__main__':
30 | 
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument('FILE', type=str)
33 | 
34 |     args = parser.parse_args()
35 | 
36 |     spectrum = MaldiTofSpectrum(
37 |         pd.read_csv(args.FILE, sep=' ', comment='#').values
38 |     )
39 | 
40 |     sn = ScaleNormalizer()
41 |     spectrum = sn.fit_transform([spectrum])[0]
42 | 
43 |     spectrum = spectrum[spectrum.mass_to_charge_ratios < 2500]
44 |     x_min = np.min(spectrum.mass_to_charge_ratios)
45 |     x_max = np.max(spectrum.mass_to_charge_ratios)
46 | 
47 |     fig, ax = plt.subplots(4, 1, sharex=True)
48 | 
49 |     ax[0].stem(spectrum.mass_to_charge_ratios, spectrum.intensities,
50 |               linefmt='k-', basefmt='black', markerfmt='None',
51 |               use_line_collection=True)
52 | 
53 |     for axis in ax:
54 |         axis.set_ylim(0, 6)
55 | 
56 |         axis.spines['top'].set_visible(False)
57 |         axis.spines['right'].set_visible(False)
58 | 
59 |         axis.set_yticks([0, 2, 4, 6])
60 | 
61 |     for axis, sigma in zip(ax[1:], [1, 10, 100]):
62 | 
63 |         X = np.linspace(x_min, x_max, 300)
64 |         Y = [feature_map(spectrum, x, sigma) for x in X]
65 | 
66 |         axis.plot(X, Y)
67 | 
68 | 
69 |     plt.show()
70 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/submit_diffusion_kernel_jobs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SEED=(58925 15250 97412 17965 44873)
 4 | PEAKS=(100 200 500 700)
 5 | MEMORY=8192
 6 | TIME=23:59
 7 | 
 8 | # s. aureus
 9 | 
10 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do
11 |   for S in "${SEED[@]}"; do
12 |       for P in "${PEAKS[@]}"; do
13 |         OUTPUT="${A}_saureus_seed${S}_peaks${P}_GP_diffusion"
14 |         bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --peaks $P"
15 |         bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --peaks $P --normalize"
16 |     done
17 |   done
18 | done
19 | 
20 | # e. coli
21 | 
22 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do
23 |   for S in "${SEED[@]}"; do
24 |       for P in "${PEAKS[@]}"; do
25 |         OUTPUT="${A}_ecoli_seed${S}_peaks${P}_GP_diffusion"
26 |         bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --peaks $P"
27 |         bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --peaks $P --normalize"
28 |     done
29 |   done
30 | done
31 | 
32 | # k. pneu
33 | 
34 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do
35 |   for S in "${SEED[@]}"; do
36 |       for P in "${PEAKS[@]}"; do
37 |         OUTPUT="${A}_kpneu_seed${S}_peaks${P}_GP_diffusion"
38 |         bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --peaks $P"
39 |         bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --peaks $P --normalize"
40 |     done
41 |   done
42 | done
43 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/mean_rejection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Calculates the mean of a set of CSVs. The CSV files are assumed to
 4 | # contain the same ranges.
 5 | 
 6 | import argparse
 7 | import sys
 8 | 
 9 | import pandas as pd
10 | 
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('FILES', nargs='+', type=str)
16 | 
17 |     args = parser.parse_args()
18 | 
19 |     data = []
20 | 
21 |     for filename in args.FILES:
22 |         df = pd.read_csv(filename, header=0, index_col=0)
23 |         data.append(df)
24 | 
25 |     df = data[0]
26 |     columns = df.columns
27 | 
28 |     for index, right in enumerate(data[1:]):
29 |         df = pd.merge(df, right,
30 |                 suffixes=('', '_' + str(index + 1)),
31 |                 how='outer', on=['threshold']
32 |         )
33 | 
34 |     df = df.fillna(1.0)
35 | 
36 |     mean_rejected_in_sample = df[['rejected_in_sample', 'rejected_in_sample_1', 'rejected_in_sample_2', 'rejected_in_sample_3',
37 |         'rejected_in_sample_4']].mean(axis=1)
38 | 
39 |     std_rejected_in_sample = df[['rejected_in_sample', 'rejected_in_sample_1', 'rejected_in_sample_2', 'rejected_in_sample_3',
40 |         'rejected_in_sample_4']].std(axis=1)
41 | 
42 |     mean_rejected_out_of_sample = df[['rejected_out_of_sample',
43 |         'rejected_out_of_sample_1',
44 |         'rejected_out_of_sample_2',
45 |         'rejected_out_of_sample_3',
46 |         'rejected_out_of_sample_4']].mean(axis=1)
47 | 
48 |     std_rejected_out_of_sample = df[['rejected_out_of_sample',
49 |         'rejected_out_of_sample_1',
50 |         'rejected_out_of_sample_2',
51 |         'rejected_out_of_sample_3',
52 |         'rejected_out_of_sample_4']].std(axis=1)
53 | 
54 |     df = pd.DataFrame({'mean_rejected_in_sample': mean_rejected_in_sample, 'std_rejected_in_sample': std_rejected_in_sample,
55 |         'mean_rejected_out_of_sample': mean_rejected_out_of_sample,
56 |         'std_rejected_out_of_sample': std_rejected_out_of_sample})
57 | 
58 |     df.to_csv(sys.stdout)
59 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/extract_kernel_parameters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env
 2 | #
 3 | # Auxiliary script for extracting kernel parameters from a set of runs
 4 | # and reporting their mean. This is useful to run a calibration with a
 5 | # pre-selected model.
 6 | 
 7 | import argparse
 8 | import re
 9 | 
10 | import json_tricks as jt
11 | import numpy as np
12 | 
13 | from tqdm import tqdm
14 | 
15 | 
16 | def extract_parameter(s, name='DiffusionKernel'):
17 |     '''
18 |     Extracts the kernel parameter from a string. The function attempts
19 |     to extract a float value enclosed between brackets that correspond
20 |     to a kernel name.
21 | 
22 |     Returns `np.nan` if no match could be found.
23 |     '''
24 | 
25 |     pattern = f'{name}\((.+)\)'
26 |     m = re.match(pattern, s)
27 | 
28 |     if m:
29 |         return float(m.group(1))
30 |     else:
31 |         return np.nan
32 | 
33 | 
34 | if __name__ == '__main__':
35 | 
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument('INPUT', nargs='+', type=str)
38 |     args = parser.parse_args()
39 | 
40 |     parameters = []
41 | 
42 |     for filename in tqdm(args.INPUT, desc='Loading'):
43 |         with open(filename) as f:
44 |             # Ensures that we can parse normal JSON files
45 |             pos = 0
46 | 
47 |             for line in f:
48 | 
49 |                 # We found *probably* the beginning of the JSON file, so
50 |                 # we can start the parse process from here, having to do
51 |                 # a reset.
52 |                 if line.startswith('{'):
53 |                     f.seek(pos)
54 |                     break
55 |                 else:
56 |                     pos += len(line)
57 | 
58 |             # Check whether file is empty for some reason. If so, we
59 |             # skip it.
60 |             line = f.readline()
61 |             if line == '':
62 |                 continue
63 | 
64 |             # Not empty, so we need to reset the file pointer
65 |             else:
66 |                 f.seek(pos)
67 | 
68 |             data_raw = jt.load(f)
69 | 
70 |         kernel = data_raw['kernel']
71 |         parameter = extract_parameter(kernel)
72 | 
73 |         parameters.append(parameter)
74 | 
75 |     mu = np.mean(parameters)
76 |     sigma = np.std(parameters)
77 | 
78 |     print('Extracted kernel parameters:', parameters)
79 |     print(f'Mean kernel parameter: {mu:.2f} +- {sigma:.2f}')
80 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/calibrate_histograms.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Calculates calibration scores of the model by calculating changes in
 4 | # AUPRC depending on the threshold.
 5 | 
 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
10 | 
11 | from sklearn.metrics import accuracy_score
12 | from sklearn.metrics import average_precision_score
13 | 
14 | import argparse
15 | 
16 | import numpy as np
17 | import json_tricks as jt
18 | 
19 | from tqdm import tqdm
20 | 
21 | 
22 | def process(data, in_sample_species, antibiotic, seed):
23 | 
24 |     species_to_dataset = {
25 |         'ecoli': EcoliAntibioticResistanceDataset,
26 |         'kpneu': KpneuAntibioticResistanceDataset,
27 |         'saureus': SaureusAntibioticResistanceDataset
28 |     }
29 | 
30 |     dataset = species_to_dataset[in_sample_species](
31 |         test_size=0.20,
32 |         antibiotic=antibiotic,
33 |         random_seed=seed
34 |     )
35 | 
36 |     _, y_test = dataset.testing_data
37 | 
38 |     test_proba = data['in_sample_test_proba']
39 |     test_proba_max = np.amax(test_proba, axis=1)
40 | 
41 |     output_in_sample_proba = \
42 |         f'In_sample_proba_{in_sample_species}_{antibiotic}_{seed}.csv'
43 | 
44 |     np.savetxt(output_in_sample_proba, test_proba_max, fmt='%.2f')
45 | 
46 |     oos_species = data['out_of_sample_species']
47 | 
48 |     for species in oos_species:
49 | 
50 |         oos_proba = data['out_of_sample_' + species + '_proba']
51 |         oos_proba_max = np.amax(oos_proba, axis=1)
52 | 
53 |         output_out_of_sample_proba = \
54 |             f'Out_of_sample_proba_{species}_{antibiotic}_{seed}.csv'
55 | 
56 |         np.savetxt(output_out_of_sample_proba, oos_proba_max, fmt='%.2f')
57 | 
58 | 
59 | if __name__ == '__main__':
60 | 
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('FILES', nargs='+', type=str)
63 | 
64 |     args = parser.parse_args()
65 | 
66 |     for filename in tqdm(args.FILES, desc='Loading'):
67 |         with open(filename) as f:
68 |             data = jt.load(f)
69 |             species = data['in_sample_species']
70 |             antibiotic = data['in_sample_antibiotic']
71 |             seed = data['seed']
72 | 
73 |             process(data, species, antibiotic, seed)
74 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/vectorization/binning.py:
--------------------------------------------------------------------------------
 1 | """Transformers for binning spectra."""
 2 | import numpy as np
 3 | 
 4 | from sklearn.base import BaseEstimator
 5 | from sklearn.base import TransformerMixin
 6 | 
 7 | 
 8 | class BinningVectorizer(BaseEstimator, TransformerMixin):
 9 |     """Vectorizer based on binning MALDI-TOF spectra.
10 | 
11 |     Attributes:
12 |         bin_edges_: Edges of the bins derived after fitting the transformer.
13 | 
14 |     """
15 | 
16 |     _required_parameters = ['n_bins']
17 | 
18 |     def __init__(self, n_bins, min_bin=float('inf'), max_bin=float('-inf')):
19 |         """Initialize BinningVectorizer.
20 | 
21 |         Args:
22 |             n_bins: Number of bins to bin the inputs spectra into.
23 |             min_bin: Smallest possible bin edge.
24 |             max_bin: Largest possible bin edge.
25 | 
26 |         """
27 |         self.n_bins = n_bins
28 |         self.min_bin = min_bin
29 |         self.max_bin = max_bin
30 |         self.bin_edges_ = None
31 | 
32 |     def fit(self, X, y=None):
33 |         """Fit transformer, derives bins used to bin spectra."""
34 |         combined_times = np.concatenate(
35 |             [spectrum[:, 0] for spectrum in X], axis=0)
36 |         min_range = min(self.min_bin, np.min(combined_times))
37 |         max_range = max(self.max_bin, np.max(combined_times))
38 | 
39 |         _, self.bin_edges_ = np.histogram(
40 |             combined_times, self.n_bins, range=(min_range, max_range))
41 |         return self
42 | 
43 |     def transform(self, X):
44 |         """Transform list of spectra into vector using bins.
45 | 
46 |         Args:
47 |             X: List of MALDI-TOF spectra
48 | 
49 |         Returns:
50 |             2D numpy array with shape [n_instances x n_bins]
51 | 
52 |         """
53 |         output = []
54 |         for spectrum in X:
55 |             times = spectrum[:, 0]
56 |             indices = np.digitize(times, self.bin_edges_, right=True)
57 | 
58 |             # Drops all instances which are outside the defined bin
59 |             # range.
60 |             valid = (indices >= 1) & (indices <= self.n_bins)
61 |             spectrum = spectrum[valid]
62 | 
63 |             # Need to update indices to ensure that the first bin is at
64 |             # position zero.
65 |             indices = indices[valid] - 1
66 |             identity = np.eye(self.n_bins)
67 | 
68 |             vec = np.sum(
69 |                 identity[indices] * spectrum[:, 1][:, np.newaxis], axis=0)
70 | 
71 |             output.append(vec)
72 | 
73 |         return np.stack(output, axis=0)
74 | 


--------------------------------------------------------------------------------
/maldi-learn/tests/preprocessing/test_normalization.py:
--------------------------------------------------------------------------------
 1 | """Test normalizers."""
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | 
 6 | from maldi_learn.data import MaldiTofSpectrum
 7 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer
 8 | 
 9 | 
10 | MOCK_DATA = [
11 |     MaldiTofSpectrum(
12 |         [[0.0,   5.0],
13 |          [10.7,  8.0],
14 |          [150.4, 10.],
15 |          [1000,  3.0]
16 |          ]
17 |     ),  # Mean intensity 6.5
18 |     MaldiTofSpectrum(
19 |         [[0.0,   15.0],
20 |          [10.7,  0.0],
21 |          [150.4, 10.],
22 |          [1000,  3.0]
23 |          ]
24 |     ),  # Mean intensity 7 or 9.3333 (with ignore zero intensity)
25 | ]
26 | 
27 | # Total mean intensity: 6.75 or 7.7142857143 (with ignore zero intensity)
28 | 
29 | 
30 | class TestTotalIonCurrentNormalizer(unittest.TestCase):
31 |     def test_dont_ignore_zero_intensity(self):
32 |         transf = TotalIonCurrentNormalizer(ignore_zero_intensity=False)
33 |         transformed = transf.fit_transform(MOCK_DATA)
34 | 
35 |         # Normalization factor first example: 6.5 / 6.75 = 0.9629
36 |         transformed_intesities = transformed[0].intensities
37 |         expected_intensities = MOCK_DATA[0].intensities * (6.5 / 6.75)
38 |         self.assertTrue(np.allclose(
39 |             transformed_intesities,
40 |             expected_intensities
41 |         ))
42 | 
43 |         # Normalization factor second example: 7 / 6.75 = 1.0370
44 |         transformed_intesities = transformed[1].intensities
45 |         expected_intensities = MOCK_DATA[1].intensities * (7 / 6.75)
46 |         self.assertTrue(np.allclose(
47 |             transformed_intesities,
48 |             expected_intensities
49 |         ))
50 | 
51 |     def test_ignore_zero_intensity(self):
52 |         transf = TotalIonCurrentNormalizer(ignore_zero_intensity=True)
53 |         transformed = transf.fit_transform(MOCK_DATA)
54 | 
55 |         # Normalization factor first example: 6.5 / 7.71428 = 0.9629
56 |         transformed_intesities = transformed[0].intensities
57 |         expected_intensities = MOCK_DATA[0].intensities * (6.5 / 7.71428)
58 |         self.assertTrue(np.allclose(
59 |             transformed_intesities,
60 |             expected_intensities
61 |         ))
62 | 
63 |         # Normalization factor second example: 9.3333 / 7.71428 = 1.0370
64 |         transformed_intesities = transformed[1].intensities
65 |         expected_intensities = MOCK_DATA[1].intensities * (9.3333 / 7.71428)
66 |         self.assertTrue(np.allclose(
67 |             transformed_intesities,
68 |             expected_intensities
69 |         ))
70 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/demo_kernel_confidence.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Demo file for kernel calculation.
 3 | '''
 4 | 
 5 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 6 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 8 | 
 9 | from maldi_learn.kernels import DiffusionKernel
10 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer
11 | 
12 | from maldi_learn.preprocessing import SubsetPeaksTransformer
13 | 
14 | from sklearn.gaussian_process import GaussianProcessClassifier
15 | from sklearn.metrics import average_precision_score
16 | from sklearn.svm import SVC
17 | 
18 | from imblearn.over_sampling import RandomOverSampler
19 | 
20 | from joblib import parallel_backend
21 | 
22 | import matplotlib.pyplot as plt
23 | 
24 | import numpy as np
25 | import sys
26 | 
27 | dataset = KpneuAntibioticResistanceDataset(antibiotic='Ceftriaxon',
28 |         test_size=0.20)
29 | X_train, y_train = dataset.training_data
30 | X_test, y_test = dataset.testing_data
31 | 
32 | X_indices = np.asarray([i for i in range(0,
33 |     len(X_train))]).reshape(-1, 1)
34 | 
35 | ros = RandomOverSampler(random_state=2020)
36 | 
37 | X_indices, y_train = ros.fit_sample(X_indices, y_train)
38 | 
39 | X_train_ = []
40 | 
41 | for index in X_indices.ravel():
42 |     X_train_.append(X_train[index])
43 | 
44 | X_train = X_train_
45 | 
46 | tic = TotalIonCurrentNormalizer()
47 | X_train = tic.fit_transform(X_train)
48 | X_test = tic.transform(X_test)
49 | 
50 | st = SubsetPeaksTransformer(n_peaks=200)
51 | 
52 | X_train = st.fit_transform(X_train)
53 | X_test = st.transform(X_test)
54 | 
55 | kernel = DiffusionKernel(sigma=10)
56 | 
57 | print('Finished pre-processing')
58 | 
59 | test_distribution = kernel(X_train, X_test).ravel()
60 | 
61 | clf = GaussianProcessClassifier(optimizer=None, kernel=kernel, n_jobs=-1)
62 | clf.fit(X_train, y_train)
63 | 
64 | test_distribution = np.amax(clf.predict_proba(X_test), axis=1).ravel()
65 | 
66 | oos_dataset = SaureusAntibioticResistanceDataset(antibiotic='Penicillin',
67 |         test_size=0.20)
68 | 
69 | oos_test, _ = oos_dataset.testing_data
70 | oos_test = tic.transform(oos_test)
71 | oos_test = st.transform(oos_test)
72 | 
73 | oos_distribution = np.amax(clf.predict_proba(oos_test), axis=1).ravel()
74 | 
75 | plt.hist(test_distribution, label='test', bins=np.linspace(0.50, 0.60, 100), alpha=0.5)
76 | plt.hist(oos_distribution, label='oos saureus', bins=np.linspace(0.50, 0.60, 100), alpha=0.5)
77 | 
78 | oos_dataset = EcoliAntibioticResistanceDataset(antibiotic='Ciprofloxacin',
79 |         test_size=0.20)
80 | 
81 | oos_test, _ = oos_dataset.testing_data
82 | oos_test = tic.transform(oos_test)
83 | oos_test = st.transform(oos_test)
84 | 
85 | oos_distribution = np.amax(clf.predict_proba(oos_test), axis=1).ravel()
86 | 
87 | plt.hist(oos_distribution, label='oos ecoli', bins=np.linspace(0.50, 0.60, 100), alpha=0.5)
88 | plt.legend()
89 | 
90 | plt.show()
91 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/visualise_baseline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Basic visualisation script for baseline classifier.
 4 | 
 5 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
 6 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 7 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 8 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 9 | 
10 | from maldi_learn.vectorization import BinningVectorizer
11 | 
12 | from sklearn.manifold import TSNE
13 | 
14 | from joblib import parallel_backend
15 | 
16 | import matplotlib.pyplot as plt
17 | import seaborn as sns
18 | 
19 | import numpy as np
20 | import json_tricks as jt
21 | 
22 | import argparse
23 | import os
24 | import warnings
25 | 
26 | 
27 | if __name__ == '__main__':
28 | 
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument('-s', '--species', type=str, required=True)
31 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
32 |     parser.add_argument('-S', '--seed', type=int, required=False,
33 |             default=2020)
34 |     parser.add_argument('--suffix', default='')
35 | 
36 |     args = parser.parse_args()
37 | 
38 |     species_to_dataset = {
39 |         'ecoli': EcoliAntibioticResistanceDataset,
40 |         'kpneu': KpneuAntibioticResistanceDataset,
41 |         'saureus': SaureusAntibioticResistanceDataset
42 |     }
43 | 
44 |     dataset = species_to_dataset[args.species](
45 |                 test_size=0.20,
46 |                 antibiotic=args.antibiotic,
47 |                 random_seed=args.seed,
48 |                 suffix=args.suffix
49 |     )
50 | 
51 |     X_train, y_train = dataset.training_data
52 |     X_test, y_test = dataset.testing_data
53 | 
54 |     bv = BinningVectorizer(900, min_bin=2000, max_bin=20000)
55 |     X_train = bv.fit_transform(X_train)
56 |     X_test = bv.transform(X_test)
57 | 
58 |     X = np.concatenate((X_train, X_test), axis=0)
59 | 
60 |     # Static information about the data set; will be extended later on
61 |     # with information about the training itself.
62 |     data = {
63 |         'seed': args.seed,
64 |         'species': args.species,
65 |         'antibiotic': args.antibiotic,
66 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
67 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
68 |     }
69 | 
70 |     tsne = TSNE(n_components=2)
71 |     tsne.fit(X)
72 | 
73 |     Z = tsne.fit_transform(X)
74 |     Z_train = Z[:len(X_train)]
75 |     Z_test = Z[len(X_train):]
76 | 
77 |     fig, axes = plt.subplots(ncols=2)
78 | 
79 |     sns.scatterplot(x=Z_train[:, 0], y=Z_train[:, 1], hue=y_train,
80 |             ax=axes[0])
81 |     sns.scatterplot(x=Z_test[:, 0], y=Z_test[:, 1], hue=y_test,
82 |             ax=axes[1])
83 | 
84 |     plt.show()
85 | 
86 |     if args.output is not None:
87 |         with open(args.output, 'w') as f:
88 |             jt.dump(data, f, indent=4)
89 |     else:
90 |         print(jt.dumps(data, indent=4))
91 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/analyse_split.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # Analyses a given split of a data set and prints some summary
 4 | # statistics. This is mean for debugging purposes only.
 5 | 
 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
10 | 
11 | from maldi_learn.preprocessing import SubsetPeaksTransformer
12 | 
13 | from joblib import parallel_backend
14 | 
15 | import numpy as np
16 | 
17 | import argparse
18 | import os
19 | import warnings
20 | 
21 | 
22 | def get_mean_and_std(X, y, l):
23 | 
24 |     intensities = []
25 | 
26 |     for spectrum, label in zip(X, y):
27 |         if label == l:
28 |             intensities.extend(spectrum[:, 1])
29 | 
30 |     return np.mean(intensities), np.std(intensities)
31 | 
32 | 
33 | if __name__ == '__main__':
34 | 
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('-s', '--species', type=str, required=True)
37 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
38 |     parser.add_argument('-S', '--seed', type=int, required=False,
39 |             default=2020)
40 |     parser.add_argument('-p', '--peaks', type=int, required=False,
41 |             default=100)
42 | 
43 |     args = parser.parse_args()
44 | 
45 |     species_to_dataset = {
46 |         'ecoli': EcoliAntibioticResistanceDataset,
47 |         'kpneu': KpneuAntibioticResistanceDataset,
48 |         'saureus': SaureusAntibioticResistanceDataset
49 |     }
50 | 
51 |     dataset = species_to_dataset[args.species](
52 |                 test_size=0.20,
53 |                 antibiotic=args.antibiotic,
54 |                 random_seed=args.seed
55 |             )
56 | 
57 |     X_train, y_train = dataset.training_data
58 |     X_test, y_test = dataset.testing_data
59 | 
60 |     st = SubsetPeaksTransformer(n_peaks=args.peaks)
61 | 
62 |     X_train = st.fit_transform(X_train)
63 |     X_test = st.transform(X_test)
64 | 
65 |     print(f'Seed: {args.seed}')
66 |     print(f'Species: {args.species}')
67 |     print(f'Antibiotic: {args.antibiotic}')
68 |     print(f'Number of peaks: {args.peaks}')
69 | 
70 |     SPECTRA_PATH = os.getenv('ANTIBIOTICS_SPECTRA_PATH')
71 |     ENDPOINT_PATH = os.getenv('ANTIBIOTICS_ENDPOINT_PATH')
72 | 
73 |     print(f'SPECTRA_PATH = {SPECTRA_PATH}')
74 |     print(f'ENDPOINT_PATH = {ENDPOINT_PATH}')
75 | 
76 |     mu_train_0, std_train_0 = get_mean_and_std(X_train, y_train, 0)
77 |     mu_train_1, std_train_1 = get_mean_and_std(X_train, y_train, 1)
78 |     mu_test_0, std_test_0 = get_mean_and_std(X_test, y_test, 0)
79 |     mu_test_1, std_test_1 = get_mean_and_std(X_test, y_test, 1)
80 | 
81 |     print('y == 0 (train):', mu_train_0, std_train_0)
82 |     print('y == 1 (train):', mu_train_1, std_train_1)
83 |     print('y == 0 (test):', mu_test_0, std_test_0)
84 |     print('y == 1 (test):', mu_test_1, std_test_1)
85 | 
86 |     print(abs(mu_train_0 - mu_test_0), abs(std_test_0 - std_test_1))
87 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/collect_results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Collection script for all results. Will create a table based on the
  4 | # species and the antibiotic and summarise the performance measures.
  5 | 
  6 | import argparse
  7 | 
  8 | import json_tricks as jt
  9 | import numpy as np
 10 | import pandas as pd
 11 | 
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | if __name__ == '__main__':
 16 | 
 17 |     parser = argparse.ArgumentParser()
 18 |     parser.add_argument('INPUT', nargs='+', type=str)
 19 | 
 20 |     # Following the convention of `sklearn` here instead of referring to
 21 |     # AUPRC or something like that.
 22 |     parser.add_argument(
 23 |             '-m', '--metric',
 24 |             default='average_precision',
 25 |             type=str
 26 |     )
 27 | 
 28 |     args = parser.parse_args()
 29 | 
 30 |     rows = []
 31 | 
 32 |     for filename in tqdm(args.INPUT, desc='Loading'):
 33 |         with open(filename) as f:
 34 |             # Ensures that we can parse normal JSON files
 35 |             pos = 0
 36 | 
 37 |             for line in f:
 38 | 
 39 |                 # We found *probably* the beginning of the JSON file, so
 40 |                 # we can start the parse process from here, having to do
 41 |                 # a reset.
 42 |                 if line.startswith('{'):
 43 |                     f.seek(pos)
 44 |                     break
 45 |                 else:
 46 |                     pos += len(line)
 47 | 
 48 |             # Check whether file is empty for some reason. If so, we
 49 |             # skip it.
 50 |             line = f.readline()
 51 |             if line == '':
 52 |                 continue
 53 | 
 54 |             # Not empty, so we need to reset the file pointer
 55 |             else:
 56 |                 f.seek(pos)
 57 | 
 58 |             data_raw = jt.load(f)
 59 | 
 60 |         # Create one row in the table containing the relevant
 61 |         # information for now.
 62 |         row = {
 63 |             'species': data_raw['species'],
 64 |             'antibiotic': data_raw['antibiotic'],
 65 |             args.metric: data_raw[args.metric],
 66 |         }
 67 | 
 68 |         # Some magic for figuring out whether we are looking at
 69 |         # a baseline method or one of our own. Also creates the
 70 |         # name of the method.
 71 |         #
 72 |         # TODO: handle multiple kernels
 73 | 
 74 |         is_baseline = 'best_parameters' in data_raw
 75 |         method = 'baseline' if is_baseline else 'kernel'
 76 | 
 77 |         if not is_baseline:
 78 |             if data_raw['n_peaks'] is None:
 79 |                 data_raw['n_peaks'] = 'all'
 80 | 
 81 |             method += '_' + str(data_raw['n_peaks'])
 82 | 
 83 |         if 'normalize' in data_raw:
 84 |             method += '_normalized' if data_raw['normalize'] else ''
 85 | 
 86 |         row['method'] = method
 87 |         rows.append(row)
 88 | 
 89 |     pd.options.display.max_rows = 999
 90 |     pd.options.display.float_format = '{:,.2f}'.format
 91 | 
 92 |     df = pd.DataFrame(rows)
 93 |     df = df.groupby(['species', 'antibiotic', 'method']).agg(
 94 |         {
 95 |             args.metric: [np.mean, np.std]
 96 |         }
 97 |     )
 98 | 
 99 |     print(df)
100 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/visualise_kernel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | #
 3 | # Basic visualisation script
 4 | 
 5 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
 6 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
 7 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 8 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 9 | 
10 | from maldi_learn.kernels import DiffusionKernel
11 | from maldi_learn.preprocessing import ScaleNormalizer
12 | 
13 | from sklearn.decomposition import KernelPCA
14 | 
15 | from joblib import parallel_backend
16 | 
17 | import matplotlib.pyplot as plt
18 | import seaborn as sns
19 | 
20 | import numpy as np
21 | import json_tricks as jt
22 | 
23 | import argparse
24 | import os
25 | import warnings
26 | 
27 | 
28 | if __name__ == '__main__':
29 | 
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('-s', '--species', type=str, required=True)
32 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
33 |     parser.add_argument('-S', '--seed', type=int, required=False,
34 |             default=2020)
35 |     parser.add_argument('--sigma', type=float, required=False, default=1.0)
36 |     parser.add_argument('--suffix', default='')
37 | 
38 |     args = parser.parse_args()
39 | 
40 |     species_to_dataset = {
41 |         'ecoli': EcoliAntibioticResistanceDataset,
42 |         'kpneu': KpneuAntibioticResistanceDataset,
43 |         'saureus': SaureusAntibioticResistanceDataset
44 |     }
45 | 
46 |     dataset = species_to_dataset[args.species](
47 |                 test_size=0.20,
48 |                 antibiotic=args.antibiotic,
49 |                 random_seed=args.seed,
50 |                 suffix=args.suffix
51 |     )
52 | 
53 |     X_train, y_train = dataset.training_data
54 |     X_test, y_test = dataset.testing_data
55 | 
56 |     # Only perform scale normalisation if a suffix has been set; this
57 |     # should be made configurable.
58 |     if len(args.suffix) > 0:
59 |         sn = ScaleNormalizer()
60 |         X_train = sn.fit_transform(X_train)
61 |         X_test = sn.transform(X_test)
62 | 
63 |     # Static information about the data set; will be extended later on
64 |     # with information about the training itself.
65 |     data = {
66 |         'seed': args.seed,
67 |         'species': args.species,
68 |         'antibiotic': args.antibiotic,
69 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
70 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
71 |     }
72 | 
73 |     kernel = DiffusionKernel(args.sigma)
74 | 
75 |     with parallel_backend(backend='threading', n_jobs=-1):
76 |         K_train = kernel(X_train)
77 |         K_test = kernel(X_test)
78 | 
79 |     pca = KernelPCA(n_components=2, kernel="precomputed")
80 |     Z_train = pca.fit_transform(K_train)
81 |     Z_test = pca.fit_transform(K_test)
82 | 
83 |     fig, axes = plt.subplots(ncols=2)
84 | 
85 |     sns.scatterplot(x=Z_train[:, 0], y=Z_train[:, 1], hue=y_train,
86 |             ax=axes[0])
87 |     sns.scatterplot(x=Z_test[:, 0], y=Z_test[:, 1], hue=y_test,
88 |             ax=axes[1])
89 | 
90 |     plt.show()
91 | 
92 |     if args.output is not None:
93 |         with open(args.output, 'w') as f:
94 |             jt.dump(data, f, indent=4)
95 |     else:
96 |         print(jt.dumps(data, indent=4))
97 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/preprocessing/normalization.py:
--------------------------------------------------------------------------------
  1 | """Normalization strategies for MALDI-TOF spectra."""
  2 | import numpy as np
  3 | 
  4 | from sklearn.base import TransformerMixin
  5 | from sklearn.base import BaseEstimator
  6 | 
  7 | 
  8 | class TotalIonCurrentNormalizer(BaseEstimator, TransformerMixin):
  9 |     """
 10 |     Normalize spectra based on total ion content. The normalizer
 11 |     supports different normalization strategies.
 12 |     """
 13 | 
 14 |     def __init__(self, ignore_zero_intensity=True, method='mean'):
 15 |         """Initialize total ion content based normalizer.
 16 | 
 17 |         Args:
 18 |             ignore_zero_intensity: Ignore peaks with zero intensity when
 19 |                 computing the average used for normalization.
 20 | 
 21 |             method: Determines the method that is used to perform the
 22 |             normalization. If set to 'mean', computes averages over the
 23 |             spectra to normalize. If set to 'sum', normalizes each
 24 |             spectrum individually such that its intensities sum to one.
 25 |         """
 26 |         self.ignore_zero_intensity = ignore_zero_intensity
 27 |         self.mean_intensity = None
 28 |         self.method = method
 29 | 
 30 |     def _normalize_spectrum(self, spectrum, method):
 31 |         if method == 'mean':
 32 |             if self.ignore_zero_intensity:
 33 |                 intensities = spectrum.intensities[spectrum.intensities != 0.]
 34 |             else:
 35 |                 intensities = spectrum.intensities
 36 |             mean_instance_intensity = np.mean(intensities)
 37 |             scaling = mean_instance_intensity / self.mean_intensity
 38 |             return spectrum * np.array([1, scaling])[np.newaxis, :]
 39 |         elif method == 'sum':
 40 |             scaling = 1.0 / np.sum(spectrum.intensities)
 41 |             return spectrum * np.array([1, scaling])[np.newaxis, :]
 42 |         else:
 43 |             raise RuntimeError(
 44 |                     f'Unexpected normalization method "{method}"')
 45 | 
 46 |     def _compute_mean_intensity_spectra(self, spectra):
 47 |         if self.ignore_zero_intensity:
 48 |             intensities = np.concatenate(
 49 |                 [
 50 |                     spectrum.intensities[spectrum.intensities != 0.]
 51 |                     for spectrum in spectra
 52 |                 ],
 53 |                 axis=0
 54 |             )
 55 |         else:
 56 |             intensities = np.concatenate(
 57 |                 [spectrum.intensities for spectrum in spectra], axis=0)
 58 |         return np.mean(intensities)
 59 | 
 60 |     def fit(self, X, y=None):
 61 |         """Fit transformer, computes average statistics of spectra."""
 62 |         self.mean_intensity = self._compute_mean_intensity_spectra(X)
 63 |         return self
 64 | 
 65 |     def transform(self, X):
 66 |         """Normalize spectra using total ion content."""
 67 |         return [
 68 |             self._normalize_spectrum(spectrum, method=self.method)
 69 |             for spectrum in X
 70 |         ]
 71 | 
 72 | 
 73 | class ScaleNormalizer(BaseEstimator, TransformerMixin):
 74 |     """
 75 |     Normalizes a set of spectra such that their scales are not too
 76 |     small.
 77 |     """
 78 | 
 79 |     def _calculate_min_nonzero_intensity(self, spectra):
 80 |         intensities = np.concatenate(
 81 |             [
 82 |                 s.intensities[s.intensities != 0] for s in spectra
 83 |             ],
 84 |             axis=0
 85 |         )
 86 |         return np.min(intensities)
 87 | 
 88 |     def _normalize_spectrum(self, spectrum):
 89 |         scaling = 1.0 / self.min_nonzero_intensity
 90 |         return spectrum * np.array([1, scaling])[np.newaxis, :]
 91 | 
 92 |     def fit(self, X, y=None):
 93 |         self.min_nonzero_intensity = self._calculate_min_nonzero_intensity(X)
 94 |         return self
 95 | 
 96 |     def transform(self, X):
 97 |         return [
 98 |             self._normalize_spectrum(spectrum) for spectrum in X
 99 |         ]
100 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/calibrate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # Calculates calibration scores of the model by calculating changes in
  4 | # AUPRC depending on the threshold.
  5 | 
  6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 10 | 
 11 | from sklearn.metrics import accuracy_score
 12 | from sklearn.metrics import average_precision_score
 13 | 
 14 | import argparse
 15 | 
 16 | import numpy as np
 17 | import json_tricks as jt
 18 | 
 19 | from tqdm import tqdm
 20 | 
 21 | 
 22 | def process(data, in_sample_species, antibiotic, seed):
 23 | 
 24 |     species_to_dataset = {
 25 |         'ecoli': EcoliAntibioticResistanceDataset,
 26 |         'kpneu': KpneuAntibioticResistanceDataset,
 27 |         'saureus': SaureusAntibioticResistanceDataset
 28 |     }
 29 | 
 30 |     dataset = species_to_dataset[in_sample_species](
 31 |         test_size=0.20,
 32 |         antibiotic=antibiotic,
 33 |         random_seed=seed
 34 |     )
 35 | 
 36 |     _, y_test = dataset.testing_data
 37 | 
 38 |     thresholds = np.linspace(0.5, 1.0, 1000)
 39 | 
 40 |     test_proba = data['in_sample_test_proba']
 41 |     test_proba_max = np.amax(test_proba, axis=1)
 42 | 
 43 |     output_rejection_ratio_curve = \
 44 |         f'Calibration_{in_sample_species}_{antibiotic}_{seed}.csv'
 45 | 
 46 |     with open(output_rejection_ratio_curve, 'w') as f:
 47 | 
 48 |         print('threshold,accuracy,auprc,n_pos_samples', file=f)
 49 | 
 50 |         for threshold in thresholds:
 51 | 
 52 |             # Get the indices that we want to *keep*, i.e. those test
 53 |             # samples whose maximum probability exceeds the threshold
 54 |             indices = test_proba_max > threshold
 55 | 
 56 |             # Subset the predictions and the labels according to these
 57 |             # indices and calculate an AUPRC.
 58 |             y_true = y_test[indices]
 59 |             y_pred_proba = test_proba[indices][:, 1]
 60 | 
 61 |             # Predict the positive class if the prediction threshold is
 62 |             # larger than the one we use for this iteration.
 63 |             y_pred = np.zeros_like(y_pred_proba)
 64 |             y_pred[y_pred_proba > threshold] = 1.0
 65 | 
 66 |             y_true_unique = set(y_true.values)
 67 | 
 68 |             if len(y_true_unique) != 2:
 69 |                 break
 70 | 
 71 |             average_precision = average_precision_score(y_true, y_pred_proba)
 72 |             accuracy = accuracy_score(y_true, y_pred)
 73 | 
 74 |             print(f'{threshold},{accuracy},{average_precision},{sum(y_true == 1)}', file=f)
 75 | 
 76 |     oos_species = data['out_of_sample_species']
 77 | 
 78 |     for species in oos_species:
 79 | 
 80 |         output_rejection_plot = \
 81 |             f'Rejection_ratio_{in_sample_species}_{antibiotic}_{species}_{seed}.csv'
 82 | 
 83 |         oos_proba = data['out_of_sample_' + species + '_proba']
 84 |         oos_proba_max = np.amax(oos_proba, axis=1)
 85 | 
 86 |         with open(output_rejection_plot, 'w') as f:
 87 | 
 88 |             print('threshold,rejected_in_sample,rejected_out_of_sample',
 89 |                     file=f)
 90 | 
 91 |             for threshold in thresholds:
 92 |                 rejected_test = \
 93 |                     sum(test_proba_max <= threshold) / len(test_proba_max)
 94 | 
 95 |                 rejected_oos = \
 96 |                     sum(oos_proba_max <= threshold) / len(oos_proba_max)
 97 | 
 98 |                 print(f'{threshold},{rejected_test},{rejected_oos}',
 99 |                         file=f)
100 | 
101 | 
102 | if __name__ == '__main__':
103 | 
104 |     parser = argparse.ArgumentParser()
105 |     parser.add_argument('FILES', nargs='+', type=str)
106 | 
107 |     args = parser.parse_args()
108 | 
109 |     for filename in tqdm(args.FILES, desc='Loading'):
110 |         with open(filename) as f:
111 |             data = jt.load(f)
112 |             species = data['in_sample_species']
113 |             antibiotic = data['in_sample_antibiotic']
114 |             seed = data['seed']
115 | 
116 |             process(data, species, antibiotic, seed)
117 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/baseline_gp_rbf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Trains a baseline Gaussian process classifier with an RBF kernel on
  4 | # the features generated by the logistic regression.
  5 | 
  6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 10 | 
 11 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer
 12 | from maldi_learn.preprocessing import SubsetPeaksTransformer
 13 | from maldi_learn.vectorization import BinningVectorizer
 14 | 
 15 | from imblearn.over_sampling import RandomOverSampler
 16 | 
 17 | from sklearn.gaussian_process import GaussianProcessClassifier
 18 | from sklearn.gaussian_process.kernels import RBF
 19 | 
 20 | from sklearn.exceptions import ConvergenceWarning
 21 | from sklearn.linear_model import LogisticRegression
 22 | from sklearn.metrics import average_precision_score
 23 | from sklearn.metrics import accuracy_score
 24 | from sklearn.model_selection import GridSearchCV
 25 | from sklearn.model_selection import StratifiedKFold
 26 | from sklearn.preprocessing import StandardScaler
 27 | from sklearn.pipeline import Pipeline
 28 | 
 29 | from joblib import parallel_backend
 30 | 
 31 | import numpy as np
 32 | import json_tricks as jt
 33 | 
 34 | import argparse
 35 | import os
 36 | import warnings
 37 | 
 38 | 
 39 | if __name__ == '__main__':
 40 | 
 41 |     parser = argparse.ArgumentParser()
 42 |     parser.add_argument('-s', '--species', type=str, required=True)
 43 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
 44 |     parser.add_argument('-S', '--seed', type=int, required=False, default=2020)
 45 |     parser.add_argument('-o', '--output', type=str)
 46 |     parser.add_argument('-n', '--normalize', action='store_true')
 47 | 
 48 |     args = parser.parse_args()
 49 | 
 50 |     species_to_dataset = {
 51 |         'ecoli': EcoliAntibioticResistanceDataset,
 52 |         'kpneu': KpneuAntibioticResistanceDataset,
 53 |         'saureus': SaureusAntibioticResistanceDataset
 54 |     }
 55 | 
 56 |     dataset = species_to_dataset[args.species](
 57 |                 test_size=0.20,
 58 |                 antibiotic=args.antibiotic,
 59 |                 random_seed=args.seed,
 60 |                 suffix='_peaks_warped'
 61 |             )
 62 | 
 63 |     X_train, y_train = dataset.training_data
 64 |     X_test, y_test = dataset.testing_data
 65 | 
 66 |     # Perform random oversampling in order to ensure class balance. This
 67 |     # is strictly speaking not required but we do it for the GP as well,
 68 |     # so in the interest of comparability, we have to do it here.
 69 | 
 70 |     ros = RandomOverSampler(random_state=args.seed)
 71 | 
 72 |     X_indices = np.asarray(
 73 |         [i for i in range(0, len(X_train))]).reshape(-1, 1)
 74 | 
 75 |     X_indices, y_train = ros.fit_sample(X_indices, y_train)
 76 |     X_train = np.take(X_train, X_indices.ravel())
 77 | 
 78 |     # Static information about the data set; will be extended later on
 79 |     # with information about the training itself.
 80 |     data = {
 81 |         'seed': args.seed,
 82 |         'species': args.species,
 83 |         'antibiotic': args.antibiotic,
 84 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
 85 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
 86 |         'normalize': args.normalize,
 87 |     }
 88 | 
 89 |     # This is the mode of the experiments run for the logistic
 90 |     # regression pipeline.
 91 |     n_bins = 3600
 92 | 
 93 |     data['n_bins'] = n_bins
 94 | 
 95 |     # Define pipeline and cross-validation setup
 96 | 
 97 |     pipeline = Pipeline(
 98 |         [
 99 |             ('bv', BinningVectorizer(
100 |                     n_bins=n_bins,
101 |                     min_bin=2000,
102 |                     max_bin=20000)),
103 |             ('gp', GaussianProcessClassifier(
104 |                     kernel=RBF(),
105 |                    )
106 |             )
107 |         ],
108 |         memory=os.getenv('TMPDIR', default=None),
109 |     )
110 | 
111 |     with warnings.catch_warnings():
112 |         warnings.filterwarnings('ignore', category=ConvergenceWarning)
113 |         warnings.filterwarnings('ignore', category=UserWarning)
114 | 
115 |         # Let's do the fitting in parallel, but the prediction can be done
116 |         # without additional threading.
117 |         with parallel_backend('loky', n_jobs=-1):
118 |             pipeline.fit(X_train, y_train)
119 | 
120 |         data['kernel'] = repr(pipeline['gp'].kernel_.theta)
121 | 
122 |     # AUPRC
123 | 
124 |     y_pred = pipeline.predict_proba(X_test)
125 |     average_precision = average_precision_score(y_test, y_pred[:, 1])
126 | 
127 |     data['average_precision'] = 100 * average_precision
128 | 
129 |     # Accuracy
130 | 
131 |     y_pred = pipeline.predict(X_test)
132 |     accuracy = accuracy_score(y_test, y_pred)
133 | 
134 |     data['accuracy'] = 100 * accuracy
135 | 
136 |     if args.output is not None:
137 |         with open(args.output, 'w') as f:
138 |             jt.dump(data, f, indent=4)
139 |     else:
140 |         print(jt.dumps(data, indent=4))
141 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/diffusion_kernel.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Trains a diffusion kernel Gaussian Process classifier and reports the
  4 | # results on all tasks.
  5 | 
  6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 10 | 
 11 | from maldi_learn.kernels import DiffusionKernel
 12 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer
 13 | from maldi_learn.preprocessing import SubsetPeaksTransformer
 14 | from maldi_learn.preprocessing import ScaleNormalizer
 15 | 
 16 | from sklearn.gaussian_process import GaussianProcessClassifier
 17 | from sklearn.gaussian_process.kernels import RBF
 18 | from sklearn.exceptions import ConvergenceWarning
 19 | from sklearn.metrics import average_precision_score
 20 | from sklearn.metrics import accuracy_score
 21 | 
 22 | from imblearn.over_sampling import RandomOverSampler
 23 | 
 24 | from joblib import parallel_backend
 25 | 
 26 | import numpy as np
 27 | import json_tricks as jt
 28 | 
 29 | import argparse
 30 | import os
 31 | import warnings
 32 | 
 33 | 
 34 | if __name__ == '__main__':
 35 | 
 36 |     parser = argparse.ArgumentParser()
 37 |     parser.add_argument('-s', '--species', type=str, required=True)
 38 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
 39 |     parser.add_argument('-o', '--output', type=str)
 40 |     parser.add_argument('-S', '--seed', type=int, required=False,
 41 |             default=2020)
 42 |     parser.add_argument('-p', '--peaks', type=int, required=False,
 43 |             default=None)
 44 |     parser.add_argument('-n', '--normalize', action='store_true')
 45 |     parser.add_argument('--suffix', default='')
 46 | 
 47 |     args = parser.parse_args()
 48 | 
 49 |     species_to_dataset = {
 50 |         'ecoli': EcoliAntibioticResistanceDataset,
 51 |         'kpneu': KpneuAntibioticResistanceDataset,
 52 |         'saureus': SaureusAntibioticResistanceDataset
 53 |     }
 54 | 
 55 |     dataset = species_to_dataset[args.species](
 56 |                 test_size=0.20,
 57 |                 antibiotic=args.antibiotic,
 58 |                 random_seed=args.seed,
 59 |                 suffix=args.suffix
 60 |     )
 61 | 
 62 |     X_train, y_train = dataset.training_data
 63 |     X_test, y_test = dataset.testing_data
 64 | 
 65 |     # Perform random oversampling in order to ensure class balance. This
 66 |     # is strictly speaking not required but we do it for the GP as well,
 67 |     # so in the interest of comparability, we have to do it here.
 68 | 
 69 |     ros = RandomOverSampler(random_state=args.seed)
 70 | 
 71 |     X_indices = np.asarray(
 72 |         [i for i in range(0, len(X_train))]).reshape(-1, 1)
 73 | 
 74 |     X_indices, y_train = ros.fit_sample(X_indices, y_train)
 75 |     X_train = np.take(X_train, X_indices.ravel())
 76 | 
 77 |     # Normalise on demand. This is an *external* flag because by
 78 |     # default, we should have no expectations about its efficacy
 79 |     # in practice.
 80 |     if args.normalize:
 81 |         tic = TotalIonCurrentNormalizer(method='sum')
 82 |         X_train = tic.fit_transform(X_train)
 83 |         X_test = tic.transform(X_test)
 84 | 
 85 |     # Sparsify the data by restricting everything to the peaks only.
 86 |     st = SubsetPeaksTransformer(n_peaks=args.peaks)
 87 | 
 88 |     X_train = st.fit_transform(X_train)
 89 |     X_test = st.transform(X_test)
 90 | 
 91 |     # Perform scale normalisation for the MQ data set, which is
 92 |     # indicated by a suffix, or whenever the client specified a
 93 |     # normalisation parameter manually. This ensures that every
 94 |     # spectrum can be fitted by the kernel.
 95 |     if args.normalize or len(args.suffix) > 0:
 96 |         sn = ScaleNormalizer()
 97 |         X_train = sn.fit_transform(X_train)
 98 |         X_test = sn.transform(X_test)
 99 | 
100 |     # Static information about the data set; will be extended later on
101 |     # with information about the training itself.
102 |     data = {
103 |         'seed': args.seed,
104 |         'species': args.species,
105 |         'antibiotic': args.antibiotic,
106 |         'n_peaks': args.peaks,
107 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
108 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
109 |         'normalize': args.normalize,
110 |     }
111 | 
112 |     kernel = DiffusionKernel(sigma=1)
113 |     clf = GaussianProcessClassifier(kernel=kernel)
114 | 
115 |     with warnings.catch_warnings():
116 |         warnings.filterwarnings('ignore', category=ConvergenceWarning)
117 |         warnings.filterwarnings('ignore', category=UserWarning)
118 | 
119 |         # Let's do the fitting in parallel, but the prediction can be done
120 |         # without additional threading.
121 |         with parallel_backend(backend='loky'):
122 |             clf.fit(X_train, y_train)
123 | 
124 |         data['kernel'] = repr(clf.kernel_)
125 |         data['log_marginal_likelihood'] = clf.log_marginal_likelihood_value_
126 | 
127 |     y_pred = clf.predict_proba(X_test)
128 |     average_precision = average_precision_score(y_test, y_pred[:, 1])
129 | 
130 |     data['average_precision'] = 100 * average_precision
131 | 
132 |     y_pred = clf.predict(X_test)
133 |     accuracy = accuracy_score(y_test, y_pred)
134 | 
135 |     data['accuracy'] = 100 * accuracy
136 | 
137 |     if args.output is not None:
138 |         with open(args.output, 'w') as f:
139 |             jt.dump(data, f, indent=4)
140 |     else:
141 |         print(jt.dumps(data, indent=4))
142 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/datasets/antibiotics.py:
--------------------------------------------------------------------------------
  1 | """Dataset of MALDI-TOF spectra for antibiotic resistance prediction."""
  2 | import os
  3 | 
  4 | from dotenv import load_dotenv
  5 | import pandas as pd
  6 | from sklearn.model_selection import train_test_split
  7 | from maldi_learn.data import MaldiTofSpectrum
  8 | 
  9 | from .dataset import Dataset
 10 | 
 11 | # Dataset paths are specified in .env file in the root of the repository
 12 | load_dotenv()
 13 | SPECTRA_PATH = os.getenv('ANTIBIOTICS_SPECTRA_PATH')
 14 | ENDPOINT_PATH = os.getenv('ANTIBIOTICS_ENDPOINT_PATH')
 15 | 
 16 | 
 17 | class AntibioticResistanceDataset(Dataset):
 18 |     """Base class of datasets predicting antibiotic resistance."""
 19 | 
 20 |     # endpoint_file_name = 'IDRES_clean.csv'
 21 | 
 22 |     def __init__(self, antibiotic, test_size=0.2, random_seed=2020,
 23 |             suffix=''):
 24 |         """Initialize the dataset.
 25 | 
 26 |         Args:
 27 |             antibiotic: Name (str) of the antibiotic to use for
 28 |             generating labels (endpoints).
 29 |             test_size: Fraction of the data that should be returned for
 30 |                 testing.
 31 |             random_seed: Random seed for splitting the data into train and
 32 |                 test.
 33 |             suffix: Suffix to use for the files to load. This suffix
 34 |             will be appended to the code specified in the endpoints data
 35 |             file.
 36 |         """
 37 |         self.antibiotic = antibiotic
 38 |         self.suffix = suffix
 39 | 
 40 |         all_instances = self._make_binary_labels(
 41 |             self._read_endpoints_and_preprocess())
 42 | 
 43 |         train_instances, test_instances = train_test_split(
 44 |             all_instances,
 45 |             test_size=test_size,
 46 |             random_state=random_seed,
 47 |             stratify=all_instances.values  # stratify by labels
 48 |         )
 49 | 
 50 |         self.all_instances, self.train_instances, self.test_instances = \
 51 |             all_instances, train_instances, test_instances
 52 | 
 53 |     def _read_endpoints_and_preprocess(self):
 54 |         endpoint_file = os.path.join(ENDPOINT_PATH, self.endpoint_file_name)
 55 |         endpoints = pd.read_csv(endpoint_file, index_col='code')
 56 |         endpoints = endpoints.replace({
 57 |             '-': float('NaN'),
 58 |             'R(1)': float('NaN'),
 59 |             'L(1)': float('NaN'),
 60 |             'I(1)': float('NaN'),
 61 |             'I(1), S(1)': float('NaN'),
 62 |             'R(1), I(1)': float('NaN'),
 63 |             'R(1), S(1)': float('NaN'),
 64 |             'R(1), I(1), S(1)': float('NaN')
 65 |         })
 66 | 
 67 |         return endpoints
 68 | 
 69 |     def _make_binary_labels(self, df):
 70 |         """
 71 |         Creates binary labels by restricting the input data frame to the
 72 |         specified antibiotic. This is followed by dropping all NaNs, and
 73 |         making all labels binary (depending on resistance/susceptibility).
 74 |         """
 75 | 
 76 |         only_antibiotic = df[self.antibiotic]
 77 | 
 78 |         only_antibiotic = only_antibiotic.dropna(
 79 |             axis='index', how='any', inplace=False)
 80 | 
 81 |         return only_antibiotic.replace({'R': 1, 'I': 1, 'S': 0})
 82 | 
 83 |     # TODO: might want to remove this
 84 |     def _subset_instances(self, *instance_lists):
 85 |         def subset_and_binarize(input_instances):
 86 |             """Remove unused antibiotics and not measured instances."""
 87 |             only_antibiotic = input_instances[self.antibiotic]
 88 |             only_antibiotic = only_antibiotic.dropna(axis='index', how='any', inplace=False)
 89 |             return only_antibiotic.replace({'R': 1, 'I': 1, 'S': 0})
 90 |         return [subset_and_binarize(instances) for instances in instance_lists]
 91 | 
 92 |     @staticmethod
 93 |     def _build_filepaths_from_codes(codes, suffix):
 94 |         return [os.path.join(SPECTRA_PATH, f'{code}{suffix}.txt') for code in codes]
 95 | 
 96 |     def _read_spectra(self, files):
 97 |         return [
 98 |             MaldiTofSpectrum(
 99 |                 pd.read_csv(f, sep=' ', comment='#', engine='c').values)
100 |             for f in files
101 |         ]
102 | 
103 |     def _read_data(self, instances):
104 |         codes = instances.index
105 |         files = self._build_filepaths_from_codes(codes, self.suffix)
106 |         spectra = self._read_spectra(files)
107 |         return spectra, instances
108 | 
109 |     @property
110 |     def training_data(self):
111 |         """Get spectra used for training."""
112 |         return self._read_data(self.train_instances)
113 | 
114 |     @property
115 |     def validation_data(self):
116 |         """Not implemented for now."""
117 |         raise NotImplementedError()
118 | 
119 |     @property
120 |     def testing_data(self):
121 |         """Get spectra used for testing."""
122 |         return self._read_data(self.test_instances)
123 | 
124 |     @property
125 |     def complete_data(self):
126 |         """Get all spectra."""
127 |         return self._read_data(self.all_instances)
128 | 
129 | 
130 | class EcoliAntibioticResistanceDataset(AntibioticResistanceDataset):
131 |     """Dataset for E.coli antibiotic resistance."""
132 | 
133 |     endpoint_file_name = 'IDRES_Ecoli.csv'
134 | 
135 | 
136 | class SaureusAntibioticResistanceDataset(AntibioticResistanceDataset):
137 |     """Dataset for S.aureus antibiotic resistance."""
138 | 
139 |     endpoint_file_name = 'IDRES_Saureus.csv'
140 | 
141 | 
142 | class KpneuAntibioticResistanceDataset(AntibioticResistanceDataset):
143 |     """Dataset for K.pneumoniae antibiotic resistance."""
144 | 
145 |     endpoint_file_name = 'IDRES_Kpneu.csv'
146 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/baseline_maldiquant.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Trains a baseline logistic regression classifier and reports the
  4 | # results on all tasks. Uses pre-processed spectra and does not do
  5 | # any peak calling.
  6 | 
  7 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 10 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 11 | 
 12 | from maldi_learn.vectorization import BinningVectorizer
 13 | 
 14 | from imblearn.over_sampling import RandomOverSampler
 15 | 
 16 | from sklearn.exceptions import FitFailedWarning
 17 | from sklearn.exceptions import ConvergenceWarning
 18 | from sklearn.linear_model import LogisticRegression
 19 | from sklearn.metrics import average_precision_score
 20 | from sklearn.metrics import accuracy_score
 21 | from sklearn.model_selection import GridSearchCV
 22 | from sklearn.model_selection import StratifiedKFold
 23 | from sklearn.preprocessing import StandardScaler
 24 | from sklearn.pipeline import Pipeline
 25 | 
 26 | from joblib import parallel_backend
 27 | 
 28 | import numpy as np
 29 | import json_tricks as jt
 30 | 
 31 | import argparse
 32 | import os
 33 | import warnings
 34 | 
 35 | 
 36 | if __name__ == '__main__':
 37 | 
 38 |     parser = argparse.ArgumentParser()
 39 |     parser.add_argument('-s', '--species', type=str, required=True)
 40 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
 41 |     parser.add_argument('-S', '--seed', type=int, required=False, default=2020)
 42 |     parser.add_argument('-o', '--output', type=str)
 43 |     parser.add_argument('--suffix', type=str, default='')
 44 | 
 45 |     args = parser.parse_args()
 46 | 
 47 |     species_to_dataset = {
 48 |         'ecoli': EcoliAntibioticResistanceDataset,
 49 |         'kpneu': KpneuAntibioticResistanceDataset,
 50 |         'saureus': SaureusAntibioticResistanceDataset
 51 |     }
 52 | 
 53 |     dataset = species_to_dataset[args.species](
 54 |                 test_size=0.20,
 55 |                 antibiotic=args.antibiotic,
 56 |                 random_seed=args.seed,
 57 |                 suffix=args.suffix
 58 |             )
 59 | 
 60 |     X_train, y_train = dataset.training_data
 61 |     X_test, y_test = dataset.testing_data
 62 | 
 63 |     # Perform random oversampling in order to ensure class balance. This
 64 |     # is strictly speaking not required but we do it for the GP as well,
 65 |     # so in the interest of comparability, we have to do it here.
 66 | 
 67 |     ros = RandomOverSampler(random_state=args.seed)
 68 | 
 69 |     X_indices = np.asarray(
 70 |         [i for i in range(0, len(X_train))]).reshape(-1, 1)
 71 | 
 72 |     X_indices, y_train = ros.fit_sample(X_indices, y_train)
 73 |     X_train = np.take(X_train, X_indices.ravel())
 74 | 
 75 |     # Static information about the data set; will be extended later on
 76 |     # with information about the training itself.
 77 |     data = {
 78 |         'seed': args.seed,
 79 |         'species': args.species,
 80 |         'antibiotic': args.antibiotic,
 81 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
 82 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
 83 |     }
 84 | 
 85 |     param_grid = {
 86 |         'bv__n_bins': [300, 600, 1800, 3600],
 87 |         'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
 88 |         'lr__C': 10. ** np.arange(-4, 5),  # 10^{-4}..10^{4}
 89 |     }
 90 | 
 91 |     data['param_grid'] = param_grid
 92 | 
 93 |     # Define pipeline and cross-validation setup
 94 | 
 95 |     pipeline = Pipeline(
 96 |         [
 97 |             ('bv', BinningVectorizer(
 98 |                     n_bins=0,
 99 |                     min_bin=2000,
100 |                     max_bin=20000)),
101 |             ('std', StandardScaler()),
102 |             ('lr', LogisticRegression(
103 |                         class_weight='balanced',
104 |                         solver='saga'  # supports L_1 and L_2 penalties
105 |                    )
106 |             )
107 |         ],
108 |         memory=os.getenv('TMPDIR', default=None),
109 |     )
110 | 
111 |     grid_search = GridSearchCV(
112 |                     pipeline,
113 |                     param_grid=param_grid,
114 |                     scoring='average_precision',
115 |                     cv=StratifiedKFold(n_splits=5, shuffle=True,
116 |                         random_state=42),
117 |                     n_jobs=-1,
118 |                 )
119 | 
120 |     with warnings.catch_warnings():
121 |         warnings.filterwarnings('ignore', category=ConvergenceWarning)
122 |         warnings.filterwarnings('ignore', category=FitFailedWarning)
123 |         warnings.filterwarnings('ignore', category=UserWarning)
124 | 
125 |         # Let's do the fitting in parallel, but the prediction can be done
126 |         # without additional threading.
127 |         with parallel_backend('threading'):
128 |             grid_search.fit(X_train, y_train)
129 | 
130 |         data['best_parameters'] = grid_search.best_params_
131 | 
132 |     # AUPRC
133 | 
134 |     y_pred = grid_search.predict_proba(X_test)
135 |     average_precision = average_precision_score(y_test, y_pred[:, 1])
136 | 
137 |     data['average_precision'] = 100 * average_precision
138 | 
139 |     # Accuracy
140 | 
141 |     y_pred = grid_search.predict(X_test)
142 |     accuracy = accuracy_score(y_test, y_pred)
143 | 
144 |     data['accuracy'] = 100 * accuracy
145 | 
146 |     if args.output is not None:
147 |         with open(args.output, 'w') as f:
148 |             jt.dump(data, f, indent=4)
149 |     else:
150 |         print(jt.dumps(data, indent=4))
151 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/baseline_maldiquant_confidence.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Trains a baseline logistic regression classifier and reports the
  4 | # confidence scores on our pre-defined task.
  5 | 
  6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 10 | 
 11 | from maldi_learn.vectorization import BinningVectorizer
 12 | 
 13 | from imblearn.over_sampling import RandomOverSampler
 14 | 
 15 | from sklearn.exceptions import FitFailedWarning
 16 | from sklearn.exceptions import ConvergenceWarning
 17 | from sklearn.linear_model import LogisticRegression
 18 | from sklearn.preprocessing import StandardScaler
 19 | from sklearn.pipeline import Pipeline
 20 | 
 21 | from joblib import parallel_backend
 22 | 
 23 | import numpy as np
 24 | import json_tricks as jt
 25 | 
 26 | import argparse
 27 | import os
 28 | import warnings
 29 | 
 30 | 
 31 | if __name__ == '__main__':
 32 | 
 33 |     parser = argparse.ArgumentParser()
 34 |     parser.add_argument('-S', '--seed', type=int, required=False, default=2020)
 35 |     parser.add_argument('-o', '--output', type=str)
 36 |     parser.add_argument('--suffix', type=str, default='')
 37 | 
 38 |     args = parser.parse_args()
 39 | 
 40 |     in_sample_species = 'saureus'
 41 |     in_sample_antibiotic = 'Amoxicillin-Clavulansaeure'
 42 | 
 43 |     # Antibiotics will not be used, but are specified because our data
 44 |     # set selection class demands it.
 45 |     out_of_sample_species = ['ecoli', 'kpneu']
 46 |     out_of_sample_antibiotics = ['Ciprofloxacin', 'Ciprofloxacin']
 47 | 
 48 |     species_to_dataset = {
 49 |         'ecoli': EcoliAntibioticResistanceDataset,
 50 |         'kpneu': KpneuAntibioticResistanceDataset,
 51 |         'saureus': SaureusAntibioticResistanceDataset
 52 |     }
 53 | 
 54 |     dataset = species_to_dataset[in_sample_species](
 55 |                 test_size=0.20,
 56 |                 antibiotic=in_sample_antibiotic,
 57 |                 random_seed=args.seed,
 58 |                 suffix=args.suffix
 59 |             )
 60 | 
 61 |     X_train, y_train = dataset.training_data
 62 |     X_test, y_test = dataset.testing_data
 63 | 
 64 |     # Perform random oversampling in order to ensure class balance. This
 65 |     # is strictly speaking not required but we do it for the GP as well,
 66 |     # so in the interest of comparability, we have to do it here.
 67 | 
 68 |     ros = RandomOverSampler(random_state=args.seed)
 69 | 
 70 |     X_indices = np.asarray(
 71 |         [i for i in range(0, len(X_train))]).reshape(-1, 1)
 72 | 
 73 |     X_indices, y_train = ros.fit_sample(X_indices, y_train)
 74 |     X_train = np.take(X_train, X_indices.ravel())
 75 | 
 76 |     # Parameters extracted from the respective runs of the baseline
 77 |     # classifier for this particular scenario.
 78 |     n_bins = 3600
 79 |     C = 0.01
 80 |     penalty = 'l2'
 81 | 
 82 |     # Static information about the data set; will be extended later on
 83 |     # with information about the training itself.
 84 |     data = {
 85 |         'seed': args.seed,
 86 |         'in_sample_antibiotic': in_sample_antibiotic,
 87 |         'in_sample_species': in_sample_species,
 88 |         'out_of_sample_species': out_of_sample_species,
 89 |         'out_of_sample_antibiotics': out_of_sample_antibiotics,
 90 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
 91 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
 92 |         'n_bins': n_bins,
 93 |         'C': C,
 94 |         'penalty': penalty,
 95 |     }
 96 | 
 97 |     pipeline = Pipeline(
 98 |         [
 99 |             ('bv', BinningVectorizer(
100 |                     n_bins=n_bins,
101 |                     min_bin=2000,
102 |                     max_bin=20000)),
103 |             ('std', StandardScaler()),
104 |             ('lr', LogisticRegression(
105 |                         class_weight='balanced',
106 |                         C=C,
107 |                         penalty=penalty,
108 |                         solver='saga'  # supports L_1 and L_2 penalties
109 |                    )
110 |             )
111 |         ],
112 |         memory=os.getenv('TMPDIR', default=None),
113 |     )
114 | 
115 |     # Makes subsequent operations easier to read
116 |     clf = pipeline
117 | 
118 |     with warnings.catch_warnings():
119 |         warnings.filterwarnings('ignore', category=ConvergenceWarning)
120 |         warnings.filterwarnings('ignore', category=FitFailedWarning)
121 |         warnings.filterwarnings('ignore', category=UserWarning)
122 | 
123 |         # Let's do the fitting in parallel, but the prediction can be done
124 |         # without additional threading.
125 |         with parallel_backend('loky'):
126 |             clf.fit(X_train, y_train)
127 | 
128 |     # Get maximum probability for classifying a sample into *any* class,
129 |     # based on the test data set.
130 |     test_proba = clf.predict_proba(X_test)
131 |     test_proba_max = np.amax(test_proba, axis=1)
132 | 
133 |     data['in_sample_test_proba'] = test_proba
134 | 
135 |     for species, antibiotic in zip(out_of_sample_species,
136 |             out_of_sample_antibiotics):
137 | 
138 |         oos_dataset = species_to_dataset[species](
139 |             test_size=0.20,
140 |             antibiotic=antibiotic,
141 |             random_seed=args.seed,
142 |             suffix=args.suffix,
143 |         )
144 | 
145 |         oos_test, _ = oos_dataset.testing_data
146 | 
147 |         oos_proba = clf.predict_proba(oos_test)
148 |         oos_proba_max = np.amax(oos_proba, axis=1)
149 | 
150 |         data['out_of_sample_' + species + '_proba'] = oos_proba
151 | 
152 |     if args.output is not None:
153 |         with open(args.output, 'w') as f:
154 |             jt.dump(data, f, indent=4)
155 |     else:
156 |         print(jt.dumps(data, indent=4))
157 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/baseline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Trains a baseline logistic regression classifier and reports the
  4 | # results on all tasks.
  5 | 
  6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 10 | 
 11 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer
 12 | from maldi_learn.preprocessing import SubsetPeaksTransformer
 13 | from maldi_learn.vectorization import BinningVectorizer
 14 | 
 15 | from imblearn.over_sampling import RandomOverSampler
 16 | 
 17 | from sklearn.exceptions import ConvergenceWarning
 18 | from sklearn.linear_model import LogisticRegression
 19 | from sklearn.metrics import average_precision_score
 20 | from sklearn.metrics import accuracy_score
 21 | from sklearn.model_selection import GridSearchCV
 22 | from sklearn.model_selection import StratifiedKFold
 23 | from sklearn.preprocessing import StandardScaler
 24 | from sklearn.pipeline import Pipeline
 25 | 
 26 | from joblib import parallel_backend
 27 | 
 28 | import numpy as np
 29 | import json_tricks as jt
 30 | 
 31 | import argparse
 32 | import os
 33 | import warnings
 34 | 
 35 | 
 36 | if __name__ == '__main__':
 37 | 
 38 |     parser = argparse.ArgumentParser()
 39 |     parser.add_argument('-s', '--species', type=str, required=True)
 40 |     parser.add_argument('-a', '--antibiotic', type=str, required=True)
 41 |     parser.add_argument('-S', '--seed', type=int, required=False, default=2020)
 42 |     parser.add_argument('-o', '--output', type=str)
 43 |     parser.add_argument('-n', '--normalize', action='store_true')
 44 | 
 45 |     args = parser.parse_args()
 46 | 
 47 |     species_to_dataset = {
 48 |         'ecoli': EcoliAntibioticResistanceDataset,
 49 |         'kpneu': KpneuAntibioticResistanceDataset,
 50 |         'saureus': SaureusAntibioticResistanceDataset
 51 |     }
 52 | 
 53 |     dataset = species_to_dataset[args.species](
 54 |                 test_size=0.20,
 55 |                 antibiotic=args.antibiotic,
 56 |                 random_seed=args.seed,
 57 |             )
 58 | 
 59 |     X_train, y_train = dataset.training_data
 60 |     X_test, y_test = dataset.testing_data
 61 | 
 62 |     # Perform random oversampling in order to ensure class balance. This
 63 |     # is strictly speaking not required but we do it for the GP as well,
 64 |     # so in the interest of comparability, we have to do it here.
 65 | 
 66 |     ros = RandomOverSampler(random_state=args.seed)
 67 | 
 68 |     X_indices = np.asarray(
 69 |         [i for i in range(0, len(X_train))]).reshape(-1, 1)
 70 | 
 71 |     X_indices, y_train = ros.fit_sample(X_indices, y_train)
 72 |     X_train = np.take(X_train, X_indices.ravel())
 73 | 
 74 |     # Normalise on demand. This is an *external* flag because by
 75 |     # default, we should have no expectations about its efficacy
 76 |     # in practice.
 77 |     if args.normalize:
 78 |         tic = TotalIonCurrentNormalizer(method='sum')
 79 |         X_train = tic.fit_transform(X_train)
 80 |         X_test = tic.transform(X_test)
 81 | 
 82 | 
 83 |     # Static information about the data set; will be extended later on
 84 |     # with information about the training itself.
 85 |     data = {
 86 |         'seed': args.seed,
 87 |         'species': args.species,
 88 |         'antibiotic': args.antibiotic,
 89 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
 90 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
 91 |         'normalize': args.normalize,
 92 |     }
 93 | 
 94 |     param_grid = [
 95 |         {
 96 |             'pt__n_peaks': [50, 100, 200, 500, None],
 97 |             'bv__n_bins': [75, 150, 300, 600, 1800, 3600],
 98 |             'lr__penalty': ['l1', 'l2'],
 99 |             'lr__C': 10. ** np.arange(-4, 5),  # 10^{-4}..10^{4}
100 |         },
101 |         {
102 |             'pt__n_peaks': [50, 100, 200, 500, None],
103 |             'bv__n_bins': [75, 150, 300, 600, 1800, 3600],
104 |             'lr__penalty': ['none'],
105 |         }
106 |     ]
107 | 
108 |     data['param_grid'] = param_grid
109 | 
110 |     # Define pipeline and cross-validation setup
111 | 
112 |     pipeline = Pipeline(
113 |         [
114 |             ('pt', SubsetPeaksTransformer(n_peaks=0)),
115 |             ('bv', BinningVectorizer(
116 |                     n_bins=3600,
117 |                     min_bin=2000,
118 |                     max_bin=20000)),
119 |             ('std', StandardScaler()),
120 |             ('lr', LogisticRegression(
121 |                         class_weight='balanced',
122 |                         solver='saga'  # supports L_1 and L_2 penalties
123 |                    )
124 |             )
125 |         ],
126 |         memory=os.getenv('TMPDIR', default=None),
127 |     )
128 | 
129 |     grid_search = GridSearchCV(
130 |                     pipeline,
131 |                     param_grid=param_grid,
132 |                     scoring='average_precision',
133 |                     cv=StratifiedKFold(n_splits=5, shuffle=True,
134 |                         random_state=42),
135 |                     n_jobs=-1,
136 |                 )
137 | 
138 |     with warnings.catch_warnings():
139 |         warnings.filterwarnings('ignore', category=ConvergenceWarning)
140 |         warnings.filterwarnings('ignore', category=UserWarning)
141 | 
142 |         # Let's do the fitting in parallel, but the prediction can be done
143 |         # without additional threading.
144 |         with parallel_backend('loky', n_jobs=-1):
145 |             grid_search.fit(X_train, y_train)
146 | 
147 |         data['best_parameters'] = grid_search.best_params_
148 | 
149 |     # AUPRC
150 | 
151 |     y_pred = grid_search.predict_proba(X_test)
152 |     average_precision = average_precision_score(y_test, y_pred[:, 1])
153 | 
154 |     data['average_precision'] = 100 * average_precision
155 | 
156 |     # Accuracy
157 | 
158 |     y_pred = grid_search.predict(X_test)
159 |     accuracy = accuracy_score(y_test, y_pred)
160 | 
161 |     data['accuracy'] = 100 * accuracy
162 | 
163 |     if args.output is not None:
164 |         with open(args.output, 'w') as f:
165 |             jt.dump(data, f, indent=4)
166 |     else:
167 |         print(jt.dumps(data, indent=4))
168 | 


--------------------------------------------------------------------------------
/ismb2020_maldi/diffusion_kernel_confidence.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # Performs a confidence estimation experiment on a subset of the data
  4 | # sets in order to check whether we may reject samples from another
  5 | # distribution.
  6 | 
  7 | from ismb2020_maldi.datasets import AntibioticResistanceDataset
  8 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset
  9 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset
 10 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset
 11 | 
 12 | from maldi_learn.kernels import DiffusionKernel
 13 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer
 14 | from maldi_learn.preprocessing import SubsetPeaksTransformer
 15 | from maldi_learn.preprocessing import ScaleNormalizer
 16 | 
 17 | from sklearn.gaussian_process import GaussianProcessClassifier
 18 | from sklearn.gaussian_process.kernels import RBF
 19 | from sklearn.exceptions import ConvergenceWarning
 20 | from sklearn.metrics import average_precision_score
 21 | 
 22 | from imblearn.over_sampling import RandomOverSampler
 23 | 
 24 | from joblib import parallel_backend
 25 | 
 26 | import numpy as np
 27 | import json_tricks as jt
 28 | 
 29 | import argparse
 30 | import os
 31 | import warnings
 32 | 
 33 | 
 34 | if __name__ == '__main__':
 35 | 
 36 |     parser = argparse.ArgumentParser()
 37 |     parser.add_argument('-o', '--output', type=str)
 38 |     parser.add_argument('-s', '--sigma', type=float, required=True)
 39 |     parser.add_argument('-S', '--seed', type=int, default=2020)
 40 |     parser.add_argument('-n', '--normalize', action='store_true')
 41 | 
 42 |     # By default, we assume that we want to use *all* the peaks because
 43 |     # we are comparing our model to a pre-processed pipeline.
 44 |     parser.add_argument('-p', '--peaks', type=int, required=False,
 45 |             default=None)
 46 |     parser.add_argument('--suffix', default='')
 47 | 
 48 |     args = parser.parse_args()
 49 | 
 50 |     in_sample_species = 'saureus'
 51 |     in_sample_antibiotic = 'Amoxicillin-Clavulansaeure'
 52 |     n_peaks = args.peaks
 53 | 
 54 |     # Antibiotics will not be used, but are specified because our data
 55 |     # set selection class demands it.
 56 |     out_of_sample_species = ['ecoli', 'kpneu']
 57 |     out_of_sample_antibiotics = ['Ciprofloxacin', 'Ciprofloxacin']
 58 | 
 59 |     args = parser.parse_args()
 60 | 
 61 |     species_to_dataset = {
 62 |         'ecoli': EcoliAntibioticResistanceDataset,
 63 |         'kpneu': KpneuAntibioticResistanceDataset,
 64 |         'saureus': SaureusAntibioticResistanceDataset
 65 |     }
 66 | 
 67 |     dataset = species_to_dataset[in_sample_species](
 68 |                 test_size=0.20,
 69 |                 antibiotic=in_sample_antibiotic,
 70 |                 random_seed=args.seed,
 71 |                 suffix=args.suffix
 72 |     )
 73 | 
 74 |     X_train, y_train = dataset.training_data
 75 |     X_test, y_test = dataset.testing_data
 76 | 
 77 |     # Only perform scale normalisation if a suffix has been set; this
 78 |     # should be made configurable.
 79 |     if len(args.suffix) > 0:
 80 |         sn = ScaleNormalizer()
 81 |         X_train = sn.fit_transform(X_train)
 82 |         X_test = sn.transform(X_test)
 83 | 
 84 |     # Perform random oversampling in order to ensure class balance. This
 85 |     # is strictly speaking not required but we do it for the GP as well,
 86 |     # so in the interest of comparability, we have to do it here.
 87 | 
 88 |     ros = RandomOverSampler(random_state=args.seed)
 89 | 
 90 |     X_indices = np.asarray(
 91 |         [i for i in range(0, len(X_train))]).reshape(-1, 1)
 92 | 
 93 |     X_indices, y_train = ros.fit_sample(X_indices, y_train)
 94 |     X_train = np.take(X_train, X_indices.ravel())
 95 | 
 96 |     # Normalise on demand. This is an *external* flag because by
 97 |     # default, we should have no expectations about its efficacy
 98 |     # in practice.
 99 |     if args.normalize:
100 |         tic = TotalIonCurrentNormalizer()
101 |         X_train = tic.fit_transform(X_train)
102 |         X_test = tic.transform(X_test)
103 | 
104 |     # Sparsify the data by restricting everything to the peaks only.
105 |     st = SubsetPeaksTransformer(n_peaks=n_peaks)
106 | 
107 |     X_train = st.fit_transform(X_train)
108 |     X_test = st.transform(X_test)
109 | 
110 |     kernel = DiffusionKernel(sigma=args.sigma)
111 |     clf = GaussianProcessClassifier(kernel=kernel, optimizer=None)
112 | 
113 |     # Static information about the data set; will be extended later on
114 |     # with information about the training itself.
115 |     data = {
116 |         'seed': args.seed,
117 |         'in_sample_antibiotic': in_sample_antibiotic,
118 |         'in_sample_species': in_sample_species,
119 |         'out_of_sample_species': out_of_sample_species,
120 |         'out_of_sample_antibiotics': out_of_sample_antibiotics,
121 |         'n_peaks': n_peaks,
122 |         'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'),
123 |         'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'),
124 |         'sigma': args.sigma,
125 |     }
126 | 
127 |     with warnings.catch_warnings():
128 |         warnings.filterwarnings('ignore', category=ConvergenceWarning)
129 |         warnings.filterwarnings('ignore', category=UserWarning)
130 | 
131 |         # Let's do the fitting in parallel, but the prediction can be done
132 |         # without additional threading.
133 |         with parallel_backend(backend='loky'):
134 |             clf.fit(X_train, y_train)
135 | 
136 |     # Get maximum probability for classifying a sample into *any* class,
137 |     # based on the test data set.
138 |     test_proba = clf.predict_proba(X_test)
139 |     test_proba_max = np.amax(test_proba, axis=1)
140 | 
141 |     data['in_sample_test_proba'] = test_proba
142 | 
143 |     for species, antibiotic in zip(out_of_sample_species,
144 |             out_of_sample_antibiotics):
145 | 
146 |         oos_dataset = species_to_dataset[species](
147 |             test_size=0.20,
148 |             antibiotic=antibiotic,
149 |             random_seed=args.seed,
150 |             suffix=args.suffix
151 |         )
152 | 
153 |         oos_test, _ = oos_dataset.testing_data
154 | 
155 |         # Only perform scale normalisation if a suffix has been set; this
156 |         # should be made configurable.
157 |         if len(args.suffix) > 0:
158 |             oos_test = sn.transform(oos_test)
159 | 
160 |         if args.normalize:
161 |             oos_test = tic.transform(oos_test)
162 | 
163 |         oos_test = st.transform(oos_test)
164 | 
165 |         oos_proba = clf.predict_proba(oos_test)
166 |         oos_proba_max = np.amax(oos_proba, axis=1)
167 | 
168 |         data['out_of_sample_' + species + '_proba'] = oos_proba
169 | 
170 |     if args.output is not None:
171 |         with open(args.output, 'w') as f:
172 |             jt.dump(data, f, indent=4)
173 |     else:
174 |         print(jt.dumps(data, indent=4))
175 | 


--------------------------------------------------------------------------------
/maldi-learn/maldi_learn/kernels.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Kernels for assessing the similarity between MALDI-TOF spectra.
  3 | '''
  4 | 
  5 | from sklearn.gaussian_process.kernels import Hyperparameter
  6 | from sklearn.gaussian_process.kernels import StationaryKernelMixin
  7 | from sklearn.gaussian_process.kernels import Kernel
  8 | 
  9 | from sklearn.metrics import pairwise_distances
 10 | from sklearn.metrics import pairwise_kernels
 11 | 
 12 | from scipy.spatial.distance import cdist
 13 | from scipy.spatial.distance import pdist
 14 | 
 15 | import numpy as np
 16 | import sys
 17 | 
 18 | 
 19 | class DiffusionKernel(StationaryKernelMixin, Kernel):
 20 |     '''
 21 |     Implements a diffusion kernel that performs iterative smoothing of
 22 |     a MALDI-TOF spectrum.
 23 |     '''
 24 | 
 25 |     def __init__(self, sigma=1.0, sigma_bounds=(1e-5, 1e5)):
 26 |         '''
 27 |         Initialises a new instance of the kernel.
 28 | 
 29 |         Parameters:
 30 |             sigma: Smoothing parameter
 31 |             sigma_bounds: Tuple specifying the minimum and maximum bound
 32 |             of the sigma scale parameter.
 33 |         '''
 34 | 
 35 |         self.sigma = sigma
 36 |         self.sigma_bounds = sigma_bounds
 37 | 
 38 |         def passthrough(*args, **kwargs):
 39 |             return args
 40 | 
 41 |         module = sys.modules['sklearn.metrics.pairwise']
 42 |         module.check_pairwise_arrays = passthrough
 43 | 
 44 |         sys.modules['sklearn.metrics.pairwise'] = module
 45 | 
 46 |     @property
 47 |     def hyperparameter_sigma(self):
 48 |         return Hyperparameter('sigma', 'numeric', self.sigma_bounds)
 49 | 
 50 |     @property
 51 |     def requires_vector_input(self):
 52 |         '''
 53 |         Returns whether the kernel works only on fixed-length feature
 54 |         vectors.
 55 |         '''
 56 | 
 57 |         return False
 58 | 
 59 |     def __call__(self, X, Y=None, eval_gradient=False):
 60 |         '''
 61 |         Returns the kernel value k(X, Y) and, if desired, its gradient
 62 |         as well.
 63 | 
 64 |         Parameters
 65 |         ----------
 66 |         X : array of spectra
 67 |             Left argument of the returned kernel k(X, Y)
 68 |         Y : array of spectra
 69 |             Right argument of the returned kernel k(X, Y). If None, k(X, X)
 70 |             if evaluated instead.
 71 |         eval_gradient : bool (optional, default=False)
 72 |             Determines whether the gradient with respect to the kernel
 73 |             hyperparameter is determined. Only supported when Y is None.
 74 | 
 75 |         Returns
 76 |         -------
 77 |         K : array, shape (n_samples_X, n_samples_Y)
 78 |             Kernel k(X, Y)
 79 |         K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims)
 80 |             The gradient of the kernel k(X, X) with respect to the
 81 |             hyperparameter of the kernel. Only returned when eval_gradient
 82 |             is True.
 83 |         '''
 84 | 
 85 |         def evaluate_kernel(x, y):
 86 | 
 87 |             # Get the positions (masses) of the two spectra. This could
 88 |             # be rewritten more compactly following the new interface.
 89 |             #
 90 |             # TODO: simplify / refactor
 91 |             x_positions = np.array(x[:, 0]).reshape(-1, 1)
 92 |             y_positions = np.array(y[:, 0]).reshape(-1, 1)
 93 | 
 94 |             distances = pairwise_distances(
 95 |                 x_positions,
 96 |                 y_positions,
 97 |                 metric='sqeuclidean'
 98 |             )
 99 | 
100 |             # Calculate scale factors as the outer product of the peak
101 |             # heights of the input data.
102 |             x_peaks = np.array(x[:, 1])
103 |             y_peaks = np.array(y[:, 1])
104 | 
105 |             P = np.outer(x_peaks, y_peaks)
106 |             K = np.multiply(P, np.exp(-distances / (4 * self.sigma)))
107 | 
108 |             return np.sum(K) / (4 * self.sigma * np.pi)
109 | 
110 |         def evaluate_gradient(x, y):
111 | 
112 |             # TODO: simplify / refactor
113 |             x_positions = np.array(x[:, 0]).reshape(-1, 1)
114 |             y_positions = np.array(y[:, 0]).reshape(-1, 1)
115 | 
116 |             distances = pairwise_distances(
117 |                 x_positions,
118 |                 y_positions,
119 |                 metric='sqeuclidean'
120 |             )
121 | 
122 |             # Calculate scale factors as the outer product of the peak
123 |             # heights of the input data.
124 |             x_peaks = np.array(x[:, 1])
125 |             y_peaks = np.array(y[:, 1])
126 | 
127 |             P = np.outer(x_peaks, y_peaks)
128 |             K = np.multiply(P, np.exp(-distances / (4 * self.sigma)))
129 | 
130 |             # Thanks to the simple form of the kernel, the gradient only
131 |             # requires an additional multiplication, followed by scaling
132 |             # it appropriately.
133 |             K_gradient = np.multiply(K, (distances - 4 * self.sigma))
134 | 
135 |             # Sum over all pairwise kernel values to get the full
136 |             # gradient between the two entries.
137 |             return np.sum(K_gradient) / (4 * self.sigma**2)
138 | 
139 |         if Y is None:
140 |             if eval_gradient:
141 |                 K = pairwise_kernels(X, metric=evaluate_kernel)
142 |                 K_gradient = pairwise_kernels(X, metric=evaluate_gradient)
143 | 
144 |                 return K, K_gradient[:, :, np.newaxis]
145 | 
146 |             else:
147 |                 return pairwise_kernels(X, metric=evaluate_kernel)
148 |         else:
149 | 
150 |             # Following the original API here, which prohibits gradient
151 |             # evaluation for this case.
152 |             if eval_gradient:
153 |                 raise ValueError(
154 |                     'Gradient can only be evaluated when Y is None.')
155 | 
156 |             return pairwise_kernels(X, Y, metric=evaluate_kernel)
157 | 
158 |     def diag(self, X):
159 |         '''
160 |         Returns the diagonal of the kernel k(X, X). The result of this
161 |         method is identical to np.diag(self(X)); however, it can be
162 |         evaluated more efficiently since only the diagonal is evaluated.
163 | 
164 |         Parameters
165 |         ----------
166 |         X : array, shape (n_samples_X, n_features)
167 |             Left argument of the returned kernel k(X, Y)
168 |         Returns
169 |         -------
170 |         K_diag : array, shape (n_samples_X,)
171 |             Diagonal of kernel k(X, X)
172 |         '''
173 | 
174 |         diag_values = np.zeros(len(X))
175 | 
176 |         for i, x in enumerate(X):
177 |             x_positions = np.array(x[:, 0]).reshape(-1, 1)
178 | 
179 |             distances = pairwise_distances(
180 |                 x_positions,
181 |                 x_positions,
182 |                 metric='sqeuclidean'
183 |             )
184 | 
185 |             x_peaks = np.array(x[:, 1])
186 | 
187 |             P = np.outer(x_peaks, x_peaks)
188 |             K = np.multiply(P, np.exp(-distances / (4 * self.sigma)))
189 | 
190 |             # Diagonal value for $x_i$
191 |             diag_values[i] = np.sum(K)
192 | 
193 |         return diag_values / (4 * self.sigma * np.pi)
194 | 
195 |     def __repr__(self):
196 |         return f'{self.__class__.__name__}({self.sigma:.8f})'
197 | 


--------------------------------------------------------------------------------
/maldi-learn/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------