├── tests ├── __init__.py └── test_ismb2020_maldi.py ├── maldi-learn ├── tests │ ├── __init__.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── test_topological_preprocessing.py │ │ ├── test_subset_peaks.py │ │ └── test_normalization.py │ ├── test_maldi_learn.py │ ├── mock.py │ └── vectorization │ │ └── test_binning_vectorizer.py ├── maldi_learn │ ├── __init__.py │ ├── vectorization │ │ ├── __init__.py │ │ └── binning.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── generic.py │ │ ├── topological.py │ │ └── normalization.py │ ├── data.py │ └── kernels.py ├── .gitignore ├── pyproject.toml ├── README.md └── LICENSE ├── PIKE_behaviour.png ├── ismb2020_maldi ├── __init__.py ├── example_usage.py ├── submit_diffusion_kernel_confidence_jobs.sh ├── datasets │ ├── __init__.py │ ├── dataset.py │ └── antibiotics.py ├── util.py ├── submit_maldiquant_baseline_confidence_jobs.sh ├── submit_maldiquant_diffusion_kernel_confidence_jobs.sh ├── save_preprocessed.py ├── results_overview.sh ├── submit_baseline_jobs_bw.sh ├── demo_kernel.py ├── submit_maldiquant_baseline_jobs.sh ├── submit_maldiquant_diffusion_kernel_jobs.sh ├── submit_baseline_gp_rbf_jobs.sh ├── summarize_dataset.py ├── submit_diffusion_kernel_reduced_jobs.sh ├── submit_baseline_jobs.sh ├── extract_baseline_parameters.py ├── mean.py ├── visualise_feature_map.py ├── submit_diffusion_kernel_jobs.sh ├── mean_rejection.py ├── extract_kernel_parameters.py ├── calibrate_histograms.py ├── demo_kernel_confidence.py ├── visualise_baseline.py ├── analyse_split.py ├── collect_results.py ├── visualise_kernel.py ├── calibrate.py ├── baseline_gp_rbf.py ├── diffusion_kernel.py ├── baseline_maldiquant.py ├── baseline_maldiquant_confidence.py ├── baseline.py └── diffusion_kernel_confidence.py ├── PIKE_behaviour_matplotlib.png ├── .gitignore ├── data └── Example_peaks.txt ├── pyproject.toml ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maldi-learn/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maldi-learn/tests/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.1.0' 2 | -------------------------------------------------------------------------------- /PIKE_behaviour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BorgwardtLab/maldi_PIKE/HEAD/PIKE_behaviour.png -------------------------------------------------------------------------------- /ismb2020_maldi/__init__.py: -------------------------------------------------------------------------------- 1 | """Package with functionality used to analyze MALDI-TOF data.""" 2 | -------------------------------------------------------------------------------- /PIKE_behaviour_matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BorgwardtLab/maldi_PIKE/HEAD/PIKE_behaviour_matplotlib.png -------------------------------------------------------------------------------- /tests/test_ismb2020_maldi.py: -------------------------------------------------------------------------------- 1 | from ismb2020_maldi import __version__ 2 | 3 | 4 | def test_version(): 5 | assert __version__ == '0.1.0' 6 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/vectorization/__init__.py: -------------------------------------------------------------------------------- 1 | """Transformers for vectorizing MALDI-TOF spectra.""" 2 | from .binning import BinningVectorizer 3 | -------------------------------------------------------------------------------- /maldi-learn/tests/test_maldi_learn.py: -------------------------------------------------------------------------------- 1 | from maldi_learn import __version__ 2 | 3 | 4 | def test_version(): 5 | assert __version__ == '0.1.0' 6 | -------------------------------------------------------------------------------- /maldi-learn/.gitignore: -------------------------------------------------------------------------------- 1 | # Python cache 2 | __pycache__/ 3 | 4 | # Pyenv files 5 | .python-version 6 | 7 | # Poetry 8 | # Poetry lock 9 | poetry.lock 10 | # Egg generated when running poetry install 11 | maldi_learn.egg-info/ 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python cache 2 | __pycache__/ 3 | 4 | # Pyenv files 5 | .python-version 6 | 7 | # Poetry 8 | # Poetry lock 9 | poetry.lock 10 | # Egg generated when running poetry install 11 | ismb2020_maldi.egg-info/ 12 | 13 | # Environment file defining path to data 14 | .env 15 | 16 | # Ignore any `pip` installation files 17 | pip-wheel-metadata/ 18 | -------------------------------------------------------------------------------- /ismb2020_maldi/example_usage.py: -------------------------------------------------------------------------------- 1 | """Example usage file.""" 2 | 3 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 4 | from maldi_learn.preprocessing import TopologicalPeakFiltering 5 | dataset = EcoliAntibioticResistanceDataset('Ciprofloxacin') 6 | X, y = dataset.complete_data 7 | 8 | topf = TopologicalPeakFiltering(n_peaks=100) 9 | X_sparse = topf.transform(X) 10 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_diffusion_kernel_confidence_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | for S in "${SEED[@]}"; do 8 | OUTPUT="Calibration_saureus_seed${S}_GP_diffusion" 9 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel_confidence.py --sigma 16.22 --seed $S" 10 | done 11 | -------------------------------------------------------------------------------- /ismb2020_maldi/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Datasets.""" 2 | from .dataset import Dataset 3 | from .antibiotics import AntibioticResistanceDataset, EcoliAntibioticResistanceDataset, \ 4 | SaureusAntibioticResistanceDataset, KpneuAntibioticResistanceDataset 5 | __all__ = ['Dataset', 'AntibioticResistanceDataset', 'EcoliAntibioticResistanceDataset', 'SaureusAntibioticResistanceDataset', 'KpneuAntibioticResistanceDataset'] 6 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Preprocessing of MALDI-TOF spectra.""" 2 | 3 | from .generic import SubsetPeaksTransformer 4 | from .normalization import TotalIonCurrentNormalizer 5 | from .normalization import ScaleNormalizer 6 | from .topological import TopologicalPeakFiltering 7 | 8 | 9 | __all__ = [ 10 | 'ScaleNormalizer', 11 | 'SubsetPeaksTransformer', 12 | 'TopologicalPeakFiltering', 13 | 'TotalIonCurrentNormalizer' 14 | ] 15 | -------------------------------------------------------------------------------- /data/Example_peaks.txt: -------------------------------------------------------------------------------- 1 | 1970.42 2.93 2 | 1999.58 1.64 3 | 2009.62 1.59 4 | 2024.74 1.95 5 | 2041.18 3.47 6 | 2058.96 4.46 7 | 2087.05 4.62 8 | 2105.03 9.21 9 | 2120.07 6.48 10 | 2136.46 5.02 11 | 2187.76 16.89 12 | 2208.36 1.26 13 | 2225.97 1.37 14 | 2248.97 5.04 15 | 2274.77 3.76 16 | 2290.40 7.27 17 | 2306.99 7.69 18 | 2322.29 8.49 19 | 2338.54 8.76 20 | 2355.30 5.44 21 | 2373.03 2.49 22 | 2384.43 2.59 23 | 2398.61 3.65 24 | 2415.13 11.14 25 | 2431.70 17.61 26 | 2455.74 13.73 27 | 2494.82 8.18 28 | -------------------------------------------------------------------------------- /maldi-learn/tests/mock.py: -------------------------------------------------------------------------------- 1 | """Module for generating mock data of MALDI-TOF spectra.""" 2 | import numpy as np 3 | 4 | from maldi_learn.data import MaldiTofSpectrum 5 | 6 | 7 | def generate_mock_data(n_examples): 8 | """Generate random data with correct shape.""" 9 | n_peaks = np.random.normal(1000, 100, size=n_examples).astype(int) 10 | print(n_peaks) 11 | return [ 12 | MaldiTofSpectrum( 13 | np.random.uniform(0, 10000, size=(peaks, 2))) 14 | for peaks in n_peaks 15 | ] 16 | -------------------------------------------------------------------------------- /ismb2020_maldi/util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utility functions 3 | ''' 4 | 5 | 6 | def create_binary_label(df_resistances, antibiotic): 7 | ''' 8 | Given a data frame of resistance information and the name of an 9 | antibiotic, creates a binary label vector. The antibiotic needs 10 | to be present in the data frame. Else, an error is raised. 11 | ''' 12 | 13 | # TODO: check whether this conversion makes sense 14 | y = df_resistances[antibiotic].values 15 | y[y != 'R'] = 0 16 | y[y == 'R'] = 1 17 | 18 | return y.astype('int') 19 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_maldiquant_baseline_confidence_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/ 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/ 9 | 10 | for S in "${SEED[@]}"; do 11 | OUTPUT="Calibration_saureus_seed${S}_MQ" 12 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant_confidence.py --seed $S --suffix _peaks_warped" 13 | done 14 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_maldiquant_diffusion_kernel_confidence_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/ 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/ 9 | 10 | for S in "${SEED[@]}"; do 11 | OUTPUT="Calibration_saureus_seed${S}_MQ_GP_diffusion" 12 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel_confidence.py --sigma 4.18 --seed $S --suffix _peaks_warped" 13 | done 14 | -------------------------------------------------------------------------------- /maldi-learn/tests/preprocessing/test_topological_preprocessing.py: -------------------------------------------------------------------------------- 1 | """Tests for topological preprocessing routines.""" 2 | import unittest 3 | 4 | from maldi_learn.preprocessing import TopologicalPeakFiltering 5 | 6 | from tests.mock import generate_mock_data 7 | 8 | 9 | class TestToplogicalPreprocessing(unittest.TestCase): 10 | def test_correct_n_peaks(self, n_examples=10, n_peaks=100): 11 | mock_data = generate_mock_data(n_examples) 12 | transformer = TopologicalPeakFiltering(n_peaks=n_peaks) 13 | transformed_data = transformer.fit_transform(mock_data) 14 | print(transformed_data[0].shape) 15 | self.assertTrue( 16 | all([spectrum.n_peaks == n_peaks for spectrum in transformed_data]) 17 | ) 18 | -------------------------------------------------------------------------------- /maldi-learn/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "maldi-learn" 3 | version = "0.1.0" 4 | description = "Software library for MALDI-TOF preprocessing and machine learning analysis." 5 | authors = ["Caroline Weis ", "Max Horn ", "Bastian Rieck "] 6 | readme = "README.md" 7 | repository = "https://github.com/BorgwardtLab/maldi-learn" 8 | homepage = "https://github.com/BorgwardtLab/maldi-learn" 9 | 10 | [tool.poetry.dependencies] 11 | python = ">=3.6" 12 | scikit-learn = "^0.22.1" 13 | topf = {git = "ssh://git@github.com/BorgwardtLab/Topf.git"} 14 | pandas = "^0.25.3" 15 | [tool.poetry.dev-dependencies] 16 | pytest = "^3.0" 17 | 18 | [build-system] 19 | requires = ["poetry>=0.12"] 20 | build-backend = "poetry.masonry.api" 21 | -------------------------------------------------------------------------------- /ismb2020_maldi/save_preprocessed.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Read all spectra, preprocess them with TopologicalPeakFiltering and then save to new folder. 3 | ''' 4 | 5 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 6 | 7 | from maldi_learn.data import write_spectra 8 | from maldi_learn.preprocessing import TopologicalPeakFiltering 9 | 10 | 11 | dataset = AntibioticResistanceDataset(test_size=0.5) 12 | 13 | # write testing_data 14 | X, y = dataset.testing_data 15 | 16 | topf = TopologicalPeakFiltering(n_peaks=False) 17 | X_sparse = topf.transform(X) 18 | 19 | write_spectra(X_sparse, y, '/links/groups/borgwardt/Data/ismb2020_maldi/spectra_preprocessed') 20 | 21 | 22 | # write training_data 23 | X, y = dataset.training_data 24 | 25 | topf = TopologicalPeakFiltering(n_peaks=False) 26 | X_sparse = topf.transform(X) 27 | 28 | write_spectra(X_sparse, y, '/links/groups/borgwardt/Data/ismb2020_maldi/spectra_preprocessed') 29 | -------------------------------------------------------------------------------- /ismb2020_maldi/results_overview.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # example usage: 4 | # ANTIBIOTICS=Ceftriaxon SPECIES=ecoli ./results_overview.sh 5 | 6 | if [ -z ${ANTIBIOTICS+x} ]; then 7 | ANTIBIOTICS=(Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon) 8 | fi 9 | 10 | if [ -z ${SPECIES+x} ]; then 11 | SPECIES=(ecoli saureus kpneu) 12 | fi 13 | 14 | for A in "${ANTIBIOTICS[@]}"; do 15 | for S in "${SPECIES[@]}"; do 16 | echo 17 | echo ---- ${A} -- ${S} ---- 18 | echo - baseline on raw - 19 | cat /cluster/work/borgw/ismb2020_maldi/results/raw/${A}_${S}_*.out | grep Average 20 | echo - baseline on preprocessed - 21 | cat /cluster/work/borgw/ismb2020_maldi/results/preprocessed/${A}_${S}_*.out | grep Average 22 | echo - GP random oversampling 200 peaks - 23 | cat /cluster/work/borgw/ismb2020_maldi/results/diffusion_ros/GP_diffusion_${A}_${S}_*_200_*.out | grep Average 24 | 25 | done 26 | done 27 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "ismb2020_maldi" 3 | version = "0.1.0" 4 | description = "Maldi-Tof kernel and GP classifier" 5 | authors = ["Caroline Weis ", "Max Horn ", "Dr. Bastian Alexander Rieck "] 6 | readme = "README.md" 7 | repository = "https://github.com/BorgwardtLab/maldi_PIKE" 8 | homepage = "https://github.com/BorgwardtLab/maldi_PIKE" 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.7" 12 | pandas = "^0.25.3" 13 | python-dotenv = "^0.10.3" 14 | maldi-learn = {path = "maldi-learn/"} 15 | scikit-learn = {git = "https://github.com/BorgwardtLab/scikit-learn.git", rev = "maldi-learn"} 16 | imbalanced-learn = "^0.6.1" 17 | json-tricks = "^3.13.5" 18 | tqdm = "^4.41.1" 19 | matplotlib = "^3.1.2" 20 | seaborn = "^0.10.0" 21 | [tool.poetry.dev-dependencies] 22 | pytest = "^3.0" 23 | ipython = "^7.11.1" 24 | 25 | [build-system] 26 | requires = ["poetry>=0.12"] 27 | build-backend = "poetry.masonry.api" 28 | -------------------------------------------------------------------------------- /maldi-learn/README.md: -------------------------------------------------------------------------------- 1 | # maldi-learn 2 | Software library for MALDI-TOF preprocessing and machine learning analysis. 3 | 4 | ## Installation - development 5 | 6 | The installation of this package requires 7 | [poetry](https://python-poetry.org/docs/). 8 | 9 | In order to set up a development environment run `poetry install` in the 10 | project root. To run commands in the associated virtual environment of this 11 | package run `poetry shell` to spawn a shell. 12 | 13 | 14 | ### Python version 15 | 16 | This project requires at least python version `3.7`. In a development setup it 17 | is recommended to install a appropriate python version using 18 | [pyenv](https://github.com/pyenv/pyenv), and then marking this folder for usage 19 | with this version: 20 | 21 | ```bash 22 | $ pyenv install 3.7.4 # Install python 3.7.4 using pyenv 23 | $ pyenv local 3.7.4 # Mark python version 3.7.4 for usage in this folder 24 | $ poetry install # Setup the virtual environment for development 25 | ``` 26 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_baseline_jobs_bw.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | # s. aureus 8 | 9 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 10 | for s in "${SEED[@]}"; do 11 | OUTPUT="${A}_saureus_seed${s}_normalized.json" 12 | nice poetry run python baseline.py --species saureus --antibiotic $A --seed $s --normalize --output ${OUTPUT} & 13 | done 14 | done 15 | 16 | # e. coli 17 | 18 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 19 | for s in "${SEED[@]}"; do 20 | OUTPUT="${A}_ecoli_seed${s}_normalized.json" 21 | nice poetry run python baseline.py --species ecoli --antibiotic $A --seed $s --normalize --output ${OUTPUT} & 22 | done 23 | done 24 | 25 | # k. pneu 26 | 27 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 28 | for s in "${SEED[@]}"; do 29 | OUTPUT="${A}_kpneu_seed${s}_normalized.json" 30 | nice poetry run python baseline.py --species kpneu --antibiotic $A --seed $s --normalize --output ${OUTPUT} & 31 | done 32 | done 33 | -------------------------------------------------------------------------------- /ismb2020_maldi/demo_kernel.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Demo file for kernel calculation. 3 | ''' 4 | 5 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 6 | 7 | from maldi_learn.kernels import DiffusionKernel 8 | 9 | from maldi_learn.preprocessing import SubsetPeaksTransformer 10 | 11 | from sklearn.gaussian_process import GaussianProcessClassifier 12 | from sklearn.metrics import average_precision_score 13 | from sklearn.svm import SVC 14 | 15 | 16 | import numpy as np 17 | import sys 18 | 19 | dataset = EcoliAntibioticResistanceDataset(antibiotic='Ceftriaxon', 20 | test_size=0.20) 21 | X_train, y_train = dataset.training_data 22 | X_test, y_test = dataset.testing_data 23 | 24 | st = SubsetPeaksTransformer(n_peaks=100) 25 | 26 | X_train = st.fit_transform(X_train) 27 | X_test = st.transform(X_test) 28 | 29 | kernel = DiffusionKernel(sigma=400) 30 | 31 | clf = GaussianProcessClassifier(kernel=kernel, n_jobs=-1) 32 | clf.fit(X_train, y_train) 33 | 34 | y_pred = clf.predict_proba(X_test) 35 | average_precision = average_precision_score(y_test, y_pred[:, 1]) 36 | 37 | print(f'Average precision: {100 * average_precision:2.2f}') 38 | -------------------------------------------------------------------------------- /maldi-learn/tests/vectorization/test_binning_vectorizer.py: -------------------------------------------------------------------------------- 1 | """Test BinningVectorizer.""" 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from maldi_learn.data import MaldiTofSpectrum 7 | from maldi_learn.vectorization import BinningVectorizer 8 | 9 | 10 | MOCK_DATA = [ 11 | MaldiTofSpectrum( 12 | [[0.0, 5.0], 13 | [10.7, 8.0], 14 | [150.4, 10.], 15 | [1000, 3.0] 16 | ] 17 | ), 18 | MaldiTofSpectrum( 19 | [[0.0, 15.0], 20 | [10.7, 5.0], 21 | [150.4, 10.], 22 | [1000, 3.0] 23 | ] 24 | ) 25 | ] 26 | 27 | 28 | class TestBinningVectorizer(unittest.TestCase): 29 | def test_simple_binning(self): 30 | vectorizer = BinningVectorizer(2, min_bin=-0.1, max_bin=999) 31 | vectorized = vectorizer.fit_transform(MOCK_DATA) 32 | self.assertEqual(vectorized.ndim, 2) 33 | self.assertEqual(vectorized.shape[0], len(MOCK_DATA)) 34 | self.assertEqual(vectorized.shape[1], 2) 35 | 36 | self.assertTrue(np.all(vectorized[0] == np.array([23., 3.]))) 37 | self.assertTrue(np.all(vectorized[1] == np.array([30., 3.]))) 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /maldi-learn/tests/preprocessing/test_subset_peaks.py: -------------------------------------------------------------------------------- 1 | """Test SubsetPeaksTransformer.""" 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from maldi_learn.preprocessing import SubsetPeaksTransformer 7 | from maldi_learn.data import MaldiTofSpectrum 8 | 9 | 10 | MOCK_DATA = [ 11 | MaldiTofSpectrum( 12 | [[0.0, 5.0], 13 | [10.7, 8.0], 14 | [150.4, 10.], 15 | [1000, 3.0] 16 | ] 17 | ), 18 | MaldiTofSpectrum( 19 | [[0.0, 15.0], 20 | [10.7, 5.0], 21 | [150.4, 10.], 22 | [1000, 3.0] 23 | ] 24 | ) 25 | ] 26 | 27 | 28 | class TestSubsetPeakTransformer(unittest.TestCase): 29 | def test_transformer(self, n_peaks=2): 30 | transf = SubsetPeaksTransformer(n_peaks) 31 | transformed = transf.fit_transform(MOCK_DATA) 32 | print(transformed) 33 | # First example 34 | self.assertTrue(np.all(transformed[0][0] == np.array([10.7, 8.0]))) 35 | self.assertTrue(np.all(transformed[0][1] == np.array([150.4, 10.0]))) 36 | 37 | # Second example 38 | self.assertTrue(np.all(transformed[1][0] == np.array([0.0, 15.0]))) 39 | self.assertTrue(np.all(transformed[1][1] == np.array([150.4, 10.0]))) 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # maldi_PIKE — Peak Information KErnel for MALDI-TOF MS spectra 2 | 3 | `maldi_PIKE` is a small library for Python 3 that include the code used for 4 | 'Topological and kernel-based microbial phenotype prediction from MALDI-TOF mass 5 | spectra'. The main method includes PIKE, the Peak Information KErnel for MALDI-TOF MS spectra, 6 | embedded in a Gaussian Process. We developed PIKE based on heat diffusion on structured 7 | objects. It is well suited for MALDI-TOF mass spectra and able to capture interactions between 8 | mass peaks. 9 | 10 | # Dependencies 11 | 12 | - Python 3.7 13 | - packages listed in `pyproject.toml` 14 | 15 | # Installation 16 | 17 | - Clone the repository 18 | - `poetry install` 19 | 20 | Follow the instructions given by `poetry`. 21 | 22 | # Example behaviour of PIKE 23 | 24 | Figure 2: 25 | 26 | ![PIKE_behaviour](PIKE_behaviour.png) 27 | 28 | Code to recreate this graphic in matplotlib can be found in 29 | `ismb2020_maldi/visualise_feature_map.py`. Use the script as 30 | follows: 31 | 32 | ``` 33 | poetry run python ismb2020_maldi/visualise_feature_map.py data/Example_peaks.txt 34 | ``` 35 | 36 | This should result in the following plot: 37 | 38 | ![PIKE_behaviour_matplotlib](PIKE_behaviour_matplotlib.png) 39 | 40 | This repository is work in progress. 41 | -------------------------------------------------------------------------------- /ismb2020_maldi/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | """Base class for a dataset.""" 2 | import abc 3 | from collections.abc import Sequence 4 | from typing import List, Tuple 5 | 6 | from maldi_learn.data import MaldiTofSpectrum 7 | 8 | 9 | class Dataset(metaclass=abc.ABCMeta): 10 | """Abstract base class for a dataset.""" 11 | 12 | @property 13 | @abc.abstractmethod 14 | def training_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]: 15 | """Get training data of dataset. 16 | 17 | Returns: 18 | Tuple (X, y) 19 | 20 | """ 21 | 22 | @property 23 | @abc.abstractmethod 24 | def validation_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]: 25 | """Get validation data of dataset. 26 | 27 | Returns: 28 | Tuple (X, y) 29 | 30 | """ 31 | 32 | @property 33 | @abc.abstractmethod 34 | def testing_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]: 35 | """Get testing data of dataset. 36 | 37 | Returns: 38 | Tuple (X, y) 39 | 40 | """ 41 | 42 | @property 43 | @abc.abstractmethod 44 | def complete_data(self) -> Tuple[List[MaldiTofSpectrum], Sequence]: 45 | """Get complete dataset. 46 | 47 | Returns: 48 | Tuple (X, y) 49 | 50 | """ 51 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/data.py: -------------------------------------------------------------------------------- 1 | """Classes to standardize handling of Spectra.""" 2 | 3 | import numpy as np 4 | 5 | 6 | class MaldiTofSpectrum(np.ndarray): 7 | """Numpy NDArray subclass representing a MALDI-TOF Spectrum.""" 8 | 9 | def __new__(cls, peaks): 10 | """Create a MaldiTofSpectrum. 11 | 12 | Args: 13 | peaks: 2d array or list of tuples or list of list containing pairs 14 | of mass/charge to intensity. 15 | 16 | Raises: 17 | ValueError: If the input data is not in the correct format. 18 | 19 | """ 20 | peaks = np.asarray(peaks).view(cls) 21 | if peaks.ndim != 2 or peaks.shape[1] != 2: 22 | raise ValueError( 23 | f'Input shape of {peaks.shape} does not match expected shape ' 24 | 'for spectrum [n_peaks, 2].' 25 | ) 26 | return peaks 27 | 28 | @property 29 | def n_peaks(self): 30 | """Get number of peaks of the spectrum.""" 31 | return self.shape[0] 32 | 33 | @property 34 | def intensities(self): 35 | """Get the intensities of the spectrum.""" 36 | return self[:, 1] 37 | 38 | @property 39 | def mass_to_charge_ratios(self): 40 | """Get mass-t0-charge ratios of spectrum.""" 41 | return self[:, 0] 42 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_maldiquant_baseline_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/ 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/ 9 | 10 | # s. aureus 11 | 12 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 13 | for s in "${SEED[@]}"; do 14 | OUTPUT="${A}_saureus_seed${s}_MQ" 15 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant.py --suffix _peaks_warped --species saureus --antibiotic $A --seed $s" 16 | done 17 | done 18 | 19 | # e. coli 20 | 21 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 22 | for s in "${SEED[@]}"; do 23 | OUTPUT="${A}_ecoli_seed${s}_MQ" 24 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant.py --suffix _peaks_warped --species ecoli --antibiotic $A --seed $s" 25 | done 26 | done 27 | 28 | # k. pneu 29 | 30 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 31 | for s in "${SEED[@]}"; do 32 | OUTPUT="${A}_kpneu_seed${s}_MQ" 33 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_maldiquant.py --suffix _peaks_warped --species kpneu --antibiotic $A --seed $s" 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_maldiquant_diffusion_kernel_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/ 8 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/ 9 | 10 | # s. aureus 11 | 12 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 13 | for S in "${SEED[@]}"; do 14 | OUTPUT="${A}_saureus_seed${S}_peaks${P}_MQ_GP_diffusion" 15 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --suffix _peaks_warped" 16 | done 17 | done 18 | 19 | # e. coli 20 | 21 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 22 | for S in "${SEED[@]}"; do 23 | OUTPUT="${A}_ecoli_seed${S}_peaks${P}_MQ_GP_diffusion" 24 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --suffix _peaks_warped" 25 | done 26 | done 27 | 28 | # k. pneu 29 | 30 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 31 | for S in "${SEED[@]}"; do 32 | OUTPUT="${A}_kpneu_seed${S}_peaks${P}_MQ_GP_diffusion" 33 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --suffix _peaks_warped" 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_baseline_gp_rbf_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Submission script for the baseline GP-RBF classifier. This uses MQ 4 | # features because we want to ensure that the improvements are given 5 | # by our new kernel. 6 | 7 | SEED=(58925 15250 97412 17965 44873) 8 | MEMORY=8192 9 | TIME=23:59 10 | 11 | export ANTIBIOTICS_SPECTRA_PATH=/cluster/work/borgw/ismb2020_maldi/spectra_MaldiQuant/ 12 | export ANTIBIOTICS_ENDPOINT_PATH=/cluster/work/borgw/ismb2020_maldi/ 13 | 14 | # s. aureus 15 | 16 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 17 | for S in "${SEED[@]}"; do 18 | OUTPUT="${A}_saureus_seed${S}_GP_RBF" 19 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_gp_rbf.py --species saureus --antibiotic $A --seed $S" 20 | done 21 | done 22 | 23 | # e. coli 24 | 25 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 26 | for S in "${SEED[@]}"; do 27 | OUTPUT="${A}_ecoli_seed${S}_GP_RBF" 28 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_gp_rbf.py --species ecoli --antibiotic $A --seed $S" 29 | done 30 | done 31 | 32 | # k. pneu 33 | 34 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 35 | for S in "${SEED[@]}"; do 36 | OUTPUT="${A}_kpneu_seed${S}_GP_RBF" 37 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline_gp_rbf.py --species kpneu --antibiotic $A --seed $S" 38 | done 39 | done 40 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/preprocessing/generic.py: -------------------------------------------------------------------------------- 1 | """Generic preprocessing transformers for spectra.""" 2 | import numpy as np 3 | 4 | from sklearn.base import TransformerMixin 5 | from sklearn.base import BaseEstimator 6 | 7 | 8 | class SubsetPeaksTransformer(BaseEstimator, TransformerMixin): 9 | """Transform to extract subset of peaks from spectrum.""" 10 | 11 | def __init__(self, n_peaks=None): 12 | """Initialize transformer for subsetting peaks. 13 | 14 | Args: 15 | n_peaks: Number of peaks to extract from spectrum. If set to 16 | `None`, will just pass through input data without changing 17 | anything. 18 | on_less: Behaviour when one of the spectra has less than n_peaks 19 | peaks. 20 | 21 | """ 22 | self.n_peaks = n_peaks 23 | 24 | def fit(self, X, y=None): 25 | """Fit transformer, does nothing.""" 26 | return self 27 | 28 | def transform(self, X): 29 | """Get the n_peaks peaks with the highest intensity.""" 30 | 31 | # Bail out early because there is nothing to do 32 | if self.n_peaks is None: 33 | return X 34 | 35 | output = [] 36 | for spectrum in X: 37 | intensity = spectrum[:, 1] 38 | peak_indices = np.argsort(intensity, kind='stable')[::-1] 39 | # We want to sort back the indices to perserve the original order 40 | output.append(spectrum[sorted(peak_indices[:self.n_peaks])]) 41 | return output 42 | -------------------------------------------------------------------------------- /ismb2020_maldi/summarize_dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Provide a summary over the dataset, including class balance for each species and antibiotic. 3 | ''' 4 | 5 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset, SaureusAntibioticResistanceDataset, KpneuAntibioticResistanceDataset 6 | 7 | import numpy as np 8 | import sys 9 | 10 | 11 | datasets_map = { 12 | 'ecoli': EcoliAntibioticResistanceDataset, 13 | 'saureus': SaureusAntibioticResistanceDataset, 14 | 'kpneu': KpneuAntibioticResistanceDataset 15 | } 16 | 17 | antibiotic_map = { 18 | 'ecoli': ['Ciprofloxacin', 'Ceftriaxon','Amoxicillin-Clavulansaeure'], 19 | 'saureus': ['Ciprofloxacin', 'Penicillin','Amoxicillin-Clavulansaeure'], 20 | 'kpneu': ['Ciprofloxacin', 'Ceftriaxon','Piperacillin-Tazobactam'] 21 | } 22 | 23 | for species in datasets_map.keys(): 24 | 25 | print(f'\n{species}') 26 | Dataset = datasets_map[species] 27 | for antibiotic in antibiotic_map[species]: 28 | 29 | print(f'{antibiotic}') 30 | dataset = Dataset(antibiotic, test_size=0.2) 31 | _, y_complete = dataset.complete_data 32 | _, y_train = dataset.training_data 33 | _, y_test = dataset.testing_data 34 | 35 | #for y in [y_complete, y_train, y_test]: 36 | for y in [y_complete]: 37 | counts = y.value_counts() 38 | print(counts) 39 | print(y.shape[0]) 40 | assert counts.loc[0]+counts.loc[1] == y.shape[0] 41 | print(round(counts.loc[1]/float(y.shape[0]), 3)) 42 | print() 43 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_diffusion_kernel_reduced_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Submission script for the _reduced_ set of diffusion kernel jobs, i.e. 4 | # we do _not_ change the number of peaks and always use normalisation. A 5 | # scenario like this closely matches that of MQ data. 6 | 7 | SEED=(58925 15250 97412 17965 44873) 8 | PEAKS=200 9 | MEMORY=8192 10 | TIME=23:59 11 | 12 | # s. aureus 13 | 14 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 15 | for S in "${SEED[@]}"; do 16 | OUTPUT="${A}_saureus_seed${S}_peaks${P}_GP_diffusion" 17 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --peaks ${PEAKS} --normalize" 18 | done 19 | done 20 | 21 | # e. coli 22 | 23 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 24 | for S in "${SEED[@]}"; do 25 | OUTPUT="${A}_ecoli_seed${S}_peaks${P}_GP_diffusion" 26 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --peaks ${PEAKS} --normalize" 27 | done 28 | done 29 | 30 | # k. pneu 31 | 32 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 33 | for S in "${SEED[@]}"; do 34 | OUTPUT="${A}_kpneu_seed${S}_peaks${P}_GP_diffusion" 35 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --peaks ${PEAKS} --normalize" 36 | done 37 | done 38 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_baseline_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | MEMORY=8192 5 | TIME=23:59 6 | 7 | # s. aureus 8 | 9 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 10 | for s in "${SEED[@]}"; do 11 | OUTPUT="${A}_saureus_seed${s}" 12 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species saureus --antibiotic $A --seed $s" 13 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species saureus --antibiotic $A --seed $s --normalize" 14 | done 15 | done 16 | 17 | # e. coli 18 | 19 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 20 | for s in "${SEED[@]}"; do 21 | OUTPUT="${A}_ecoli_seed${s}" 22 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species ecoli --antibiotic $A --seed $s" 23 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species ecoli --antibiotic $A --seed $s --normalize" 24 | done 25 | done 26 | 27 | # k. pneu 28 | 29 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 30 | for s in "${SEED[@]}"; do 31 | OUTPUT="${A}_kpneu_seed${s}" 32 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species kpneu --antibiotic $A --seed $s" 33 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python baseline.py --species kpneu --antibiotic $A --seed $s --normalize" 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, Machine Learning and Computational Biology Lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /ismb2020_maldi/extract_baseline_parameters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env 2 | # 3 | # Auxiliary script for extracting baseline parameters from a set of runs 4 | # and reporting their mean. This is useful to run a calibration with 5 | # a pre-selected model. 6 | 7 | import argparse 8 | import re 9 | 10 | import json_tricks as jt 11 | import numpy as np 12 | 13 | from tqdm import tqdm 14 | 15 | 16 | if __name__ == '__main__': 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('INPUT', nargs='+', type=str) 20 | args = parser.parse_args() 21 | 22 | parameters = [] 23 | 24 | for filename in tqdm(args.INPUT, desc='Loading'): 25 | with open(filename) as f: 26 | # Ensures that we can parse normal JSON files 27 | pos = 0 28 | 29 | for line in f: 30 | 31 | # We found *probably* the beginning of the JSON file, so 32 | # we can start the parse process from here, having to do 33 | # a reset. 34 | if line.startswith('{'): 35 | f.seek(pos) 36 | break 37 | else: 38 | pos += len(line) 39 | 40 | # Check whether file is empty for some reason. If so, we 41 | # skip it. 42 | line = f.readline() 43 | if line == '': 44 | continue 45 | 46 | # Not empty, so we need to reset the file pointer 47 | else: 48 | f.seek(pos) 49 | 50 | data_raw = jt.load(f) 51 | 52 | seed = data_raw['seed'] 53 | best_parameters = data_raw['best_parameters'] 54 | print(f'Seed {seed}: {best_parameters}') 55 | -------------------------------------------------------------------------------- /ismb2020_maldi/mean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Calculates the mean of a set of CSVs. The CSV files are assumed to 4 | # contain the same ranges. 5 | 6 | import argparse 7 | import sys 8 | 9 | import pandas as pd 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('FILES', nargs='+', type=str) 16 | 17 | args = parser.parse_args() 18 | 19 | data = [] 20 | 21 | for filename in args.FILES: 22 | df = pd.read_csv(filename, header=0, index_col=0) 23 | data.append(df) 24 | 25 | df = data[0] 26 | columns = df.columns 27 | 28 | for index, right in enumerate(data[1:]): 29 | df = pd.merge(df, right, 30 | suffixes=('', '_' + str(index + 1)), 31 | how='outer', on=['threshold'] 32 | ) 33 | 34 | df = df.fillna(1.0) 35 | 36 | mean_auprc = df[['auprc', 'auprc_1', 'auprc_2', 'auprc_3', 37 | 'auprc_4']].mean(axis=1) 38 | 39 | std_auprc = df[['auprc', 'auprc_1', 'auprc_2', 'auprc_3', 40 | 'auprc_4']].std(axis=1) 41 | 42 | 43 | mean_accuracy = df[['accuracy', 'accuracy_1', 'accuracy_2', 'accuracy_3', 44 | 'accuracy_4']].mean(axis=1) 45 | 46 | std_accuracy = df[['accuracy', 'accuracy_1', 'accuracy_2', 'accuracy_3', 47 | 'accuracy_4']].std(axis=1) 48 | 49 | mean_n_pos_samples = df[['n_pos_samples', 50 | 'n_pos_samples_1', 51 | 'n_pos_samples_2', 52 | 'n_pos_samples_3', 53 | 'n_pos_samples_4']].mean(axis=1) 54 | 55 | df = pd.DataFrame({'mean_auprc': mean_auprc, 'std_auprc': std_auprc, 56 | 'mean_n_pos_samples': mean_n_pos_samples, 'mean_accuracy': 57 | mean_accuracy, 'std_accuracy': std_accuracy}) 58 | 59 | df.to_csv(sys.stdout) 60 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/preprocessing/topological.py: -------------------------------------------------------------------------------- 1 | """Preprocessing using TOpological Peak Filtering (TOPF).""" 2 | from sklearn.base import TransformerMixin 3 | from topf import PersistenceTransformer 4 | from typing import List 5 | 6 | from ..data import MaldiTofSpectrum 7 | 8 | 9 | class TopologicalPeakFiltering(TransformerMixin): 10 | """Topological peak filtering using TOPF.""" 11 | 12 | _required_arguments = ['n_peaks'] 13 | 14 | def __init__(self, n_peaks): 15 | """Topological peak filtrering (TOPF) for MALDI-TOF spectra. 16 | 17 | Args: 18 | n_peaks: Number of peaks to retain. Peaks will be eliminated in 19 | top-down order starting from the one with the lowest 20 | persistence. Thus, if the var is 1, only the highest peak will 21 | be kept. 22 | 23 | """ 24 | self.n_peaks = n_peaks 25 | 26 | def fit(self, X, y=None): 27 | """Do nothing.""" 28 | return self 29 | 30 | @staticmethod 31 | def _remove_non_peaks(spectrum): 32 | return spectrum[spectrum[:, 1] != 0.] 33 | 34 | def transform(self, X: List[MaldiTofSpectrum]) -> List[MaldiTofSpectrum]: 35 | """Apply topological peak filtering to the data array X. 36 | 37 | Args: 38 | X: List of MALDI-TOF spectra. 39 | 40 | Returns: 41 | Sparse spectra containing only n_peaks peaks. 42 | 43 | """ 44 | pers_transformer = PersistenceTransformer( 45 | calculate_persistence_diagram=False, n_peaks=self.n_peaks) 46 | 47 | return [ 48 | MaldiTofSpectrum( 49 | self._remove_non_peaks( 50 | pers_transformer.fit_transform(spectrum) 51 | ) 52 | ) 53 | for spectrum in X 54 | ] 55 | -------------------------------------------------------------------------------- /ismb2020_maldi/visualise_feature_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Basic visualisation script for the feature map of our kernel, subject 4 | # to a certain smoothing parameter. 5 | 6 | from maldi_learn.data import MaldiTofSpectrum 7 | 8 | from maldi_learn.preprocessing import ScaleNormalizer 9 | 10 | import matplotlib.pyplot as plt 11 | import seaborn as sns 12 | 13 | import numpy as np 14 | import pandas as pd 15 | 16 | import argparse 17 | 18 | 19 | def feature_map(spectrum, x, sigma=1.0): 20 | positions = spectrum.mass_to_charge_ratios 21 | peaks = spectrum.intensities 22 | 23 | f = np.multiply(peaks, np.exp(-(x - positions)**2 / (4 * sigma))) 24 | f = 1 / (2 * np.sqrt(np.pi * sigma)) * np.sum(f) 25 | 26 | return f 27 | 28 | 29 | if __name__ == '__main__': 30 | 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('FILE', type=str) 33 | 34 | args = parser.parse_args() 35 | 36 | spectrum = MaldiTofSpectrum( 37 | pd.read_csv(args.FILE, sep=' ', comment='#').values 38 | ) 39 | 40 | sn = ScaleNormalizer() 41 | spectrum = sn.fit_transform([spectrum])[0] 42 | 43 | spectrum = spectrum[spectrum.mass_to_charge_ratios < 2500] 44 | x_min = np.min(spectrum.mass_to_charge_ratios) 45 | x_max = np.max(spectrum.mass_to_charge_ratios) 46 | 47 | fig, ax = plt.subplots(4, 1, sharex=True) 48 | 49 | ax[0].stem(spectrum.mass_to_charge_ratios, spectrum.intensities, 50 | linefmt='k-', basefmt='black', markerfmt='None', 51 | use_line_collection=True) 52 | 53 | for axis in ax: 54 | axis.set_ylim(0, 6) 55 | 56 | axis.spines['top'].set_visible(False) 57 | axis.spines['right'].set_visible(False) 58 | 59 | axis.set_yticks([0, 2, 4, 6]) 60 | 61 | for axis, sigma in zip(ax[1:], [1, 10, 100]): 62 | 63 | X = np.linspace(x_min, x_max, 300) 64 | Y = [feature_map(spectrum, x, sigma) for x in X] 65 | 66 | axis.plot(X, Y) 67 | 68 | 69 | plt.show() 70 | -------------------------------------------------------------------------------- /ismb2020_maldi/submit_diffusion_kernel_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SEED=(58925 15250 97412 17965 44873) 4 | PEAKS=(100 200 500 700) 5 | MEMORY=8192 6 | TIME=23:59 7 | 8 | # s. aureus 9 | 10 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Penicillin; do 11 | for S in "${SEED[@]}"; do 12 | for P in "${PEAKS[@]}"; do 13 | OUTPUT="${A}_saureus_seed${S}_peaks${P}_GP_diffusion" 14 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --peaks $P" 15 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species saureus --antibiotic $A --seed $S --peaks $P --normalize" 16 | done 17 | done 18 | done 19 | 20 | # e. coli 21 | 22 | for A in Amoxicillin-Clavulansaeure Ciprofloxacin Ceftriaxon; do 23 | for S in "${SEED[@]}"; do 24 | for P in "${PEAKS[@]}"; do 25 | OUTPUT="${A}_ecoli_seed${S}_peaks${P}_GP_diffusion" 26 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --peaks $P" 27 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species ecoli --antibiotic $A --seed $S --peaks $P --normalize" 28 | done 29 | done 30 | done 31 | 32 | # k. pneu 33 | 34 | for A in Ciprofloxacin Ceftriaxon Piperacillin-Tazobactam; do 35 | for S in "${SEED[@]}"; do 36 | for P in "${PEAKS[@]}"; do 37 | OUTPUT="${A}_kpneu_seed${S}_peaks${P}_GP_diffusion" 38 | bsub -N -W $TIME -o "${OUTPUT}_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --peaks $P" 39 | bsub -N -W $TIME -o "${OUTPUT}_normalized_%J.json" -R "rusage[mem=${MEMORY}]" "poetry run python diffusion_kernel.py --species kpneu --antibiotic $A --seed $S --peaks $P --normalize" 40 | done 41 | done 42 | done 43 | -------------------------------------------------------------------------------- /ismb2020_maldi/mean_rejection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Calculates the mean of a set of CSVs. The CSV files are assumed to 4 | # contain the same ranges. 5 | 6 | import argparse 7 | import sys 8 | 9 | import pandas as pd 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('FILES', nargs='+', type=str) 16 | 17 | args = parser.parse_args() 18 | 19 | data = [] 20 | 21 | for filename in args.FILES: 22 | df = pd.read_csv(filename, header=0, index_col=0) 23 | data.append(df) 24 | 25 | df = data[0] 26 | columns = df.columns 27 | 28 | for index, right in enumerate(data[1:]): 29 | df = pd.merge(df, right, 30 | suffixes=('', '_' + str(index + 1)), 31 | how='outer', on=['threshold'] 32 | ) 33 | 34 | df = df.fillna(1.0) 35 | 36 | mean_rejected_in_sample = df[['rejected_in_sample', 'rejected_in_sample_1', 'rejected_in_sample_2', 'rejected_in_sample_3', 37 | 'rejected_in_sample_4']].mean(axis=1) 38 | 39 | std_rejected_in_sample = df[['rejected_in_sample', 'rejected_in_sample_1', 'rejected_in_sample_2', 'rejected_in_sample_3', 40 | 'rejected_in_sample_4']].std(axis=1) 41 | 42 | mean_rejected_out_of_sample = df[['rejected_out_of_sample', 43 | 'rejected_out_of_sample_1', 44 | 'rejected_out_of_sample_2', 45 | 'rejected_out_of_sample_3', 46 | 'rejected_out_of_sample_4']].mean(axis=1) 47 | 48 | std_rejected_out_of_sample = df[['rejected_out_of_sample', 49 | 'rejected_out_of_sample_1', 50 | 'rejected_out_of_sample_2', 51 | 'rejected_out_of_sample_3', 52 | 'rejected_out_of_sample_4']].std(axis=1) 53 | 54 | df = pd.DataFrame({'mean_rejected_in_sample': mean_rejected_in_sample, 'std_rejected_in_sample': std_rejected_in_sample, 55 | 'mean_rejected_out_of_sample': mean_rejected_out_of_sample, 56 | 'std_rejected_out_of_sample': std_rejected_out_of_sample}) 57 | 58 | df.to_csv(sys.stdout) 59 | -------------------------------------------------------------------------------- /ismb2020_maldi/extract_kernel_parameters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env 2 | # 3 | # Auxiliary script for extracting kernel parameters from a set of runs 4 | # and reporting their mean. This is useful to run a calibration with a 5 | # pre-selected model. 6 | 7 | import argparse 8 | import re 9 | 10 | import json_tricks as jt 11 | import numpy as np 12 | 13 | from tqdm import tqdm 14 | 15 | 16 | def extract_parameter(s, name='DiffusionKernel'): 17 | ''' 18 | Extracts the kernel parameter from a string. The function attempts 19 | to extract a float value enclosed between brackets that correspond 20 | to a kernel name. 21 | 22 | Returns `np.nan` if no match could be found. 23 | ''' 24 | 25 | pattern = f'{name}\((.+)\)' 26 | m = re.match(pattern, s) 27 | 28 | if m: 29 | return float(m.group(1)) 30 | else: 31 | return np.nan 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('INPUT', nargs='+', type=str) 38 | args = parser.parse_args() 39 | 40 | parameters = [] 41 | 42 | for filename in tqdm(args.INPUT, desc='Loading'): 43 | with open(filename) as f: 44 | # Ensures that we can parse normal JSON files 45 | pos = 0 46 | 47 | for line in f: 48 | 49 | # We found *probably* the beginning of the JSON file, so 50 | # we can start the parse process from here, having to do 51 | # a reset. 52 | if line.startswith('{'): 53 | f.seek(pos) 54 | break 55 | else: 56 | pos += len(line) 57 | 58 | # Check whether file is empty for some reason. If so, we 59 | # skip it. 60 | line = f.readline() 61 | if line == '': 62 | continue 63 | 64 | # Not empty, so we need to reset the file pointer 65 | else: 66 | f.seek(pos) 67 | 68 | data_raw = jt.load(f) 69 | 70 | kernel = data_raw['kernel'] 71 | parameter = extract_parameter(kernel) 72 | 73 | parameters.append(parameter) 74 | 75 | mu = np.mean(parameters) 76 | sigma = np.std(parameters) 77 | 78 | print('Extracted kernel parameters:', parameters) 79 | print(f'Mean kernel parameter: {mu:.2f} +- {sigma:.2f}') 80 | -------------------------------------------------------------------------------- /ismb2020_maldi/calibrate_histograms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Calculates calibration scores of the model by calculating changes in 4 | # AUPRC depending on the threshold. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.metrics import average_precision_score 13 | 14 | import argparse 15 | 16 | import numpy as np 17 | import json_tricks as jt 18 | 19 | from tqdm import tqdm 20 | 21 | 22 | def process(data, in_sample_species, antibiotic, seed): 23 | 24 | species_to_dataset = { 25 | 'ecoli': EcoliAntibioticResistanceDataset, 26 | 'kpneu': KpneuAntibioticResistanceDataset, 27 | 'saureus': SaureusAntibioticResistanceDataset 28 | } 29 | 30 | dataset = species_to_dataset[in_sample_species]( 31 | test_size=0.20, 32 | antibiotic=antibiotic, 33 | random_seed=seed 34 | ) 35 | 36 | _, y_test = dataset.testing_data 37 | 38 | test_proba = data['in_sample_test_proba'] 39 | test_proba_max = np.amax(test_proba, axis=1) 40 | 41 | output_in_sample_proba = \ 42 | f'In_sample_proba_{in_sample_species}_{antibiotic}_{seed}.csv' 43 | 44 | np.savetxt(output_in_sample_proba, test_proba_max, fmt='%.2f') 45 | 46 | oos_species = data['out_of_sample_species'] 47 | 48 | for species in oos_species: 49 | 50 | oos_proba = data['out_of_sample_' + species + '_proba'] 51 | oos_proba_max = np.amax(oos_proba, axis=1) 52 | 53 | output_out_of_sample_proba = \ 54 | f'Out_of_sample_proba_{species}_{antibiotic}_{seed}.csv' 55 | 56 | np.savetxt(output_out_of_sample_proba, oos_proba_max, fmt='%.2f') 57 | 58 | 59 | if __name__ == '__main__': 60 | 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('FILES', nargs='+', type=str) 63 | 64 | args = parser.parse_args() 65 | 66 | for filename in tqdm(args.FILES, desc='Loading'): 67 | with open(filename) as f: 68 | data = jt.load(f) 69 | species = data['in_sample_species'] 70 | antibiotic = data['in_sample_antibiotic'] 71 | seed = data['seed'] 72 | 73 | process(data, species, antibiotic, seed) 74 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/vectorization/binning.py: -------------------------------------------------------------------------------- 1 | """Transformers for binning spectra.""" 2 | import numpy as np 3 | 4 | from sklearn.base import BaseEstimator 5 | from sklearn.base import TransformerMixin 6 | 7 | 8 | class BinningVectorizer(BaseEstimator, TransformerMixin): 9 | """Vectorizer based on binning MALDI-TOF spectra. 10 | 11 | Attributes: 12 | bin_edges_: Edges of the bins derived after fitting the transformer. 13 | 14 | """ 15 | 16 | _required_parameters = ['n_bins'] 17 | 18 | def __init__(self, n_bins, min_bin=float('inf'), max_bin=float('-inf')): 19 | """Initialize BinningVectorizer. 20 | 21 | Args: 22 | n_bins: Number of bins to bin the inputs spectra into. 23 | min_bin: Smallest possible bin edge. 24 | max_bin: Largest possible bin edge. 25 | 26 | """ 27 | self.n_bins = n_bins 28 | self.min_bin = min_bin 29 | self.max_bin = max_bin 30 | self.bin_edges_ = None 31 | 32 | def fit(self, X, y=None): 33 | """Fit transformer, derives bins used to bin spectra.""" 34 | combined_times = np.concatenate( 35 | [spectrum[:, 0] for spectrum in X], axis=0) 36 | min_range = min(self.min_bin, np.min(combined_times)) 37 | max_range = max(self.max_bin, np.max(combined_times)) 38 | 39 | _, self.bin_edges_ = np.histogram( 40 | combined_times, self.n_bins, range=(min_range, max_range)) 41 | return self 42 | 43 | def transform(self, X): 44 | """Transform list of spectra into vector using bins. 45 | 46 | Args: 47 | X: List of MALDI-TOF spectra 48 | 49 | Returns: 50 | 2D numpy array with shape [n_instances x n_bins] 51 | 52 | """ 53 | output = [] 54 | for spectrum in X: 55 | times = spectrum[:, 0] 56 | indices = np.digitize(times, self.bin_edges_, right=True) 57 | 58 | # Drops all instances which are outside the defined bin 59 | # range. 60 | valid = (indices >= 1) & (indices <= self.n_bins) 61 | spectrum = spectrum[valid] 62 | 63 | # Need to update indices to ensure that the first bin is at 64 | # position zero. 65 | indices = indices[valid] - 1 66 | identity = np.eye(self.n_bins) 67 | 68 | vec = np.sum( 69 | identity[indices] * spectrum[:, 1][:, np.newaxis], axis=0) 70 | 71 | output.append(vec) 72 | 73 | return np.stack(output, axis=0) 74 | -------------------------------------------------------------------------------- /maldi-learn/tests/preprocessing/test_normalization.py: -------------------------------------------------------------------------------- 1 | """Test normalizers.""" 2 | import unittest 3 | 4 | import numpy as np 5 | 6 | from maldi_learn.data import MaldiTofSpectrum 7 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer 8 | 9 | 10 | MOCK_DATA = [ 11 | MaldiTofSpectrum( 12 | [[0.0, 5.0], 13 | [10.7, 8.0], 14 | [150.4, 10.], 15 | [1000, 3.0] 16 | ] 17 | ), # Mean intensity 6.5 18 | MaldiTofSpectrum( 19 | [[0.0, 15.0], 20 | [10.7, 0.0], 21 | [150.4, 10.], 22 | [1000, 3.0] 23 | ] 24 | ), # Mean intensity 7 or 9.3333 (with ignore zero intensity) 25 | ] 26 | 27 | # Total mean intensity: 6.75 or 7.7142857143 (with ignore zero intensity) 28 | 29 | 30 | class TestTotalIonCurrentNormalizer(unittest.TestCase): 31 | def test_dont_ignore_zero_intensity(self): 32 | transf = TotalIonCurrentNormalizer(ignore_zero_intensity=False) 33 | transformed = transf.fit_transform(MOCK_DATA) 34 | 35 | # Normalization factor first example: 6.5 / 6.75 = 0.9629 36 | transformed_intesities = transformed[0].intensities 37 | expected_intensities = MOCK_DATA[0].intensities * (6.5 / 6.75) 38 | self.assertTrue(np.allclose( 39 | transformed_intesities, 40 | expected_intensities 41 | )) 42 | 43 | # Normalization factor second example: 7 / 6.75 = 1.0370 44 | transformed_intesities = transformed[1].intensities 45 | expected_intensities = MOCK_DATA[1].intensities * (7 / 6.75) 46 | self.assertTrue(np.allclose( 47 | transformed_intesities, 48 | expected_intensities 49 | )) 50 | 51 | def test_ignore_zero_intensity(self): 52 | transf = TotalIonCurrentNormalizer(ignore_zero_intensity=True) 53 | transformed = transf.fit_transform(MOCK_DATA) 54 | 55 | # Normalization factor first example: 6.5 / 7.71428 = 0.9629 56 | transformed_intesities = transformed[0].intensities 57 | expected_intensities = MOCK_DATA[0].intensities * (6.5 / 7.71428) 58 | self.assertTrue(np.allclose( 59 | transformed_intesities, 60 | expected_intensities 61 | )) 62 | 63 | # Normalization factor second example: 9.3333 / 7.71428 = 1.0370 64 | transformed_intesities = transformed[1].intensities 65 | expected_intensities = MOCK_DATA[1].intensities * (9.3333 / 7.71428) 66 | self.assertTrue(np.allclose( 67 | transformed_intesities, 68 | expected_intensities 69 | )) 70 | -------------------------------------------------------------------------------- /ismb2020_maldi/demo_kernel_confidence.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Demo file for kernel calculation. 3 | ''' 4 | 5 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 6 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | 9 | from maldi_learn.kernels import DiffusionKernel 10 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer 11 | 12 | from maldi_learn.preprocessing import SubsetPeaksTransformer 13 | 14 | from sklearn.gaussian_process import GaussianProcessClassifier 15 | from sklearn.metrics import average_precision_score 16 | from sklearn.svm import SVC 17 | 18 | from imblearn.over_sampling import RandomOverSampler 19 | 20 | from joblib import parallel_backend 21 | 22 | import matplotlib.pyplot as plt 23 | 24 | import numpy as np 25 | import sys 26 | 27 | dataset = KpneuAntibioticResistanceDataset(antibiotic='Ceftriaxon', 28 | test_size=0.20) 29 | X_train, y_train = dataset.training_data 30 | X_test, y_test = dataset.testing_data 31 | 32 | X_indices = np.asarray([i for i in range(0, 33 | len(X_train))]).reshape(-1, 1) 34 | 35 | ros = RandomOverSampler(random_state=2020) 36 | 37 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 38 | 39 | X_train_ = [] 40 | 41 | for index in X_indices.ravel(): 42 | X_train_.append(X_train[index]) 43 | 44 | X_train = X_train_ 45 | 46 | tic = TotalIonCurrentNormalizer() 47 | X_train = tic.fit_transform(X_train) 48 | X_test = tic.transform(X_test) 49 | 50 | st = SubsetPeaksTransformer(n_peaks=200) 51 | 52 | X_train = st.fit_transform(X_train) 53 | X_test = st.transform(X_test) 54 | 55 | kernel = DiffusionKernel(sigma=10) 56 | 57 | print('Finished pre-processing') 58 | 59 | test_distribution = kernel(X_train, X_test).ravel() 60 | 61 | clf = GaussianProcessClassifier(optimizer=None, kernel=kernel, n_jobs=-1) 62 | clf.fit(X_train, y_train) 63 | 64 | test_distribution = np.amax(clf.predict_proba(X_test), axis=1).ravel() 65 | 66 | oos_dataset = SaureusAntibioticResistanceDataset(antibiotic='Penicillin', 67 | test_size=0.20) 68 | 69 | oos_test, _ = oos_dataset.testing_data 70 | oos_test = tic.transform(oos_test) 71 | oos_test = st.transform(oos_test) 72 | 73 | oos_distribution = np.amax(clf.predict_proba(oos_test), axis=1).ravel() 74 | 75 | plt.hist(test_distribution, label='test', bins=np.linspace(0.50, 0.60, 100), alpha=0.5) 76 | plt.hist(oos_distribution, label='oos saureus', bins=np.linspace(0.50, 0.60, 100), alpha=0.5) 77 | 78 | oos_dataset = EcoliAntibioticResistanceDataset(antibiotic='Ciprofloxacin', 79 | test_size=0.20) 80 | 81 | oos_test, _ = oos_dataset.testing_data 82 | oos_test = tic.transform(oos_test) 83 | oos_test = st.transform(oos_test) 84 | 85 | oos_distribution = np.amax(clf.predict_proba(oos_test), axis=1).ravel() 86 | 87 | plt.hist(oos_distribution, label='oos ecoli', bins=np.linspace(0.50, 0.60, 100), alpha=0.5) 88 | plt.legend() 89 | 90 | plt.show() 91 | -------------------------------------------------------------------------------- /ismb2020_maldi/visualise_baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Basic visualisation script for baseline classifier. 4 | 5 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 6 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 9 | 10 | from maldi_learn.vectorization import BinningVectorizer 11 | 12 | from sklearn.manifold import TSNE 13 | 14 | from joblib import parallel_backend 15 | 16 | import matplotlib.pyplot as plt 17 | import seaborn as sns 18 | 19 | import numpy as np 20 | import json_tricks as jt 21 | 22 | import argparse 23 | import os 24 | import warnings 25 | 26 | 27 | if __name__ == '__main__': 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('-s', '--species', type=str, required=True) 31 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 32 | parser.add_argument('-S', '--seed', type=int, required=False, 33 | default=2020) 34 | parser.add_argument('--suffix', default='') 35 | 36 | args = parser.parse_args() 37 | 38 | species_to_dataset = { 39 | 'ecoli': EcoliAntibioticResistanceDataset, 40 | 'kpneu': KpneuAntibioticResistanceDataset, 41 | 'saureus': SaureusAntibioticResistanceDataset 42 | } 43 | 44 | dataset = species_to_dataset[args.species]( 45 | test_size=0.20, 46 | antibiotic=args.antibiotic, 47 | random_seed=args.seed, 48 | suffix=args.suffix 49 | ) 50 | 51 | X_train, y_train = dataset.training_data 52 | X_test, y_test = dataset.testing_data 53 | 54 | bv = BinningVectorizer(900, min_bin=2000, max_bin=20000) 55 | X_train = bv.fit_transform(X_train) 56 | X_test = bv.transform(X_test) 57 | 58 | X = np.concatenate((X_train, X_test), axis=0) 59 | 60 | # Static information about the data set; will be extended later on 61 | # with information about the training itself. 62 | data = { 63 | 'seed': args.seed, 64 | 'species': args.species, 65 | 'antibiotic': args.antibiotic, 66 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 67 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 68 | } 69 | 70 | tsne = TSNE(n_components=2) 71 | tsne.fit(X) 72 | 73 | Z = tsne.fit_transform(X) 74 | Z_train = Z[:len(X_train)] 75 | Z_test = Z[len(X_train):] 76 | 77 | fig, axes = plt.subplots(ncols=2) 78 | 79 | sns.scatterplot(x=Z_train[:, 0], y=Z_train[:, 1], hue=y_train, 80 | ax=axes[0]) 81 | sns.scatterplot(x=Z_test[:, 0], y=Z_test[:, 1], hue=y_test, 82 | ax=axes[1]) 83 | 84 | plt.show() 85 | 86 | if args.output is not None: 87 | with open(args.output, 'w') as f: 88 | jt.dump(data, f, indent=4) 89 | else: 90 | print(jt.dumps(data, indent=4)) 91 | -------------------------------------------------------------------------------- /ismb2020_maldi/analyse_split.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Analyses a given split of a data set and prints some summary 4 | # statistics. This is mean for debugging purposes only. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from maldi_learn.preprocessing import SubsetPeaksTransformer 12 | 13 | from joblib import parallel_backend 14 | 15 | import numpy as np 16 | 17 | import argparse 18 | import os 19 | import warnings 20 | 21 | 22 | def get_mean_and_std(X, y, l): 23 | 24 | intensities = [] 25 | 26 | for spectrum, label in zip(X, y): 27 | if label == l: 28 | intensities.extend(spectrum[:, 1]) 29 | 30 | return np.mean(intensities), np.std(intensities) 31 | 32 | 33 | if __name__ == '__main__': 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('-s', '--species', type=str, required=True) 37 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 38 | parser.add_argument('-S', '--seed', type=int, required=False, 39 | default=2020) 40 | parser.add_argument('-p', '--peaks', type=int, required=False, 41 | default=100) 42 | 43 | args = parser.parse_args() 44 | 45 | species_to_dataset = { 46 | 'ecoli': EcoliAntibioticResistanceDataset, 47 | 'kpneu': KpneuAntibioticResistanceDataset, 48 | 'saureus': SaureusAntibioticResistanceDataset 49 | } 50 | 51 | dataset = species_to_dataset[args.species]( 52 | test_size=0.20, 53 | antibiotic=args.antibiotic, 54 | random_seed=args.seed 55 | ) 56 | 57 | X_train, y_train = dataset.training_data 58 | X_test, y_test = dataset.testing_data 59 | 60 | st = SubsetPeaksTransformer(n_peaks=args.peaks) 61 | 62 | X_train = st.fit_transform(X_train) 63 | X_test = st.transform(X_test) 64 | 65 | print(f'Seed: {args.seed}') 66 | print(f'Species: {args.species}') 67 | print(f'Antibiotic: {args.antibiotic}') 68 | print(f'Number of peaks: {args.peaks}') 69 | 70 | SPECTRA_PATH = os.getenv('ANTIBIOTICS_SPECTRA_PATH') 71 | ENDPOINT_PATH = os.getenv('ANTIBIOTICS_ENDPOINT_PATH') 72 | 73 | print(f'SPECTRA_PATH = {SPECTRA_PATH}') 74 | print(f'ENDPOINT_PATH = {ENDPOINT_PATH}') 75 | 76 | mu_train_0, std_train_0 = get_mean_and_std(X_train, y_train, 0) 77 | mu_train_1, std_train_1 = get_mean_and_std(X_train, y_train, 1) 78 | mu_test_0, std_test_0 = get_mean_and_std(X_test, y_test, 0) 79 | mu_test_1, std_test_1 = get_mean_and_std(X_test, y_test, 1) 80 | 81 | print('y == 0 (train):', mu_train_0, std_train_0) 82 | print('y == 1 (train):', mu_train_1, std_train_1) 83 | print('y == 0 (test):', mu_test_0, std_test_0) 84 | print('y == 1 (test):', mu_test_1, std_test_1) 85 | 86 | print(abs(mu_train_0 - mu_test_0), abs(std_test_0 - std_test_1)) 87 | -------------------------------------------------------------------------------- /ismb2020_maldi/collect_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Collection script for all results. Will create a table based on the 4 | # species and the antibiotic and summarise the performance measures. 5 | 6 | import argparse 7 | 8 | import json_tricks as jt 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from tqdm import tqdm 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('INPUT', nargs='+', type=str) 19 | 20 | # Following the convention of `sklearn` here instead of referring to 21 | # AUPRC or something like that. 22 | parser.add_argument( 23 | '-m', '--metric', 24 | default='average_precision', 25 | type=str 26 | ) 27 | 28 | args = parser.parse_args() 29 | 30 | rows = [] 31 | 32 | for filename in tqdm(args.INPUT, desc='Loading'): 33 | with open(filename) as f: 34 | # Ensures that we can parse normal JSON files 35 | pos = 0 36 | 37 | for line in f: 38 | 39 | # We found *probably* the beginning of the JSON file, so 40 | # we can start the parse process from here, having to do 41 | # a reset. 42 | if line.startswith('{'): 43 | f.seek(pos) 44 | break 45 | else: 46 | pos += len(line) 47 | 48 | # Check whether file is empty for some reason. If so, we 49 | # skip it. 50 | line = f.readline() 51 | if line == '': 52 | continue 53 | 54 | # Not empty, so we need to reset the file pointer 55 | else: 56 | f.seek(pos) 57 | 58 | data_raw = jt.load(f) 59 | 60 | # Create one row in the table containing the relevant 61 | # information for now. 62 | row = { 63 | 'species': data_raw['species'], 64 | 'antibiotic': data_raw['antibiotic'], 65 | args.metric: data_raw[args.metric], 66 | } 67 | 68 | # Some magic for figuring out whether we are looking at 69 | # a baseline method or one of our own. Also creates the 70 | # name of the method. 71 | # 72 | # TODO: handle multiple kernels 73 | 74 | is_baseline = 'best_parameters' in data_raw 75 | method = 'baseline' if is_baseline else 'kernel' 76 | 77 | if not is_baseline: 78 | if data_raw['n_peaks'] is None: 79 | data_raw['n_peaks'] = 'all' 80 | 81 | method += '_' + str(data_raw['n_peaks']) 82 | 83 | if 'normalize' in data_raw: 84 | method += '_normalized' if data_raw['normalize'] else '' 85 | 86 | row['method'] = method 87 | rows.append(row) 88 | 89 | pd.options.display.max_rows = 999 90 | pd.options.display.float_format = '{:,.2f}'.format 91 | 92 | df = pd.DataFrame(rows) 93 | df = df.groupby(['species', 'antibiotic', 'method']).agg( 94 | { 95 | args.metric: [np.mean, np.std] 96 | } 97 | ) 98 | 99 | print(df) 100 | -------------------------------------------------------------------------------- /ismb2020_maldi/visualise_kernel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Basic visualisation script 4 | 5 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 6 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 9 | 10 | from maldi_learn.kernels import DiffusionKernel 11 | from maldi_learn.preprocessing import ScaleNormalizer 12 | 13 | from sklearn.decomposition import KernelPCA 14 | 15 | from joblib import parallel_backend 16 | 17 | import matplotlib.pyplot as plt 18 | import seaborn as sns 19 | 20 | import numpy as np 21 | import json_tricks as jt 22 | 23 | import argparse 24 | import os 25 | import warnings 26 | 27 | 28 | if __name__ == '__main__': 29 | 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('-s', '--species', type=str, required=True) 32 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 33 | parser.add_argument('-S', '--seed', type=int, required=False, 34 | default=2020) 35 | parser.add_argument('--sigma', type=float, required=False, default=1.0) 36 | parser.add_argument('--suffix', default='') 37 | 38 | args = parser.parse_args() 39 | 40 | species_to_dataset = { 41 | 'ecoli': EcoliAntibioticResistanceDataset, 42 | 'kpneu': KpneuAntibioticResistanceDataset, 43 | 'saureus': SaureusAntibioticResistanceDataset 44 | } 45 | 46 | dataset = species_to_dataset[args.species]( 47 | test_size=0.20, 48 | antibiotic=args.antibiotic, 49 | random_seed=args.seed, 50 | suffix=args.suffix 51 | ) 52 | 53 | X_train, y_train = dataset.training_data 54 | X_test, y_test = dataset.testing_data 55 | 56 | # Only perform scale normalisation if a suffix has been set; this 57 | # should be made configurable. 58 | if len(args.suffix) > 0: 59 | sn = ScaleNormalizer() 60 | X_train = sn.fit_transform(X_train) 61 | X_test = sn.transform(X_test) 62 | 63 | # Static information about the data set; will be extended later on 64 | # with information about the training itself. 65 | data = { 66 | 'seed': args.seed, 67 | 'species': args.species, 68 | 'antibiotic': args.antibiotic, 69 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 70 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 71 | } 72 | 73 | kernel = DiffusionKernel(args.sigma) 74 | 75 | with parallel_backend(backend='threading', n_jobs=-1): 76 | K_train = kernel(X_train) 77 | K_test = kernel(X_test) 78 | 79 | pca = KernelPCA(n_components=2, kernel="precomputed") 80 | Z_train = pca.fit_transform(K_train) 81 | Z_test = pca.fit_transform(K_test) 82 | 83 | fig, axes = plt.subplots(ncols=2) 84 | 85 | sns.scatterplot(x=Z_train[:, 0], y=Z_train[:, 1], hue=y_train, 86 | ax=axes[0]) 87 | sns.scatterplot(x=Z_test[:, 0], y=Z_test[:, 1], hue=y_test, 88 | ax=axes[1]) 89 | 90 | plt.show() 91 | 92 | if args.output is not None: 93 | with open(args.output, 'w') as f: 94 | jt.dump(data, f, indent=4) 95 | else: 96 | print(jt.dumps(data, indent=4)) 97 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/preprocessing/normalization.py: -------------------------------------------------------------------------------- 1 | """Normalization strategies for MALDI-TOF spectra.""" 2 | import numpy as np 3 | 4 | from sklearn.base import TransformerMixin 5 | from sklearn.base import BaseEstimator 6 | 7 | 8 | class TotalIonCurrentNormalizer(BaseEstimator, TransformerMixin): 9 | """ 10 | Normalize spectra based on total ion content. The normalizer 11 | supports different normalization strategies. 12 | """ 13 | 14 | def __init__(self, ignore_zero_intensity=True, method='mean'): 15 | """Initialize total ion content based normalizer. 16 | 17 | Args: 18 | ignore_zero_intensity: Ignore peaks with zero intensity when 19 | computing the average used for normalization. 20 | 21 | method: Determines the method that is used to perform the 22 | normalization. If set to 'mean', computes averages over the 23 | spectra to normalize. If set to 'sum', normalizes each 24 | spectrum individually such that its intensities sum to one. 25 | """ 26 | self.ignore_zero_intensity = ignore_zero_intensity 27 | self.mean_intensity = None 28 | self.method = method 29 | 30 | def _normalize_spectrum(self, spectrum, method): 31 | if method == 'mean': 32 | if self.ignore_zero_intensity: 33 | intensities = spectrum.intensities[spectrum.intensities != 0.] 34 | else: 35 | intensities = spectrum.intensities 36 | mean_instance_intensity = np.mean(intensities) 37 | scaling = mean_instance_intensity / self.mean_intensity 38 | return spectrum * np.array([1, scaling])[np.newaxis, :] 39 | elif method == 'sum': 40 | scaling = 1.0 / np.sum(spectrum.intensities) 41 | return spectrum * np.array([1, scaling])[np.newaxis, :] 42 | else: 43 | raise RuntimeError( 44 | f'Unexpected normalization method "{method}"') 45 | 46 | def _compute_mean_intensity_spectra(self, spectra): 47 | if self.ignore_zero_intensity: 48 | intensities = np.concatenate( 49 | [ 50 | spectrum.intensities[spectrum.intensities != 0.] 51 | for spectrum in spectra 52 | ], 53 | axis=0 54 | ) 55 | else: 56 | intensities = np.concatenate( 57 | [spectrum.intensities for spectrum in spectra], axis=0) 58 | return np.mean(intensities) 59 | 60 | def fit(self, X, y=None): 61 | """Fit transformer, computes average statistics of spectra.""" 62 | self.mean_intensity = self._compute_mean_intensity_spectra(X) 63 | return self 64 | 65 | def transform(self, X): 66 | """Normalize spectra using total ion content.""" 67 | return [ 68 | self._normalize_spectrum(spectrum, method=self.method) 69 | for spectrum in X 70 | ] 71 | 72 | 73 | class ScaleNormalizer(BaseEstimator, TransformerMixin): 74 | """ 75 | Normalizes a set of spectra such that their scales are not too 76 | small. 77 | """ 78 | 79 | def _calculate_min_nonzero_intensity(self, spectra): 80 | intensities = np.concatenate( 81 | [ 82 | s.intensities[s.intensities != 0] for s in spectra 83 | ], 84 | axis=0 85 | ) 86 | return np.min(intensities) 87 | 88 | def _normalize_spectrum(self, spectrum): 89 | scaling = 1.0 / self.min_nonzero_intensity 90 | return spectrum * np.array([1, scaling])[np.newaxis, :] 91 | 92 | def fit(self, X, y=None): 93 | self.min_nonzero_intensity = self._calculate_min_nonzero_intensity(X) 94 | return self 95 | 96 | def transform(self, X): 97 | return [ 98 | self._normalize_spectrum(spectrum) for spectrum in X 99 | ] 100 | -------------------------------------------------------------------------------- /ismb2020_maldi/calibrate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Calculates calibration scores of the model by calculating changes in 4 | # AUPRC depending on the threshold. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.metrics import average_precision_score 13 | 14 | import argparse 15 | 16 | import numpy as np 17 | import json_tricks as jt 18 | 19 | from tqdm import tqdm 20 | 21 | 22 | def process(data, in_sample_species, antibiotic, seed): 23 | 24 | species_to_dataset = { 25 | 'ecoli': EcoliAntibioticResistanceDataset, 26 | 'kpneu': KpneuAntibioticResistanceDataset, 27 | 'saureus': SaureusAntibioticResistanceDataset 28 | } 29 | 30 | dataset = species_to_dataset[in_sample_species]( 31 | test_size=0.20, 32 | antibiotic=antibiotic, 33 | random_seed=seed 34 | ) 35 | 36 | _, y_test = dataset.testing_data 37 | 38 | thresholds = np.linspace(0.5, 1.0, 1000) 39 | 40 | test_proba = data['in_sample_test_proba'] 41 | test_proba_max = np.amax(test_proba, axis=1) 42 | 43 | output_rejection_ratio_curve = \ 44 | f'Calibration_{in_sample_species}_{antibiotic}_{seed}.csv' 45 | 46 | with open(output_rejection_ratio_curve, 'w') as f: 47 | 48 | print('threshold,accuracy,auprc,n_pos_samples', file=f) 49 | 50 | for threshold in thresholds: 51 | 52 | # Get the indices that we want to *keep*, i.e. those test 53 | # samples whose maximum probability exceeds the threshold 54 | indices = test_proba_max > threshold 55 | 56 | # Subset the predictions and the labels according to these 57 | # indices and calculate an AUPRC. 58 | y_true = y_test[indices] 59 | y_pred_proba = test_proba[indices][:, 1] 60 | 61 | # Predict the positive class if the prediction threshold is 62 | # larger than the one we use for this iteration. 63 | y_pred = np.zeros_like(y_pred_proba) 64 | y_pred[y_pred_proba > threshold] = 1.0 65 | 66 | y_true_unique = set(y_true.values) 67 | 68 | if len(y_true_unique) != 2: 69 | break 70 | 71 | average_precision = average_precision_score(y_true, y_pred_proba) 72 | accuracy = accuracy_score(y_true, y_pred) 73 | 74 | print(f'{threshold},{accuracy},{average_precision},{sum(y_true == 1)}', file=f) 75 | 76 | oos_species = data['out_of_sample_species'] 77 | 78 | for species in oos_species: 79 | 80 | output_rejection_plot = \ 81 | f'Rejection_ratio_{in_sample_species}_{antibiotic}_{species}_{seed}.csv' 82 | 83 | oos_proba = data['out_of_sample_' + species + '_proba'] 84 | oos_proba_max = np.amax(oos_proba, axis=1) 85 | 86 | with open(output_rejection_plot, 'w') as f: 87 | 88 | print('threshold,rejected_in_sample,rejected_out_of_sample', 89 | file=f) 90 | 91 | for threshold in thresholds: 92 | rejected_test = \ 93 | sum(test_proba_max <= threshold) / len(test_proba_max) 94 | 95 | rejected_oos = \ 96 | sum(oos_proba_max <= threshold) / len(oos_proba_max) 97 | 98 | print(f'{threshold},{rejected_test},{rejected_oos}', 99 | file=f) 100 | 101 | 102 | if __name__ == '__main__': 103 | 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument('FILES', nargs='+', type=str) 106 | 107 | args = parser.parse_args() 108 | 109 | for filename in tqdm(args.FILES, desc='Loading'): 110 | with open(filename) as f: 111 | data = jt.load(f) 112 | species = data['in_sample_species'] 113 | antibiotic = data['in_sample_antibiotic'] 114 | seed = data['seed'] 115 | 116 | process(data, species, antibiotic, seed) 117 | -------------------------------------------------------------------------------- /ismb2020_maldi/baseline_gp_rbf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Trains a baseline Gaussian process classifier with an RBF kernel on 4 | # the features generated by the logistic regression. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer 12 | from maldi_learn.preprocessing import SubsetPeaksTransformer 13 | from maldi_learn.vectorization import BinningVectorizer 14 | 15 | from imblearn.over_sampling import RandomOverSampler 16 | 17 | from sklearn.gaussian_process import GaussianProcessClassifier 18 | from sklearn.gaussian_process.kernels import RBF 19 | 20 | from sklearn.exceptions import ConvergenceWarning 21 | from sklearn.linear_model import LogisticRegression 22 | from sklearn.metrics import average_precision_score 23 | from sklearn.metrics import accuracy_score 24 | from sklearn.model_selection import GridSearchCV 25 | from sklearn.model_selection import StratifiedKFold 26 | from sklearn.preprocessing import StandardScaler 27 | from sklearn.pipeline import Pipeline 28 | 29 | from joblib import parallel_backend 30 | 31 | import numpy as np 32 | import json_tricks as jt 33 | 34 | import argparse 35 | import os 36 | import warnings 37 | 38 | 39 | if __name__ == '__main__': 40 | 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument('-s', '--species', type=str, required=True) 43 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 44 | parser.add_argument('-S', '--seed', type=int, required=False, default=2020) 45 | parser.add_argument('-o', '--output', type=str) 46 | parser.add_argument('-n', '--normalize', action='store_true') 47 | 48 | args = parser.parse_args() 49 | 50 | species_to_dataset = { 51 | 'ecoli': EcoliAntibioticResistanceDataset, 52 | 'kpneu': KpneuAntibioticResistanceDataset, 53 | 'saureus': SaureusAntibioticResistanceDataset 54 | } 55 | 56 | dataset = species_to_dataset[args.species]( 57 | test_size=0.20, 58 | antibiotic=args.antibiotic, 59 | random_seed=args.seed, 60 | suffix='_peaks_warped' 61 | ) 62 | 63 | X_train, y_train = dataset.training_data 64 | X_test, y_test = dataset.testing_data 65 | 66 | # Perform random oversampling in order to ensure class balance. This 67 | # is strictly speaking not required but we do it for the GP as well, 68 | # so in the interest of comparability, we have to do it here. 69 | 70 | ros = RandomOverSampler(random_state=args.seed) 71 | 72 | X_indices = np.asarray( 73 | [i for i in range(0, len(X_train))]).reshape(-1, 1) 74 | 75 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 76 | X_train = np.take(X_train, X_indices.ravel()) 77 | 78 | # Static information about the data set; will be extended later on 79 | # with information about the training itself. 80 | data = { 81 | 'seed': args.seed, 82 | 'species': args.species, 83 | 'antibiotic': args.antibiotic, 84 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 85 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 86 | 'normalize': args.normalize, 87 | } 88 | 89 | # This is the mode of the experiments run for the logistic 90 | # regression pipeline. 91 | n_bins = 3600 92 | 93 | data['n_bins'] = n_bins 94 | 95 | # Define pipeline and cross-validation setup 96 | 97 | pipeline = Pipeline( 98 | [ 99 | ('bv', BinningVectorizer( 100 | n_bins=n_bins, 101 | min_bin=2000, 102 | max_bin=20000)), 103 | ('gp', GaussianProcessClassifier( 104 | kernel=RBF(), 105 | ) 106 | ) 107 | ], 108 | memory=os.getenv('TMPDIR', default=None), 109 | ) 110 | 111 | with warnings.catch_warnings(): 112 | warnings.filterwarnings('ignore', category=ConvergenceWarning) 113 | warnings.filterwarnings('ignore', category=UserWarning) 114 | 115 | # Let's do the fitting in parallel, but the prediction can be done 116 | # without additional threading. 117 | with parallel_backend('loky', n_jobs=-1): 118 | pipeline.fit(X_train, y_train) 119 | 120 | data['kernel'] = repr(pipeline['gp'].kernel_.theta) 121 | 122 | # AUPRC 123 | 124 | y_pred = pipeline.predict_proba(X_test) 125 | average_precision = average_precision_score(y_test, y_pred[:, 1]) 126 | 127 | data['average_precision'] = 100 * average_precision 128 | 129 | # Accuracy 130 | 131 | y_pred = pipeline.predict(X_test) 132 | accuracy = accuracy_score(y_test, y_pred) 133 | 134 | data['accuracy'] = 100 * accuracy 135 | 136 | if args.output is not None: 137 | with open(args.output, 'w') as f: 138 | jt.dump(data, f, indent=4) 139 | else: 140 | print(jt.dumps(data, indent=4)) 141 | -------------------------------------------------------------------------------- /ismb2020_maldi/diffusion_kernel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Trains a diffusion kernel Gaussian Process classifier and reports the 4 | # results on all tasks. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from maldi_learn.kernels import DiffusionKernel 12 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer 13 | from maldi_learn.preprocessing import SubsetPeaksTransformer 14 | from maldi_learn.preprocessing import ScaleNormalizer 15 | 16 | from sklearn.gaussian_process import GaussianProcessClassifier 17 | from sklearn.gaussian_process.kernels import RBF 18 | from sklearn.exceptions import ConvergenceWarning 19 | from sklearn.metrics import average_precision_score 20 | from sklearn.metrics import accuracy_score 21 | 22 | from imblearn.over_sampling import RandomOverSampler 23 | 24 | from joblib import parallel_backend 25 | 26 | import numpy as np 27 | import json_tricks as jt 28 | 29 | import argparse 30 | import os 31 | import warnings 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('-s', '--species', type=str, required=True) 38 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 39 | parser.add_argument('-o', '--output', type=str) 40 | parser.add_argument('-S', '--seed', type=int, required=False, 41 | default=2020) 42 | parser.add_argument('-p', '--peaks', type=int, required=False, 43 | default=None) 44 | parser.add_argument('-n', '--normalize', action='store_true') 45 | parser.add_argument('--suffix', default='') 46 | 47 | args = parser.parse_args() 48 | 49 | species_to_dataset = { 50 | 'ecoli': EcoliAntibioticResistanceDataset, 51 | 'kpneu': KpneuAntibioticResistanceDataset, 52 | 'saureus': SaureusAntibioticResistanceDataset 53 | } 54 | 55 | dataset = species_to_dataset[args.species]( 56 | test_size=0.20, 57 | antibiotic=args.antibiotic, 58 | random_seed=args.seed, 59 | suffix=args.suffix 60 | ) 61 | 62 | X_train, y_train = dataset.training_data 63 | X_test, y_test = dataset.testing_data 64 | 65 | # Perform random oversampling in order to ensure class balance. This 66 | # is strictly speaking not required but we do it for the GP as well, 67 | # so in the interest of comparability, we have to do it here. 68 | 69 | ros = RandomOverSampler(random_state=args.seed) 70 | 71 | X_indices = np.asarray( 72 | [i for i in range(0, len(X_train))]).reshape(-1, 1) 73 | 74 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 75 | X_train = np.take(X_train, X_indices.ravel()) 76 | 77 | # Normalise on demand. This is an *external* flag because by 78 | # default, we should have no expectations about its efficacy 79 | # in practice. 80 | if args.normalize: 81 | tic = TotalIonCurrentNormalizer(method='sum') 82 | X_train = tic.fit_transform(X_train) 83 | X_test = tic.transform(X_test) 84 | 85 | # Sparsify the data by restricting everything to the peaks only. 86 | st = SubsetPeaksTransformer(n_peaks=args.peaks) 87 | 88 | X_train = st.fit_transform(X_train) 89 | X_test = st.transform(X_test) 90 | 91 | # Perform scale normalisation for the MQ data set, which is 92 | # indicated by a suffix, or whenever the client specified a 93 | # normalisation parameter manually. This ensures that every 94 | # spectrum can be fitted by the kernel. 95 | if args.normalize or len(args.suffix) > 0: 96 | sn = ScaleNormalizer() 97 | X_train = sn.fit_transform(X_train) 98 | X_test = sn.transform(X_test) 99 | 100 | # Static information about the data set; will be extended later on 101 | # with information about the training itself. 102 | data = { 103 | 'seed': args.seed, 104 | 'species': args.species, 105 | 'antibiotic': args.antibiotic, 106 | 'n_peaks': args.peaks, 107 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 108 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 109 | 'normalize': args.normalize, 110 | } 111 | 112 | kernel = DiffusionKernel(sigma=1) 113 | clf = GaussianProcessClassifier(kernel=kernel) 114 | 115 | with warnings.catch_warnings(): 116 | warnings.filterwarnings('ignore', category=ConvergenceWarning) 117 | warnings.filterwarnings('ignore', category=UserWarning) 118 | 119 | # Let's do the fitting in parallel, but the prediction can be done 120 | # without additional threading. 121 | with parallel_backend(backend='loky'): 122 | clf.fit(X_train, y_train) 123 | 124 | data['kernel'] = repr(clf.kernel_) 125 | data['log_marginal_likelihood'] = clf.log_marginal_likelihood_value_ 126 | 127 | y_pred = clf.predict_proba(X_test) 128 | average_precision = average_precision_score(y_test, y_pred[:, 1]) 129 | 130 | data['average_precision'] = 100 * average_precision 131 | 132 | y_pred = clf.predict(X_test) 133 | accuracy = accuracy_score(y_test, y_pred) 134 | 135 | data['accuracy'] = 100 * accuracy 136 | 137 | if args.output is not None: 138 | with open(args.output, 'w') as f: 139 | jt.dump(data, f, indent=4) 140 | else: 141 | print(jt.dumps(data, indent=4)) 142 | -------------------------------------------------------------------------------- /ismb2020_maldi/datasets/antibiotics.py: -------------------------------------------------------------------------------- 1 | """Dataset of MALDI-TOF spectra for antibiotic resistance prediction.""" 2 | import os 3 | 4 | from dotenv import load_dotenv 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from maldi_learn.data import MaldiTofSpectrum 8 | 9 | from .dataset import Dataset 10 | 11 | # Dataset paths are specified in .env file in the root of the repository 12 | load_dotenv() 13 | SPECTRA_PATH = os.getenv('ANTIBIOTICS_SPECTRA_PATH') 14 | ENDPOINT_PATH = os.getenv('ANTIBIOTICS_ENDPOINT_PATH') 15 | 16 | 17 | class AntibioticResistanceDataset(Dataset): 18 | """Base class of datasets predicting antibiotic resistance.""" 19 | 20 | # endpoint_file_name = 'IDRES_clean.csv' 21 | 22 | def __init__(self, antibiotic, test_size=0.2, random_seed=2020, 23 | suffix=''): 24 | """Initialize the dataset. 25 | 26 | Args: 27 | antibiotic: Name (str) of the antibiotic to use for 28 | generating labels (endpoints). 29 | test_size: Fraction of the data that should be returned for 30 | testing. 31 | random_seed: Random seed for splitting the data into train and 32 | test. 33 | suffix: Suffix to use for the files to load. This suffix 34 | will be appended to the code specified in the endpoints data 35 | file. 36 | """ 37 | self.antibiotic = antibiotic 38 | self.suffix = suffix 39 | 40 | all_instances = self._make_binary_labels( 41 | self._read_endpoints_and_preprocess()) 42 | 43 | train_instances, test_instances = train_test_split( 44 | all_instances, 45 | test_size=test_size, 46 | random_state=random_seed, 47 | stratify=all_instances.values # stratify by labels 48 | ) 49 | 50 | self.all_instances, self.train_instances, self.test_instances = \ 51 | all_instances, train_instances, test_instances 52 | 53 | def _read_endpoints_and_preprocess(self): 54 | endpoint_file = os.path.join(ENDPOINT_PATH, self.endpoint_file_name) 55 | endpoints = pd.read_csv(endpoint_file, index_col='code') 56 | endpoints = endpoints.replace({ 57 | '-': float('NaN'), 58 | 'R(1)': float('NaN'), 59 | 'L(1)': float('NaN'), 60 | 'I(1)': float('NaN'), 61 | 'I(1), S(1)': float('NaN'), 62 | 'R(1), I(1)': float('NaN'), 63 | 'R(1), S(1)': float('NaN'), 64 | 'R(1), I(1), S(1)': float('NaN') 65 | }) 66 | 67 | return endpoints 68 | 69 | def _make_binary_labels(self, df): 70 | """ 71 | Creates binary labels by restricting the input data frame to the 72 | specified antibiotic. This is followed by dropping all NaNs, and 73 | making all labels binary (depending on resistance/susceptibility). 74 | """ 75 | 76 | only_antibiotic = df[self.antibiotic] 77 | 78 | only_antibiotic = only_antibiotic.dropna( 79 | axis='index', how='any', inplace=False) 80 | 81 | return only_antibiotic.replace({'R': 1, 'I': 1, 'S': 0}) 82 | 83 | # TODO: might want to remove this 84 | def _subset_instances(self, *instance_lists): 85 | def subset_and_binarize(input_instances): 86 | """Remove unused antibiotics and not measured instances.""" 87 | only_antibiotic = input_instances[self.antibiotic] 88 | only_antibiotic = only_antibiotic.dropna(axis='index', how='any', inplace=False) 89 | return only_antibiotic.replace({'R': 1, 'I': 1, 'S': 0}) 90 | return [subset_and_binarize(instances) for instances in instance_lists] 91 | 92 | @staticmethod 93 | def _build_filepaths_from_codes(codes, suffix): 94 | return [os.path.join(SPECTRA_PATH, f'{code}{suffix}.txt') for code in codes] 95 | 96 | def _read_spectra(self, files): 97 | return [ 98 | MaldiTofSpectrum( 99 | pd.read_csv(f, sep=' ', comment='#', engine='c').values) 100 | for f in files 101 | ] 102 | 103 | def _read_data(self, instances): 104 | codes = instances.index 105 | files = self._build_filepaths_from_codes(codes, self.suffix) 106 | spectra = self._read_spectra(files) 107 | return spectra, instances 108 | 109 | @property 110 | def training_data(self): 111 | """Get spectra used for training.""" 112 | return self._read_data(self.train_instances) 113 | 114 | @property 115 | def validation_data(self): 116 | """Not implemented for now.""" 117 | raise NotImplementedError() 118 | 119 | @property 120 | def testing_data(self): 121 | """Get spectra used for testing.""" 122 | return self._read_data(self.test_instances) 123 | 124 | @property 125 | def complete_data(self): 126 | """Get all spectra.""" 127 | return self._read_data(self.all_instances) 128 | 129 | 130 | class EcoliAntibioticResistanceDataset(AntibioticResistanceDataset): 131 | """Dataset for E.coli antibiotic resistance.""" 132 | 133 | endpoint_file_name = 'IDRES_Ecoli.csv' 134 | 135 | 136 | class SaureusAntibioticResistanceDataset(AntibioticResistanceDataset): 137 | """Dataset for S.aureus antibiotic resistance.""" 138 | 139 | endpoint_file_name = 'IDRES_Saureus.csv' 140 | 141 | 142 | class KpneuAntibioticResistanceDataset(AntibioticResistanceDataset): 143 | """Dataset for K.pneumoniae antibiotic resistance.""" 144 | 145 | endpoint_file_name = 'IDRES_Kpneu.csv' 146 | -------------------------------------------------------------------------------- /ismb2020_maldi/baseline_maldiquant.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Trains a baseline logistic regression classifier and reports the 4 | # results on all tasks. Uses pre-processed spectra and does not do 5 | # any peak calling. 6 | 7 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 10 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 11 | 12 | from maldi_learn.vectorization import BinningVectorizer 13 | 14 | from imblearn.over_sampling import RandomOverSampler 15 | 16 | from sklearn.exceptions import FitFailedWarning 17 | from sklearn.exceptions import ConvergenceWarning 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn.metrics import average_precision_score 20 | from sklearn.metrics import accuracy_score 21 | from sklearn.model_selection import GridSearchCV 22 | from sklearn.model_selection import StratifiedKFold 23 | from sklearn.preprocessing import StandardScaler 24 | from sklearn.pipeline import Pipeline 25 | 26 | from joblib import parallel_backend 27 | 28 | import numpy as np 29 | import json_tricks as jt 30 | 31 | import argparse 32 | import os 33 | import warnings 34 | 35 | 36 | if __name__ == '__main__': 37 | 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('-s', '--species', type=str, required=True) 40 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 41 | parser.add_argument('-S', '--seed', type=int, required=False, default=2020) 42 | parser.add_argument('-o', '--output', type=str) 43 | parser.add_argument('--suffix', type=str, default='') 44 | 45 | args = parser.parse_args() 46 | 47 | species_to_dataset = { 48 | 'ecoli': EcoliAntibioticResistanceDataset, 49 | 'kpneu': KpneuAntibioticResistanceDataset, 50 | 'saureus': SaureusAntibioticResistanceDataset 51 | } 52 | 53 | dataset = species_to_dataset[args.species]( 54 | test_size=0.20, 55 | antibiotic=args.antibiotic, 56 | random_seed=args.seed, 57 | suffix=args.suffix 58 | ) 59 | 60 | X_train, y_train = dataset.training_data 61 | X_test, y_test = dataset.testing_data 62 | 63 | # Perform random oversampling in order to ensure class balance. This 64 | # is strictly speaking not required but we do it for the GP as well, 65 | # so in the interest of comparability, we have to do it here. 66 | 67 | ros = RandomOverSampler(random_state=args.seed) 68 | 69 | X_indices = np.asarray( 70 | [i for i in range(0, len(X_train))]).reshape(-1, 1) 71 | 72 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 73 | X_train = np.take(X_train, X_indices.ravel()) 74 | 75 | # Static information about the data set; will be extended later on 76 | # with information about the training itself. 77 | data = { 78 | 'seed': args.seed, 79 | 'species': args.species, 80 | 'antibiotic': args.antibiotic, 81 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 82 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 83 | } 84 | 85 | param_grid = { 86 | 'bv__n_bins': [300, 600, 1800, 3600], 87 | 'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'], 88 | 'lr__C': 10. ** np.arange(-4, 5), # 10^{-4}..10^{4} 89 | } 90 | 91 | data['param_grid'] = param_grid 92 | 93 | # Define pipeline and cross-validation setup 94 | 95 | pipeline = Pipeline( 96 | [ 97 | ('bv', BinningVectorizer( 98 | n_bins=0, 99 | min_bin=2000, 100 | max_bin=20000)), 101 | ('std', StandardScaler()), 102 | ('lr', LogisticRegression( 103 | class_weight='balanced', 104 | solver='saga' # supports L_1 and L_2 penalties 105 | ) 106 | ) 107 | ], 108 | memory=os.getenv('TMPDIR', default=None), 109 | ) 110 | 111 | grid_search = GridSearchCV( 112 | pipeline, 113 | param_grid=param_grid, 114 | scoring='average_precision', 115 | cv=StratifiedKFold(n_splits=5, shuffle=True, 116 | random_state=42), 117 | n_jobs=-1, 118 | ) 119 | 120 | with warnings.catch_warnings(): 121 | warnings.filterwarnings('ignore', category=ConvergenceWarning) 122 | warnings.filterwarnings('ignore', category=FitFailedWarning) 123 | warnings.filterwarnings('ignore', category=UserWarning) 124 | 125 | # Let's do the fitting in parallel, but the prediction can be done 126 | # without additional threading. 127 | with parallel_backend('threading'): 128 | grid_search.fit(X_train, y_train) 129 | 130 | data['best_parameters'] = grid_search.best_params_ 131 | 132 | # AUPRC 133 | 134 | y_pred = grid_search.predict_proba(X_test) 135 | average_precision = average_precision_score(y_test, y_pred[:, 1]) 136 | 137 | data['average_precision'] = 100 * average_precision 138 | 139 | # Accuracy 140 | 141 | y_pred = grid_search.predict(X_test) 142 | accuracy = accuracy_score(y_test, y_pred) 143 | 144 | data['accuracy'] = 100 * accuracy 145 | 146 | if args.output is not None: 147 | with open(args.output, 'w') as f: 148 | jt.dump(data, f, indent=4) 149 | else: 150 | print(jt.dumps(data, indent=4)) 151 | -------------------------------------------------------------------------------- /ismb2020_maldi/baseline_maldiquant_confidence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Trains a baseline logistic regression classifier and reports the 4 | # confidence scores on our pre-defined task. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from maldi_learn.vectorization import BinningVectorizer 12 | 13 | from imblearn.over_sampling import RandomOverSampler 14 | 15 | from sklearn.exceptions import FitFailedWarning 16 | from sklearn.exceptions import ConvergenceWarning 17 | from sklearn.linear_model import LogisticRegression 18 | from sklearn.preprocessing import StandardScaler 19 | from sklearn.pipeline import Pipeline 20 | 21 | from joblib import parallel_backend 22 | 23 | import numpy as np 24 | import json_tricks as jt 25 | 26 | import argparse 27 | import os 28 | import warnings 29 | 30 | 31 | if __name__ == '__main__': 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('-S', '--seed', type=int, required=False, default=2020) 35 | parser.add_argument('-o', '--output', type=str) 36 | parser.add_argument('--suffix', type=str, default='') 37 | 38 | args = parser.parse_args() 39 | 40 | in_sample_species = 'saureus' 41 | in_sample_antibiotic = 'Amoxicillin-Clavulansaeure' 42 | 43 | # Antibiotics will not be used, but are specified because our data 44 | # set selection class demands it. 45 | out_of_sample_species = ['ecoli', 'kpneu'] 46 | out_of_sample_antibiotics = ['Ciprofloxacin', 'Ciprofloxacin'] 47 | 48 | species_to_dataset = { 49 | 'ecoli': EcoliAntibioticResistanceDataset, 50 | 'kpneu': KpneuAntibioticResistanceDataset, 51 | 'saureus': SaureusAntibioticResistanceDataset 52 | } 53 | 54 | dataset = species_to_dataset[in_sample_species]( 55 | test_size=0.20, 56 | antibiotic=in_sample_antibiotic, 57 | random_seed=args.seed, 58 | suffix=args.suffix 59 | ) 60 | 61 | X_train, y_train = dataset.training_data 62 | X_test, y_test = dataset.testing_data 63 | 64 | # Perform random oversampling in order to ensure class balance. This 65 | # is strictly speaking not required but we do it for the GP as well, 66 | # so in the interest of comparability, we have to do it here. 67 | 68 | ros = RandomOverSampler(random_state=args.seed) 69 | 70 | X_indices = np.asarray( 71 | [i for i in range(0, len(X_train))]).reshape(-1, 1) 72 | 73 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 74 | X_train = np.take(X_train, X_indices.ravel()) 75 | 76 | # Parameters extracted from the respective runs of the baseline 77 | # classifier for this particular scenario. 78 | n_bins = 3600 79 | C = 0.01 80 | penalty = 'l2' 81 | 82 | # Static information about the data set; will be extended later on 83 | # with information about the training itself. 84 | data = { 85 | 'seed': args.seed, 86 | 'in_sample_antibiotic': in_sample_antibiotic, 87 | 'in_sample_species': in_sample_species, 88 | 'out_of_sample_species': out_of_sample_species, 89 | 'out_of_sample_antibiotics': out_of_sample_antibiotics, 90 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 91 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 92 | 'n_bins': n_bins, 93 | 'C': C, 94 | 'penalty': penalty, 95 | } 96 | 97 | pipeline = Pipeline( 98 | [ 99 | ('bv', BinningVectorizer( 100 | n_bins=n_bins, 101 | min_bin=2000, 102 | max_bin=20000)), 103 | ('std', StandardScaler()), 104 | ('lr', LogisticRegression( 105 | class_weight='balanced', 106 | C=C, 107 | penalty=penalty, 108 | solver='saga' # supports L_1 and L_2 penalties 109 | ) 110 | ) 111 | ], 112 | memory=os.getenv('TMPDIR', default=None), 113 | ) 114 | 115 | # Makes subsequent operations easier to read 116 | clf = pipeline 117 | 118 | with warnings.catch_warnings(): 119 | warnings.filterwarnings('ignore', category=ConvergenceWarning) 120 | warnings.filterwarnings('ignore', category=FitFailedWarning) 121 | warnings.filterwarnings('ignore', category=UserWarning) 122 | 123 | # Let's do the fitting in parallel, but the prediction can be done 124 | # without additional threading. 125 | with parallel_backend('loky'): 126 | clf.fit(X_train, y_train) 127 | 128 | # Get maximum probability for classifying a sample into *any* class, 129 | # based on the test data set. 130 | test_proba = clf.predict_proba(X_test) 131 | test_proba_max = np.amax(test_proba, axis=1) 132 | 133 | data['in_sample_test_proba'] = test_proba 134 | 135 | for species, antibiotic in zip(out_of_sample_species, 136 | out_of_sample_antibiotics): 137 | 138 | oos_dataset = species_to_dataset[species]( 139 | test_size=0.20, 140 | antibiotic=antibiotic, 141 | random_seed=args.seed, 142 | suffix=args.suffix, 143 | ) 144 | 145 | oos_test, _ = oos_dataset.testing_data 146 | 147 | oos_proba = clf.predict_proba(oos_test) 148 | oos_proba_max = np.amax(oos_proba, axis=1) 149 | 150 | data['out_of_sample_' + species + '_proba'] = oos_proba 151 | 152 | if args.output is not None: 153 | with open(args.output, 'w') as f: 154 | jt.dump(data, f, indent=4) 155 | else: 156 | print(jt.dumps(data, indent=4)) 157 | -------------------------------------------------------------------------------- /ismb2020_maldi/baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Trains a baseline logistic regression classifier and reports the 4 | # results on all tasks. 5 | 6 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 7 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 10 | 11 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer 12 | from maldi_learn.preprocessing import SubsetPeaksTransformer 13 | from maldi_learn.vectorization import BinningVectorizer 14 | 15 | from imblearn.over_sampling import RandomOverSampler 16 | 17 | from sklearn.exceptions import ConvergenceWarning 18 | from sklearn.linear_model import LogisticRegression 19 | from sklearn.metrics import average_precision_score 20 | from sklearn.metrics import accuracy_score 21 | from sklearn.model_selection import GridSearchCV 22 | from sklearn.model_selection import StratifiedKFold 23 | from sklearn.preprocessing import StandardScaler 24 | from sklearn.pipeline import Pipeline 25 | 26 | from joblib import parallel_backend 27 | 28 | import numpy as np 29 | import json_tricks as jt 30 | 31 | import argparse 32 | import os 33 | import warnings 34 | 35 | 36 | if __name__ == '__main__': 37 | 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('-s', '--species', type=str, required=True) 40 | parser.add_argument('-a', '--antibiotic', type=str, required=True) 41 | parser.add_argument('-S', '--seed', type=int, required=False, default=2020) 42 | parser.add_argument('-o', '--output', type=str) 43 | parser.add_argument('-n', '--normalize', action='store_true') 44 | 45 | args = parser.parse_args() 46 | 47 | species_to_dataset = { 48 | 'ecoli': EcoliAntibioticResistanceDataset, 49 | 'kpneu': KpneuAntibioticResistanceDataset, 50 | 'saureus': SaureusAntibioticResistanceDataset 51 | } 52 | 53 | dataset = species_to_dataset[args.species]( 54 | test_size=0.20, 55 | antibiotic=args.antibiotic, 56 | random_seed=args.seed, 57 | ) 58 | 59 | X_train, y_train = dataset.training_data 60 | X_test, y_test = dataset.testing_data 61 | 62 | # Perform random oversampling in order to ensure class balance. This 63 | # is strictly speaking not required but we do it for the GP as well, 64 | # so in the interest of comparability, we have to do it here. 65 | 66 | ros = RandomOverSampler(random_state=args.seed) 67 | 68 | X_indices = np.asarray( 69 | [i for i in range(0, len(X_train))]).reshape(-1, 1) 70 | 71 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 72 | X_train = np.take(X_train, X_indices.ravel()) 73 | 74 | # Normalise on demand. This is an *external* flag because by 75 | # default, we should have no expectations about its efficacy 76 | # in practice. 77 | if args.normalize: 78 | tic = TotalIonCurrentNormalizer(method='sum') 79 | X_train = tic.fit_transform(X_train) 80 | X_test = tic.transform(X_test) 81 | 82 | 83 | # Static information about the data set; will be extended later on 84 | # with information about the training itself. 85 | data = { 86 | 'seed': args.seed, 87 | 'species': args.species, 88 | 'antibiotic': args.antibiotic, 89 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 90 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 91 | 'normalize': args.normalize, 92 | } 93 | 94 | param_grid = [ 95 | { 96 | 'pt__n_peaks': [50, 100, 200, 500, None], 97 | 'bv__n_bins': [75, 150, 300, 600, 1800, 3600], 98 | 'lr__penalty': ['l1', 'l2'], 99 | 'lr__C': 10. ** np.arange(-4, 5), # 10^{-4}..10^{4} 100 | }, 101 | { 102 | 'pt__n_peaks': [50, 100, 200, 500, None], 103 | 'bv__n_bins': [75, 150, 300, 600, 1800, 3600], 104 | 'lr__penalty': ['none'], 105 | } 106 | ] 107 | 108 | data['param_grid'] = param_grid 109 | 110 | # Define pipeline and cross-validation setup 111 | 112 | pipeline = Pipeline( 113 | [ 114 | ('pt', SubsetPeaksTransformer(n_peaks=0)), 115 | ('bv', BinningVectorizer( 116 | n_bins=3600, 117 | min_bin=2000, 118 | max_bin=20000)), 119 | ('std', StandardScaler()), 120 | ('lr', LogisticRegression( 121 | class_weight='balanced', 122 | solver='saga' # supports L_1 and L_2 penalties 123 | ) 124 | ) 125 | ], 126 | memory=os.getenv('TMPDIR', default=None), 127 | ) 128 | 129 | grid_search = GridSearchCV( 130 | pipeline, 131 | param_grid=param_grid, 132 | scoring='average_precision', 133 | cv=StratifiedKFold(n_splits=5, shuffle=True, 134 | random_state=42), 135 | n_jobs=-1, 136 | ) 137 | 138 | with warnings.catch_warnings(): 139 | warnings.filterwarnings('ignore', category=ConvergenceWarning) 140 | warnings.filterwarnings('ignore', category=UserWarning) 141 | 142 | # Let's do the fitting in parallel, but the prediction can be done 143 | # without additional threading. 144 | with parallel_backend('loky', n_jobs=-1): 145 | grid_search.fit(X_train, y_train) 146 | 147 | data['best_parameters'] = grid_search.best_params_ 148 | 149 | # AUPRC 150 | 151 | y_pred = grid_search.predict_proba(X_test) 152 | average_precision = average_precision_score(y_test, y_pred[:, 1]) 153 | 154 | data['average_precision'] = 100 * average_precision 155 | 156 | # Accuracy 157 | 158 | y_pred = grid_search.predict(X_test) 159 | accuracy = accuracy_score(y_test, y_pred) 160 | 161 | data['accuracy'] = 100 * accuracy 162 | 163 | if args.output is not None: 164 | with open(args.output, 'w') as f: 165 | jt.dump(data, f, indent=4) 166 | else: 167 | print(jt.dumps(data, indent=4)) 168 | -------------------------------------------------------------------------------- /ismb2020_maldi/diffusion_kernel_confidence.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Performs a confidence estimation experiment on a subset of the data 4 | # sets in order to check whether we may reject samples from another 5 | # distribution. 6 | 7 | from ismb2020_maldi.datasets import AntibioticResistanceDataset 8 | from ismb2020_maldi.datasets import EcoliAntibioticResistanceDataset 9 | from ismb2020_maldi.datasets import KpneuAntibioticResistanceDataset 10 | from ismb2020_maldi.datasets import SaureusAntibioticResistanceDataset 11 | 12 | from maldi_learn.kernels import DiffusionKernel 13 | from maldi_learn.preprocessing import TotalIonCurrentNormalizer 14 | from maldi_learn.preprocessing import SubsetPeaksTransformer 15 | from maldi_learn.preprocessing import ScaleNormalizer 16 | 17 | from sklearn.gaussian_process import GaussianProcessClassifier 18 | from sklearn.gaussian_process.kernels import RBF 19 | from sklearn.exceptions import ConvergenceWarning 20 | from sklearn.metrics import average_precision_score 21 | 22 | from imblearn.over_sampling import RandomOverSampler 23 | 24 | from joblib import parallel_backend 25 | 26 | import numpy as np 27 | import json_tricks as jt 28 | 29 | import argparse 30 | import os 31 | import warnings 32 | 33 | 34 | if __name__ == '__main__': 35 | 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('-o', '--output', type=str) 38 | parser.add_argument('-s', '--sigma', type=float, required=True) 39 | parser.add_argument('-S', '--seed', type=int, default=2020) 40 | parser.add_argument('-n', '--normalize', action='store_true') 41 | 42 | # By default, we assume that we want to use *all* the peaks because 43 | # we are comparing our model to a pre-processed pipeline. 44 | parser.add_argument('-p', '--peaks', type=int, required=False, 45 | default=None) 46 | parser.add_argument('--suffix', default='') 47 | 48 | args = parser.parse_args() 49 | 50 | in_sample_species = 'saureus' 51 | in_sample_antibiotic = 'Amoxicillin-Clavulansaeure' 52 | n_peaks = args.peaks 53 | 54 | # Antibiotics will not be used, but are specified because our data 55 | # set selection class demands it. 56 | out_of_sample_species = ['ecoli', 'kpneu'] 57 | out_of_sample_antibiotics = ['Ciprofloxacin', 'Ciprofloxacin'] 58 | 59 | args = parser.parse_args() 60 | 61 | species_to_dataset = { 62 | 'ecoli': EcoliAntibioticResistanceDataset, 63 | 'kpneu': KpneuAntibioticResistanceDataset, 64 | 'saureus': SaureusAntibioticResistanceDataset 65 | } 66 | 67 | dataset = species_to_dataset[in_sample_species]( 68 | test_size=0.20, 69 | antibiotic=in_sample_antibiotic, 70 | random_seed=args.seed, 71 | suffix=args.suffix 72 | ) 73 | 74 | X_train, y_train = dataset.training_data 75 | X_test, y_test = dataset.testing_data 76 | 77 | # Only perform scale normalisation if a suffix has been set; this 78 | # should be made configurable. 79 | if len(args.suffix) > 0: 80 | sn = ScaleNormalizer() 81 | X_train = sn.fit_transform(X_train) 82 | X_test = sn.transform(X_test) 83 | 84 | # Perform random oversampling in order to ensure class balance. This 85 | # is strictly speaking not required but we do it for the GP as well, 86 | # so in the interest of comparability, we have to do it here. 87 | 88 | ros = RandomOverSampler(random_state=args.seed) 89 | 90 | X_indices = np.asarray( 91 | [i for i in range(0, len(X_train))]).reshape(-1, 1) 92 | 93 | X_indices, y_train = ros.fit_sample(X_indices, y_train) 94 | X_train = np.take(X_train, X_indices.ravel()) 95 | 96 | # Normalise on demand. This is an *external* flag because by 97 | # default, we should have no expectations about its efficacy 98 | # in practice. 99 | if args.normalize: 100 | tic = TotalIonCurrentNormalizer() 101 | X_train = tic.fit_transform(X_train) 102 | X_test = tic.transform(X_test) 103 | 104 | # Sparsify the data by restricting everything to the peaks only. 105 | st = SubsetPeaksTransformer(n_peaks=n_peaks) 106 | 107 | X_train = st.fit_transform(X_train) 108 | X_test = st.transform(X_test) 109 | 110 | kernel = DiffusionKernel(sigma=args.sigma) 111 | clf = GaussianProcessClassifier(kernel=kernel, optimizer=None) 112 | 113 | # Static information about the data set; will be extended later on 114 | # with information about the training itself. 115 | data = { 116 | 'seed': args.seed, 117 | 'in_sample_antibiotic': in_sample_antibiotic, 118 | 'in_sample_species': in_sample_species, 119 | 'out_of_sample_species': out_of_sample_species, 120 | 'out_of_sample_antibiotics': out_of_sample_antibiotics, 121 | 'n_peaks': n_peaks, 122 | 'spectra_path': os.getenv('ANTIBIOTICS_SPECTRA_PATH'), 123 | 'endpoint_path': os.getenv('ANTIBIOTICS_ENDPOINT_PATH'), 124 | 'sigma': args.sigma, 125 | } 126 | 127 | with warnings.catch_warnings(): 128 | warnings.filterwarnings('ignore', category=ConvergenceWarning) 129 | warnings.filterwarnings('ignore', category=UserWarning) 130 | 131 | # Let's do the fitting in parallel, but the prediction can be done 132 | # without additional threading. 133 | with parallel_backend(backend='loky'): 134 | clf.fit(X_train, y_train) 135 | 136 | # Get maximum probability for classifying a sample into *any* class, 137 | # based on the test data set. 138 | test_proba = clf.predict_proba(X_test) 139 | test_proba_max = np.amax(test_proba, axis=1) 140 | 141 | data['in_sample_test_proba'] = test_proba 142 | 143 | for species, antibiotic in zip(out_of_sample_species, 144 | out_of_sample_antibiotics): 145 | 146 | oos_dataset = species_to_dataset[species]( 147 | test_size=0.20, 148 | antibiotic=antibiotic, 149 | random_seed=args.seed, 150 | suffix=args.suffix 151 | ) 152 | 153 | oos_test, _ = oos_dataset.testing_data 154 | 155 | # Only perform scale normalisation if a suffix has been set; this 156 | # should be made configurable. 157 | if len(args.suffix) > 0: 158 | oos_test = sn.transform(oos_test) 159 | 160 | if args.normalize: 161 | oos_test = tic.transform(oos_test) 162 | 163 | oos_test = st.transform(oos_test) 164 | 165 | oos_proba = clf.predict_proba(oos_test) 166 | oos_proba_max = np.amax(oos_proba, axis=1) 167 | 168 | data['out_of_sample_' + species + '_proba'] = oos_proba 169 | 170 | if args.output is not None: 171 | with open(args.output, 'w') as f: 172 | jt.dump(data, f, indent=4) 173 | else: 174 | print(jt.dumps(data, indent=4)) 175 | -------------------------------------------------------------------------------- /maldi-learn/maldi_learn/kernels.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Kernels for assessing the similarity between MALDI-TOF spectra. 3 | ''' 4 | 5 | from sklearn.gaussian_process.kernels import Hyperparameter 6 | from sklearn.gaussian_process.kernels import StationaryKernelMixin 7 | from sklearn.gaussian_process.kernels import Kernel 8 | 9 | from sklearn.metrics import pairwise_distances 10 | from sklearn.metrics import pairwise_kernels 11 | 12 | from scipy.spatial.distance import cdist 13 | from scipy.spatial.distance import pdist 14 | 15 | import numpy as np 16 | import sys 17 | 18 | 19 | class DiffusionKernel(StationaryKernelMixin, Kernel): 20 | ''' 21 | Implements a diffusion kernel that performs iterative smoothing of 22 | a MALDI-TOF spectrum. 23 | ''' 24 | 25 | def __init__(self, sigma=1.0, sigma_bounds=(1e-5, 1e5)): 26 | ''' 27 | Initialises a new instance of the kernel. 28 | 29 | Parameters: 30 | sigma: Smoothing parameter 31 | sigma_bounds: Tuple specifying the minimum and maximum bound 32 | of the sigma scale parameter. 33 | ''' 34 | 35 | self.sigma = sigma 36 | self.sigma_bounds = sigma_bounds 37 | 38 | def passthrough(*args, **kwargs): 39 | return args 40 | 41 | module = sys.modules['sklearn.metrics.pairwise'] 42 | module.check_pairwise_arrays = passthrough 43 | 44 | sys.modules['sklearn.metrics.pairwise'] = module 45 | 46 | @property 47 | def hyperparameter_sigma(self): 48 | return Hyperparameter('sigma', 'numeric', self.sigma_bounds) 49 | 50 | @property 51 | def requires_vector_input(self): 52 | ''' 53 | Returns whether the kernel works only on fixed-length feature 54 | vectors. 55 | ''' 56 | 57 | return False 58 | 59 | def __call__(self, X, Y=None, eval_gradient=False): 60 | ''' 61 | Returns the kernel value k(X, Y) and, if desired, its gradient 62 | as well. 63 | 64 | Parameters 65 | ---------- 66 | X : array of spectra 67 | Left argument of the returned kernel k(X, Y) 68 | Y : array of spectra 69 | Right argument of the returned kernel k(X, Y). If None, k(X, X) 70 | if evaluated instead. 71 | eval_gradient : bool (optional, default=False) 72 | Determines whether the gradient with respect to the kernel 73 | hyperparameter is determined. Only supported when Y is None. 74 | 75 | Returns 76 | ------- 77 | K : array, shape (n_samples_X, n_samples_Y) 78 | Kernel k(X, Y) 79 | K_gradient : array (opt.), shape (n_samples_X, n_samples_X, n_dims) 80 | The gradient of the kernel k(X, X) with respect to the 81 | hyperparameter of the kernel. Only returned when eval_gradient 82 | is True. 83 | ''' 84 | 85 | def evaluate_kernel(x, y): 86 | 87 | # Get the positions (masses) of the two spectra. This could 88 | # be rewritten more compactly following the new interface. 89 | # 90 | # TODO: simplify / refactor 91 | x_positions = np.array(x[:, 0]).reshape(-1, 1) 92 | y_positions = np.array(y[:, 0]).reshape(-1, 1) 93 | 94 | distances = pairwise_distances( 95 | x_positions, 96 | y_positions, 97 | metric='sqeuclidean' 98 | ) 99 | 100 | # Calculate scale factors as the outer product of the peak 101 | # heights of the input data. 102 | x_peaks = np.array(x[:, 1]) 103 | y_peaks = np.array(y[:, 1]) 104 | 105 | P = np.outer(x_peaks, y_peaks) 106 | K = np.multiply(P, np.exp(-distances / (4 * self.sigma))) 107 | 108 | return np.sum(K) / (4 * self.sigma * np.pi) 109 | 110 | def evaluate_gradient(x, y): 111 | 112 | # TODO: simplify / refactor 113 | x_positions = np.array(x[:, 0]).reshape(-1, 1) 114 | y_positions = np.array(y[:, 0]).reshape(-1, 1) 115 | 116 | distances = pairwise_distances( 117 | x_positions, 118 | y_positions, 119 | metric='sqeuclidean' 120 | ) 121 | 122 | # Calculate scale factors as the outer product of the peak 123 | # heights of the input data. 124 | x_peaks = np.array(x[:, 1]) 125 | y_peaks = np.array(y[:, 1]) 126 | 127 | P = np.outer(x_peaks, y_peaks) 128 | K = np.multiply(P, np.exp(-distances / (4 * self.sigma))) 129 | 130 | # Thanks to the simple form of the kernel, the gradient only 131 | # requires an additional multiplication, followed by scaling 132 | # it appropriately. 133 | K_gradient = np.multiply(K, (distances - 4 * self.sigma)) 134 | 135 | # Sum over all pairwise kernel values to get the full 136 | # gradient between the two entries. 137 | return np.sum(K_gradient) / (4 * self.sigma**2) 138 | 139 | if Y is None: 140 | if eval_gradient: 141 | K = pairwise_kernels(X, metric=evaluate_kernel) 142 | K_gradient = pairwise_kernels(X, metric=evaluate_gradient) 143 | 144 | return K, K_gradient[:, :, np.newaxis] 145 | 146 | else: 147 | return pairwise_kernels(X, metric=evaluate_kernel) 148 | else: 149 | 150 | # Following the original API here, which prohibits gradient 151 | # evaluation for this case. 152 | if eval_gradient: 153 | raise ValueError( 154 | 'Gradient can only be evaluated when Y is None.') 155 | 156 | return pairwise_kernels(X, Y, metric=evaluate_kernel) 157 | 158 | def diag(self, X): 159 | ''' 160 | Returns the diagonal of the kernel k(X, X). The result of this 161 | method is identical to np.diag(self(X)); however, it can be 162 | evaluated more efficiently since only the diagonal is evaluated. 163 | 164 | Parameters 165 | ---------- 166 | X : array, shape (n_samples_X, n_features) 167 | Left argument of the returned kernel k(X, Y) 168 | Returns 169 | ------- 170 | K_diag : array, shape (n_samples_X,) 171 | Diagonal of kernel k(X, X) 172 | ''' 173 | 174 | diag_values = np.zeros(len(X)) 175 | 176 | for i, x in enumerate(X): 177 | x_positions = np.array(x[:, 0]).reshape(-1, 1) 178 | 179 | distances = pairwise_distances( 180 | x_positions, 181 | x_positions, 182 | metric='sqeuclidean' 183 | ) 184 | 185 | x_peaks = np.array(x[:, 1]) 186 | 187 | P = np.outer(x_peaks, x_peaks) 188 | K = np.multiply(P, np.exp(-distances / (4 * self.sigma))) 189 | 190 | # Diagonal value for $x_i$ 191 | diag_values[i] = np.sum(K) 192 | 193 | return diag_values / (4 * self.sigma * np.pi) 194 | 195 | def __repr__(self): 196 | return f'{self.__class__.__name__}({self.sigma:.8f})' 197 | -------------------------------------------------------------------------------- /maldi-learn/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------