├── .gitignore
├── Makefile
├── README.md
├── bioblp
├── __init__.py
├── benchmarking
│ ├── README.md
│ ├── __init__.py
│ ├── config.py
│ ├── encoders.py
│ ├── experiment.py
│ ├── featurise.py
│ ├── hpo.py
│ ├── preprocess.py
│ ├── split.py
│ ├── train.py
│ ├── train_runner.py
│ └── train_utils.py
├── data.py
├── evaluate.py
├── loaders
│ └── preprocessors.py
├── logger.py
├── models
│ ├── __init__.py
│ ├── bioblp.py
│ └── encoders.py
├── predict.py
├── preprocess.py
├── train.py
├── train_argparse.py
└── utils
│ ├── __init__.py
│ ├── bioblp_utils.py
│ ├── pipeline.py
│ ├── training.py
│ ├── triples.py
│ └── util.py
├── conf
├── complex-biokg-20220826.toml
├── complex-biokg-full-20220826.toml
├── complex-hetionet-20220826.toml
├── dpi-benchmark-cv-20230423-lr.toml
├── dpi-benchmark-cv-20230423-mlp-1.toml
├── dpi-benchmark-cv-20230423-mlp-2.toml
├── dpi-benchmark-cv-20230423-rf.toml
├── dpi-benchmark-cv-r1-20230424-mlp.toml
└── dpi-benchmark-cv-r1-20230424-rflr.toml
├── data
└── conf
│ ├── complex-biokg-20220826.toml
│ ├── complex-biokg-full-20220826.toml
│ └── complex-hetionet-20220826.toml
├── environment.yml
├── fig.png
├── jobs
├── biokg-bioblp-d-complex-initialized.sh
├── biokg-bioblp-d-complex.sh
├── biokg-bioblp-d-rotate-initialized.sh
├── biokg-bioblp-d-rotate.sh
├── biokg-bioblp-d-transe-initialized.sh
├── biokg-bioblp-d-transe.sh
├── biokg-bioblp-m-complex-bce-sweep.sh
├── biokg-bioblp-m-complex-bce-sweep.yml
├── biokg-bioblp-m-rotate-adagrad-sweep.sh
├── biokg-bioblp-m-rotate-adagrad-sweep.yml
├── biokg-bioblp-m-rotate-sweep.sh
├── biokg-bioblp-m-rotate-sweep.yml
├── biokg-bioblp-m-transe-sweep.sh
├── biokg-bioblp-m-transe-sweep.yml
├── biokg-bioblp-p-complex-bce-sweep.sh
├── biokg-bioblp-p-complex-bce-sweep.yml
├── biokg-bioblp-p-complex-initialized.sh
├── biokg-bioblp-p-rotate-initialized.sh
├── biokg-bioblp-p-rotate-sweep.sh
├── biokg-bioblp-p-rotate-sweep.yml
├── biokg-bioblp-p-transe-initialized.sh
├── biokg-bioblp-p-transe-sweep.sh
├── biokg-bioblp-p-transe-sweep.yml
├── biokg-complex-bce-sweep.sh
├── biokg-complex-bce-sweep.yml
├── biokg-complex-sweep.sh
├── biokg-complex-sweep.yml
├── biokg-rotate-bce-sweep.sh
├── biokg-rotate-bce-sweep.yml
├── biokg-rotate-sweep.sh
├── biokg-rotate-sweep.yml
├── biokg-transe-sweep.sh
├── biokg-transe-sweep.yml
├── complex.sh
├── hetionet-complex-bce-sweep.sh
├── hetionet-complex-bce-sweep.yml
├── hetionet-complex-sweep.sh
├── hetionet-complex-sweep.yml
├── hetionet-rotate-bce-sweep.sh
├── hetionet-rotate-bce-sweep.yml
├── hetionet-rotate-sweep.sh
├── hetionet-rotate-sweep.yml
├── hetionet-transe-sweep.sh
├── hetionet-transe-sweep.yml
├── rotate-dummy.sh
└── rotate.sh
├── loaders
└── placeholder.txt
├── logs
└── placeholder.txt
├── notebooks
├── 00-clean-biokg-benchmarks.ipynb
├── 01-generate-biokg-splits.ipynb
├── 01_01_biokg-data-prep-for-kge.ipynb
├── 01_01_disease_mesh_notes_retrieval.ipynb
├── 01_02_disease_bert_encodings.ipynb
├── 02-01-biokg_benchmarks_eda.ipynb
├── 02-02-biokg_benchmarks_data_prep.ipynb
├── 02-03-benchmark-results.ipynb
├── 02-03-biokg_benchmarks_data_embedders.ipynb
├── 02_01_01-biokg_benchmarks_eda.ipynb
├── 02_01_02-biokg_benchmark-reconciliation.ipynb
├── 02_03_01-biokg_bm_dpi_clf-mlp.ipynb
├── 02_04_01-sanity-check-benchmark-ppi.ipynb
├── 02_04_01_biokg_bm_dpi_clf_nestedcv.ipynb
├── 02_99-benchmark-prep-yamanashi-dpi.ipynb
├── 03-00-nested-cv.ipynb
├── 03-frequency-baseline.ipynb
├── 04_00_ProtTrans_embeddings_biokG.ipynb
├── 04_01_Load & merge protein embeddings_BioKG.ipynb
├── 05-00-Load HetioNet - Get Gene to Protein mappings.ipynb
├── 05-01-Load HetioNet - Protein Embedding Generation.ipynb
├── 06-01 - Molecular Embeddings - BioKG.ipynb
├── 06-hetionet.ipynb
├── 07_00_evaluate-link-prediction_archived.ipynb
├── 07_01_eval_lp_deepdive.ipynb
├── 07_02_eval_lp_node_degree_effect.ipynb
├── 08-00-evaluate-link-prediction.ipynb
├── 08-01-inductive-evaluation.ipynb
├── 08-02-per-triple-evaluation.ipynb
├── 09-wandb-hparam-figures.ipynb
├── 10-pretraining-significance.ipynb
├── 11-pretraining-curves.ipynb
├── 12-per-relation-figures.ipynb
├── 13-node-degree-analysis-v2.ipynb
├── 99-train_hetionet.ipynb
└── nb_utils
│ └── eval_utils.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── tests
├── __init__.py
├── benchmarking
├── __init__.py
├── bm_test_conf.toml
├── test_config.py
├── test_encoders.py
├── test_featurise.py
└── test_train.py
├── test_encoders.py
└── test_version.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv*
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # Data paths
132 | data/
133 |
134 | # Generated artifacts
135 | wandb/
136 | /models
137 |
138 | # Editor
139 | .vscode
140 |
141 | # PyCharm
142 | .idea
143 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Self-Documented Makefile https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
2 | .PHONY: clean setup install
3 |
4 | #################################################################################
5 | # GLOBALS #
6 | #################################################################################
7 |
8 | SHELL=/bin/bash
9 | PYTHON = python
10 | PROJECT_NAME = bioblp
11 | PACKAGE_NAME = bioblp
12 | PYTHON_INTERPRETER = python3
13 | KERNEL_NAME=Python (${PROJECT_NAME})
14 | PYTHON_FULL_V = $(shell python -V)
15 | PYTHON_V := $(PYTHON_FULL_V:Python%=%)
16 | CONDA_ENV=${PROJECT_NAME}-env
17 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
18 | CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate
19 | #PYTHON_V=3.8.6
20 |
21 | #################################################################################
22 | # COMMANDS #
23 | #################################################################################
24 |
25 | default: help
26 |
27 | print-%: ## Prints a variable value. Usage: make print-VARIABLE, eg: make print-TAG, result: TAG = 0.0.0
28 | @echo $* = $($*)
29 |
30 | setup:
31 | make install_poetry
32 | @echo $(shell poetry --version) || "Install Poetry"
33 |
34 | install_poetry: ## installs poetry. Remember to `source /home/jovyan/.poetry/env` from a terminal after running this recipe. Need only be run once
35 | curl -sSL https://install.python-poetry.org | python3 -
36 |
37 | install:
38 | poetry install
39 | poetry export -f requirements.txt --without-hashes --with dev --output requirements.txt
40 |
41 |
42 | update:
43 | poetry update
44 | poetry export -f requirements.txt --without-hashes --with dev --output requirements.txt
45 |
46 | test:
47 | poetry run pytest tests -s -vv
48 |
49 | create_ipython_kernel:
50 | poetry run ipython kernel install --user --display-name="${KERNEL_NAME}"
51 |
52 | freeze_requirements: ## Writes python project dependencies as a requirements.txt
53 | poetry export -f requirements.txt --output requirements.txt --without-hashes
54 |
55 | freeze_dev_requirements: ## Writes python project dependencies (including dev) as a requirements-dev.txt
56 | poetry export -f requirements.txt --output requirements-dev.txt --without-hashes --dev
57 |
58 | dist: ## Builds a distribution package with version ${PACKAGE_NAME}.__version__, eg: dist/test_me-0.0.0.tar.gz
59 | make clean
60 | poetry build
61 |
62 |
63 | ### JH setup
64 |
65 | setup_jh_env:
66 | make conda_setup
67 | make create_conda_env
68 | make create_conda_kernel
69 |
70 | conda_setup: # ensures conda env is persistent, need run only once
71 | mkdir -p /home/jovyan/.conda/pkgs/
72 | touch /home/jovyan/.conda/pkgs/urls.txt
73 |
74 | create_conda_env:
75 | conda create --yes --prefix /home/jovyan/.conda/envs/${CONDA_ENV} ipykernel
76 | #conda create --yes --prefix /home/jovyan/.conda/envs/${CONDA_ENV} python==${PYTHON_V} ipykernel
77 | ($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make setup | source /home/jovyan/.poetry/env)
78 | # to install the project module as a dependency
79 | ($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make install)
80 | conda env export -n ${CONDA_ENV} -f ${PROJECT_DIR}/environment.yml
81 |
82 | create_conda_kernel:
83 | python -m ipykernel install --user --name=${CONDA_ENV} --display-name="${KERNEL_NAME}"
84 |
85 | update_conda_env:
86 | #($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make update)
87 | conda env update --name ${CONDA_ENV} -f ${PROJECT_DIR}/environment.yml --prune
88 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BioBLP: A Modular Framework for Learning on Multimodal Biomedical Knowledge Graphs
2 |
8 |
9 |
10 |
11 |

12 |
13 |
14 |
15 | This is the official repository implementing BioBLP, presented in "BioBLP: A Modular Framework for Learning on Multimodal Biomedical Knowledge Graphs", published in the Journal of Biomedical Semantics ([link](https://doi.org/10.1186/s13326-023-00301-y)).
16 |
17 | BioBLP is a framework that allows encoding a diverse set of multimodal data that can appear in biomedical knowledge graphs. It is based on the idea of learning embeddings for each modality separately, and then combining them into a single multimodal embedding space. The framework is modular, and allows for easy integration of new modalities.
18 |
19 | To cite our work, please use the following:
20 |
21 | ```bibtex
22 | @article{bioblp,
23 | author = {Daniel Daza and
24 | Dimitrios Alivanistos and
25 | Payal Mitra and
26 | Thom Pijnenburg and
27 | Michael Cochez and
28 | Paul Groth},
29 | title = {BioBLP: a modular framework for learning on multimodal biomedical
30 | knowledge graphs},
31 | journal = {J. Biomed. Semant.},
32 | volume = {14},
33 | number = {1},
34 | pages = {20},
35 | year = {2023},
36 | url = {https://doi.org/10.1186/s13326-023-00301-y},
37 | doi = {10.1186/S13326-023-00301-Y},
38 | }
39 | ```
40 |
41 | ## Usage
42 |
43 | ### 1. Install the requirements
44 |
45 | We recommend using [Anaconda](https://www.anaconda.com/) to manage the dependencies. The following command will create and activate a new conda environment with all the required dependencies.
46 |
47 | ```bash
48 | conda create -f environment.yml && conda activate bioblp
49 | ```
50 |
51 | ### 2. Download the data
52 |
53 | The data can be downloaded from [here](https://doi.org/10.5281/zenodo.8005711) as a tar.gz file. This corresponds to our version of BioKG that has been decoupled from the benchmarks (see the paper for more details), and it also includes the necessary attribute data for proteins, molecules, and diseases.
54 | The file should be placed inside the `data` folder and decompressed:
55 |
56 | ```bash
57 | tar xzf biokgb.tar.gz
58 | ```
59 |
60 | ### 3. Training link prediction models
61 |
62 | Use the `bioblp.train` module to train a link prediction model. For example, to train a BioBLP-D model (which encodes disease descriptions) using the RotatE scoring function, use:
63 |
64 | ```sh
65 | python -m bioblp.train \
66 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
67 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
68 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
69 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
70 | --model=rotate --dimension=256 --loss_fn=crossentropy --optimizer=adam \
71 | --learning_rate=2e-5 --warmup_fraction=0.05 --num_epochs=100 \
72 | --batch_size=1024 --eval_batch_size=64 --num_negatives=512 --in_batch_negatives=True
73 | ```
74 |
75 | The above command on a NVIDIA A100 40G GPU takes about 9 hours to train.
76 |
77 | We use Weights and Biases to log the experiments, which is disabled by default. To enable it, add `--log_wandb=True` to the command above.
78 |
79 | More examples will be added soon.
80 |
81 | ### 4. Benchmark tasks
82 | * Pre-generate the input dataset with flags indicating if they are known or novel links.
83 | * Run `bioblp.benchmarking.preprocess.py` to prepare BM dataset for ML by shuffling, splits, etc.
84 | * `bioblp.benchmarking.featurize.py` can be used to featurize a list of pair wise entities into vectors composed from individual vector entities.
85 |
86 | Custom usage:
87 | ```bash
88 | $ python -m bioblp.benchmarking.featurize -i data/benchmarks/processed/dpi_benchmark_p2n-1-10.tsv -o data/features -t kgem -f models/1baon0eg/ -j concatenate
89 | ```
90 |
--------------------------------------------------------------------------------
/bioblp/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 |
--------------------------------------------------------------------------------
/bioblp/benchmarking/README.md:
--------------------------------------------------------------------------------
1 | # Benchmark
2 |
3 | ## Experiment preparation
4 | Command to prepare experimental data, given config file. This script will load the raw benchmark dataset, perform negative sampling, generate features and splits:
5 |
6 | ```bash
7 | python bioblp/benchmarking/experiment.py \
8 | --conf=conf/dpi-benchmark-cv-20230413.toml \
9 | --override_data_root=./ \
10 | --bm_file=data/benchmarks/transductive/dpi_fda.tsv \
11 | --n_proc=1
12 | ```
13 |
14 | You can execute the steps in `experiment.py` individually with the below.
15 |
16 | 1. Negative sampling.
17 | ```bash
18 | python bioblp/benchmarking/preprocess.py \
19 | --bm_data_path=data/benchmarks/experiments/DPI/1681398697/features/raw.pt \
20 | --kg_triples_dir=data/benchmarks/experiments/encoders/rotate/training_triples/ \
21 | --num_negs_per_pos=10 \
22 | --outdir=data/benchmarks/experiments/DPI/1681398697/sampled/ \
23 | --override_run_id=1681398697
24 | ```
25 |
26 | 2. Generate features.
27 |
28 | ```bash
29 | python bioblp/benchmarking/featurise.py \
30 | --conf=conf/dpi-benchmark-cv-20230413.toml \
31 | --bm_file=data/benchmarks/experiments/DPI/1681398697/sampled/dpi_fda_p2n-1-10.tsv \
32 | --override_data_root=./ \
33 | --override_run_id=1681398697
34 |
35 | ```
36 |
37 | 3. Preparing data splits for cross validation.
38 |
39 | ```bash
40 | python bioblp/benchmarking/split.py \
41 | --conf=conf/dpi-benchmark-cv-20230413.toml \
42 | --data=data/benchmarks/experiments/DPI/1681398697/features/raw.pt \
43 | --outdir=data/benchmarks/experiments/DPI/1681398697/splits/ \
44 | --n_folds=5 \
45 | --override_data_root=./ \
46 | --override_run_id=1681398697
47 | ```
48 |
49 | ## Model training
50 |
51 | Sample command for `train.py`. This script performs the training procedure for one model configuration, on one particular data split.
52 | ```bash
53 | python bioblp/benchmarking/train.py \
54 | --model_clf=RF \
55 | --model_feature=complex \
56 | --feature_dir=data/benchmarks/experiments/dpi_fda/1681301749/features/ \
57 | --splits_path=data/benchmarks/experiments/dpi_fda/1681301749/splits/train-test-split.pt \
58 | --split_idx=0 \
59 | --n_iter=3 \
60 | --refit_params=AUCPR,AUCROC \
61 | --outdir=data/benchmarks/experiments/dpi_fda/1681301749/models/ \
62 | --model_label=complex__RF \
63 | --timestamp=1681301749 \
64 | --wandb_tag=dev
65 | ```
66 |
67 | The `train_runner` script contains the procedure to run a full experiment, given a configuration file. This will perform the complete CV routine for all model configurations contained in the config file. Also supports multiprocessing through the `--n_proc` flag. For example,
68 | ```bash
69 | python bioblp/benchmarking/train_runner.py \
70 | --conf conf/dpi-benchmark-cv-20230413.toml \
71 | --override_data_root=./ \
72 | --override_run_id=1681398697 \
73 | --tag=dpi-20230413 \
74 | --n_proc=5
75 | ```
76 |
77 | In its current implementations here, the multiprocessing capability conflicts with PyTorch on GPU. For MLP models using GPU, we recommend setting `--n_proc=1`:
78 | ```bash
79 | python bioblp/benchmarking/train_runner.py \
80 | --conf conf/dpi-benchmark-cv-20230413-mlp.toml \
81 | --override_data_root=./ \
82 | --override_run_id=1681398697 \
83 | --tag=dpi-20230413 \
84 | --n_proc=1
85 | ```
86 |
87 | ## WandB logging
88 |
89 | By default logging to WandB is turned off. Change the assignments to `LOG_WANDB = True` in `train.py` for logging.
--------------------------------------------------------------------------------
/bioblp/benchmarking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/benchmarking/__init__.py
--------------------------------------------------------------------------------
/bioblp/benchmarking/config.py:
--------------------------------------------------------------------------------
1 |
2 | import abc
3 | import toml
4 | import json
5 |
6 | from dataclasses import dataclass, field
7 | from typing import List
8 | from pathlib import Path
9 |
10 |
11 | def load_toml(toml_path: str) -> dict:
12 | toml_path = Path(toml_path)
13 | config = {}
14 | with open(toml_path, "r") as f:
15 | config = toml.load(f)
16 |
17 | return config
18 |
19 |
20 | class ConfigJSONEncoder(json.JSONEncoder):
21 | def default(self, obj):
22 | # add conditional logic for any data structures that require special care
23 | # handling serialisation of Enum objects
24 | if isinstance(obj, Path):
25 | return str(obj.resolve())
26 | return json.JSONEncoder.default(self, obj)
27 |
28 |
29 | @dataclass
30 | class BenchmarkStepBaseConfig(abc.ABC):
31 | data_root: str
32 | experiment_root: str
33 | run_id: str
34 | outdir: str
35 |
36 | @classmethod
37 | def from_toml(cls, toml_path, run_id):
38 | raise NotImplementedError
39 |
40 | def resolve_outdir(self) -> Path:
41 | outdir = Path(self.data_root)\
42 | .joinpath(self.experiment_root)\
43 | .joinpath(self.run_id)\
44 | .joinpath(self.outdir)
45 |
46 | return outdir
47 |
48 |
49 | @dataclass
50 | class BenchmarkPreprocessConfig(BenchmarkStepBaseConfig):
51 | num_negs_per_pos: int
52 | kg_triples_dir: str
53 |
54 | @classmethod
55 | def from_toml(cls, toml_path: str, run_id: str):
56 | config_toml = load_toml(toml_path)
57 |
58 | cfg = config_toml.get("sampling")
59 |
60 | data_root = config_toml.get("data_root")
61 | experiment_root = config_toml.get("experiment_root")
62 |
63 | cfg.update({"data_root": data_root})
64 | cfg.update({"experiment_root": experiment_root})
65 | cfg.update({"run_id": run_id})
66 |
67 | return cls(**cfg)
68 |
69 |
70 | @dataclass
71 | class BenchmarkFeatureConfig(BenchmarkStepBaseConfig):
72 | transform: str
73 | missing_values: str
74 | encoders: list
75 | encoder_args: dict
76 |
77 | @classmethod
78 | def from_toml(cls, toml_path: str, run_id: str):
79 | conf_path = Path(toml_path)
80 | config_toml = load_toml(conf_path)
81 |
82 | data_root = config_toml.get("data_root")
83 | experiment_root = config_toml.get("experiment_root")
84 |
85 | cfg = config_toml.get("features")
86 |
87 | cfg.update({"data_root": data_root})
88 | cfg.update({"experiment_root": experiment_root})
89 | cfg.update({"run_id": run_id})
90 |
91 | return cls(**cfg)
92 |
93 |
94 | @dataclass
95 | class BenchmarkSplitConfig(BenchmarkStepBaseConfig):
96 | n_splits: int
97 |
98 | @classmethod
99 | def from_toml(cls, toml_path: str, run_id: str):
100 | conf_path = Path(toml_path)
101 | config_toml = load_toml(conf_path)
102 |
103 | data_root = config_toml.get("data_root")
104 | experiment_root = config_toml.get("experiment_root")
105 |
106 | cfg = config_toml.get("split")
107 |
108 | cfg.update({"data_root": data_root})
109 | cfg.update({"experiment_root": experiment_root})
110 | cfg.update({"run_id": run_id})
111 |
112 | return cls(**cfg)
113 |
114 |
115 | @dataclass
116 | class BenchmarkTrainConfig(BenchmarkStepBaseConfig):
117 | feature_dir: str
118 | splits_dir: str
119 | splits_file: str
120 | models: dict
121 | refit_params: List[str]
122 | n_iter: int = field(default=10, metadata={"help": "Number of HPO trials"})
123 |
124 | @classmethod
125 | def from_toml(cls, toml_path, run_id):
126 | conf = load_toml(toml_path=toml_path)
127 | cfg = {}
128 |
129 | cfg["models"] = conf.get("models")
130 |
131 | cfg.update(conf.get("train"))
132 |
133 | cfg["data_root"] = conf.get("data_root")
134 | cfg["experiment_root"] = conf.get("experiment_root")
135 | cfg["feature_dir"] = conf.get("features").get("outdir")
136 | cfg["splits_dir"] = conf.get("split").get("outdir")
137 |
138 | cfg.update({"run_id": run_id})
139 |
140 | return cls(**cfg)
141 |
142 | def resolve_feature_dir(self) -> Path:
143 | feature_dir = Path(self.data_root)\
144 | .joinpath(self.experiment_root)\
145 | .joinpath(self.run_id)\
146 | .joinpath(self.feature_dir)
147 |
148 | return feature_dir
149 |
150 | def resolve_splits_file(self) -> Path:
151 | splits_path = Path(self.data_root)\
152 | .joinpath(self.experiment_root)\
153 | .joinpath(self.run_id)\
154 | .joinpath(self.splits_dir)\
155 | .joinpath(self.splits_file)
156 |
157 | return splits_path
158 |
--------------------------------------------------------------------------------
/bioblp/benchmarking/experiment.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from time import time
3 | from pathlib import Path
4 | from bioblp.benchmarking.preprocess import main as sampling_main
5 | from bioblp.benchmarking.config import BenchmarkPreprocessConfig
6 |
7 | from bioblp.benchmarking.featurise import main as featurise_main
8 | from bioblp.benchmarking.split import main as split_main
9 |
10 |
11 | def run_experiment(args):
12 |
13 | experiment_id = str(int(time()))
14 |
15 | override_data_root = Path(
16 | args.override_data_root) if args.override_data_root is not None else None
17 |
18 | #
19 | # Negative sampling
20 | #
21 | preprocess_config = BenchmarkPreprocessConfig.from_toml(
22 | args.conf, run_id=experiment_id)
23 |
24 | if override_data_root:
25 | preprocess_config.data_root = override_data_root
26 |
27 | sampled_bm_filepath = sampling_main(bm_data_path=args.bm_file,
28 | kg_triples_dir=preprocess_config.kg_triples_dir,
29 | num_negs_per_pos=preprocess_config.num_negs_per_pos,
30 | outdir=preprocess_config.resolve_outdir(),
31 | override_run_id=experiment_id)
32 | #
33 | # Prepare features
34 | #
35 | featurise_main(bm_file=sampled_bm_filepath,
36 | conf=args.conf,
37 | override_data_root=override_data_root,
38 | override_run_id=experiment_id)
39 | #
40 | # Prepare splits
41 | #
42 | split_main(data=sampled_bm_filepath,
43 | conf=args.conf,
44 | override_data_root=override_data_root,
45 | override_run_id=experiment_id)
46 |
47 |
48 | def get_parser() -> ArgumentParser:
49 | parser = ArgumentParser(
50 | description="Run full benchmark experiment procedure")
51 | parser.add_argument("--conf", type=str,
52 | help="Path to experiment configuration")
53 | parser.add_argument("--bm_file", type=str, help="Path to benchmark data")
54 | parser.add_argument("--override_data_root", type=str, default=None,
55 | help="Path to root of data tree")
56 | parser.add_argument(
57 | "--n_proc", type=int, default=-1, help="Number of cores to use in process."
58 | )
59 | parser.add_argument("--tag", type=str,
60 | help="Optional tag to add to wandb runs")
61 | parser.add_argument("--dev_run", action='store_true',
62 | help="Quick dev run")
63 | return parser
64 |
65 |
66 | if __name__ == "__main__":
67 |
68 | args = get_parser().parse_args()
69 | run_experiment(args)
70 |
--------------------------------------------------------------------------------
/bioblp/benchmarking/featurise.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import json
3 |
4 | import pandas as pd
5 | import numpy as np
6 |
7 | from argparse import ArgumentParser
8 | from dataclasses import asdict
9 | from functools import reduce
10 |
11 | from torch import Tensor
12 |
13 | from pathlib import Path
14 | from time import time
15 | from tqdm import tqdm
16 |
17 | from typing import Tuple, List, Dict
18 |
19 | from bioblp.logger import get_logger
20 | from bioblp.benchmarking.encoders import get_encoder
21 | from bioblp.benchmarking.encoders import MissingValueMethod
22 | from bioblp.benchmarking.encoders import EntityPairEncoder
23 | from bioblp.benchmarking.encoders import EntityEncoder
24 | from bioblp.benchmarking.encoders import NoiseEncoder
25 | from bioblp.benchmarking.encoders import StructuralPairEncoder
26 | from bioblp.benchmarking.encoders import RandomNoisePairEncoder
27 | from bioblp.benchmarking.encoders import KGEMPairEncoder
28 | from bioblp.benchmarking.config import BenchmarkFeatureConfig, ConfigJSONEncoder
29 |
30 | from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET
31 | from bioblp.benchmarking.encoders import ROTATE, TRANSE, COMPLEX, STRUCTURAL, NOISE, LABEL
32 |
33 |
34 | logger = get_logger(__name__)
35 |
36 |
37 | #
38 | # Building script
39 | #
40 |
41 |
42 | def save_features(outdir: Path, label: str, feature: Tensor, labels: Tensor):
43 | outfile = outdir.joinpath(f"{label}.pt")
44 |
45 | torch_obj = {"X": feature, "y": labels}
46 | torch.save(torch_obj, outfile)
47 |
48 |
49 | def build_encodings(config: BenchmarkFeatureConfig, pairs: np.array, encoders: List[str],
50 | encoder_args: Dict[str, dict], entities_filter: List[str]) -> Tuple[str, Tensor, Tensor]:
51 | encoded_bm = []
52 |
53 | for encoder_i_label in tqdm(encoders, desc=f"Encoding benchmarks..."):
54 | logger.info(f"Encoding with {encoder_i_label}")
55 | encoder_i_args = encoder_args.get(encoder_i_label)
56 |
57 | pair_encoder = get_encoder(encoder_i_label,
58 | encoder_i_args,
59 | entities=entities_filter)
60 |
61 | missing_value_method = MissingValueMethod(config.missing_values)
62 |
63 | encoded_pairs, encoded_mask = pair_encoder.encode(pairs,
64 | missing_value=missing_value_method,
65 | transform=config.transform)
66 |
67 | encoded_bm.append((encoder_i_label, encoded_pairs, encoded_mask))
68 | return encoded_bm
69 |
70 |
71 | def apply_common_mask(encoded_bm: List[Tuple[str, Tensor, Tensor]], labels: Tensor) -> Tuple[List[Tuple[str, Tensor]], Tensor]:
72 | logger.info("Masking features...")
73 |
74 | all_masks = [x[2] for x in encoded_bm]
75 | common_mask = torch.from_numpy(reduce(np.intersect1d, all_masks))
76 |
77 | logger.info(f"size after common mask {len(common_mask)}")
78 |
79 | masked_encoded_bm = []
80 | for enc_label, enc_pairs, _ in encoded_bm:
81 | masked_enc_pairs = enc_pairs[common_mask]
82 | masked_encoded_bm.append((enc_label, masked_enc_pairs))
83 |
84 | masked_labels = labels[common_mask]
85 |
86 | return masked_encoded_bm, masked_labels
87 |
88 |
89 | def main(bm_file: str, conf: str, override_data_root=None, override_run_id=None):
90 |
91 | run_id = override_run_id or str(int(time()))
92 |
93 | config = BenchmarkFeatureConfig.from_toml(conf, run_id=run_id)
94 |
95 | if override_data_root is not None:
96 | config.data_root = override_data_root
97 |
98 | logger.info(
99 | f"Running process with config: {config} at time {run_id}...")
100 |
101 | # load benchmark data
102 | # here entities are strings
103 |
104 | bm_df = pd.read_csv(bm_file, sep='\t', names=[
105 | COL_SOURCE, COL_EDGE, COL_TARGET, LABEL], header=0)
106 |
107 | pairs = bm_df[[COL_SOURCE, COL_TARGET]].values
108 | all_entities = np.unique(np.ravel(pairs)).tolist()
109 |
110 | labels = torch.from_numpy(bm_df[LABEL].values)
111 |
112 | # perform encodings
113 | encoded_bm = build_encodings(config=config,
114 | pairs=pairs,
115 | encoders=config.encoders,
116 | encoder_args=config.encoder_args,
117 | entities_filter=all_entities)
118 |
119 | # add plain benchmark data too
120 | encoded_bm.append(("raw", pairs, np.arange(len(pairs))))
121 |
122 | # common mask only when dropping missing embeddings
123 | if config.missing_values == MissingValueMethod.DROP.value:
124 | masked_encoded_bm, masked_labels = apply_common_mask(
125 | encoded_bm, labels)
126 | else:
127 | masked_encoded_bm = [(x[0], x[1]) for x in encoded_bm]
128 | masked_labels = labels
129 |
130 | feature_outdir = config.resolve_outdir()
131 |
132 | feature_outdir.mkdir(parents=True, exist_ok=True)
133 |
134 | logger.info(f"Saving features to {feature_outdir}...")
135 |
136 | for enc_label, enc_pairs in masked_encoded_bm:
137 | logger.info(
138 | f"Saving {enc_label} features with shape: {enc_pairs.shape}")
139 | save_features(outdir=feature_outdir,
140 | label=enc_label,
141 | feature=enc_pairs,
142 | labels=masked_labels)
143 |
144 | with open(feature_outdir.joinpath("config.json"), "w") as f:
145 | cfg_dict = asdict(config)
146 | json.dump(cfg_dict, f, cls=ConfigJSONEncoder)
147 |
148 |
149 | def get_parser() -> ArgumentParser:
150 | parser = ArgumentParser(
151 | description="Generate features for benchmark datasets")
152 | parser.add_argument("--conf", type=str,
153 | help="Path to experiment configuration")
154 | parser.add_argument("--bm_file", type=str, help="Path to benchmark data")
155 | parser.add_argument("--override_data_root", type=str,
156 | help="Path to root of data tree")
157 | parser.add_argument("--override_run_id", type=str,
158 | help="Override run_id")
159 |
160 | return parser
161 |
162 |
163 | if __name__ == "__main__":
164 |
165 | args = get_parser().parse_args()
166 |
167 | main(**vars(args))
168 |
--------------------------------------------------------------------------------
/bioblp/benchmarking/preprocess.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from argparse import ArgumentParser
4 | from pathlib import Path
5 | from pykeen.sampling import PseudoTypedNegativeSampler
6 | from pykeen.triples import TriplesFactory
7 |
8 | from time import time
9 | from typing import Union
10 |
11 | from bioblp.logger import get_logger
12 | from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET
13 |
14 | logger = get_logger(__name__)
15 | COL_LABEL = 'label'
16 |
17 |
18 | def generate_negative_triples(pos_triples: TriplesFactory,
19 | filtered=True,
20 | num_negs_per_pos=1):
21 |
22 | neg_sampler = PseudoTypedNegativeSampler(mapped_triples=pos_triples.mapped_triples,
23 | filtered=filtered,
24 | num_negs_per_pos=num_negs_per_pos)
25 | pos_batch = pos_triples.mapped_triples
26 | neg_triples = neg_sampler.sample(pos_batch)[0]
27 |
28 | return neg_triples
29 |
30 |
31 | def prepare_dpi_samples(pos_df,
32 | num_negs_per_pos: Union[None, int, str] = 1,
33 | entity_to_id_map: Union[None, dict] = None,
34 | relation_to_id_map: Union[None, dict] = None,
35 | # map_to_kgem_ids=False,
36 | filtered=True):
37 | """
38 | pos_df -> Expects dataframe with true positives in format ['src', edge', 'tgt'],
39 | where the entities and relations of the triple are in their string ids.
40 | These will be converted to KGEM integer ids at a later state
41 | """
42 | pos_neg_df = pos_df.copy()
43 | pos_triples = TriplesFactory.from_labeled_triples(pos_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values,
44 | entity_to_id=entity_to_id_map,
45 | relation_to_id=relation_to_id_map)
46 |
47 | # returns a tensor object
48 | neg_triples = generate_negative_triples(pos_triples,
49 | num_negs_per_pos=num_negs_per_pos,
50 | filtered=filtered)
51 |
52 | # convert to mapped triples
53 | neg_triples_ = pos_triples.clone_and_exchange_triples(
54 | neg_triples.view(-1, 3))
55 | neg_df = pd.DataFrame(neg_triples_.triples, columns=[
56 | COL_SOURCE, COL_EDGE, COL_TARGET])
57 |
58 | # add labels
59 | pos_neg_df[COL_LABEL] = 1
60 | neg_df[COL_LABEL] = 0
61 |
62 | pos_neg_df = pd.concat([pos_neg_df, neg_df], axis=0, ignore_index=True)
63 | return pos_neg_df
64 |
65 |
66 | def main(bm_data_path: str, kg_triples_dir: str, outdir: str, num_negs_per_pos: int = 1, override_run_id=None):
67 |
68 | start = time()
69 | run_id = override_run_id or int(start)
70 |
71 | bm_data_path = Path(bm_data_path)
72 | kg_triples_dir = Path(kg_triples_dir)
73 | outdir = Path(outdir)
74 | outdir.mkdir(parents=True, exist_ok=True)
75 |
76 | num_negs_per_pos = num_negs_per_pos
77 | bm_dataset_name = bm_data_path.name.split('.tsv')[0]
78 |
79 | training_triples = TriplesFactory.from_path_binary(kg_triples_dir)
80 | entity_to_id_map = training_triples.entity_to_id
81 | relation_to_id_map = training_triples.relation_to_id
82 |
83 | # load the benchmark data
84 | bm_df = pd.read_csv(bm_data_path, sep='\t', names=[
85 | COL_SOURCE, COL_EDGE, COL_TARGET])
86 |
87 | # generate neg samples and prepare pos-neg pairs
88 | logger.info(
89 | f'Generating negative samples corresponding to benchmark triples')
90 | pos_neg_df = prepare_dpi_samples(bm_df,
91 | entity_to_id_map=entity_to_id_map,
92 | relation_to_id_map=relation_to_id_map,
93 | num_negs_per_pos=num_negs_per_pos)
94 |
95 | # save to disk
96 | bm_postprocessed_path = outdir.joinpath(
97 | f"{bm_dataset_name}_p2n-1-{num_negs_per_pos}.tsv")
98 | logger.info(f'Writing preprocessed data to {bm_postprocessed_path}')
99 | pos_neg_df.to_csv(bm_postprocessed_path, sep='\t')
100 | logger.info('Done!')
101 |
102 | return str(bm_postprocessed_path.resolve())
103 |
104 |
105 | if __name__ == "__main__":
106 |
107 | parser = ArgumentParser(
108 | description="Preprocess benchmark triples (E.g. DPI data) for downstream prediction task")
109 | parser.add_argument("--bm_data_path", type=str,
110 | help="Path to pick up benchmark data")
111 | parser.add_argument("--kg_triples_dir", type=str,
112 | help="Directory housing kg positive triples. Needed to generate negative samples")
113 | parser.add_argument("--num_negs_per_pos", type=int,
114 | help="Number of negative samples to generate per positive instance")
115 | parser.add_argument("--outdir", type=str,
116 | help="Path to data dir to write output")
117 | parser.add_argument("--override_run_id", type=str,
118 | help="Run id of experiment")
119 | args = parser.parse_args()
120 | main(**vars(args))
121 |
--------------------------------------------------------------------------------
/bioblp/benchmarking/split.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | from argparse import ArgumentParser
5 | from pathlib import Path
6 |
7 | from sklearn.model_selection import StratifiedKFold
8 | from sklearn.model_selection import train_test_split
9 |
10 |
11 | from bioblp.benchmarking.train_utils import load_feature_data
12 | from bioblp.logger import get_logger
13 | from bioblp.benchmarking.config import BenchmarkSplitConfig
14 |
15 | from typing import Union, Tuple, Dict, List
16 |
17 | RANDOM_STATE = 12
18 |
19 | logger = get_logger(__name__)
20 |
21 |
22 | def get_splits_iter(splits_path):
23 | def splits_iterable():
24 | splits_data = torch.load(splits_path)
25 | n = len(splits_data)
26 |
27 | num = 0
28 | while num < n:
29 | fold_data = splits_data[num]
30 | yield (fold_data["split_idx"], fold_data["train_idx"], fold_data["test_idx"])
31 | num += 1
32 |
33 | return splits_iterable
34 |
35 |
36 | def get_split_struct(train, test, idx) -> dict:
37 | return {
38 | "train_idx": train,
39 | "test_idx": test,
40 | "split_idx": str(idx)
41 | }
42 |
43 |
44 | def load_split(splits_file: Path, split_idx: int) -> Tuple[np.array, np.array]:
45 |
46 | splits_data = torch.load(splits_file)
47 |
48 | fold_splits = splits_data[split_idx]
49 | train_idx = fold_splits["train_idx"]
50 | test_idx = fold_splits["test_idx"]
51 | fold_idx = fold_splits["split_idx"]
52 |
53 | return (fold_idx, train_idx, test_idx)
54 |
55 |
56 | def main(data, n_folds=None, outdir=None, conf=None, override_data_root=None, override_run_id=None):
57 |
58 | if conf is not None:
59 | config = BenchmarkSplitConfig.from_toml(conf, run_id=override_run_id)
60 | if override_data_root is not None:
61 | config.data_root = override_data_root
62 |
63 | n_folds = config.n_splits
64 | data_path = Path(data)
65 | outdir = config.resolve_outdir()
66 | else:
67 | data_path = Path(data)
68 | outdir = Path(outdir)
69 |
70 | outdir.mkdir(parents=True, exist_ok=True)
71 |
72 | # load raw benchmark data
73 | X_bm, y_bm = load_feature_data(data_path)
74 |
75 | # generate train-test split
76 | logger.info("Generating train test split.")
77 |
78 | X_indices = torch.arange(len(X_bm))
79 |
80 | train_idx, test_idx, _, _ = train_test_split(
81 | X_indices, y_bm, test_size=0.1, stratify=y_bm, random_state=RANDOM_STATE)
82 |
83 | split_data = {0: get_split_struct(train_idx, test_idx, idx=0)}
84 | train_test_split_file = outdir.joinpath("train-test-split.pt")
85 | torch.save(split_data, train_test_split_file)
86 |
87 | # generate cv splits
88 | logger.info("Generating cv splits.")
89 |
90 | cv = StratifiedKFold(
91 | n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE
92 | )
93 | splits = [(train, test, idx)
94 | for idx, (train, test) in enumerate(cv.split(X_bm, y_bm))]
95 |
96 | cv_data = {x[2]: get_split_struct(x[0], x[1], x[2]) for x in splits}
97 |
98 | cv_split_file = outdir.joinpath("cv-splits.pt")
99 | torch.save(cv_data, cv_split_file)
100 |
101 | logger.info("Done.")
102 |
103 |
104 | if __name__ == "__main__":
105 |
106 | parser = ArgumentParser(
107 | description="Preprocess benchmark triples (E.g. DPI data) for downstream prediction task")
108 |
109 | parser.add_argument("--conf", type=str, default=None,
110 | help="Path to config file")
111 | parser.add_argument("--data", type=str,
112 | help="Path to pick up benchmark data")
113 | parser.add_argument("--n_folds", type=int, default=None,
114 | help="Number of cv folds to produce")
115 | parser.add_argument("--outdir", type=str, default=None,
116 | help="Path to data dir to write output")
117 | parser.add_argument("--override_data_root", type=str,
118 | help="Path to root of data tree")
119 | parser.add_argument("--override_run_id", type=str,
120 | help="Override run_id")
121 | args = parser.parse_args()
122 | main(**vars(args))
123 |
--------------------------------------------------------------------------------
/bioblp/benchmarking/train_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import string
3 | import numpy as np
4 | import random as rn
5 |
6 | from pathlib import Path
7 |
8 |
9 | from sklearn.metrics import roc_auc_score
10 | from sklearn.metrics import precision_score
11 | from sklearn.metrics import recall_score
12 | from sklearn.metrics import fbeta_score
13 | from sklearn.metrics import make_scorer
14 | from sklearn.metrics import accuracy_score
15 | from sklearn.metrics import precision_recall_curve
16 | from sklearn.metrics import roc_curve
17 | from sklearn.metrics import auc
18 | from sklearn.metrics import confusion_matrix
19 |
20 |
21 | from sklearn.model_selection import train_test_split
22 |
23 | from typing import Union, Tuple
24 |
25 | from bioblp.logger import get_logger
26 |
27 |
28 | logger = get_logger(__name__)
29 |
30 |
31 | def get_random_string(length):
32 | # choose from all lowercase letter
33 | characters = string.ascii_lowercase + string.digits
34 | result_str = "".join(rn.choice(characters) for i in range(length))
35 |
36 | return result_str
37 |
38 |
39 | def unique_study_prefix():
40 | unique_string = get_random_string(8)
41 | return unique_string
42 |
43 |
44 | def generate_study_name(prefix, model, fold):
45 | return f"{prefix}-{model}-{fold}"
46 |
47 |
48 | def aupr_score(y_true, y_pred):
49 | """Use AUC function to calculate the area under the curve of precision recall curve"""
50 | precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
51 | return auc(recall, precision)
52 |
53 |
54 | def get_auc_scorers():
55 | scorers = {
56 | "PRCURVE": make_scorer(precision_recall_curve, needs_proba=True),
57 | "ROCCURVE": make_scorer(roc_curve, needs_proba=True),
58 | "CM": make_scorer(confusion_matrix, needs_proba=False)
59 | }
60 | return scorers
61 |
62 |
63 | def get_scorers():
64 | scorers = {
65 | "AUCROC": make_scorer(roc_auc_score, needs_proba=True),
66 | "f1": make_scorer(fbeta_score, beta=1, average="micro"),
67 | "precision": make_scorer(precision_score),
68 | "recall": make_scorer(recall_score),
69 | "accuracy": make_scorer(accuracy_score),
70 | "AUCPR": make_scorer(aupr_score, needs_proba=True),
71 | }
72 | return scorers
73 |
74 |
75 | def get_model_label(feature: str, model: str):
76 | return f"{feature}__{model}"
77 |
78 |
79 | def load_feature_data(feat_path: Union[str, Path], dev_run: bool = False) -> Tuple[np.array, np.array]:
80 | """ Load feature data into numpy arrays
81 |
82 | Parameters
83 | ----------
84 | feat_path : Union[str, Path]
85 | Filepath to feature, eg 'features/rotate.pt'
86 | dev_run : bool, optional
87 | Flag to subsample data for development only, by default False
88 |
89 | Returns
90 | -------
91 | Tuple[np.array, np.array]
92 | Return (features, labels)
93 | """
94 | logger.info("Loading training data...")
95 |
96 | data = torch.load(feat_path)
97 |
98 | X = data.get("X")
99 | y = data.get("y")
100 |
101 | if torch.is_tensor(X):
102 | X = X.detach().numpy()
103 | y = y.detach().numpy()
104 |
105 | if dev_run:
106 | X, _, y, _ = train_test_split(
107 | X, y, stratify=y, train_size=0.1, random_state=12)
108 |
109 | logger.info(
110 | "Resulting shapes X: {}, y: {}".format(
111 | X.shape, y.shape)
112 | )
113 | logger.info("Counts in y: {}".format(
114 | np.unique(y, return_counts=True)))
115 |
116 | return X, y
117 |
118 |
119 | def validate_features_exist(feature_dir: Path, models_conf: dict) -> bool:
120 | """ Check if all feature files exist in directory
121 |
122 | Parameters
123 | ----------
124 | feature_dir : Path
125 | Path to feature location
126 | models_conf : dict
127 | Definition of model and feature.
128 |
129 | Returns
130 | -------
131 | bool
132 | True if features are present.
133 | """
134 | exists = {}
135 |
136 | all_features = list(set([v.get("feature")
137 | for _, v in models_conf.items()]))
138 |
139 | for feat in all_features:
140 | exists[feat] = feature_dir.joinpath(f"{feat}.pt").is_file()
141 |
142 | logger.info(f"Validated that features exist: {exists}..")
143 |
144 | missing = [k for k, v in exists.items() if v is False]
145 | if len(missing) > 0:
146 | logger.warning(f"Missing features {missing}!!")
147 |
148 | return all([v for _, v in exists.items()])
149 |
--------------------------------------------------------------------------------
/bioblp/data.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import pandas as pd
3 | from bioblp.logger import get_logger
4 | from pykeen.triples import TriplesFactory
5 |
6 | #logger = get_logger(__name__)
7 |
8 | COL_SOURCE = 'src'
9 | COL_EDGE = 'edg'
10 | COL_TARGET = 'tgt'
11 |
12 |
13 | def create_random_splits(triples: pd.DataFrame, train_ratio: float, valid_ratio: float, test_ratio: float):
14 | """Create train/valid/test based on random strategy
15 | """
16 | triples_array = triples[[COL_SOURCE, COL_EDGE, COL_TARGET]].values
17 |
18 | triples_factory = TriplesFactory.from_labeled_triples(triples_array)
19 |
20 | train, valid, test = triples_factory.split([train_ratio, valid_ratio, test_ratio], random_state=2021)
21 |
22 | train_triples = pd.DataFrame(train.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET])
23 | valid_triples = pd.DataFrame(valid.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET])
24 | test_triples = pd.DataFrame(test.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET])
25 |
26 | return train_triples, valid_triples, test_triples
27 |
28 |
29 | def save_splits(train_df, test_df, valid_df, dataset_name, out_dir):
30 | out_dir = Path(out_dir)
31 | out_dir.mkdir(exist_ok=True, parents=True)
32 |
33 | train.to_csv(out_dir.joinpath(f"{dataset_name}-train.tsv"), sep='\t', index=None)
34 | test.to_csv(out_dir.joinpath(f"{dataset_name}-test.tsv"), sep='\t', index=None)
35 | valid.to_csv(out_dir.joinpath(f"{dataset_name}-valid.tsv"), sep='\t', index=None)
36 | print(f"saved to {out_dir}")
37 |
38 |
39 | def load_splits(dataset: str, data_path: str, dev_sample=False) -> (TriplesFactory, TriplesFactory, TriplesFactory):
40 | data_path = Path(data_path)
41 |
42 | training_path = data_path.joinpath(f"{dataset}-train.tsv")
43 | valid_path = data_path.joinpath(f"{dataset}-valid.tsv")
44 | test_path = data_path.joinpath(f"{dataset}-test.tsv")
45 |
46 | train_df = pd.read_csv(training_path, index_col=None, sep="\t", dtype=str)
47 | valid_df = pd.read_csv(valid_path, index_col=None, sep="\t", dtype=str)
48 | test_df = pd.read_csv(test_path, index_col=None, sep="\t", dtype=str)
49 |
50 | if dev_sample:
51 | dev_frac = 0.01
52 | train_df = train_df.sample(frac=dev_frac, random_state=2021)
53 | valid_df = valid_df.sample(frac=dev_frac, random_state=2021)
54 | test_df = test_df.sample(frac=dev_frac, random_state=2021)
55 |
56 | training = TriplesFactory.from_labeled_triples(train_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values)
57 | valid = TriplesFactory.from_labeled_triples(
58 | valid_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, entity_to_id=training.entity_to_id,
59 | relation_to_id=training.relation_to_id)
60 | test = TriplesFactory.from_labeled_triples(
61 | test_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, entity_to_id=training.entity_to_id,
62 | relation_to_id=training.relation_to_id)
63 |
64 | return training, valid, test
--------------------------------------------------------------------------------
/bioblp/evaluate.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | import numpy as np
4 | from pykeen.evaluation import RankBasedEvaluator, RankBasedMetricResults
5 | from pykeen.evaluation.rank_based_evaluator import _iter_ranks
6 | from pykeen.triples import TriplesFactory
7 | from tap import Tap
8 | import torch
9 |
10 |
11 | class Arguments(Tap):
12 | model_path: str
13 |
14 |
15 | class SavedRanksEvaluator(RankBasedEvaluator):
16 | def __init__(self, *args, **kwargs):
17 | super().__init__(*args, **kwargs)
18 | self.saved_ranks = None
19 |
20 | def finalize(self) -> RankBasedMetricResults:
21 | if self.num_entities is None:
22 | raise ValueError
23 |
24 | result = RankBasedMetricResults.from_ranks(
25 | metrics=self.metrics,
26 | rank_and_candidates=_iter_ranks(ranks=self.ranks, num_candidates=self.num_candidates),
27 | )
28 |
29 | self.saved_ranks = self.ranks.copy()
30 | self.ranks.clear()
31 | self.num_candidates.clear()
32 |
33 | return result
34 |
35 |
36 | def get_triple_ranks(args: Arguments):
37 | model_file = osp.join(args.model_path, 'trained_model.pkl')
38 |
39 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40 |
41 | model = torch.load(model_file).to(device)
42 | train = TriplesFactory.from_path_binary(osp.join(args.model_path,
43 | 'training_triples'))
44 |
45 | graph_path = osp.join('data', 'biokgb', 'graph')
46 | valid_triples = 'biokg.links-valid.csv'
47 | test_triples = 'biokg.links-test.csv'
48 |
49 | valid, test = [TriplesFactory.from_path(osp.join(graph_path, f),
50 | entity_to_id=train.entity_to_id,
51 | relation_to_id=train.relation_to_id)
52 | for f in (valid_triples, test_triples)]
53 |
54 | evaluator = SavedRanksEvaluator(filtered=True)
55 | evaluator.evaluate(model,
56 | test.mapped_triples,
57 | additional_filter_triples=[train.mapped_triples,
58 | valid.mapped_triples])
59 |
60 | head_ranks = evaluator.saved_ranks[('head', 'realistic')]
61 | tail_ranks = evaluator.saved_ranks[('tail', 'realistic')]
62 | ranks = np.concatenate(head_ranks + tail_ranks)
63 | # Save ranks to a csv file, specifying the integer format
64 | np.savetxt(osp.join(args.model_path, 'ranks.csv'), ranks, fmt='%d')
65 |
66 |
67 | if __name__ == '__main__':
68 | get_triple_ranks(Arguments().parse_args())
69 |
--------------------------------------------------------------------------------
/bioblp/loaders/preprocessors.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple, Mapping
2 |
3 | from transformers import BertTokenizer
4 | import torch
5 | from torch import Tensor
6 | from torch.nn.utils.rnn import pad_sequence
7 | from tqdm import tqdm
8 | import numpy as np
9 |
10 |
11 | class EntityPropertyPreprocessor:
12 | """Abstract class for preprocessing entity properties of different types
13 | into tensors suitable for machine learning wizardry."""
14 | def preprocess_file(self, file_path: str,
15 | entity_to_id: Mapping[str, int]
16 | ) -> Tuple[Tensor, Tensor, Tensor]:
17 | """Read a file of entity properties, with one entity per line.
18 | Expects at each line an entity name, a tab, and a property to be
19 | encoded.
20 |
21 | Args:
22 | file_path: file mapping entities to properties
23 | entity_to_id: maps an entity name to an integer ID
24 |
25 | Returns:
26 | entity_ids: torch.Tensor containing entity IDs read by the method
27 | rows: torch.Tensor mapping each entity in entity_ids to a row in
28 | data
29 | data: torch.Tensor containing data for each entity in entity_ids
30 | """
31 | raise NotImplementedError
32 |
33 |
34 | class TextEntityPropertyPreprocessor(EntityPropertyPreprocessor):
35 | """Preprocessor for entities with textual descriptions"""
36 | def __init__(self, tokenizer: BertTokenizer, max_length: int):
37 | self.tokenizer = tokenizer
38 | self.max_length = max_length
39 |
40 | def preprocess_file(self, file_path: str,
41 | entity_to_id: Mapping[str, int]
42 | ) -> Tuple[Tensor, Tensor, Tensor]:
43 | all_tokens = []
44 | entity_ids = []
45 | rows = []
46 | row_count = 0
47 | with open(file_path) as file:
48 | for i, line in enumerate(tqdm(file, desc=f'Encoding {file_path}')):
49 | tab_idx = line.find('\t')
50 | entity, text = line[:tab_idx], line[tab_idx:].strip()
51 |
52 | if entity in entity_to_id:
53 | tokens = self.tokenizer.encode(text,
54 | max_length=self.max_length,
55 | truncation=True,
56 | padding='max_length',
57 | return_tensors='pt')
58 | all_tokens.append(tokens)
59 | entity_id = entity_to_id[entity]
60 | entity_ids.append(entity_id)
61 | rows.append(row_count)
62 | row_count += 1
63 |
64 | if len(all_tokens) > 0:
65 | all_tokens = torch.cat(all_tokens, dim=0)
66 | else:
67 | all_tokens = torch.tensor([], dtype=torch.long)
68 |
69 | return (torch.tensor(entity_ids, dtype=torch.long),
70 | torch.tensor(rows, dtype=torch.long),
71 | all_tokens)
72 |
73 |
74 | class MolecularFingerprintPreprocessor(EntityPropertyPreprocessor):
75 | """Preprocessor for molecules with known molecular fingerprints"""
76 | def preprocess_file(self, file_path: str,
77 | entity_to_id: Mapping[str, int]
78 | ) -> Tuple[Tensor, Tensor, Tensor]:
79 | all_fprints = []
80 | entity_ids = []
81 | rows = []
82 | row_count = 0
83 | with open(file_path) as file:
84 | for i, line in enumerate(tqdm(file, desc=f'Encoding {file_path}')):
85 | tab_idx = line.find('\t')
86 | entity, fprint = line[:tab_idx], line[tab_idx:].strip()
87 |
88 | if entity in entity_to_id:
89 | fprint = torch.tensor(np.array(list(fprint), dtype=float), dtype=torch.float)
90 | all_fprints.append(fprint)
91 | entity_id = entity_to_id[entity]
92 | entity_ids.append(entity_id)
93 | rows.append(row_count)
94 | row_count += 1
95 |
96 | return (torch.tensor(entity_ids, dtype=torch.long),
97 | torch.tensor(rows, dtype=torch.long),
98 | torch.stack(all_fprints, dim=0))
99 |
100 |
101 | class PretrainedEmbeddingPreprocessor(EntityPropertyPreprocessor):
102 | def preprocess_file(self, file_path: str,
103 | entity_to_id: Mapping[str, int]
104 | ) -> Tuple[Tensor, Tensor, Tensor]:
105 | data_dict = torch.load(file_path)
106 | entity_to_row = data_dict['identifiers']
107 |
108 | entity_ids = []
109 | data = []
110 | for entity, row in entity_to_row.items():
111 | if entity in entity_to_id:
112 | entity_ids.append(entity_to_id[entity])
113 | data.append(entity_to_row[entity])
114 |
115 | entity_ids = torch.tensor(entity_ids, dtype=torch.long)
116 | data_idx = torch.arange(len(entity_ids))
117 | data = torch.tensor(data, dtype=torch.long)
118 |
119 | return entity_ids, data_idx, data
120 |
121 |
122 | class MoleculeEmbeddingPreprocessor(EntityPropertyPreprocessor):
123 | def preprocess_file(self, file_path: str,
124 | entity_to_id: Mapping[str, int]
125 | ) -> Tuple[Tensor, Tensor, Tensor]:
126 | """Load embeddings for all the molecules we need, putting them
127 | in a single tensor that can be used to retrieve embeddings during
128 | training. Since molecules have variable length we use padding with
129 | a value of -1000 before placing them all inside a single 3D tensor
130 | of shape (N, L, D) where N is the number of molecules,
131 | L the maximum molecule length, and D the embedding dimension"""
132 | data_dict = torch.load(file_path)
133 |
134 | entity_ids = []
135 | data = []
136 | for molecule, embeddings in data_dict.items():
137 | if molecule in entity_to_id:
138 | entity_ids.append(entity_to_id[molecule])
139 | data.append(embeddings)
140 |
141 | entity_ids = torch.tensor(entity_ids, dtype=torch.long)
142 | data = pad_sequence(data, batch_first=True, padding_value=-10_000)
143 | data_idx = torch.arange(len(entity_ids))
144 |
145 | return entity_ids, data_idx, data
146 |
--------------------------------------------------------------------------------
/bioblp/logger.py:
--------------------------------------------------------------------------------
1 | import logging as lg
2 |
3 |
4 | def get_logger(logger_name=''):
5 | """Get a default logger that includes a timestamp."""
6 | logger = lg.getLogger(logger_name)
7 | logger.handlers = []
8 | ch = lg.StreamHandler()
9 | str_fmt = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
10 | formatter = lg.Formatter(str_fmt, datefmt='%H:%M:%S')
11 | ch.setFormatter(formatter)
12 | logger.addHandler(ch)
13 | logger.setLevel('INFO')
14 |
15 | return logger
16 |
--------------------------------------------------------------------------------
/bioblp/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .bioblp import *
2 |
--------------------------------------------------------------------------------
/bioblp/models/bioblp.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | from typing import Optional
3 |
4 | import pykeen.models
5 | from pykeen.nn.representation import Embedding as PyKEmbedding
6 | from pykeen.typing import InductiveMode
7 | import torch
8 |
9 | from bioblp.models.encoders import PropertyEncoderRepresentation
10 |
11 |
12 | class BioBLP:
13 | def __init__(self, *,
14 | entity_representations: PropertyEncoderRepresentation,
15 | from_checkpoint: str = None,
16 | **kwargs):
17 | self.from_checkpoint = from_checkpoint
18 |
19 | super().__init__(**kwargs)
20 |
21 | entity_embedding_lut = self.entity_representations[0]
22 | entity_embedding_lut: PyKEmbedding
23 |
24 | entity_representations.wrap_lookup_table(entity_embedding_lut)
25 | self.property_encoder = entity_representations
26 |
27 | def reset_parameters_(self):
28 | super().reset_parameters_()
29 | if self.from_checkpoint:
30 | checkpoint = torch.load(osp.join(self.from_checkpoint,
31 | 'trained_model.pkl'),
32 | map_location='cpu')
33 | self.load_state_dict(checkpoint.state_dict(), strict=False)
34 |
35 | def score_hrt_and_negatives(self,
36 | hrt_batch: torch.LongTensor,
37 | num_negatives: int,
38 | *, mode: Optional[InductiveMode] = None
39 | ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
40 | batch_size = hrt_batch.shape[0]
41 |
42 | h, r, t = self._get_representations(h=hrt_batch[:, 0],
43 | r=hrt_batch[:, 1],
44 | t=hrt_batch[:, 2], mode=mode)
45 | positive_scores = self.interaction.score_hrt(h=h, r=r, t=t)
46 |
47 | num_ents = batch_size * 2
48 | idx = torch.arange(num_ents).reshape(batch_size, 2)
49 |
50 | # For each row, sample entities, assigning 0 probability to entities
51 | # of the same row
52 | zeros = torch.zeros(batch_size, 2)
53 | head_weights = torch.ones(batch_size, num_ents, dtype=torch.float)
54 | head_weights.scatter_(1, idx, zeros)
55 | random_idx = head_weights.multinomial(num_negatives, replacement=True)
56 | random_idx = random_idx.t().flatten()
57 |
58 | # Select randomly the first or the second column
59 | row_selector = torch.arange(batch_size * num_negatives)
60 | col_selector = torch.randint(0, 2, [batch_size * num_negatives])
61 |
62 | # Fill the array of negative samples with the sampled random entities
63 | # at the right positions
64 | neg_idx = idx.repeat((num_negatives, 1))
65 | neg_idx[row_selector, col_selector] = random_idx
66 | # neg_idx = neg_idx.reshape(-1, batch_size, 2)
67 | # neg_idx.transpose_(0, 1)
68 |
69 | neg_embs = torch.stack([h, r], dim=1).view(batch_size * 2, -1)
70 | neg_embs = neg_embs[neg_idx.to(neg_embs.device)]
71 | h_neg, t_neg = neg_embs[:, 0], neg_embs[:, 1]
72 |
73 | r_neg_idx = torch.arange(batch_size).repeat(num_negatives)
74 | r_neg = r[r_neg_idx.to(r.device)]
75 |
76 | negative_scores = self.interaction.score_hrt(h=h_neg, r=r_neg, t=t_neg)
77 | negative_scores = negative_scores.reshape(batch_size, num_negatives)
78 |
79 | return positive_scores, negative_scores
80 |
81 |
82 | class BioBLPTransE(BioBLP, pykeen.models.TransE):
83 | ...
84 |
85 |
86 | class BioBLPComplEx(BioBLP, pykeen.models.ComplEx):
87 | ...
88 |
89 |
90 | class BioBLPRotatE(BioBLP, pykeen.models.RotatE):
91 | ...
92 |
93 |
94 | MODELS_DICT = {
95 | 'transe': BioBLPTransE,
96 | 'complex': BioBLPComplEx,
97 | 'rotate': BioBLPRotatE
98 | }
99 |
100 |
101 | def get_model_class(model_name: str):
102 | if model_name in MODELS_DICT:
103 | return MODELS_DICT[model_name]
104 | else:
105 | raise ValueError(f'Unknown model f{model_name}')
106 |
107 |
--------------------------------------------------------------------------------
/bioblp/predict.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/predict.py
--------------------------------------------------------------------------------
/bioblp/preprocess.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pandas as pd
3 | import numpy as np
4 | import bio_embeddings
5 | from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, prottrans_t5_embedder, esm_embedder
6 |
7 |
8 | # Here we can change the Protein Embedder to w/e we want from the above.
9 | # TODO: An experiment with t5 embedding
10 | prot_trans_embedder = ProtTransBertBFDEmbedder()
11 |
12 |
13 | def get_protein_repr(amino_repr):
14 | """ Here we need to go from a collection of amino-acid embeddings to a full protein embedding
15 |
16 | # Example:
17 | #
18 | # M : (1,1024)
19 | # A : (1,1024)
20 | # S : (1,1024)
21 | #
22 | # Output: An aggregated representation for proteins
23 | #
24 | # Type: Dict(protein_id: (embedding))
25 | #
26 | e.g Dict(: (LENG8_MOUSE, 1024)) """
27 |
28 | emb_matrix = torch.Tensor(amino_repr)
29 |
30 | # We average over columns
31 | protein_emb = torch.mean(emb_matrix, dim=0)
32 |
33 | return protein_emb
34 |
35 |
36 | def get_protein_embedding(path, embedder="prottrans"):
37 | """
38 | Wrapper over different protein embedders
39 | Parameters
40 | ----------
41 | embedder: The model to embed proteins
42 | path: The data path
43 |
44 | Returns
45 | -------
46 | """
47 | print('Im in')
48 |
49 | # Load sequences
50 | sequence_data = pd.read_csv(path, sep='\t')
51 |
52 | # Sample : Uncomment for testing
53 | # sequence_data = sequence_data.sample(2)
54 |
55 | # Select correct columns
56 | sequence_data = sequence_data[['From', 'Sequence']]
57 |
58 | # Embed sequences
59 | sequence_data['embedding'] = sequence_data['Sequence'].apply(lambda x: prot_trans_embedder.embed(x))
60 |
61 | # Aggregate sequences
62 | sequence_data['squashed'] = sequence_data['embedding'].apply(lambda x: get_protein_repr(x))
63 |
64 |
65 | # Save sequences
66 | sequence_data.to_csv('../data/processed/uniprot_seq_embeddings.tsv')
67 |
68 |
69 | get_protein_embedding('../data/uniprot_sequences.tsv')
70 |
--------------------------------------------------------------------------------
/bioblp/train.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | from pykeen.pipeline import pipeline
4 | from pykeen.training import TrainingCallback
5 | from pykeen.triples import TriplesFactory
6 |
7 | from tap import Tap
8 | from transformers import get_linear_schedule_with_warmup
9 | import wandb
10 |
11 | from bioblp.logger import get_logger
12 | import bioblp.models as models
13 | from bioblp.utils.bioblp_utils import build_encoders
14 | from bioblp.utils.training import InBatchNegativesTraining
15 |
16 |
17 | class Arguments(Tap):
18 | train_triples: str
19 | valid_triples: str
20 | test_triples: str
21 |
22 | protein_data: str = None
23 | molecule_data: str = None
24 | text_data: str = None
25 |
26 | model: str = 'complex'
27 | dimension: int = 256
28 | loss_fn: str = 'crossentropy'
29 | loss_margin: float = 1.0
30 | optimizer: str = 'adagrad'
31 | learning_rate: float = 1e-2
32 | freeze_pretrained_embeddings: bool = False
33 | warmup_fraction: float = None
34 | regularizer: float = 1e-6
35 | num_epochs: int = 100
36 | batch_size: int = 1024
37 | eval_batch_size: int = 16
38 | eval_every: int = 10
39 | num_negatives: int = 512
40 | in_batch_negatives: bool = False
41 | add_inverses: bool = False
42 | early_stopper: str = 'both.realistic.inverse_harmonic_mean_rank'
43 | from_checkpoint: str = None
44 |
45 | search_train_batch_size: bool = False
46 | search_eval_batch_size: bool = False
47 | log_wandb: bool = False
48 | notes: str = None
49 |
50 |
51 | class BioBLPCallback(TrainingCallback):
52 | """A callback to get the wandb ID of the run before it gets closed.
53 | We use it to get a file name for the stored model."""
54 | id = None
55 | scheduler = None
56 |
57 | def __init__(self, num_training_steps, warmup_fraction):
58 | super().__init__()
59 | self.use_scheduler = warmup_fraction is not None
60 | if self.use_scheduler:
61 | self.num_training_steps = num_training_steps
62 | self.num_warmup_steps = int(self.num_training_steps * warmup_fraction)
63 |
64 | def post_epoch(self, *args, **kwargs):
65 | if wandb.run is not None and BioBLPCallback.id is None:
66 | BioBLPCallback.id = wandb.run.id
67 |
68 | def pre_step(self, **kwargs):
69 | if not self.use_scheduler:
70 | return
71 |
72 | if self.scheduler is None:
73 | self.scheduler = get_linear_schedule_with_warmup(
74 | self.optimizer,
75 | self.num_warmup_steps,
76 | self.num_training_steps
77 | )
78 | else:
79 | self.scheduler.step()
80 |
81 |
82 | def run(args: Arguments):
83 | cli_args_dict = {f'cli_{k}': v for k, v in args.as_dict().items()}
84 | if args.search_train_batch_size:
85 | args.batch_size = None
86 | if args.search_eval_batch_size:
87 | args.eval_batch_size = None
88 |
89 | logger = get_logger()
90 | logger.info('Loading triples...')
91 |
92 | entity_to_id = relation_to_id = None
93 | if args.from_checkpoint:
94 | checkpoint_triples = TriplesFactory.from_path_binary(
95 | osp.join(args.from_checkpoint, 'training_triples')
96 | )
97 | entity_to_id = checkpoint_triples.entity_to_id
98 | relation_to_id = checkpoint_triples.relation_to_id
99 |
100 | training = TriplesFactory.from_path(
101 | args.train_triples,
102 | create_inverse_triples=args.add_inverses,
103 | entity_to_id=entity_to_id,
104 | relation_to_id=relation_to_id
105 | )
106 | validation = TriplesFactory.from_path(args.valid_triples,
107 | entity_to_id=training.entity_to_id,
108 | relation_to_id=training.relation_to_id)
109 | testing = TriplesFactory.from_path(args.test_triples,
110 | entity_to_id=training.entity_to_id,
111 | relation_to_id=training.relation_to_id)
112 |
113 | logger.info(f'Loaded graph with {training.num_entities:,} entities')
114 | logger.info(f'{training.num_triples:,} training triples')
115 | logger.info(f'{validation.num_triples:,} validation triples')
116 | logger.info(f'{testing.num_triples:,} test triples')
117 |
118 | loss_kwargs = None
119 | if args.loss_fn in {'nssa', 'marginranking'}:
120 | loss_kwargs = {'margin': args.loss_margin}
121 | model = args.model
122 | model_kwargs = {'embedding_dim': args.dimension, 'loss': args.loss_fn}
123 |
124 | if any((args.protein_data, args.molecule_data, args.text_data)):
125 | model = models.get_model_class(args.model)
126 | dimension = args.dimension
127 | if args.model in ('complex', 'rotate'):
128 | dimension *= 2
129 |
130 | freeze_pretrained_embeddings = args.freeze_pretrained_embeddings
131 | encoders = build_encoders(dimension,
132 | training.entity_to_id,
133 | args.protein_data,
134 | args.molecule_data,
135 | args.text_data,
136 | freeze_pretrained_embeddings)
137 | model_kwargs['entity_representations'] = encoders
138 |
139 | if args.from_checkpoint:
140 | model_kwargs['from_checkpoint'] = args.from_checkpoint
141 |
142 | if args.warmup_fraction:
143 | if args.batch_size is None:
144 | raise ValueError('Batch size is needed to apply learning rate'
145 | ' warmup.')
146 | num_steps = (training.num_triples // args.batch_size) * args.num_epochs
147 | else:
148 | num_steps = None
149 |
150 | training_loop = InBatchNegativesTraining if args.in_batch_negatives else None
151 |
152 | result = pipeline(training=training,
153 | validation=validation,
154 | testing=testing,
155 | model=model,
156 | model_kwargs=model_kwargs,
157 | loss_kwargs=loss_kwargs,
158 | optimizer=args.optimizer,
159 | optimizer_kwargs={'lr': args.learning_rate},
160 | regularizer='LpRegularizer',
161 | regularizer_kwargs={'weight': args.regularizer},
162 | training_kwargs={'num_epochs': args.num_epochs,
163 | 'batch_size': args.batch_size,
164 | 'callbacks': BioBLPCallback,
165 | 'callback_kwargs': {
166 | 'num_training_steps': num_steps,
167 | 'warmup_fraction': args.warmup_fraction
168 | }},
169 | training_loop=training_loop,
170 | negative_sampler='basic',
171 | negative_sampler_kwargs={
172 | 'num_negs_per_pos': args.num_negatives
173 | },
174 | stopper='early',
175 | stopper_kwargs={
176 | 'evaluation_batch_size': args.eval_batch_size,
177 | 'metric': args.early_stopper,
178 | 'frequency': args.eval_every,
179 | 'patience': 5,
180 | 'relative_delta': 0.0001,
181 | 'larger_is_better': True
182 | },
183 | evaluator_kwargs={'batch_size': args.eval_batch_size},
184 | result_tracker='wandb',
185 | result_tracker_kwargs={
186 | 'entity': 'discoverylab',
187 | 'project': 'bioblp',
188 | 'notes': args.notes,
189 | 'config': cli_args_dict,
190 | 'offline': not args.log_wandb
191 | }
192 | )
193 |
194 | result.save_to_directory(osp.join('models', BioBLPCallback.id))
195 |
196 |
197 | if __name__ == '__main__':
198 | run(Arguments(explicit_bool=True).parse_args())
199 |
--------------------------------------------------------------------------------
/bioblp/train_argparse.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | from pathlib import Path
3 | from pykeen.pipeline import pipeline
4 | from pykeen.training import TrainingCallback
5 | from pykeen.triples import TriplesFactory
6 | from dataclasses import dataclass, asdict
7 | # from tap import Tap
8 | from argparse import ArgumentParser
9 | import wandb
10 | import toml
11 |
12 | from bioblp.logging import get_logger
13 |
14 | @dataclass
15 | class Arguments:
16 | #data_splits_path: str
17 | #dataset_name: str
18 | train_triples: str
19 | valid_triples: str
20 | test_triples: str
21 |
22 | model: str = 'complex'
23 | dimension: int = 256
24 | loss_fn: str = 'crossentropy'
25 | loss_margin: float = 1.0
26 | optimizer: str = 'adagrad'
27 | learning_rate: float = 1e-2
28 | regularizer: float = 1e-6
29 | num_epochs: int = 100
30 | batch_size: int = 1024
31 | eval_batch_size: int = 16
32 | num_negatives: int = 512
33 | add_inverses: bool = False
34 | early_stopper: str = 'both.realistic.inverse_harmonic_mean_rank'
35 |
36 | search_train_batch_size: bool = False
37 | search_eval_batch_size: bool = False
38 | log_wandb: bool = False
39 | notes: str = None
40 |
41 |
42 | class WBIDCallback(TrainingCallback):
43 | """A callback to get the wandb ID of the run before it gets closed.
44 | We use it to get a file name for the stored model."""
45 | id = None
46 |
47 | def post_train(self, *args, **kwargs):
48 | if wandb.run is not None:
49 | WBIDCallback.id = wandb.run.id
50 |
51 |
52 | def load_toml(toml_path: str) -> dict:
53 | toml_path = Path(toml_path)
54 | config = {}
55 | with open(toml_path, "r") as f:
56 | config = toml.load(f)
57 |
58 | return config
59 |
60 |
61 | def run(args: Arguments):
62 | cli_args_dict = {f'cli_{k}': v for k, v in asdict(args).items()}
63 | if args.search_train_batch_size:
64 | args.batch_size = None
65 | if args.search_eval_batch_size:
66 | args.eval_batch_size = None
67 |
68 | logger = get_logger()
69 | logger.info('Loading triples...')
70 |
71 | training = TriplesFactory.from_path(
72 | args.train_triples,
73 | create_inverse_triples=args.add_inverses
74 | )
75 | validation = TriplesFactory.from_path(args.valid_triples)
76 | testing = TriplesFactory.from_path(args.test_triples)
77 |
78 | logger.info(f'Loaded graph with {training.num_entities:,} entities')
79 | logger.info(f'{training.num_triples:,} training triples')
80 | logger.info(f'{validation.num_triples:,} validation triples')
81 | logger.info(f'{testing.num_triples:,} test triples')
82 |
83 | loss_kwargs = None
84 | if args.loss_fn in {'nssa', 'marginranking'}:
85 | loss_kwargs = {'margin': args.loss_margin}
86 |
87 | result = pipeline(training=training,
88 | validation=validation,
89 | testing=testing,
90 | model=args.model,
91 | model_kwargs={'embedding_dim': args.dimension,
92 | 'loss': args.loss_fn},
93 | loss_kwargs=loss_kwargs,
94 | optimizer=args.optimizer,
95 | optimizer_kwargs={'lr': args.learning_rate},
96 | regularizer='LpRegularizer',
97 | #regularizer_kwargs={'weight': args.regularizer},
98 | training_kwargs={'num_epochs': args.num_epochs,
99 | 'batch_size': args.batch_size,
100 | 'callbacks': WBIDCallback},
101 | negative_sampler='basic',
102 | negative_sampler_kwargs={
103 | 'num_negs_per_pos': args.num_negatives
104 | },
105 | stopper='early',
106 | stopper_kwargs={
107 | 'evaluation_batch_size': args.eval_batch_size,
108 | 'metric': args.early_stopper,
109 | 'frequency': 10,
110 | 'patience': 5,
111 | 'relative_delta': 0.0001,
112 | 'larger_is_better': True
113 | },
114 | evaluator_kwargs={'batch_size': args.eval_batch_size},
115 | result_tracker='wandb',
116 | result_tracker_kwargs={
117 | 'entity': 'discoverylab',
118 | 'project': 'bioblp',
119 | 'notes': args.notes,
120 | 'config': cli_args_dict,
121 | 'offline': not args.log_wandb
122 | }
123 | )
124 |
125 | result.save_to_directory(osp.join('models', WBIDCallback.id))
126 |
127 |
128 | if __name__ == '__main__':
129 | parser = ArgumentParser(description="Model training routing")
130 | parser.add_argument("--conf", type=str,
131 | help="Path to experiment toml file")
132 | #parser.add_argument('--out_path', type=str,
133 | # help='Path to write models output')
134 |
135 | args = parser.parse_args()
136 | conf = load_toml(args.conf)
137 | args = Arguments(**conf)
138 | run(args)
139 | #run(Arguments(explicit_bool=True).parse_args())
140 |
--------------------------------------------------------------------------------
/bioblp/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/utils/__init__.py
--------------------------------------------------------------------------------
/bioblp/utils/bioblp_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Mapping
2 |
3 | import bioblp.models.encoders as encoders
4 |
5 |
6 | def build_encoders(dim: int,
7 | entity_to_id: Mapping[str, int],
8 | protein_data: str = None,
9 | molecule_data: str = None,
10 | text_data: str = None,
11 | freeze_pretrained_embeddings: bool = False
12 | ) -> encoders.PropertyEncoderRepresentation:
13 | if not any((protein_data, molecule_data, text_data)):
14 | raise ValueError("No entity data provided to build encoders.")
15 |
16 | encoders_list = []
17 |
18 | if protein_data:
19 | protein_encoder = encoders.PretrainedLookupTableEncoder(
20 | file_path=protein_data,
21 | dim=dim,
22 | freeze_pretrained_embeddings=freeze_pretrained_embeddings
23 | )
24 | encoders_list.append(protein_encoder)
25 |
26 | if molecule_data:
27 | # TODO: We might want to set different learning rates for different
28 | # modules, potentially also with learning rate scheduling
29 | molecule_encoder = encoders.MoleculeEmbeddingEncoder(
30 | file_path=molecule_data,
31 | dim=dim
32 | )
33 | encoders_list.append(molecule_encoder)
34 |
35 | if text_data:
36 | text_encoder = encoders.TransformerTextEncoder(
37 | file_path=text_data,
38 | dim=dim
39 | )
40 | encoders_list.append(text_encoder)
41 |
42 | entity_encoders = encoders.PropertyEncoderRepresentation(
43 | dim=dim,
44 | entity_to_id=entity_to_id,
45 | encoders=encoders_list
46 | )
47 |
48 | return entity_encoders
49 |
--------------------------------------------------------------------------------
/bioblp/utils/pipeline.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | def create_pipeline(functions: list):
4 | """Sequentially executes a list of functions"""
5 | def pipeline(input):
6 | res = input
7 | for function in functions:
8 | res = function(res)
9 | return res
10 |
11 | return pipeline
12 |
--------------------------------------------------------------------------------
/bioblp/utils/training.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Union
2 |
3 | from pykeen.training.slcwa import SLCWATrainingLoop
4 | from pykeen.models.base import Model
5 | from pykeen.losses import Loss
6 | from pykeen.typing import InductiveMode
7 | from pykeen.triples.instances import SLCWABatch
8 | import torch
9 |
10 | from bioblp.models import BioBLP
11 |
12 |
13 | class InBatchNegativesTraining(SLCWATrainingLoop):
14 | @staticmethod
15 | def _process_batch_static(
16 | model: Union[BioBLP, Model],
17 | loss: Loss,
18 | mode: Optional[InductiveMode],
19 | batch: SLCWABatch,
20 | start: Optional[int],
21 | stop: Optional[int],
22 | label_smoothing: float = 0.0,
23 | slice_size: Optional[int] = None,
24 | ) -> torch.FloatTensor:
25 | # Slicing is not possible in sLCWA training loops
26 | if slice_size is not None:
27 | raise AttributeError(
28 | "Slicing is not possible for sLCWA training loops.")
29 |
30 | positive_batch, negative_batch, positive_filter = batch
31 | positive_batch = positive_batch[start:stop].to(device=model.device)
32 |
33 | positive_scores, negative_scores = model.score_hrt_and_negatives(
34 | positive_batch,
35 | num_negatives=negative_batch.shape[1],
36 | mode=mode
37 | )
38 |
39 | return (
40 | loss.process_slcwa_scores(
41 | positive_scores=positive_scores,
42 | negative_scores=negative_scores,
43 | label_smoothing=label_smoothing,
44 | batch_filter=positive_filter,
45 | num_entities=model._get_entity_len(mode=mode),
46 | )
47 | + model.collect_regularization_term()
48 | )
--------------------------------------------------------------------------------
/bioblp/utils/triples.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import os.path as osp
4 | from collections import Counter
5 | from argparse import ArgumentParser
6 |
7 | import pandas as pd
8 | import numpy as np
9 | from tqdm import tqdm
10 | from pykeen.triples import TriplesFactory
11 |
12 | from bioblp.data import COL_SOURCE
13 | from bioblp.data import COL_EDGE
14 | from bioblp.data import COL_TARGET
15 | from bioblp.data import COL_PUBYEAR
16 |
17 | DIR_PROCESSED = 'processed'
18 |
19 | logger = logging.getLogger(__name__)
20 | handler = logging.StreamHandler()
21 | logger.addHandler(handler)
22 | logger.setLevel(logging.INFO)
23 |
24 |
25 | def get_entity_relation_counts(triples: pd.DataFrame):
26 | """Count frequency of entities and relations across triples.
27 | Entities are not counted twice if there is a self-loop."""
28 | relation_counts = triples[COL_EDGE].value_counts()
29 |
30 | no_loops = triples[COL_SOURCE] != triples[COL_TARGET]
31 | tails_no_loops = triples[COL_TARGET].where(no_loops).dropna()
32 | entities = pd.concat([triples[COL_SOURCE], tails_no_loops])
33 | entity_counts = entities.value_counts()
34 |
35 | return entity_counts, relation_counts
36 |
37 |
38 | def split_train_test_triples(triples: pd.DataFrame, ratio: float):
39 | """Split a dataset of triples into training and test sets, so that all
40 | entities in the test set are in the training set.
41 | Triples are removed in order starting from index 0. Edges are deleted so
42 | that the initial proportion of relation types is preserved in the training
43 | set."""
44 | entity_counts, relation_counts = get_entity_relation_counts(triples)
45 | new_relation_counts = np.floor(relation_counts * ratio).astype(int)
46 |
47 | train_triples = []
48 | test_triples = []
49 | removed_relation_counts = Counter()
50 | done = {r: count == 0 for r, count in new_relation_counts.items()}
51 |
52 | with tqdm(total=new_relation_counts.sum(), desc='Removing triples') as bar:
53 | for i in range(len(triples)):
54 | row = triples.iloc[i]
55 | head = row[COL_SOURCE]
56 | rel = row[COL_EDGE]
57 | tail = row[COL_TARGET]
58 |
59 | # Check that removing the entity does not remove it from the
60 | # training set a count larger than two is required if head == tail
61 | if entity_counts[head] > 2 and entity_counts[tail] > 2 and not done[rel]:
62 | entity_counts[head] -= 1
63 | entity_counts[tail] -= 1
64 | test_triples.append(row)
65 |
66 | removed_relation_counts[rel] += 1
67 | bar.update(1)
68 | if removed_relation_counts[rel] == new_relation_counts[rel]:
69 | done[rel] = True
70 | if all(done.values()):
71 | break
72 | else:
73 | train_triples.append(row)
74 |
75 | test_triples = pd.DataFrame(test_triples, columns=triples.columns)
76 | train_triples = pd.DataFrame(train_triples, columns=triples.columns)
77 | # Add the rest of the triples that were not removed
78 | train_triples = pd.concat([train_triples, triples.iloc[i + 1:]])
79 |
80 | print('Done!')
81 |
82 | return train_triples, test_triples
83 |
84 |
85 | def create_splits(triples_path: str, random: bool = False):
86 | """Create train/valid/test splits based on timestamps."""
87 | print('Reading triples...')
88 | triples = pd.read_csv(triples_path, sep='\t')
89 | initial_length = len(triples)
90 |
91 | triples = triples.dropna(subset=[COL_SOURCE, COL_EDGE, COL_TARGET,
92 | COL_PUBYEAR])
93 | triples[COL_PUBYEAR] = triples[COL_PUBYEAR].astype(int)
94 |
95 | # Sort whole dataframe first to ensure repeatability
96 | triples = triples.sort_values(by=list(triples.columns), kind='mergesort')
97 |
98 | if not random:
99 | # Sort by pubyear before deduplicating and removing triples!
100 | triples = triples.sort_values(by=COL_PUBYEAR, ascending=False,
101 | ignore_index=True, kind='mergesort')
102 | else:
103 | triples = triples.sample(frac=1, random_state=0)
104 |
105 | # In case of duplicates, keep most recent edge
106 | triples = triples.drop_duplicates(subset=[COL_SOURCE, COL_EDGE,
107 | COL_TARGET],
108 | keep='first')
109 |
110 | print(f'Read {initial_length:,} lines, got {len(triples):,} '
111 | 'after keeping triples with dates and deduplicating.')
112 |
113 | train_triples, test_triples = split_train_test_triples(triples, ratio=0.1)
114 |
115 | num_test_triples = len(test_triples)
116 | split_idx = num_test_triples // 2
117 | valid_triples = test_triples.iloc[split_idx:]
118 | test_triples = test_triples.iloc[:split_idx]
119 |
120 | filename = osp.basename(triples_path)
121 | name, ext = osp.splitext(filename)
122 | data_path = osp.join(osp.dirname(osp.dirname(triples_path)), DIR_PROCESSED)
123 |
124 | if not osp.exists(data_path):
125 | os.mkdir(data_path)
126 |
127 | splits = {'train': train_triples,
128 | 'valid': valid_triples,
129 | 'test': test_triples}
130 | for s, dataframe in splits.items():
131 | out_path = osp.join(data_path, f'{name}-{s}{ext}')
132 | dataframe.to_csv(out_path, sep='\t', index=False)
133 | print(f'Saved {len(dataframe):,} triples at {out_path}')
134 |
135 |
136 | def load_triples_array(path: str):
137 | """Given a path to a dataset file, extract only the colums containing
138 | (head, relation, tail) - i.e. the triples."""
139 | triples = pd.read_csv(path, sep='\t', dtype=str)
140 | triples = triples[[COL_SOURCE, COL_EDGE, COL_TARGET]].to_numpy()
141 |
142 | return triples
143 |
144 |
145 | def load_triples_factories(data_path: str, dataset: str):
146 | """Load a pykeen.triples.TriplesFactory tuple for training, validation,
147 | and testing triples."""
148 | processed_path = osp.join(data_path, DIR_PROCESSED)
149 |
150 | train_triples = load_triples_array(osp.join(processed_path,
151 | f'{dataset}-train.tsv'))
152 | valid_triples = load_triples_array(osp.join(processed_path,
153 | f'{dataset}-valid.tsv'))
154 | test_triples = load_triples_array(osp.join(processed_path,
155 | f'{dataset}-test.tsv'))
156 |
157 | training = TriplesFactory.from_labeled_triples(train_triples)
158 | validation = TriplesFactory.from_labeled_triples(
159 | valid_triples,
160 | entity_to_id=training.entity_to_id,
161 | relation_to_id=training.relation_to_id
162 | )
163 | testing = TriplesFactory.from_labeled_triples(
164 | test_triples,
165 | entity_to_id=training.entity_to_id,
166 | relation_to_id=training.relation_to_id
167 | )
168 |
169 | return training, validation, testing
170 |
171 |
172 | def reuse_existing_splits(triples_path, dataset_existing_splits):
173 | """"""
174 |
175 | triples = pd.read_csv(triples_path, sep='\t', dtype=str)
176 | initial_length = len(triples)
177 | logger.info(f"{initial_length} triples in input")
178 |
179 | triples = triples.dropna(subset=[COL_SOURCE, COL_EDGE, COL_TARGET,
180 | COL_PUBYEAR])
181 | cols = [COL_SOURCE, COL_EDGE, COL_TARGET]
182 | triples = triples[cols]
183 |
184 | filename = osp.basename(triples_path)
185 | name, ext = osp.splitext(filename)
186 | data_path = osp.join(osp.dirname(osp.dirname(triples_path)), DIR_PROCESSED)
187 |
188 | existing_train_path = osp.join(data_path, f'{dataset_existing_splits}-train{ext}')
189 | existing_val_path = osp.join(data_path, f'{dataset_existing_splits}-valid{ext}')
190 | existing_test_path = osp.join(data_path, f'{dataset_existing_splits}-test{ext}')
191 |
192 | existing_train = pd.read_csv(existing_train_path, sep='\t', dtype=str)[cols]
193 | existing_valid = pd.read_csv(existing_val_path, sep='\t', dtype=str)[cols]
194 | existing_test = pd.read_csv(existing_test_path, sep='\t', dtype=str)[cols]
195 |
196 | all_existing_triples = existing_train.append(existing_valid.append(
197 | existing_test)).sort_values(by=cols, kind='mergesort')
198 |
199 | logger.info(f"{len(all_existing_triples)} triples in existing {dataset_existing_splits}")
200 |
201 | all_existing_triples_records = set([tuple(x) for x in all_existing_triples.values])
202 | triple_records = [tuple(x) for x in triples.sort_values(by=cols, kind='mergesort').values]
203 |
204 | new_records = []
205 | with tqdm(total=len(triple_records), desc='Checking triple overlap') as bar:
206 | for i in range(len(triple_records)):
207 | row = triple_records[i]
208 |
209 | try:
210 | all_existing_triples_records.remove(row)
211 | except KeyError:
212 | new_records.append(row)
213 |
214 | bar.update(1)
215 | bar.set_description(
216 | f"Checking triple overlap. Remaining set: {len(all_existing_triples_records)}", refresh=True)
217 |
218 | # merge new triples plus existing train for new train
219 | new_triples = pd.DataFrame.from_records(new_records, columns=cols)
220 | train_triples = new_triples.append(existing_train)
221 |
222 | splits = {'train': train_triples,
223 | 'valid': existing_valid,
224 | 'test': existing_test}
225 |
226 | for s, dataframe in splits.items():
227 | out_path = osp.join(data_path, f'{name}-{s}{ext}')
228 | dataframe.to_csv(out_path, sep='\t', index=False)
229 | print(f'Saved {len(dataframe):,} triples at {out_path}')
230 |
231 |
232 | if __name__ == '__main__':
233 | parser = ArgumentParser(description='Split a file of triples into '
234 | 'train/valid/test sets based on time.')
235 | parser.add_argument('file', type=str)
236 | parser.add_argument('--random', action='store_true',
237 | help='Split randomly instead.')
238 | parser.add_argument('--existing_dataset_splits', type=str,
239 | help='Name of existing splits (assumed to be in processed)')
240 |
241 | args = parser.parse_args()
242 |
243 | if args.existing_dataset_splits is not None:
244 | reuse_existing_splits(args.file, args.existing_dataset_splits)
245 | else:
246 | create_splits(args.file, args.random)
247 |
--------------------------------------------------------------------------------
/bioblp/utils/util.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import time
3 | import pickle
4 | import torch
5 |
6 | def save_object(obj, filename):
7 | with open(filename, 'wb') as output: # Overwrites any existing file.
8 | torch.save(obj, output, pickle_module=dill)
9 |
10 |
11 | def load_object(filename):
12 | with open(filename, 'wb') as object:
13 | obj = torch.load(object, pickle_module=dill, encoding='utf-8')
14 |
15 |
16 | def read_query(query_filename):
17 | """
18 | Read a query from file and return as a string
19 | Parameters
20 | ----------
21 | query_filename: str name of the query. It will be looked for in the queries folder of this project
22 | Returns
23 | -------
24 | query: str the query with placeholders for the query parameters, as a string to be formatted
25 | """
26 | # query_filepath = Path(RAW_DIR / QUERY_DIR / query_filename)
27 |
28 | with open(query_filename) as fr:
29 | query = fr.read()
30 | return query
31 |
32 |
33 | def loading_animation(process, message="Loading") :
34 | while process.isAlive() :
35 | chars = "/—\|"
36 | for char in chars:
37 | sys.stdout.write('\r' + f'{message} {char} ')
38 | time.sleep(.1)
39 | sys.stdout.flush()
40 |
41 |
42 | def write_dict_as_pkl(dict_object, filename):
43 | """
44 | filename: path to pickle file, should include appropiate .pkl extension
45 | """
46 | with open(filename, "wb") as pkl_handle:
47 | pickle.dump(dict_object, pkl_handle)
48 |
49 |
50 | def load_dict_from_pkl(filename):
51 | """
52 | filename: path to pickle file, should include appropiate .pkl extension
53 | """
54 | with open(filename, "rb") as pkl_handle:
55 | dict_object = pickle.load(pkl_handle)
56 |
57 | return dict_object
58 |
59 |
--------------------------------------------------------------------------------
/conf/complex-biokg-20220826.toml:
--------------------------------------------------------------------------------
1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-train.tsv'
2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-valid.tsv'
3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-test.tsv'
4 |
5 | model = 'complex'
6 | dimension = 256
7 | loss_fn = 'crossentropy'
8 | loss_margin = 1.0
9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 20
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 |
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 |
--------------------------------------------------------------------------------
/conf/complex-biokg-full-20220826.toml:
--------------------------------------------------------------------------------
1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-train.tsv'
2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-valid.tsv'
3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-test.tsv'
4 |
5 | model = 'complex'
6 | dimension = 256
7 | loss_fn = 'crossentropy'
8 | loss_margin = 1.0
9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 2
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 |
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 |
--------------------------------------------------------------------------------
/conf/complex-hetionet-20220826.toml:
--------------------------------------------------------------------------------
1 | train_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-train.tsv'
2 | valid_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-valid.tsv'
3 | test_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-test.tsv'
4 |
5 | model = 'complex'
6 | dimension = 256
7 | loss_fn = 'crossentropy'
8 | loss_margin = 1.0
9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 200
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 128
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 |
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = true
22 | notes = 'attempt to reproduce hetionet reported results'
--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-lr.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/DPI/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 10
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 |
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 |
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 |
41 |
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 |
46 |
47 | [models]
48 |
49 | [models.noise_lr]
50 | feature = "noise"
51 | model = "LR"
52 |
53 | [models.structural_lr]
54 | feature = "structural"
55 | model = "LR"
56 |
57 | [models.transe_lr]
58 | feature = "transe"
59 | model = "LR"
60 |
61 | [models.complex_lr]
62 | feature = "complex"
63 | model = "LR"
64 |
65 | [models.rotate_lr]
66 | feature = "rotate"
67 | model = "LR"
68 |
69 | [models.bioblpd_lr]
70 | feature = "bioblpd"
71 | model = "LR"
72 |
73 | [models.bioblpm_lr]
74 | feature = "bioblpm"
75 | model = "LR"
76 |
77 | [models.bioblpp_lr]
78 | feature = "bioblpp"
79 | model = "LR"
80 |
81 |
82 | [train]
83 | n_iter = 10
84 | splits_file = "cv-splits.pt"
85 | refit_params = ["AUCPR", "AUCROC"]
86 | outdir = "models"
87 |
--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-mlp-1.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/DPI/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 10
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 |
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 |
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 |
41 |
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 |
46 |
47 | [models]
48 |
49 | [models.noise_mlp]
50 | feature = "noise"
51 | model = "MLP"
52 |
53 | [models.structural_mlp]
54 | feature = "structural"
55 | model = "MLP"
56 |
57 | [models.transe_mlp]
58 | feature = "transe"
59 | model = "MLP"
60 |
61 | [models.complex_mlp]
62 | feature = "complex"
63 | model = "MLP"
64 |
65 |
66 | [train]
67 | n_iter = 10
68 | splits_file = "cv-splits.pt"
69 | refit_params = ["AUCPR", "AUCROC"]
70 | outdir = "models"
71 |
--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-mlp-2.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/DPI/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 10
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 |
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 |
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 |
41 |
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 |
46 |
47 | [models]
48 |
49 |
50 | [models.rotate_mlp]
51 | feature = "rotate"
52 | model = "MLP"
53 |
54 | [models.bioblpd_mlp]
55 | feature = "bioblpd"
56 | model = "MLP"
57 |
58 | [models.bioblpm_mlp]
59 | feature = "bioblpm"
60 | model = "MLP"
61 |
62 | [models.bioblpp_mlp]
63 | feature = "bioblpp"
64 | model = "MLP"
65 |
66 |
67 | [train]
68 | n_iter = 10
69 | splits_file = "cv-splits.pt"
70 | refit_params = ["AUCPR", "AUCROC"]
71 | outdir = "models"
72 |
--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-rf.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/DPI/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 10
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 |
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 |
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 |
41 |
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 |
46 |
47 | [models]
48 |
49 | [models.noise_rf]
50 | feature = "noise"
51 | model = "RF"
52 |
53 | [models.structural_rf]
54 | feature = "structural"
55 | model = "RF"
56 |
57 | [models.transe_rf]
58 | feature = "transe"
59 | model = "RF"
60 |
61 | [models.complex_rf]
62 | feature = "complex"
63 | model = "RF"
64 |
65 | [models.rotate_rf]
66 | feature = "rotate"
67 | model = "RF"
68 |
69 | [models.bioblpd_rf]
70 | feature = "bioblpd"
71 | model = "RF"
72 |
73 | [models.bioblpm_rf]
74 | feature = "bioblpm"
75 | model = "RF"
76 |
77 | [models.bioblpp_rf]
78 | feature = "bioblpp"
79 | model = "RF"
80 |
81 |
82 | [train]
83 | n_iter = 10
84 | splits_file = "cv-splits.pt"
85 | refit_params = ["AUCPR", "AUCROC"]
86 | outdir = "models"
87 |
--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-r1-20230424-mlp.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/DPI/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 1
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 |
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 |
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 |
41 |
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 |
46 |
47 | [models]
48 |
49 | [models.noise_mlp]
50 | feature = "noise"
51 | model = "MLP"
52 |
53 | [models.structural_mlp]
54 | feature = "structural"
55 | model = "MLP"
56 |
57 | [models.transe_mlp]
58 | feature = "transe"
59 | model = "MLP"
60 |
61 | [models.complex_mlp]
62 | feature = "complex"
63 | model = "MLP"
64 |
65 | [models.rotate_mlp]
66 | feature = "rotate"
67 | model = "MLP"
68 |
69 | [models.bioblpd_mlp]
70 | feature = "bioblpd"
71 | model = "MLP"
72 |
73 | [models.bioblpm_mlp]
74 | feature = "bioblpm"
75 | model = "MLP"
76 |
77 | [models.bioblpp_mlp]
78 | feature = "bioblpp"
79 | model = "MLP"
80 |
81 |
82 | [train]
83 | n_iter = 10
84 | splits_file = "cv-splits.pt"
85 | refit_params = ["AUCPR", "AUCROC"]
86 | outdir = "models"
87 |
--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-r1-20230424-rflr.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/DPI/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 1
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 |
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 |
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 |
41 |
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 |
46 |
47 | [models]
48 |
49 | [models.noise_lr]
50 | feature = "noise"
51 | model = "LR"
52 |
53 | [models.structural_lr]
54 | feature = "structural"
55 | model = "LR"
56 |
57 | [models.transe_lr]
58 | feature = "transe"
59 | model = "LR"
60 |
61 | [models.complex_lr]
62 | feature = "complex"
63 | model = "LR"
64 |
65 | [models.rotate_lr]
66 | feature = "rotate"
67 | model = "LR"
68 |
69 | [models.bioblpd_lr]
70 | feature = "bioblpd"
71 | model = "LR"
72 |
73 | [models.bioblpm_lr]
74 | feature = "bioblpm"
75 | model = "LR"
76 |
77 | [models.bioblpp_lr]
78 | feature = "bioblpp"
79 | model = "LR"
80 |
81 |
82 | [models.noise_rf]
83 | feature = "noise"
84 | model = "RF"
85 |
86 | [models.structural_rf]
87 | feature = "structural"
88 | model = "RF"
89 |
90 | [models.transe_rf]
91 | feature = "transe"
92 | model = "RF"
93 |
94 | [models.complex_rf]
95 | feature = "complex"
96 | model = "RF"
97 |
98 | [models.rotate_rf]
99 | feature = "rotate"
100 | model = "RF"
101 |
102 | [models.bioblpd_rf]
103 | feature = "bioblpd"
104 | model = "RF"
105 |
106 | [models.bioblpm_rf]
107 | feature = "bioblpm"
108 | model = "RF"
109 |
110 | [models.bioblpp_rf]
111 | feature = "bioblpp"
112 | model = "RF"
113 |
114 |
115 | [train]
116 | n_iter = 10
117 | splits_file = "cv-splits.pt"
118 | refit_params = ["AUCPR", "AUCROC"]
119 | outdir = "models"
120 |
--------------------------------------------------------------------------------
/data/conf/complex-biokg-20220826.toml:
--------------------------------------------------------------------------------
1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-train.tsv'
2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-valid.tsv'
3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-test.tsv'
4 |
5 | model = 'complex'
6 | dimension = 256
7 | loss_fn = 'crossentropy'
8 | loss_margin = 1.0
9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 20
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 |
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 |
--------------------------------------------------------------------------------
/data/conf/complex-biokg-full-20220826.toml:
--------------------------------------------------------------------------------
1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-train.tsv'
2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-valid.tsv'
3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-test.tsv'
4 |
5 | model = 'complex'
6 | dimension = 256
7 | loss_fn = 'crossentropy'
8 | loss_margin = 1.0
9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 2
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 |
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 |
--------------------------------------------------------------------------------
/data/conf/complex-hetionet-20220826.toml:
--------------------------------------------------------------------------------
1 | train_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-train.tsv'
2 | valid_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-valid.tsv'
3 | test_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-test.tsv'
4 |
5 | model = 'complex'
6 | dimension = 256
7 | loss_fn = 'crossentropy'
8 | loss_margin = 1.0
9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 200
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 128
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 |
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = true
22 | notes = 'attempt to reproduce hetionet reported results'
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: bioblp
2 | channels:
3 | - huggingface
4 | - pytorch
5 | - conda-forge
6 | - defaults
7 | dependencies:
8 | - _libgcc_mutex=0.1=main
9 | - _openmp_mutex=5.1=1_gnu
10 | - anyio=3.5.0=py39h06a4308_0
11 | - appdirs=1.4.4=pyh9f0ad1d_0
12 | - argon2-cffi=21.3.0=pyhd3eb1b0_0
13 | - argon2-cffi-bindings=21.2.0=py39h7f8727e_0
14 | - asttokens=2.0.5=pyhd3eb1b0_0
15 | - babel=2.9.1=pyhd3eb1b0_0
16 | - backcall=0.2.0=pyhd3eb1b0_0
17 | - beautifulsoup4=4.11.1=py39h06a4308_0
18 | - blas=1.0=mkl
19 | - bleach=4.1.0=pyhd3eb1b0_0
20 | - bottleneck=1.3.5=py39h7deecbd_0
21 | - brotli=1.0.9=h166bdaf_7
22 | - brotli-bin=1.0.9=h166bdaf_7
23 | - brotlipy=0.7.0=py39h27cfd23_1003
24 | - bzip2=1.0.8=h7b6447c_0
25 | - ca-certificates=2022.12.7=ha878542_0
26 | - certifi=2022.12.7=pyhd8ed1ab_0
27 | - cffi=1.15.1=py39h74dc2b5_0
28 | - charset-normalizer=2.0.4=pyhd3eb1b0_0
29 | - click=8.0.4=py39h06a4308_0
30 | - contourpy=1.0.5=py39hdb19cb5_0
31 | - cryptography=37.0.1=py39h9ce1e76_0
32 | - cudatoolkit=11.3.1=h2bc3f7f_2
33 | - cycler=0.11.0=pyhd8ed1ab_0
34 | - dataclasses=0.8=pyh6d0b6a4_7
35 | - dbus=1.13.18=hb2f20db_0
36 | - debugpy=1.5.1=py39h295c915_0
37 | - decorator=5.1.1=pyhd3eb1b0_0
38 | - defusedxml=0.7.1=pyhd3eb1b0_0
39 | - entrypoints=0.4=py39h06a4308_0
40 | - executing=0.8.3=pyhd3eb1b0_0
41 | - expat=2.4.9=h6a678d5_0
42 | - ffmpeg=4.3=hf484d3e_0
43 | - filelock=3.6.0=pyhd3eb1b0_0
44 | - fontconfig=2.13.1=h6c09931_0
45 | - fonttools=4.25.0=pyhd3eb1b0_0
46 | - freetype=2.11.0=h70c0345_0
47 | - giflib=5.2.1=h7b6447c_0
48 | - glib=2.69.1=h4ff587b_1
49 | - gmp=6.2.1=h295c915_3
50 | - gnutls=3.6.15=he1e5248_0
51 | - gst-plugins-base=1.14.0=h8213a91_2
52 | - gstreamer=1.14.0=h28cd5cc_2
53 | - huggingface_hub=0.10.1=py_0
54 | - icu=58.2=he6710b0_3
55 | - idna=3.4=py39h06a4308_0
56 | - importlib-metadata=4.11.3=py39h06a4308_0
57 | - importlib_metadata=4.11.3=hd3eb1b0_0
58 | - intel-openmp=2021.4.0=h06a4308_3561
59 | - ipykernel=6.15.2=py39h06a4308_0
60 | - ipython=8.4.0=py39h06a4308_0
61 | - ipython_genutils=0.2.0=pyhd3eb1b0_1
62 | - ipywidgets=7.6.5=pyhd3eb1b0_1
63 | - jedi=0.18.1=py39h06a4308_1
64 | - jinja2=3.0.3=pyhd3eb1b0_0
65 | - joblib=1.1.0=pyhd3eb1b0_0
66 | - jpeg=9e=h7f8727e_0
67 | - json5=0.9.6=pyhd3eb1b0_0
68 | - jsonschema=4.16.0=py39h06a4308_0
69 | - jupyter=1.0.0=py39h06a4308_8
70 | - jupyter_client=7.3.5=py39h06a4308_0
71 | - jupyter_console=6.4.3=pyhd3eb1b0_0
72 | - jupyter_core=4.11.1=py39h06a4308_0
73 | - jupyter_server=1.18.1=py39h06a4308_0
74 | - jupyterlab=3.4.4=py39h06a4308_0
75 | - jupyterlab_pygments=0.1.2=py_0
76 | - jupyterlab_server=2.15.2=py39h06a4308_0
77 | - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
78 | - kiwisolver=1.4.2=py39h295c915_0
79 | - krb5=1.19.2=hac12032_0
80 | - lame=3.100=h7b6447c_0
81 | - lcms2=2.12=h3be6417_0
82 | - ld_impl_linux-64=2.38=h1181459_1
83 | - lerc=3.0=h295c915_0
84 | - libbrotlicommon=1.0.9=h166bdaf_7
85 | - libbrotlidec=1.0.9=h166bdaf_7
86 | - libbrotlienc=1.0.9=h166bdaf_7
87 | - libclang=10.0.1=default_hb85057a_2
88 | - libdeflate=1.8=h7f8727e_5
89 | - libedit=3.1.20210910=h7f8727e_0
90 | - libevent=2.1.12=h8f2d780_0
91 | - libffi=3.3=he6710b0_2
92 | - libgcc-ng=11.2.0=h1234567_1
93 | - libgfortran-ng=12.2.0=h69a702a_19
94 | - libgfortran5=12.2.0=h337968e_19
95 | - libgomp=11.2.0=h1234567_1
96 | - libiconv=1.16=h7f8727e_2
97 | - libidn2=2.3.2=h7f8727e_0
98 | - libllvm10=10.0.1=hbcb73fb_5
99 | - libpng=1.6.37=hbc83047_0
100 | - libpq=12.9=h16c4e8d_3
101 | - libprotobuf=3.20.1=h4ff587b_0
102 | - libsodium=1.0.18=h7b6447c_0
103 | - libstdcxx-ng=11.2.0=h1234567_1
104 | - libtasn1=4.16.0=h27cfd23_0
105 | - libtiff=4.4.0=hecacb30_0
106 | - libunistring=0.9.10=h27cfd23_0
107 | - libuuid=1.0.3=h7f8727e_2
108 | - libwebp=1.2.4=h11a3e52_0
109 | - libwebp-base=1.2.4=h5eee18b_0
110 | - libxcb=1.15=h7f8727e_0
111 | - libxkbcommon=1.0.1=hfa300c1_0
112 | - libxml2=2.9.14=h74e7548_0
113 | - libxslt=1.1.35=h4e12654_0
114 | - lz4-c=1.9.3=h295c915_1
115 | - markupsafe=2.1.1=py39h7f8727e_0
116 | - matplotlib=3.6.2=py39hf3d152e_0
117 | - matplotlib-base=3.6.2=py39h945d387_0
118 | - matplotlib-inline=0.1.6=py39h06a4308_0
119 | - mistune=0.8.4=py39h27cfd23_1000
120 | - mkl=2021.4.0=h06a4308_640
121 | - mkl-service=2.4.0=py39h7f8727e_0
122 | - mkl_fft=1.3.1=py39hd3c417c_0
123 | - mkl_random=1.2.2=py39h51133e4_0
124 | - munkres=1.1.4=pyh9f0ad1d_0
125 | - nbclassic=0.3.5=pyhd3eb1b0_0
126 | - nbclient=0.5.13=py39h06a4308_0
127 | - nbconvert=6.4.4=py39h06a4308_0
128 | - nbformat=5.5.0=py39h06a4308_0
129 | - ncurses=6.3=h5eee18b_3
130 | - nest-asyncio=1.5.5=py39h06a4308_0
131 | - nettle=3.7.3=hbbd107a_1
132 | - notebook=6.4.12=py39h06a4308_0
133 | - nspr=4.33=h295c915_0
134 | - nss=3.74=h0370c37_0
135 | - numexpr=2.8.4=py39he184ba9_0
136 | - numpy=1.23.3=py39h14f4228_0
137 | - numpy-base=1.23.3=py39h31eccc5_0
138 | - openh264=2.1.1=h4ff587b_0
139 | - openssl=1.1.1t=h7f8727e_0
140 | - packaging=21.3=pyhd3eb1b0_0
141 | - pandocfilters=1.5.0=pyhd3eb1b0_0
142 | - parso=0.8.3=pyhd3eb1b0_0
143 | - patsy=0.5.3=pyhd8ed1ab_0
144 | - pcre=8.45=h295c915_0
145 | - pexpect=4.8.0=pyhd3eb1b0_3
146 | - pickleshare=0.7.5=pyhd3eb1b0_1003
147 | - pillow=9.2.0=py39hace64e9_1
148 | - pip=22.2.2=py39h06a4308_0
149 | - ply=3.11=py39h06a4308_0
150 | - pooch=1.6.0=pyhd8ed1ab_0
151 | - prometheus_client=0.14.1=py39h06a4308_0
152 | - prompt-toolkit=3.0.20=pyhd3eb1b0_0
153 | - prompt_toolkit=3.0.20=hd3eb1b0_0
154 | - protobuf=3.20.1=py39h295c915_0
155 | - ptyprocess=0.7.0=pyhd3eb1b0_2
156 | - pure_eval=0.2.2=pyhd3eb1b0_0
157 | - pycparser=2.21=pyhd3eb1b0_0
158 | - pygments=2.11.2=pyhd3eb1b0_0
159 | - pyopenssl=22.0.0=pyhd3eb1b0_0
160 | - pyparsing=3.0.9=py39h06a4308_0
161 | - pyqt=5.15.7=py39h6a678d5_1
162 | - pyqt5-sip=12.11.0=py39h6a678d5_1
163 | - pyrsistent=0.18.0=py39heee7806_0
164 | - pysocks=1.7.1=py39h06a4308_0
165 | - python=3.9.13=haa1d7c7_2
166 | - python-dateutil=2.8.2=pyhd3eb1b0_0
167 | - python-fastjsonschema=2.16.2=py39h06a4308_0
168 | - python_abi=3.9=2_cp39
169 | - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0
170 | - pytorch-mutex=1.0=cuda
171 | - pyyaml=6.0=py39h7f8727e_1
172 | - pyzmq=23.2.0=py39h6a678d5_0
173 | - qt-main=5.15.2=h327a75a_7
174 | - qt-webengine=5.15.9=hd2b0992_4
175 | - qtconsole=5.3.2=py39h06a4308_0
176 | - qtpy=2.2.0=py39h06a4308_0
177 | - qtwebkit=5.212=h4eab89a_4
178 | - readline=8.1.2=h7f8727e_1
179 | - regex=2022.7.9=py39h5eee18b_0
180 | - requests=2.28.1=py39h06a4308_0
181 | - sacremoses=master=py_0
182 | - seaborn=0.12.2=hd8ed1ab_0
183 | - seaborn-base=0.12.2=pyhd8ed1ab_0
184 | - send2trash=1.8.0=pyhd3eb1b0_1
185 | - setuptools=63.4.1=py39h06a4308_0
186 | - sip=6.6.2=py39h6a678d5_0
187 | - six=1.16.0=pyhd3eb1b0_1
188 | - sniffio=1.2.0=py39h06a4308_1
189 | - soupsieve=2.3.1=pyhd3eb1b0_0
190 | - sqlite=3.39.3=h5082296_0
191 | - stack_data=0.2.0=pyhd3eb1b0_0
192 | - statsmodels=0.13.5=py39h7deecbd_1
193 | - terminado=0.13.1=py39h06a4308_0
194 | - testpath=0.6.0=py39h06a4308_0
195 | - tk=8.6.12=h1ccaba5_0
196 | - toml=0.10.2=pyhd3eb1b0_0
197 | - torchaudio=0.12.1=py39_cu113
198 | - torchvision=0.13.1=py39_cu113
199 | - tornado=6.2=py39h5eee18b_0
200 | - tqdm=4.64.1=py39h06a4308_0
201 | - traitlets=5.1.1=pyhd3eb1b0_0
202 | - typing-extensions=4.3.0=py39h06a4308_0
203 | - typing_extensions=4.3.0=py39h06a4308_0
204 | - tzdata=2022e=h04d1e81_0
205 | - urllib3=1.26.11=py39h06a4308_0
206 | - wcwidth=0.2.5=pyhd3eb1b0_0
207 | - webencodings=0.5.1=py39h06a4308_1
208 | - websocket-client=0.58.0=py39h06a4308_4
209 | - wheel=0.37.1=pyhd3eb1b0_0
210 | - widgetsnbextension=3.5.2=py39h06a4308_0
211 | - xz=5.2.6=h5eee18b_0
212 | - yaml=0.2.5=h7b6447c_0
213 | - zeromq=4.3.4=h2531618_0
214 | - zipp=3.8.0=py39h06a4308_0
215 | - zlib=1.2.12=h5eee18b_3
216 | - zstd=1.5.2=ha4553b6_0
217 | - pip:
218 | - alembic==1.8.1
219 | - attrs==22.1.0
220 | - autopage==0.5.1
221 | - class-resolver==0.3.10
222 | - click-default-group==1.2.2
223 | - cliff==4.0.0
224 | - cmaes==0.8.2
225 | - cmd2==2.4.2
226 | - colorlog==6.7.0
227 | - dataclasses-json==0.5.7
228 | - dill==0.3.6
229 | - docdata==0.0.3
230 | - docker-pycreds==0.4.0
231 | - gitdb==4.0.9
232 | - gitpython==3.1.29
233 | - greenlet==1.1.3.post0
234 | - mako==1.2.3
235 | - marshmallow==3.18.0
236 | - marshmallow-enum==1.5.1
237 | - more-click==0.1.1
238 | - more-itertools==9.0.0
239 | - mypy-extensions==0.4.3
240 | - networkx==3.0
241 | - optuna==3.0.3
242 | - pandas==1.5.1
243 | - pathtools==0.1.2
244 | - pbr==5.10.0
245 | - prettytable==3.4.1
246 | - promise==2.3
247 | - psutil==5.9.3
248 | - pykeen==1.9.0
249 | - pyperclip==1.8.2
250 | - pystow==0.4.6
251 | - pytz==2022.5
252 | - rexmex==0.1.2
253 | - scikit-learn==1.1.2
254 | - scipy==1.8.1
255 | - sentry-sdk==1.9.10
256 | - setproctitle==1.3.2
257 | - shortuuid==1.0.9
258 | - sklearn==0.0
259 | - smmap==5.0.0
260 | - sqlalchemy==1.4.42
261 | - stevedore==4.0.1
262 | - tabulate==0.9.0
263 | - threadpoolctl==3.1.0
264 | - tokenizers==0.10.3
265 | - torch-max-mem==0.0.4
266 | - torch-ppr==0.0.8
267 | - transformers==4.11.3
268 | - typed-argument-parser==1.7.2
269 | - typing-inspect==0.8.0
270 | - wandb==0.13.4
271 |
--------------------------------------------------------------------------------
/fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/fig.png
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-complex-initialized.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-d
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout fix_bioblp_init
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=complex \
27 | --dimension=256 \
28 | --loss_fn=bcewithlogits \
29 | --optimizer=adam \
30 | --learning_rate=2e-5 \
31 | --warmup_fraction=0.05 \
32 | --num_epochs=100 \
33 | --batch_size=1024 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --in_batch_negatives=True \
37 | --from_checkpoint=models/1e9b4f4o \
38 | --log_wandb=True \
39 | --notes="ComplEx BioBLP-D initialized with 1e9b4f4o"
40 |
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-complex.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-d-complex
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout develop
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=complex \
27 | --dimension=256 \
28 | --loss_fn=bcewithlogits \
29 | --optimizer=adam \
30 | --learning_rate=2e-5 \
31 | --warmup_fraction=0.05 \
32 | --num_epochs=100 \
33 | --batch_size=1024 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --in_batch_negatives=True \
37 | --log_wandb=True \
38 | --notes="ComplEx BioBLP-D"
39 |
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-rotate-initialized.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-d
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout disease-encoder-checkpoint
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=rotate \
27 | --dimension=256 \
28 | --loss_fn=crossentropy \
29 | --optimizer=adam \
30 | --learning_rate=2e-5 \
31 | --warmup_fraction=0.05 \
32 | --num_epochs=100 \
33 | --batch_size=1024 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --in_batch_negatives=True \
37 | --from_checkpoint=models/36viovqn \
38 | --log_wandb=True \
39 | --notes="RotatE BioBLP-D initialized with 36viovqn, higher patience"
40 |
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-rotate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-d
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout disease-encoder-dummy
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=rotate \
27 | --dimension=256 \
28 | --loss_fn=crossentropy \
29 | --optimizer=adam \
30 | --learning_rate=2e-5 \
31 | --warmup_fraction=0.05 \
32 | --num_epochs=100 \
33 | --batch_size=1024 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --in_batch_negatives=True \
37 | --log_wandb=True \
38 | --notes="BioBLP-D"
39 |
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-transe-initialized.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-d-transe
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout fix_bioblp_init
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=transe \
27 | --dimension=512 \
28 | --loss_fn=marginranking \
29 | --loss_margin=8.155451890616455 \
30 | --optimizer=adam \
31 | --learning_rate=2e-5 \
32 | --warmup_fraction=0.05 \
33 | --num_epochs=100 \
34 | --batch_size=1024 \
35 | --eval_batch_size=64 \
36 | --num_negatives=512 \
37 | --in_batch_negatives=True \
38 | --from_checkpoint=models/394htt2x \
39 | --log_wandb=True \
40 | --notes="TransE BioBLP-D initialized with 394htt2x"
41 |
42 | # Keep files generated during job
43 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
44 | mkdir -p $RESULTS_FOLDER
45 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
46 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-transe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-d-transe
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout fix_bioblp_init
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=transe \
27 | --dimension=512 \
28 | --loss_fn=marginranking \
29 | --loss_margin=8.155451890616455 \
30 | --optimizer=adam \
31 | --learning_rate=2e-5 \
32 | --warmup_fraction=0.05 \
33 | --num_epochs=100 \
34 | --batch_size=1024 \
35 | --eval_batch_size=64 \
36 | --num_negatives=512 \
37 | --in_batch_negatives=True \
38 | --log_wandb=True \
39 | --notes="TransE BioBLP-D, margin from sage-shadow-1047"
40 |
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-complex-sweep
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout develop
20 | wandb agent --count 1 discoverylab/bioblp/70t4kuu5
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: complex
11 | loss_fn:
12 | value: bcewithlogits
13 | optimizer:
14 | value: adam
15 | learning_rate:
16 | distribution: log_uniform_values
17 | min: 1e-3
18 | max: 1.0
19 | regularizer:
20 | distribution: log_uniform_values
21 | min: 1e-6
22 | max: 1e-3
23 | batch_size:
24 | value: 1024
25 | eval_batch_size:
26 | value: 64
27 | in_batch_negatives:
28 | value: true
29 | command:
30 | - ${env}
31 | - python
32 | - "-m"
33 | - ${program}
34 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
35 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
36 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
37 | - '--search_eval_batch_size=True'
38 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
39 | - '--log_wandb=True'
40 | - '--notes="BioBLP-P ComplEx sweep"'
41 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-adagrad-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-bioblp-m-rotate-sweep
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/oouxbq6p
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-adagrad-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: crossentropy
13 | optimizer:
14 | value: adagrad
15 | learning_rate:
16 | distribution: log_uniform_values
17 | min: 1e-3
18 | max: 1e-1
19 | regularizer:
20 | distribution: log_uniform_values
21 | min: 1e-6
22 | max: 1e-3
23 | batch_size:
24 | value: 1024
25 | eval_batch_size:
26 | value: 64
27 | in_batch_negatives:
28 | value: true
29 | command:
30 | - ${env}
31 | - python
32 | - "-m"
33 | - ${program}
34 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
35 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
36 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
37 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
38 | - '--log_wandb=True'
39 | - '--notes=BioBLP-M RotatE sweep'
40 | - ${args}
41 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-bioblp-m-rotate-sweep
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/liqycjns
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: crossentropy
13 | optimizer:
14 | value: adam
15 | learning_rate:
16 | distribution: log_uniform_values
17 | min: 1e-4
18 | max: 1e-1
19 | regularizer:
20 | distribution: log_uniform_values
21 | min: 1e-6
22 | max: 1e-3
23 | batch_size:
24 | value: 1024
25 | eval_batch_size:
26 | value: 64
27 | in_batch_negatives:
28 | value: true
29 | command:
30 | - ${env}
31 | - python
32 | - "-m"
33 | - ${program}
34 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
35 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
36 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
37 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
38 | - '--log_wandb=True'
39 | - '--notes=BioBLP-M RotatE sweep'
40 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-transe-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-transe-sweep
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout develop
20 | wandb agent --count 1 discoverylab/bioblp/pgx00fqa
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-transe-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: transe
11 | dimension:
12 | value: 512
13 | loss_fn:
14 | value: marginranking
15 | optimizer:
16 | value: adam
17 | loss_margin:
18 | distribution: uniform
19 | min: 0.5
20 | max: 10.0
21 | learning_rate:
22 | distribution: log_uniform_values
23 | min: 1e-4
24 | max: 1e-1
25 | regularizer:
26 | distribution: log_uniform_values
27 | min: 1e-6
28 | max: 1e-3
29 | batch_size:
30 | value: 1024
31 | eval_batch_size:
32 | value: 64
33 | in_batch_negatives:
34 | value: true
35 | command:
36 | - ${env}
37 | - python
38 | - "-m"
39 | - ${program}
40 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
41 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
42 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
43 | - '--search_eval_batch_size=True'
44 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
45 | - '--log_wandb=True'
46 | - '--notes=BioBLP-M TransE sweep'
47 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-p-rotate
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/6d2bwmy4
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | loss_fn:
10 | value: bcewithlogits
11 | freeze_pretrained_embeddings:
12 | value: true
13 | learning_rate:
14 | distribution: log_uniform_values
15 | min: 1e-3
16 | max: 1.0
17 | regularizer:
18 | distribution: log_uniform_values
19 | min: 1e-6
20 | max: 1e-3
21 | batch_size:
22 | values:
23 | - 128
24 | - 256
25 | - 512
26 | command:
27 | - ${env}
28 | - python
29 | - "-m"
30 | - ${program}
31 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
32 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
33 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
34 | - '--search_eval_batch_size=True'
35 | - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt'
36 | - '--log_wandb=True'
37 | - '--notes="BioBLP-P ComplEx sweep"'
38 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-complex-initialized.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-p
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \
26 | --model=complex \
27 | --dimension=256 \
28 | --loss_fn=bcewithlogits \
29 | --regularizer=7.54616261352196e-05 \
30 | --freeze_pretrained_embeddings=True \
31 | --learning_rate=0.344274380857535 \
32 | --num_epochs=100 \
33 | --batch_size=512 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --from_checkpoint=models/1e9b4f4o \
37 | --log_wandb=True \
38 | --notes="ComplEx BioBLP-P initialized with 1e9b4f4o"
39 |
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-rotate-initialized.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-p
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \
26 | --model=rotate \
27 | --dimension=256 \
28 | --loss_fn=crossentropy \
29 | --regularizer=0.0003536270470551425 \
30 | --freeze_pretrained_embeddings=True \
31 | --learning_rate=0.04972680094809032 \
32 | --num_epochs=100 \
33 | --batch_size=512 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --from_checkpoint=models/36viovqn \
37 | --log_wandb=True \
38 | --notes="RotatE BioBLP-P initialized with 36viovqn"
39 |
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-rotate-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-p-rotate
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/u02tzec7
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-rotate-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: crossentropy
13 | freeze_pretrained_embeddings:
14 | value: true
15 | learning_rate:
16 | distribution: log_uniform_values
17 | min: 1e-3
18 | max: 1.0
19 | regularizer:
20 | distribution: log_uniform_values
21 | min: 1e-6
22 | max: 1e-3
23 | batch_size:
24 | values:
25 | - 128
26 | - 256
27 | - 512
28 | command:
29 | - ${env}
30 | - python
31 | - "-m"
32 | - ${program}
33 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
34 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
35 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
36 | - '--search_eval_batch_size=True'
37 | - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt'
38 | - '--log_wandb=True'
39 | - '--notes=BioBLP-P RotatE sweep'
40 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-transe-initialized.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-p
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=24:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \
26 | --model=transe \
27 | --dimension=512 \
28 | --loss_fn=marginranking \
29 | --loss_margin=7.234906889602847 \
30 | --regularizer=0.0006031667561379036 \
31 | --freeze_pretrained_embeddings=True \
32 | --learning_rate=0.03569964236328523 \
33 | --num_epochs=100 \
34 | --batch_size=256 \
35 | --eval_batch_size=64 \
36 | --num_negatives=512 \
37 | --from_checkpoint=models/394htt2x \
38 | --log_wandb=True \
39 | --notes="TransE BioBLP-P initialized with 394htt2x"
40 |
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-transe-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-p-transe
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=72:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/rw6nzzyx
21 |
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 |
--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-transe-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: random
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: transe
11 | dimension:
12 | value: 512
13 | loss_fn:
14 | value: marginranking
15 | freeze_pretrained_embeddings:
16 | value: true
17 | loss_margin:
18 | distribution: uniform
19 | min: 0.5
20 | max: 10.0
21 | learning_rate:
22 | distribution: log_uniform_values
23 | min: 1e-3
24 | max: 1.0
25 | regularizer:
26 | distribution: log_uniform_values
27 | min: 1e-6
28 | max: 1e-3
29 | batch_size:
30 | values:
31 | - 128
32 | - 256
33 | - 512
34 | command:
35 | - ${env}
36 | - python
37 | - "-m"
38 | - ${program}
39 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
40 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
41 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
42 | - '--search_eval_batch_size=True'
43 | - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt'
44 | - '--log_wandb=True'
45 | - '--notes=BioBLP-P TransE sweep'
46 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-complex-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/21oekub7
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/biokg-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | loss_fn:
10 | value: bcewithlogits
11 | learning_rate:
12 | distribution: log_uniform_values
13 | min: 1e-3
14 | max: 1.0
15 | regularizer:
16 | distribution: log_uniform_values
17 | min: 1e-6
18 | max: 1e-3
19 | batch_size:
20 | values:
21 | - 128
22 | - 256
23 | - 512
24 | - 1024
25 | command:
26 | - ${env}
27 | - python
28 | - "-m"
29 | - ${program}
30 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
31 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
32 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
33 | - '--log_wandb=True'
34 | - '--notes="ComplEx sweep"'
35 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-complex-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-complex-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/9m2x48u3
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/biokg-complex-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | learning_rate:
10 | distribution: log_uniform_values
11 | min: 1e-3
12 | max: 1.0
13 | regularizer:
14 | distribution: log_uniform_values
15 | min: 1e-6
16 | max: 1e-3
17 | batch_size:
18 | values:
19 | - 128
20 | - 256
21 | - 512
22 | - 1024
23 | command:
24 | - ${env}
25 | - python
26 | - "-m"
27 | - ${program}
28 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
29 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
30 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
31 | - '--log_wandb=True'
32 | - '--notes="ComplEx sweep"'
33 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-rotate-bce-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-rotate-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/7q2851co
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/biokg-rotate-bce-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: bcewithlogits
13 | learning_rate:
14 | distribution: log_uniform_values
15 | min: 1e-3
16 | max: 1.0
17 | regularizer:
18 | distribution: log_uniform_values
19 | min: 1e-6
20 | max: 1e-3
21 | batch_size:
22 | values:
23 | - 128
24 | - 256
25 | - 512
26 | - 1024
27 | command:
28 | - ${env}
29 | - python
30 | - "-m"
31 | - ${program}
32 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
33 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
34 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
35 | - '--log_wandb=True'
36 | - '--notes=RotatE sweep, bcewithlogits'
37 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-rotate-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-rotate-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/u75h00fl
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/biokg-rotate-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: crossentropy
13 | learning_rate:
14 | distribution: log_uniform_values
15 | min: 1e-3
16 | max: 1.0
17 | regularizer:
18 | distribution: log_uniform_values
19 | min: 1e-6
20 | max: 1e-3
21 | batch_size:
22 | values:
23 | - 128
24 | - 256
25 | - 512
26 | - 1024
27 | command:
28 | - ${env}
29 | - python
30 | - "-m"
31 | - ${program}
32 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
33 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
34 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
35 | - '--log_wandb=True'
36 | - '--notes=RotatE sweep'
37 | - ${args}
--------------------------------------------------------------------------------
/jobs/biokg-transe-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-transe-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/n4zgfrhb
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/biokg-transe-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: transe
11 | dimension:
12 | value: 512
13 | loss_fn:
14 | value: marginranking
15 | loss_margin:
16 | distribution: uniform
17 | min: 0.5
18 | max: 10.0
19 | learning_rate:
20 | distribution: log_uniform_values
21 | min: 1e-3
22 | max: 1.0
23 | regularizer:
24 | distribution: log_uniform_values
25 | min: 1e-6
26 | max: 1e-3
27 | batch_size:
28 | values:
29 | - 128
30 | - 256
31 | - 512
32 | - 1024
33 | command:
34 | - ${env}
35 | - python
36 | - "-m"
37 | - ${program}
38 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
39 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
40 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
41 | - '--log_wandb=True'
42 | - '--notes=TransE sweep'
43 | - ${args}
--------------------------------------------------------------------------------
/jobs/complex.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=complex
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=10:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout fix_bioblp_init
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --model=complex \
26 | --dimension=256 \
27 | --loss_fn=bcewithlogits \
28 | --learning_rate=0.3595182058943781 \
29 | --regularizer=3.7579365087382533e-05 \
30 | --num_epochs=100 \
31 | --batch_size=256 \
32 | --eval_batch_size=64 \
33 | --num_negatives=512 \
34 | --log_wandb=True \
35 | --notes="ComplEx best hparams, rep"
36 |
37 | # Keep files generated during job
38 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
39 | mkdir -p $RESULTS_FOLDER
40 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
41 |
--------------------------------------------------------------------------------
/jobs/hetionet-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-complex-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/ydoydkmt
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/hetionet-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | loss_fn:
10 | value: bcewithlogits
11 | learning_rate:
12 | distribution: log_uniform_values
13 | min: 1e-3
14 | max: 1.0
15 | regularizer:
16 | distribution: log_uniform_values
17 | min: 1e-6
18 | max: 1e-3
19 | batch_size:
20 | values:
21 | - 128
22 | - 256
23 | - 512
24 | - 1024
25 | command:
26 | - ${env}
27 | - python
28 | - "-m"
29 | - ${program}
30 | - '--train_triples=data/hetionet/hetionet.train.csv'
31 | - '--valid_triples=data/hetionet/hetionet.valid.csv'
32 | - '--test_triples=data/hetionet/hetionet.test.csv'
33 | - '--log_wandb=True'
34 | - '--notes="ComplEx sweep"'
35 | - ${args}
--------------------------------------------------------------------------------
/jobs/hetionet-complex-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-complex-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/uvgnrmka
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/hetionet-complex-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | learning_rate:
10 | distribution: log_uniform_values
11 | min: 1e-3
12 | max: 1.0
13 | regularizer:
14 | distribution: log_uniform_values
15 | min: 1e-6
16 | max: 1e-3
17 | batch_size:
18 | values:
19 | - 128
20 | - 256
21 | - 512
22 | - 1024
23 | command:
24 | - ${env}
25 | - python
26 | - "-m"
27 | - ${program}
28 | - '--train_triples=data/hetionet/hetionet.train.csv'
29 | - '--valid_triples=data/hetionet/hetionet.valid.csv'
30 | - '--test_triples=data/hetionet/hetionet.test.csv'
31 | - '--log_wandb=True'
32 | - '--notes="ComplEx sweep"'
33 | - ${args}
--------------------------------------------------------------------------------
/jobs/hetionet-rotate-bce-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-rotate-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/ge1smc54
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/hetionet-rotate-bce-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: bcewithlogits
13 | learning_rate:
14 | distribution: log_uniform_values
15 | min: 1e-3
16 | max: 1.0
17 | regularizer:
18 | distribution: log_uniform_values
19 | min: 1e-6
20 | max: 1e-3
21 | batch_size:
22 | values:
23 | - 128
24 | - 256
25 | - 512
26 | - 1024
27 | command:
28 | - ${env}
29 | - python
30 | - "-m"
31 | - ${program}
32 | - '--train_triples=data/hetionet/hetionet.train.csv'
33 | - '--valid_triples=data/hetionet/hetionet.valid.csv'
34 | - '--test_triples=data/hetionet/hetionet.test.csv'
35 | - '--log_wandb=True'
36 | - '--notes=RotatE sweep, bcewithlogits'
37 | - ${args}
--------------------------------------------------------------------------------
/jobs/hetionet-rotate-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-rotate-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/2iderrf0
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/hetionet-rotate-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: rotate
11 | loss_fn:
12 | value: crossentropy
13 | learning_rate:
14 | distribution: log_uniform_values
15 | min: 1e-3
16 | max: 1.0
17 | regularizer:
18 | distribution: log_uniform_values
19 | min: 1e-6
20 | max: 1e-3
21 | batch_size:
22 | values:
23 | - 128
24 | - 256
25 | - 512
26 | - 1024
27 | command:
28 | - ${env}
29 | - python
30 | - "-m"
31 | - ${program}
32 | - '--train_triples=data/hetionet/hetionet.train.csv'
33 | - '--valid_triples=data/hetionet/hetionet.valid.csv'
34 | - '--test_triples=data/hetionet/hetionet.test.csv'
35 | - '--log_wandb=True'
36 | - '--notes=RotatE sweep'
37 | - ${args}
--------------------------------------------------------------------------------
/jobs/hetionet-transe-sweep.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=biokg-transe-sweep
3 | #SBATCH --output=array_%A_%a.out
4 | #SBATCH --error=array_%A_%a.err
5 | #SBATCH --ntasks=1
6 | #SBATCH --cpus-per-task=6
7 | #SBATCH --ntasks-per-node=1
8 | #SBATCH --time=40:00:00
9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 |
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 |
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 |
20 | source activate bioblp
21 |
22 | wandb agent --count 1 discoverylab/bioblp/jfb6wo19
23 |
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 |
--------------------------------------------------------------------------------
/jobs/hetionet-transe-sweep.yml:
--------------------------------------------------------------------------------
1 | entity: discoverylab
2 | project: bioblp
3 | program: bioblp.train
4 | method: bayes
5 | metric:
6 | name: validation.both.realistic.inverse_harmonic_mean_rank
7 | goal: maximize
8 | parameters:
9 | model:
10 | value: transe
11 | dimension:
12 | value: 512
13 | loss_fn:
14 | value: marginranking
15 | loss_margin:
16 | distribution: uniform
17 | min: 0.5
18 | max: 10.0
19 | learning_rate:
20 | distribution: log_uniform_values
21 | min: 1e-3
22 | max: 1.0
23 | regularizer:
24 | distribution: log_uniform_values
25 | min: 1e-6
26 | max: 1e-3
27 | batch_size:
28 | values:
29 | - 128
30 | - 256
31 | - 512
32 | - 1024
33 | command:
34 | - ${env}
35 | - python
36 | - "-m"
37 | - ${program}
38 | - '--train_triples=data/hetionet/hetionet.train.csv'
39 | - '--valid_triples=data/hetionet/hetionet.valid.csv'
40 | - '--test_triples=data/hetionet/hetionet.test.csv'
41 | - '--log_wandb=True'
42 | - '--notes=TransE sweep'
43 | - ${args}
--------------------------------------------------------------------------------
/jobs/rotate-dummy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-rotate-dummy
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=08:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout disease-encoder-dummy
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/dummy_biokg_meshid_to_descr_name.tsv \
26 | --model=rotate \
27 | --dimension=256 \
28 | --loss_fn=crossentropy \
29 | --optimizer=adagrad \
30 | --regularizer=0.0002757262741946316 \
31 | --learning_rate=0.07300713133641318 \
32 | --num_epochs=100 \
33 | --batch_size=1024 \
34 | --eval_batch_size=64 \
35 | --num_negatives=512 \
36 | --in_batch_negatives=False \
37 | --log_wandb=True \
38 | --notes="BioBLP-D RotatE, no descriptions, fixed eval batch size"
39 |
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 |
--------------------------------------------------------------------------------
/jobs/rotate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=bioblp-complex
3 | #SBATCH --ntasks=1
4 | #SBATCH --cpus-per-task=18
5 | #SBATCH --time=01:00:00
6 | #SBATCH --mem=16G
7 | #SBATCH --partition=gpu
8 | #SBATCH --gpus=1
9 |
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 |
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 |
17 | source activate bioblp
18 |
19 | git checkout disease-encoder
20 |
21 | python -m bioblp.train \
22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 | --model=rotate \
27 | --dimension=256 \
28 | --loss_fn=crossentropy \
29 | --optimizer=adam \
30 | --learning_rate=2e-5 \
31 | --warmup_fraction=0.05 \
32 | --num_epochs=10 \
33 | --batch_size=1024 \
34 | --search_eval_batch_size=True \
35 | --eval_every=1 \
36 | --num_negatives=512 \
37 | --in_batch_negatives=True \
38 | --log_wandb=True \
39 | --notes="BioBLP-D 10 epoch test"
40 |
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 |
--------------------------------------------------------------------------------
/loaders/placeholder.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/loaders/placeholder.txt
--------------------------------------------------------------------------------
/logs/placeholder.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/logs/placeholder.txt
--------------------------------------------------------------------------------
/notebooks/01_01_biokg-data-prep-for-kge.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "id": "dd58a8cf",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "%load_ext autoreload\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 11,
17 | "id": "b05d473c",
18 | "metadata": {
19 | "tags": []
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import pandas as pd \n",
24 | "from pathlib import Path\n",
25 | "import toml\n",
26 | "\n",
27 | "from bioblp.data import COL_SOURCE, COL_TARGET,COL_EDGE\n",
28 | "from bioblp.data import create_random_splits"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "id": "f36dd753",
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "DATA_DIR = Path(\"../data\")\n",
39 | "SHARED_DATA_DIR = Path(\"/home/jovyan/workbench-shared-folder/bioblp/data\")\n",
40 | "config_path = DATA_DIR.joinpath(\"conf/complex-biokg-20220826.toml\")\n",
41 | "biokg_mini_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links_sample.tsv\")\n",
42 | "biokg_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links.tsv\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "id": "f4732983-308b-44d7-8fd9-43a3b1506819",
48 | "metadata": {
49 | "tags": []
50 | },
51 | "source": [
52 | "### BIOKG Data Prep"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 7,
58 | "id": "918f0203",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/html": [
64 | "\n",
65 | "\n",
78 | "
\n",
79 | " \n",
80 | " \n",
81 | " | \n",
82 | " src | \n",
83 | " edg | \n",
84 | " tgt | \n",
85 | "
\n",
86 | " \n",
87 | " \n",
88 | " \n",
89 | " 0 | \n",
90 | " C566487 | \n",
91 | " DISEASE_PATHWAY_ASSOCIATION | \n",
92 | " hsa00071 | \n",
93 | "
\n",
94 | " \n",
95 | " 1 | \n",
96 | " C567839 | \n",
97 | " DISEASE_PATHWAY_ASSOCIATION | \n",
98 | " map04810 | \n",
99 | "
\n",
100 | " \n",
101 | "
\n",
102 | "
"
103 | ],
104 | "text/plain": [
105 | " src edg tgt\n",
106 | "0 C566487 DISEASE_PATHWAY_ASSOCIATION hsa00071\n",
107 | "1 C567839 DISEASE_PATHWAY_ASSOCIATION map04810"
108 | ]
109 | },
110 | "execution_count": 7,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "#df = pd.read_csv(biokg_mini_path, delimiter=\"\\t\", names=[\"idx\", COL_SOURCE, COL_EDGE, COL_TARGET], header=0)\n",
117 | "df = pd.read_csv(biokg_path, delimiter=\"\\t\", names=[COL_SOURCE, COL_EDGE, COL_TARGET], header=None)\n",
118 | "df.head(2)"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "id": "37dac0a0-108d-4f4c-a1f3-95e985ca9db7",
124 | "metadata": {},
125 | "source": [
126 | "Create data splits"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 8,
132 | "id": "cb5e4b6d",
133 | "metadata": {},
134 | "outputs": [
135 | {
136 | "name": "stderr",
137 | "output_type": "stream",
138 | "text": [
139 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
140 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
141 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n"
142 | ]
143 | }
144 | ],
145 | "source": [
146 | "train, test, valid = create_random_splits(df, 0.9, 0.05, 0.05)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 11,
152 | "id": "d06a6c1e",
153 | "metadata": {},
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "saved to ../data/raw/biokg_full_splits\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "SAVE_SPLITS_TO_DISK = False\n",
165 | "dataset_name = 'biokg_random_900505'\n",
166 | "datasplits_dir = DATA_DIR.joinpath(\"raw/biokg_full_splits\")\n",
167 | "\n",
168 | "if SAVE_SPLITS_TO_DISK:\n",
169 | " save_splits(train_df=train,\n",
170 | " test_df=test, \n",
171 | " valid_df=valid,\n",
172 | " dataset_name=dataset_name,\n",
173 | " out_dir=datasplits_dir)"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "id": "ed5633c4-cf9f-477f-a468-582bbf91146d",
179 | "metadata": {},
180 | "source": [
181 | "### Training"
182 | ]
183 | },
184 | {
185 | "cell_type": "markdown",
186 | "id": "388a8210-89f0-435f-8405-81b8c38caa12",
187 | "metadata": {},
188 | "source": [
189 | "```bash\n",
190 | "$ python -m bioblp.train_argparse --conf /home/jovyan/BioBLP/data/conf/complex-biokg-full-20220826.toml\n",
191 | "```"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "id": "773a6c74-333b-49e8-b2df-022574889217",
198 | "metadata": {},
199 | "outputs": [],
200 | "source": []
201 | }
202 | ],
203 | "metadata": {
204 | "kernelspec": {
205 | "display_name": ".conda-bioblp-env [Python]",
206 | "language": "python",
207 | "name": "conda-env-.conda-bioblp-env-py"
208 | },
209 | "language_info": {
210 | "codemirror_mode": {
211 | "name": "ipython",
212 | "version": 3
213 | },
214 | "file_extension": ".py",
215 | "mimetype": "text/x-python",
216 | "name": "python",
217 | "nbconvert_exporter": "python",
218 | "pygments_lexer": "ipython3",
219 | "version": "3.8.13"
220 | }
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 5
224 | }
225 |
--------------------------------------------------------------------------------
/notebooks/03-00-nested-cv.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "f8467842-5b37-4dc9-83f0-a684ed4a5fdd",
6 | "metadata": {},
7 | "source": [
8 | "# Run nested CV routine"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "id": "259edda9-e110-4e05-b1de-2965c45ef58b",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import random\n",
19 | "\n",
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "\n",
23 | "from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET\n",
24 | "from bioblp.logging import get_logger\n",
25 | "import torch\n",
26 | "\n",
27 | "\n",
28 | "logger = get_logger(__name__)\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "id": "134fd3c5",
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "DATA_DIR = Path(\"../data/\")\n",
39 | "DATA_SHARED = Path(\"/home/jovyan/workbench-shared-folder/bioblp\")"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "id": "eee761be",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "from time import time\n",
50 | "from pathlib import Path\n",
51 | "from collections import defaultdict\n",
52 | "\n",
53 | "from bioblp.benchmarking.train import run_nested_cv\n",
54 | "from bioblp.benchmarking.train import get_scorers\n",
55 | "\n"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": null,
61 | "id": "326edf30",
62 | "metadata": {},
63 | "outputs": [],
64 | "source": [
65 | "\"\"\"Perform train run\"\"\"\n",
66 | "\n",
67 | "# reproducibility\n",
68 | "# SEED is set as global\n",
69 | "shuffle = True\n",
70 | "refit_params = [\"AUCPR\", \"AUCROC\"]\n",
71 | "\n",
72 | "data_dir = Path(\"../data/features/kge-1baon0eg/\")\n",
73 | "out_dir = Path(\"../data/runs/\")\n",
74 | "\n",
75 | "n_proc = 1\n",
76 | "n_iter = 2\n",
77 | "inner_n_folds = 3\n",
78 | "outer_n_folds = 5\n",
79 | "\n",
80 | "exp_output = defaultdict(dict)\n",
81 | "exp_output[\"config\"] = {\n",
82 | " \"n_proc\": n_proc,\n",
83 | " \"n_iter\": n_iter,\n",
84 | " \"inner_n_folds\": inner_n_folds,\n",
85 | " \"outer_n_folds\": outer_n_folds,\n",
86 | " \"data_dir\": data_dir,\n",
87 | " \"seed\": SEED,\n",
88 | " \"shuffle\": shuffle\n",
89 | "}\n",
90 | "\n",
91 | "start = time()\n",
92 | "run_timestamp = int(start)\n",
93 | "\n",
94 | "logger.info(\"Starting model building script at {}.\".format(start))\n",
95 | "\n",
96 | "############\n",
97 | "# Load data\n",
98 | "############\n",
99 | "logger.info(\"Loading training data...\")\n",
100 | "\n",
101 | "X_train = np.load(data_dir.joinpath(\"X.npy\"))\n",
102 | "y_train = np.load(data_dir.joinpath(\"y.npy\"))\n",
103 | "\n",
104 | "logger.info(\n",
105 | " \"Resulting shapes X_train: {}, y_train: {}\".format(\n",
106 | " X_train.shape, y_train.shape)\n",
107 | ")\n",
108 | "logger.info(\"Counts in y_train: {}\".format(\n",
109 | " np.unique(y_train, return_counts=True)))\n",
110 | "\n",
111 | "############\n",
112 | "# Setup classifiers & pipelines\n",
113 | "############\n",
114 | "\n",
115 | "lr_label = \"LR\"\n",
116 | "rf_label = \"RF\"\n",
117 | "MLP_label = \"MLP\"\n",
118 | "\n",
119 | "############\n",
120 | "# Compare models\n",
121 | "############\n",
122 | "\n",
123 | "candidates = [\n",
124 | " lr_label,\n",
125 | " # rf_label,\n",
126 | " # MLP_label\n",
127 | "\n",
128 | "]\n",
129 | "\n",
130 | "scorer = get_scorers()\n",
131 | "\n",
132 | "nested_cv_scores = run_nested_cv(\n",
133 | " candidates=candidates,\n",
134 | " X=X_train,\n",
135 | " y=y_train,\n",
136 | " scoring=scorer,\n",
137 | " inner_n_folds=inner_n_folds,\n",
138 | " inner_n_iter=n_iter,\n",
139 | " outer_n_folds=outer_n_folds,\n",
140 | " shuffle=shuffle,\n",
141 | " n_jobs=n_proc,\n",
142 | " refit_params=refit_params,\n",
143 | " random_state=SEED,\n",
144 | " outdir=out_dir,\n",
145 | " timestamp=run_timestamp\n",
146 | ")\n",
147 | "\n",
148 | "for algo, scores in nested_cv_scores.items():\n",
149 | " logger.info(\"Scores {}: {}\".format(algo, scores))\n",
150 | "\n",
151 | "exp_output[\"results\"] = nested_cv_scores\n",
152 | "\n",
153 | "logger.info(exp_output)\n",
154 | "\n",
155 | "file_out = out_dir.joinpath(\n",
156 | " \"nested_cv_scores_{}.npy\".format(run_timestamp))\n",
157 | "logger.info(\"Saving to {}\".format(file_out))\n",
158 | "np.save(file_out, exp_output)\n",
159 | "\n",
160 | "end = time()\n",
161 | "\n",
162 | "logger.info(\"Ran script in {} seconds\".format(str(end - start)))"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "id": "703ff89a-dd11-4fb0-bdcb-87e9fa41e20a",
168 | "metadata": {},
169 | "source": [
170 | "_____"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": null,
176 | "id": "a6594c30-e73d-4214-989c-54512bef0e5b",
177 | "metadata": {},
178 | "outputs": [],
179 | "source": []
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "id": "df67346c-124a-49ec-9cfe-913d273f66c2",
185 | "metadata": {},
186 | "outputs": [],
187 | "source": []
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "id": "58d97f92-0a46-4bd0-92be-7124e6c91768",
193 | "metadata": {},
194 | "outputs": [],
195 | "source": []
196 | }
197 | ],
198 | "metadata": {
199 | "kernelspec": {
200 | "display_name": ".conda-bioblp-env [Python]",
201 | "language": "python",
202 | "name": "conda-env-.conda-bioblp-env-py"
203 | },
204 | "language_info": {
205 | "codemirror_mode": {
206 | "name": "ipython",
207 | "version": 3
208 | },
209 | "file_extension": ".py",
210 | "mimetype": "text/x-python",
211 | "name": "python",
212 | "nbconvert_exporter": "python",
213 | "pygments_lexer": "ipython3",
214 | "version": "3.9.13"
215 | },
216 | "vscode": {
217 | "interpreter": {
218 | "hash": "c313b0b0929f94c03130caa81adcdac46c3c408d7f1caca6c1104b192c16f937"
219 | }
220 | }
221 | },
222 | "nbformat": 4,
223 | "nbformat_minor": 5
224 | }
225 |
--------------------------------------------------------------------------------
/notebooks/03-frequency-baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "# Evaluating frequency-based baselines for link prediction\n",
7 | "\n",
8 | "Some knowledege graphs come with particularly frequent instances (either relations, or entities), that a model can use to learn spurious correlations that lead to high ranking metrics, due to the calculation of micro-averages.\n",
9 | "A sanity check thus consists of running a baseline that simply uses counts, which can be compared with models that are supposed to generalize much better."
10 | ],
11 | "metadata": {
12 | "collapsed": false,
13 | "pycharm": {
14 | "name": "#%% md\n"
15 | }
16 | }
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 5,
21 | "metadata": {
22 | "collapsed": true,
23 | "pycharm": {
24 | "name": "#%%\n"
25 | }
26 | },
27 | "outputs": [],
28 | "source": [
29 | "import os.path as osp\n",
30 | "\n",
31 | "from pykeen.models.baseline import MarginalDistributionBaseline\n",
32 | "from pykeen.triples import TriplesFactory\n",
33 | "from pykeen.evaluation import RankBasedEvaluator, evaluate\n",
34 | "import torch"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "source": [
40 | "## Data loading"
41 | ],
42 | "metadata": {
43 | "collapsed": false,
44 | "pycharm": {
45 | "name": "#%% md\n"
46 | }
47 | }
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 6,
52 | "outputs": [],
53 | "source": [
54 | "graph_path = osp.join('..', 'data', 'biokgb', 'graph')\n",
55 | "train_triples = 'biokg.links-train.csv'\n",
56 | "valid_triples = 'biokg.links-valid.csv'\n",
57 | "test_triples = 'biokg.links-test.csv'\n",
58 | "\n",
59 | "train, valid, test = [TriplesFactory.from_path(osp.join(graph_path, f)) for f in (train_triples, valid_triples, test_triples)]"
60 | ],
61 | "metadata": {
62 | "collapsed": false,
63 | "pycharm": {
64 | "name": "#%%\n"
65 | }
66 | }
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "source": [
71 | "## Instantiating a frequency-based baseline\n",
72 | "\n",
73 | "PyKEEN comes with a set of interesting baselines that, ideally, any machine learning model should outperform. Here we will use the [`MarginalDistributionBaseline`](https://pykeen.readthedocs.io/en/stable/api/pykeen.models.MarginalDistributionBaseline.html).\n",
74 | "\n",
75 | "When predicting the tail for a triple (h, r, t), the model scores each possible tail t as the probability that t co-occurs with r times the probability that t co-occurs with h:\n",
76 | "\n",
77 | "$$\n",
78 | "P(t\\vert h, r) = P(t\\vert r) P(t\\vert h)\n",
79 | "$$"
80 | ],
81 | "metadata": {
82 | "collapsed": false,
83 | "pycharm": {
84 | "name": "#%% md\n"
85 | }
86 | }
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 7,
91 | "outputs": [],
92 | "source": [
93 | "model = MarginalDistributionBaseline(train)\n",
94 | "# An ugly hack to add a dummy parameter to this non-parametric baseline\n",
95 | "# so that evaluation works as for models with learnable parameters\n",
96 | "model.foo = torch.nn.Embedding(1, 2)"
97 | ],
98 | "metadata": {
99 | "collapsed": false,
100 | "pycharm": {
101 | "name": "#%%\n"
102 | }
103 | }
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "source": [
108 | "## Evaluation\n",
109 | "\n",
110 | "We now get the ranking metrics on the test set, using triples in the training, validation, and test sets for filtering.\n",
111 | "\n",
112 | "**Warning:** the next cell can take around half an hour to run."
113 | ],
114 | "metadata": {
115 | "collapsed": false,
116 | "pycharm": {
117 | "name": "#%% md\n"
118 | }
119 | }
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 10,
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": "Evaluating on cpu: 0%| | 0.00/185k [00:00, ?triple/s]",
128 | "application/vnd.jupyter.widget-view+json": {
129 | "version_major": 2,
130 | "version_minor": 0,
131 | "model_id": "c19c7651c4d148c4a90c6c58a905d73d"
132 | }
133 | },
134 | "metadata": {},
135 | "output_type": "display_data"
136 | }
137 | ],
138 | "source": [
139 | "evaluator = RankBasedEvaluator()\n",
140 | "results = evaluate(model, test.mapped_triples, evaluator, batch_size=1024, mode=None, device=torch.device('cpu'),\n",
141 | " additional_filter_triples=[train.mapped_triples, valid.mapped_triples, test.mapped_triples])"
142 | ],
143 | "metadata": {
144 | "collapsed": false,
145 | "pycharm": {
146 | "name": "#%%\n"
147 | }
148 | }
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 17,
153 | "outputs": [
154 | {
155 | "name": "stdout",
156 | "output_type": "stream",
157 | "text": [
158 | "both.inverse_harmonic_mean_rank 0.07\n",
159 | "both.hits_at_1 0.07\n",
160 | "both.hits_at_3 0.07\n",
161 | "both.hits_at_10 0.07\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "metrics = ['both.inverse_harmonic_mean_rank',\n",
167 | " 'both.hits_at_1',\n",
168 | " 'both.hits_at_3',\n",
169 | " 'both.hits_at_10']\n",
170 | "\n",
171 | "for m in metrics:\n",
172 | " print(f'{m:<40}{results.get_metric(m) * 100:.2f}')"
173 | ],
174 | "metadata": {
175 | "collapsed": false,
176 | "pycharm": {
177 | "name": "#%%\n"
178 | }
179 | }
180 | }
181 | ],
182 | "metadata": {
183 | "kernelspec": {
184 | "display_name": "Python 3",
185 | "language": "python",
186 | "name": "python3"
187 | },
188 | "language_info": {
189 | "codemirror_mode": {
190 | "name": "ipython",
191 | "version": 2
192 | },
193 | "file_extension": ".py",
194 | "mimetype": "text/x-python",
195 | "name": "python",
196 | "nbconvert_exporter": "python",
197 | "pygments_lexer": "ipython2",
198 | "version": "2.7.6"
199 | }
200 | },
201 | "nbformat": 4,
202 | "nbformat_minor": 0
203 | }
--------------------------------------------------------------------------------
/notebooks/06-hetionet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "source": [
6 | "# Exporting Hetionet from PyKEEN\n",
7 | "\n",
8 | "We will use PyKEEN to export Hetionet, because we need the string identifiers to retrieve properties.\n",
9 | "\n",
10 | "The resulting triples will be stored in `data/hetionet`."
11 | ],
12 | "metadata": {
13 | "collapsed": false
14 | }
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 1,
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "outputs": [],
23 | "source": [
24 | "import os.path as osp\n",
25 | "import os\n",
26 | "\n",
27 | "import pandas as pd\n",
28 | "from pykeen.datasets import Hetionet"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "outputs": [],
35 | "source": [
36 | "dataset = Hetionet()\n",
37 | "train, valid, test = dataset.training, dataset.validation, dataset.testing\n",
38 | "splits_dict = {'train': train, 'valid': valid, 'test':test}"
39 | ],
40 | "metadata": {
41 | "collapsed": false
42 | }
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "source": [
47 | "## Some stats"
48 | ],
49 | "metadata": {
50 | "collapsed": false
51 | }
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 3,
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "There are 45,158 entities and 24 relations.\n",
62 | " Split Triples\n",
63 | "--------------------\n",
64 | " train 1,800,157\n",
65 | " valid 225,020\n",
66 | " test 225,020\n"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "print(f'There are {dataset.num_entities:,} entities and {dataset.num_relations:,} relations.')\n",
72 | "print(f'{\"Split\":^10}{\"Triples\":>10}')\n",
73 | "print('-' * 20)\n",
74 | "for name, split in splits_dict.items():\n",
75 | " print(f'{name:^10}{split.num_triples:>10,}')"
76 | ],
77 | "metadata": {
78 | "collapsed": false
79 | }
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 4,
84 | "outputs": [
85 | {
86 | "name": "stderr",
87 | "output_type": "stream",
88 | "text": [
89 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
90 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
91 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "out_path = osp.join('..', 'data', 'hetionet')\n",
97 | "if not osp.exists(out_path):\n",
98 | " os.mkdir(out_path)\n",
99 | "\n",
100 | "for name, split in splits_dict.items():\n",
101 | " pd.DataFrame(split.triples).to_csv(osp.join(out_path, f'hetionet.{name}.csv'), sep='\\t', index=False, header=False)"
102 | ],
103 | "metadata": {
104 | "collapsed": false
105 | }
106 | }
107 | ],
108 | "metadata": {
109 | "kernelspec": {
110 | "display_name": "Python 3",
111 | "language": "python",
112 | "name": "python3"
113 | },
114 | "language_info": {
115 | "codemirror_mode": {
116 | "name": "ipython",
117 | "version": 2
118 | },
119 | "file_extension": ".py",
120 | "mimetype": "text/x-python",
121 | "name": "python",
122 | "nbconvert_exporter": "python",
123 | "pygments_lexer": "ipython2",
124 | "version": "2.7.6"
125 | }
126 | },
127 | "nbformat": 4,
128 | "nbformat_minor": 0
129 | }
130 |
--------------------------------------------------------------------------------
/notebooks/99-train_hetionet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "id": "dd58a8cf",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "%load_ext autoreload\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 11,
17 | "id": "b05d473c",
18 | "metadata": {
19 | "tags": []
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import pandas as pd \n",
24 | "from pathlib import Path\n",
25 | "import toml\n",
26 | "\n",
27 | "from bioblp.data import COL_SOURCE, COL_TARGET,COL_EDGE\n",
28 | "from bioblp.data import create_random_splits"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "id": "f36dd753",
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "DATA_DIR = Path(\"../data\")\n",
39 | "SHARED_DATA_DIR = Path(\"/home/jovyan/workbench-shared-folder/bioblp/data\")\n",
40 | "config_path = DATA_DIR.joinpath(\"conf/complex-biokg-20220826.toml\")\n",
41 | "biokg_mini_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links_sample.tsv\")\n",
42 | "biokg_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links.tsv\")"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "id": "56b4e0a0",
48 | "metadata": {},
49 | "source": [
50 | "## Hetionet"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 21,
56 | "id": "cbbb5a42",
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | "EagerDataset (create_inverse_triples=False)\n",
64 | "Name Entities Relations Triples\n",
65 | "---------- ---------- ----------- ---------\n",
66 | "Training 45158 24 1800157\n",
67 | "Testing 45158 24 225020\n",
68 | "Validation 45158 24 225020\n",
69 | "Total - - 2250197\n",
70 | "Head Relation tail\n",
71 | "----------------------- ---------- ------------\n",
72 | "Anatomy::UBERON:0000002 AdG Gene::10005\n",
73 | "Anatomy::UBERON:0000002 AdG Gene::114804\n",
74 | "Anatomy::UBERON:0000002 AdG Gene::118670\n",
75 | "Anatomy::UBERON:0000002 AdG Gene::128989\n",
76 | "Anatomy::UBERON:0000002 AdG Gene::132851\n",
77 | "\n"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "from pykeen.datasets import Hetionet\n",
83 | "from pykeen.datasets import get_dataset\n",
84 | "\n",
85 | "ds = get_dataset(dataset=Hetionet)\n",
86 | "ds.summarize()"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 39,
92 | "id": "35ad86ee",
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "name": "stderr",
97 | "output_type": "stream",
98 | "text": [
99 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
100 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
101 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "triples = Hetionet().factory_dict\n",
107 | "test = pd.DataFrame(triples['testing'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])\n",
108 | "train = pd.DataFrame(triples['training'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])\n",
109 | "valid = pd.DataFrame(triples['validation'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 41,
115 | "id": "978049a9",
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "0.10000013332166029"
122 | ]
123 | },
124 | "execution_count": 41,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "len(test)/(len(train)+ len(test) +len(valid))"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 42,
136 | "id": "d6068102",
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "saved to ../data/raw/hetionet_splits\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "SAVE_SPLITS_TO_DISK = False\n",
149 | "hetio_dataset_name = 'hetionet_random_801010'\n",
150 | "hetio_datasplits_dir = DATA_DIR.joinpath(\"raw/hetionet_splits\")\n",
151 | "\n",
152 | "if SAVE_SPLITS_TO_DISK:\n",
153 | " save_splits(train_df=train,\n",
154 | " test_df=test, \n",
155 | " valid_df=valid,\n",
156 | " dataset_name=hetio_dataset_name\",\n",
157 | " out_dir=hetio_datasplits_dir)"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": null,
163 | "id": "3459292c",
164 | "metadata": {},
165 | "outputs": [],
166 | "source": []
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 15,
171 | "id": "527f6a4d",
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/plain": [
177 | "{'train_triples': 'data',\n",
178 | " 'valid_triples': 'data',\n",
179 | " 'test_triples': 'data',\n",
180 | " 'model': 'complex',\n",
181 | " 'dimension': 256,\n",
182 | " 'loss_fn': 'crossentropy',\n",
183 | " 'loss_margin': 1.0,\n",
184 | " 'optimizer': 'adagrad',\n",
185 | " 'learning_rate': 0.01,\n",
186 | " 'regularizer': 1e-06,\n",
187 | " 'num_epochs': 100,\n",
188 | " 'batch_size': 1024,\n",
189 | " 'eval_batch_size': 16,\n",
190 | " 'num_negatives': 512,\n",
191 | " 'add_inverses': False,\n",
192 | " 'early_stopper': 'both.realistic.inverse_harmonic_mean_rank',\n",
193 | " 'search_train_batch_size': False,\n",
194 | " 'search_eval_batch_size': False,\n",
195 | " 'log_wandb': False}"
196 | ]
197 | },
198 | "execution_count": 15,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "def load_toml(toml_path: str) -> dict:\n",
205 | " toml_path = Path(toml_path)\n",
206 | "\n",
207 | " config = {}\n",
208 | "\n",
209 | " with open(toml_path, \"r\") as f:\n",
210 | " config = toml.load(f)\n",
211 | "\n",
212 | " return config\n",
213 | "\n",
214 | "config = load_toml(config_path)\n",
215 | "config"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "id": "ed5633c4-cf9f-477f-a468-582bbf91146d",
221 | "metadata": {},
222 | "source": [
223 | "### Training"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "id": "388a8210-89f0-435f-8405-81b8c38caa12",
229 | "metadata": {},
230 | "source": [
231 | "```bash\n",
232 | "$ python -m bioblp.train_argparse --conf /home/jovyan/BioBLP/data/conf/complex-hetionet-20220826.toml\n",
233 | "```"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "id": "773a6c74-333b-49e8-b2df-022574889217",
240 | "metadata": {},
241 | "outputs": [],
242 | "source": []
243 | }
244 | ],
245 | "metadata": {
246 | "kernelspec": {
247 | "display_name": ".conda-bioblp-env [Python]",
248 | "language": "python",
249 | "name": "conda-env-.conda-bioblp-env-py"
250 | },
251 | "language_info": {
252 | "codemirror_mode": {
253 | "name": "ipython",
254 | "version": 3
255 | },
256 | "file_extension": ".py",
257 | "mimetype": "text/x-python",
258 | "name": "python",
259 | "nbconvert_exporter": "python",
260 | "pygments_lexer": "ipython3",
261 | "version": "3.8.13"
262 | }
263 | },
264 | "nbformat": 4,
265 | "nbformat_minor": 5
266 | }
267 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "bioblp"
3 | version = "0.1.0"
4 | description = "Link Prediction for biomedical data using KGE"
5 | authors = []
6 | packages = [{include = "bioblp"}]
7 |
8 | [tool.poetry.dependencies]
9 | python = "^3.9,<3.11"
10 | tqdm = "^4.60.0"
11 | pykeen = "^1.4.0"
12 | toml = "^0.10.2"
13 | pandas = "^1.4.2"
14 | torch = "^1.11.0"
15 | scikit-learn = "^1.1.0"
16 | skorch = "^0.11.0"
17 | optuna = "3.0.1"
18 | dill = "^0.3.6"
19 |
20 | [tool.poetry.dev-dependencies]
21 |
22 |
23 | [tool.poetry.group.dev.dependencies]
24 | pytest = "^7.2.1"
25 | pycodestyle = "^2.10.0"
26 | autopep8 = "^2.0.1"
27 |
28 | [build-system]
29 | requires = ["poetry-core>=1.0.0"]
30 | build-backend = "poetry.core.masonry.api"
31 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | alembic==1.8.1 ; python_version >= "3.8" and python_version < "3.11"
2 | attrs==22.1.0 ; python_version >= "3.8" and python_version < "3.11"
3 | autopage==0.5.1 ; python_version >= "3.8" and python_version < "3.11"
4 | certifi==2022.6.15.1 ; python_version >= "3.8" and python_version < "3.11"
5 | charset-normalizer==2.1.1 ; python_version >= "3.8" and python_version < "3.11"
6 | class-resolver==0.3.10 ; python_version >= "3.8" and python_version < "3.11"
7 | click-default-group==1.2.2 ; python_version >= "3.8" and python_version < "3.11"
8 | click==8.1.3 ; python_version >= "3.8" and python_version < "3.11"
9 | cliff==4.0.0 ; python_version >= "3.8" and python_version < "3.11"
10 | cmaes==0.8.2 ; python_version >= "3.8" and python_version < "3.11"
11 | cmd2==2.4.2 ; python_version >= "3.8" and python_version < "3.11"
12 | colorama==0.4.5 ; python_version >= "3.8" and python_version < "3.11" and platform_system == "Windows" or python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32"
13 | colorlog==6.7.0 ; python_version >= "3.8" and python_version < "3.11"
14 | dataclasses-json==0.5.7 ; python_version >= "3.8" and python_version < "3.11"
15 | docdata==0.0.3 ; python_version >= "3.8" and python_version < "3.11"
16 | greenlet==1.1.3 ; python_version >= "3.8" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "3.11"
17 | idna==3.3 ; python_version >= "3.8" and python_version < "3.11"
18 | importlib-metadata==4.12.0 ; python_version >= "3.8" and python_version < "3.11"
19 | importlib-resources==5.9.0 ; python_version >= "3.8" and python_version < "3.9"
20 | joblib==1.1.0 ; python_version >= "3.8" and python_version < "3.11"
21 | mako==1.2.2 ; python_version >= "3.8" and python_version < "3.11"
22 | markupsafe==2.1.1 ; python_version >= "3.8" and python_version < "3.11"
23 | marshmallow-enum==1.5.1 ; python_version >= "3.8" and python_version < "3.11"
24 | marshmallow==3.17.1 ; python_version >= "3.8" and python_version < "3.11"
25 | more-click==0.1.1 ; python_version >= "3.8" and python_version < "3.11"
26 | more-itertools==8.14.0 ; python_version >= "3.8" and python_version < "3.11"
27 | mypy-extensions==0.4.3 ; python_version >= "3.8" and python_version < "3.11"
28 | numpy==1.23.3 ; python_version < "3.11" and python_version >= "3.8"
29 | optuna==3.0.1 ; python_version >= "3.8" and python_version < "3.11"
30 | packaging==21.3 ; python_version >= "3.8" and python_version < "3.11"
31 | pandas==1.4.4 ; python_version >= "3.8" and python_version < "3.11"
32 | pbr==5.10.0 ; python_version >= "3.8" and python_version < "3.11"
33 | prettytable==3.4.1 ; python_version >= "3.8" and python_version < "3.11"
34 | protobuf==3.20.1 ; python_version >= "3.8" and python_version < "3.11"
35 | pykeen==1.9.0 ; python_version >= "3.8" and python_version < "3.11"
36 | pyparsing==3.0.9 ; python_version >= "3.8" and python_version < "3.11"
37 | pyperclip==1.8.2 ; python_version >= "3.8" and python_version < "3.11"
38 | pyreadline3==3.4.1 ; python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32"
39 | pystow==0.4.6 ; python_version >= "3.8" and python_version < "3.11"
40 | python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "3.11"
41 | pytz==2022.2.1 ; python_version >= "3.8" and python_version < "3.11"
42 | pyyaml==6.0 ; python_version >= "3.8" and python_version < "3.11"
43 | requests==2.28.1 ; python_version >= "3.8" and python_version < "3.11"
44 | rexmex==0.0.15 ; python_version >= "3.8" and python_version < "3.11"
45 | scikit-learn==1.1.2 ; python_version >= "3.8" and python_version < "3.11"
46 | scipy==1.8.1 ; python_version >= "3.8" and python_version < "3.11"
47 | six==1.16.0 ; python_version >= "3.8" and python_version < "3.11"
48 | scikit-learn==0.0 ; python_version >= "3.8" and python_version < "3.11"
49 | skorch==0.11.0 ; python_version >= "3.8" and python_version < "3.11"
50 | sqlalchemy==1.4.41 ; python_version >= "3.8" and python_version < "3.11"
51 | stevedore==4.0.0 ; python_version >= "3.8" and python_version < "3.11"
52 | tabulate==0.8.10 ; python_version >= "3.8" and python_version < "3.11"
53 | threadpoolctl==3.1.0 ; python_version >= "3.8" and python_version < "3.11"
54 | toml==0.10.2 ; python_version >= "3.8" and python_version < "3.11"
55 | torch-max-mem==0.0.4 ; python_version >= "3.8" and python_version < "3.11"
56 | torch-ppr==0.0.8 ; python_version >= "3.8" and python_version < "3.11"
57 | torch==1.12.1 ; python_version >= "3.8" and python_version < "3.11"
58 | tqdm==4.64.1 ; python_version >= "3.8" and python_version < "3.11"
59 | typing-extensions==4.3.0 ; python_version >= "3.8" and python_version < "3.11"
60 | typing-inspect==0.8.0 ; python_version >= "3.8" and python_version < "3.11"
61 | urllib3==1.26.12 ; python_version >= "3.8" and python_version < "3.11"
62 | wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "3.11"
63 | zipp==3.8.1 ; python_version >= "3.8" and python_version < "3.11"
64 |
65 | bioblp~=0.1.0
66 | torch~=1.13.1
67 | transformers~=4.26.1
68 | pandas~=1.5.3
69 | numpy~=1.24.2
70 | tqdm~=4.64.1
71 | pykeen~=1.10.0
72 | wandb~=0.13.10
73 | optuna~=3.0.1
74 | scikit-learn~=1.2.1
75 | skorch~=0.11.0
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/tests/__init__.py
--------------------------------------------------------------------------------
/tests/benchmarking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/tests/benchmarking/__init__.py
--------------------------------------------------------------------------------
/tests/benchmarking/bm_test_conf.toml:
--------------------------------------------------------------------------------
1 |
2 | data_root = "/home/skywalker/bioblp/"
3 | experiment_root = "data/benchmarks/experiments/dpi_fda/20230224/"
4 |
5 | [sampling]
6 | outdir = "sampled"
7 | num_negs_per_pos = 10
8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
9 |
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "random"
14 | encoders = ["structural", "complex", "rotate", "noise"]
15 |
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 |
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 |
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 |
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 |
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 |
32 | [split]
33 | n_splits = 5
34 | outdir = "splits"
35 |
36 | [models]
37 |
38 | [models.noise_lr]
39 | feature = "noise"
40 | model = "LR"
41 |
42 | [models.noise_rf]
43 | feature = "noise"
44 | model = "RF"
45 |
46 | [models.noise_mlp]
47 | feature = "noise"
48 | model = "MLP"
49 |
50 | [models.structural_lr]
51 | feature = "structural"
52 | model = "LR"
53 |
54 | [models.complex_lr]
55 | feature = "complex"
56 | model = "LR"
57 |
58 | [models.rotate_lr]
59 | feature = "rotate"
60 | model = "LR"
61 |
62 | [train]
63 | n_iter = 2
64 | splits_file = "cv-splits.pt"
65 | refit_params = ["AUCPR", "AUCROC"]
66 | outdir = "models"
67 |
--------------------------------------------------------------------------------
/tests/benchmarking/test_config.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from dataclasses import fields
4 |
5 | from pathlib import Path
6 | from bioblp.benchmarking.config import BenchmarkStepBaseConfig
7 | from bioblp.benchmarking.config import BenchmarkPreprocessConfig
8 | from bioblp.benchmarking.config import BenchmarkFeatureConfig
9 | from bioblp.benchmarking.config import BenchmarkTrainConfig
10 |
11 |
12 | from bioblp.logger import get_logger
13 |
14 |
15 | logger = get_logger(__name__)
16 |
17 | test_toml_file = Path(__file__).parent.joinpath("bm_test_conf.toml")
18 |
19 |
20 | class TestBenchmarkStepBaseConfig():
21 |
22 | dr = "/home/skywalker/bioblp/data/"
23 | exp = "benchmark/experiments"
24 | step_out = "step_out"
25 | run_id = "123"
26 |
27 | def test_resolve_outdir(self):
28 |
29 | cfg = BenchmarkStepBaseConfig(
30 | data_root=self.dr,
31 | experiment_root=self.exp,
32 | run_id=self.run_id,
33 | outdir=self.step_out
34 | )
35 |
36 | full_outdir = cfg.resolve_outdir()
37 |
38 | assert str(full_outdir) == self.dr + self.exp + \
39 | "/" + self.run_id + "/" + self.step_out
40 |
41 | def test_test_resolve_outdir_mutated(self):
42 | cfg = BenchmarkStepBaseConfig(
43 | data_root=self.dr,
44 | experiment_root=self.exp,
45 | run_id=self.run_id,
46 | outdir=self.step_out
47 | )
48 |
49 | override_data_root = "/home/vader/bioblp/data/"
50 |
51 | cfg.data_root = override_data_root
52 |
53 | full_outdir = cfg.resolve_outdir()
54 |
55 | assert str(full_outdir) == override_data_root + self.exp + \
56 | "/" + self.run_id + "/" + self.step_out
57 |
58 |
59 | class TestBenchmarkPreprocessConfig():
60 |
61 | def test_from_toml(self):
62 | expected_fields = ["data_root", "experiment_root", "run_id", "outdir",
63 | "num_negs_per_pos", "kg_triples_dir"]
64 |
65 | run_id = "123"
66 | cfg = BenchmarkPreprocessConfig.from_toml(
67 | test_toml_file, run_id=run_id)
68 |
69 | cfg_fields = [field.name for field in fields(cfg)]
70 |
71 | assert cfg.num_negs_per_pos == 10
72 | assert cfg.data_root == "/home/skywalker/bioblp/"
73 | assert len(set(cfg_fields).difference(set(expected_fields))
74 | ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}"
75 |
76 | def test_resolve_outdir(self):
77 |
78 | run_id = "123"
79 | cfg = BenchmarkPreprocessConfig.from_toml(
80 | test_toml_file, run_id=run_id)
81 |
82 | outdir = cfg.resolve_outdir()
83 |
84 | assert str(
85 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/sampled"
86 |
87 |
88 | class TestBenchmarkFeatureConfig():
89 |
90 | def test_from_toml(self):
91 | expected_fields = ["data_root", "experiment_root", "run_id", "outdir",
92 | "transform", "missing_values", "encoders", "encoder_args"]
93 |
94 | run_id = "123"
95 | cfg = BenchmarkFeatureConfig.from_toml(test_toml_file, run_id=run_id)
96 |
97 | cfg_fields = [field.name for field in fields(cfg)]
98 |
99 | assert len(set(cfg_fields).difference(set(expected_fields))
100 | ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}"
101 |
102 | def test_resolve_outdir(self):
103 |
104 | run_id = "123"
105 | cfg = BenchmarkFeatureConfig.from_toml(test_toml_file, run_id=run_id)
106 |
107 | outdir = cfg.resolve_outdir()
108 |
109 | assert str(
110 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/features"
111 |
112 |
113 | class TestBenchmarkTrainConfig():
114 |
115 | def test_from_toml(self):
116 | expected_fields = ["data_root", "experiment_root", "run_id", "outdir",
117 | "feature_dir", "models", "refit_params", "n_iter", "splits_dir", "splits_file"]
118 |
119 | run_id = "123"
120 | cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id)
121 |
122 | cfg_fields = [field.name for field in fields(cfg)]
123 |
124 | assert len(set(cfg_fields).difference(set(expected_fields))
125 | ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}"
126 |
127 | def test_resolve_outdir(self):
128 |
129 | run_id = "123"
130 | cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id)
131 |
132 | outdir = cfg.resolve_outdir()
133 |
134 | assert str(
135 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/models"
136 |
137 | def test_resolve_feature_outdir(self):
138 |
139 | run_id = "123"
140 | cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id)
141 |
142 | outdir = cfg.resolve_feature_dir()
143 |
144 | assert str(
145 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/features"
146 |
--------------------------------------------------------------------------------
/tests/benchmarking/test_featurise.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import torch
3 |
4 | from bioblp.benchmarking.featurise import apply_common_mask
5 |
6 |
7 | class TestApplyCommonMask:
8 |
9 | data_A = torch.arange(0., 9.).resize(3, 3)
10 | data_B = torch.arange(9., 21.).resize(3, 4)
11 |
12 | labels = torch.ones(3)
13 |
14 | def test_mask_consistency(self):
15 | mask_A = torch.tensor([0, 1])
16 | mask_B = torch.tensor([0, 1, 2])
17 |
18 | inputs = [("A", self.data_A, mask_A), ("B", self.data_B, mask_B)]
19 |
20 | masked_inputs, _ = apply_common_mask(inputs, labels=self.labels)
21 |
22 | assert masked_inputs[0][1].size(0) == len(mask_A)
23 | assert masked_inputs[0][1].size(0) == masked_inputs[1][1].size(0)
24 |
25 | def test_mask_consistency_labels(self):
26 | mask_A = torch.tensor([0, 2])
27 | mask_B = torch.tensor([0, 1, 2])
28 |
29 | labels = torch.tensor([1, 1, 0])
30 | expected_labels = torch.tensor([1, 0])
31 |
32 | inputs = [("A", self.data_A, mask_A), ("B", self.data_B, mask_B)]
33 |
34 | _, masked_labels = apply_common_mask(inputs, labels=labels)
35 |
36 | assert len(masked_labels) == len(mask_A)
37 | assert torch.sum((masked_labels - expected_labels)) == 0
38 |
--------------------------------------------------------------------------------
/tests/benchmarking/test_train.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from bioblp.benchmarking.train_utils import validate_features_exist
3 | from bioblp.benchmarking.config import BenchmarkTrainConfig
4 |
5 | from bioblp.logger import get_logger
6 |
7 |
8 | logger = get_logger(__name__)
9 |
10 |
11 | CONFIG_PATH = "conf/dpi-benchmark-cv-20230423-lr.toml"
12 |
13 |
14 | def test_parse_train_config():
15 | cfg = BenchmarkTrainConfig.from_toml(CONFIG_PATH, run_id="abc")
16 |
17 | logger.info(cfg)
18 |
19 |
20 | class TestValidateFeatures():
21 |
22 | models_conf = {
23 | "noise_lr": {
24 | "feature": "noise",
25 | "model": "LR"
26 | },
27 | "complex_lr": {
28 | "feature": "complex",
29 | "model": "LR"
30 | }
31 | }
32 |
33 | existing_feats = ["noise", "complex"]
34 |
35 | def setup_feats(self, dir):
36 | data = torch.arange(0., 12.).resize(3, 4)
37 |
38 | for feat in self.existing_feats:
39 | torch.save(data, dir.joinpath(f"{feat}.pt"))
40 |
41 | def test_validate_features_exist(self, tmp_path):
42 | dir = tmp_path.joinpath("features")
43 | dir.mkdir()
44 | self.setup_feats(dir)
45 |
46 | exists = validate_features_exist(dir, self.models_conf)
47 |
48 | assert exists is True
49 |
50 | def test_validate_features_exist_missing(self, tmp_path):
51 | dir = tmp_path.joinpath("features")
52 | dir.mkdir()
53 | self.setup_feats(dir)
54 |
55 | missing_feat = {
56 | "feature": "rotate",
57 | "model": "LR"
58 | }
59 | conf = self.models_conf
60 | conf.update({"rotate_LR": missing_feat})
61 |
62 | exists = validate_features_exist(dir, conf)
63 |
64 | assert exists is False
65 |
--------------------------------------------------------------------------------
/tests/test_encoders.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | import unittest
3 | import tempfile
4 | import os
5 | import os.path as osp
6 | import pytest
7 | import torch
8 | from transformers import BertTokenizer
9 |
10 | from bioblp.models.encoders import TransformerTextEncoder
11 | import bioblp.loaders.preprocessors as preprocessors
12 |
13 |
14 | class TestPropertyEncoders(unittest.TestCase):
15 | DISEASES = ['Irreversible FIBROSIS of the submucosal tissue of the MOUTH.',
16 | 'The co-occurrence of pregnancy and parasitic diseases.',
17 | 'Benign epidermal proliferations or tumors of viral in origin.',
18 | 'Infections with bacteria of the genus PASTEURELLA.']
19 |
20 | MOLECULES = ['101010101010101010101010101010101010']
21 |
22 | def setUp(self):
23 | self.temp_file = None
24 |
25 | def tearDown(self):
26 | if self.temp_file is not None:
27 | if osp.exists(self.temp_file):
28 | os.remove(self.temp_file)
29 |
30 | def make_test_file(self, entities: List[int], choices: List[str]):
31 | if self.temp_file is None:
32 | file_name = tempfile.NamedTemporaryFile().name
33 | self.temp_file = file_name
34 | else:
35 | file_name = self.temp_file
36 |
37 | with open(file_name, 'w') as file:
38 | for i, entity in enumerate(entities):
39 | sample = choices[i % len(choices)]
40 | file.write(f'{entity}\t{sample}\n')
41 |
42 | return file_name
43 |
44 | def make_protein_test_file(self, emb_dim: int, entities: List[str]):
45 | if self.temp_file is None:
46 | file_name = tempfile.NamedTemporaryFile().name
47 | self.temp_file = file_name
48 | else:
49 | file_name = self.temp_file
50 |
51 | embeddings = torch.rand([len(entities), emb_dim])
52 |
53 | with open(file_name, 'w') as file:
54 | torch.save({'identifiers': entities, 'embeddings': embeddings},
55 | file_name)
56 |
57 | return file_name
58 |
59 | @pytest.mark.skip(reason="no way of currently testing this")
60 | def test_text_preprocessor(self):
61 | entity_to_id = {str(i): i for i in range(10)}
62 | entities = list(entity_to_id.keys())
63 | file = self.make_test_file(entities, choices=self.DISEASES)
64 |
65 | max_length = 32
66 | tokenizer = BertTokenizer.from_pretrained(
67 | TransformerTextEncoder.BASE_MODEL)
68 | preprocessor = preprocessors.TextEntityPropertyPreprocessor(tokenizer,
69 | max_length)
70 |
71 | entities_tensor, data_idx, data = preprocessor.preprocess_file(file,
72 | entity_to_id)
73 | self.assertEqual(len(entities_tensor), len(entities))
74 | self.assertEqual(len(data_idx), len(entities))
75 | self.assertTupleEqual(data.shape, (len(entities), max_length))
76 |
77 | def test_molecule_preprocessor(self):
78 | entity_to_id = {str(i): i for i in range(10)}
79 | entities = list(entity_to_id.keys())
80 | file = self.make_test_file(entities, choices=self.MOLECULES)
81 |
82 | preprocessor = preprocessors.MolecularFingerprintPreprocessor()
83 | entities_tensor, data_idx, data = preprocessor.preprocess_file(file,
84 | entity_to_id)
85 |
86 | self.assertEqual(len(entities_tensor), len(entities))
87 | self.assertEqual(len(data_idx), len(entities))
88 | self.assertTupleEqual(
89 | data.shape, (len(entities), len(self.MOLECULES[0])))
90 |
91 | @pytest.mark.skip(reason="faulty test")
92 | def test_pretrained_protein_preprocessor(self):
93 | emb_dim = 32
94 | entity_to_id = {str(i): i for i in range(10)}
95 | entities = list(entity_to_id.keys())
96 | file = self.make_protein_test_file(emb_dim, entities)
97 |
98 | preprocessor = preprocessors.PretrainedEmbeddingPreprocessor()
99 | entities_tensor, data_idx, data = preprocessor.preprocess_file(file,
100 | entity_to_id)
101 |
102 | self.assertEqual(len(entities_tensor), len(entities))
103 | self.assertEqual(len(data_idx), len(entities))
104 | self.assertTupleEqual(data.shape, (len(entities), emb_dim))
105 |
--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
1 | from bioblp import __version__
2 |
3 |
4 | def test_version():
5 | assert __version__ == "0.1.0"
6 |
--------------------------------------------------------------------------------