├── .gitignore ├── Makefile ├── README.md ├── bioblp ├── __init__.py ├── benchmarking │ ├── README.md │ ├── __init__.py │ ├── config.py │ ├── encoders.py │ ├── experiment.py │ ├── featurise.py │ ├── hpo.py │ ├── preprocess.py │ ├── split.py │ ├── train.py │ ├── train_runner.py │ └── train_utils.py ├── data.py ├── evaluate.py ├── loaders │ └── preprocessors.py ├── logger.py ├── models │ ├── __init__.py │ ├── bioblp.py │ └── encoders.py ├── predict.py ├── preprocess.py ├── train.py ├── train_argparse.py └── utils │ ├── __init__.py │ ├── bioblp_utils.py │ ├── pipeline.py │ ├── training.py │ ├── triples.py │ └── util.py ├── conf ├── complex-biokg-20220826.toml ├── complex-biokg-full-20220826.toml ├── complex-hetionet-20220826.toml ├── dpi-benchmark-cv-20230423-lr.toml ├── dpi-benchmark-cv-20230423-mlp-1.toml ├── dpi-benchmark-cv-20230423-mlp-2.toml ├── dpi-benchmark-cv-20230423-rf.toml ├── dpi-benchmark-cv-r1-20230424-mlp.toml └── dpi-benchmark-cv-r1-20230424-rflr.toml ├── data └── conf │ ├── complex-biokg-20220826.toml │ ├── complex-biokg-full-20220826.toml │ └── complex-hetionet-20220826.toml ├── environment.yml ├── fig.png ├── jobs ├── biokg-bioblp-d-complex-initialized.sh ├── biokg-bioblp-d-complex.sh ├── biokg-bioblp-d-rotate-initialized.sh ├── biokg-bioblp-d-rotate.sh ├── biokg-bioblp-d-transe-initialized.sh ├── biokg-bioblp-d-transe.sh ├── biokg-bioblp-m-complex-bce-sweep.sh ├── biokg-bioblp-m-complex-bce-sweep.yml ├── biokg-bioblp-m-rotate-adagrad-sweep.sh ├── biokg-bioblp-m-rotate-adagrad-sweep.yml ├── biokg-bioblp-m-rotate-sweep.sh ├── biokg-bioblp-m-rotate-sweep.yml ├── biokg-bioblp-m-transe-sweep.sh ├── biokg-bioblp-m-transe-sweep.yml ├── biokg-bioblp-p-complex-bce-sweep.sh ├── biokg-bioblp-p-complex-bce-sweep.yml ├── biokg-bioblp-p-complex-initialized.sh ├── biokg-bioblp-p-rotate-initialized.sh ├── biokg-bioblp-p-rotate-sweep.sh ├── biokg-bioblp-p-rotate-sweep.yml ├── biokg-bioblp-p-transe-initialized.sh ├── biokg-bioblp-p-transe-sweep.sh ├── biokg-bioblp-p-transe-sweep.yml ├── biokg-complex-bce-sweep.sh ├── biokg-complex-bce-sweep.yml ├── biokg-complex-sweep.sh ├── biokg-complex-sweep.yml ├── biokg-rotate-bce-sweep.sh ├── biokg-rotate-bce-sweep.yml ├── biokg-rotate-sweep.sh ├── biokg-rotate-sweep.yml ├── biokg-transe-sweep.sh ├── biokg-transe-sweep.yml ├── complex.sh ├── hetionet-complex-bce-sweep.sh ├── hetionet-complex-bce-sweep.yml ├── hetionet-complex-sweep.sh ├── hetionet-complex-sweep.yml ├── hetionet-rotate-bce-sweep.sh ├── hetionet-rotate-bce-sweep.yml ├── hetionet-rotate-sweep.sh ├── hetionet-rotate-sweep.yml ├── hetionet-transe-sweep.sh ├── hetionet-transe-sweep.yml ├── rotate-dummy.sh └── rotate.sh ├── loaders └── placeholder.txt ├── logs └── placeholder.txt ├── notebooks ├── 00-clean-biokg-benchmarks.ipynb ├── 01-generate-biokg-splits.ipynb ├── 01_01_biokg-data-prep-for-kge.ipynb ├── 01_01_disease_mesh_notes_retrieval.ipynb ├── 01_02_disease_bert_encodings.ipynb ├── 02-01-biokg_benchmarks_eda.ipynb ├── 02-02-biokg_benchmarks_data_prep.ipynb ├── 02-03-benchmark-results.ipynb ├── 02-03-biokg_benchmarks_data_embedders.ipynb ├── 02_01_01-biokg_benchmarks_eda.ipynb ├── 02_01_02-biokg_benchmark-reconciliation.ipynb ├── 02_03_01-biokg_bm_dpi_clf-mlp.ipynb ├── 02_04_01-sanity-check-benchmark-ppi.ipynb ├── 02_04_01_biokg_bm_dpi_clf_nestedcv.ipynb ├── 02_99-benchmark-prep-yamanashi-dpi.ipynb ├── 03-00-nested-cv.ipynb ├── 03-frequency-baseline.ipynb ├── 04_00_ProtTrans_embeddings_biokG.ipynb ├── 04_01_Load & merge protein embeddings_BioKG.ipynb ├── 05-00-Load HetioNet - Get Gene to Protein mappings.ipynb ├── 05-01-Load HetioNet - Protein Embedding Generation.ipynb ├── 06-01 - Molecular Embeddings - BioKG.ipynb ├── 06-hetionet.ipynb ├── 07_00_evaluate-link-prediction_archived.ipynb ├── 07_01_eval_lp_deepdive.ipynb ├── 07_02_eval_lp_node_degree_effect.ipynb ├── 08-00-evaluate-link-prediction.ipynb ├── 08-01-inductive-evaluation.ipynb ├── 08-02-per-triple-evaluation.ipynb ├── 09-wandb-hparam-figures.ipynb ├── 10-pretraining-significance.ipynb ├── 11-pretraining-curves.ipynb ├── 12-per-relation-figures.ipynb ├── 13-node-degree-analysis-v2.ipynb ├── 99-train_hetionet.ipynb └── nb_utils │ └── eval_utils.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tests ├── __init__.py ├── benchmarking ├── __init__.py ├── bm_test_conf.toml ├── test_config.py ├── test_encoders.py ├── test_featurise.py └── test_train.py ├── test_encoders.py └── test_version.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv* 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Data paths 132 | data/ 133 | 134 | # Generated artifacts 135 | wandb/ 136 | /models 137 | 138 | # Editor 139 | .vscode 140 | 141 | # PyCharm 142 | .idea 143 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Self-Documented Makefile https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html 2 | .PHONY: clean setup install 3 | 4 | ################################################################################# 5 | # GLOBALS # 6 | ################################################################################# 7 | 8 | SHELL=/bin/bash 9 | PYTHON = python 10 | PROJECT_NAME = bioblp 11 | PACKAGE_NAME = bioblp 12 | PYTHON_INTERPRETER = python3 13 | KERNEL_NAME=Python (${PROJECT_NAME}) 14 | PYTHON_FULL_V = $(shell python -V) 15 | PYTHON_V := $(PYTHON_FULL_V:Python%=%) 16 | CONDA_ENV=${PROJECT_NAME}-env 17 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) 18 | CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate 19 | #PYTHON_V=3.8.6 20 | 21 | ################################################################################# 22 | # COMMANDS # 23 | ################################################################################# 24 | 25 | default: help 26 | 27 | print-%: ## Prints a variable value. Usage: make print-VARIABLE, eg: make print-TAG, result: TAG = 0.0.0 28 | @echo $* = $($*) 29 | 30 | setup: 31 | make install_poetry 32 | @echo $(shell poetry --version) || "Install Poetry" 33 | 34 | install_poetry: ## installs poetry. Remember to `source /home/jovyan/.poetry/env` from a terminal after running this recipe. Need only be run once 35 | curl -sSL https://install.python-poetry.org | python3 - 36 | 37 | install: 38 | poetry install 39 | poetry export -f requirements.txt --without-hashes --with dev --output requirements.txt 40 | 41 | 42 | update: 43 | poetry update 44 | poetry export -f requirements.txt --without-hashes --with dev --output requirements.txt 45 | 46 | test: 47 | poetry run pytest tests -s -vv 48 | 49 | create_ipython_kernel: 50 | poetry run ipython kernel install --user --display-name="${KERNEL_NAME}" 51 | 52 | freeze_requirements: ## Writes python project dependencies as a requirements.txt 53 | poetry export -f requirements.txt --output requirements.txt --without-hashes 54 | 55 | freeze_dev_requirements: ## Writes python project dependencies (including dev) as a requirements-dev.txt 56 | poetry export -f requirements.txt --output requirements-dev.txt --without-hashes --dev 57 | 58 | dist: ## Builds a distribution package with version ${PACKAGE_NAME}.__version__, eg: dist/test_me-0.0.0.tar.gz 59 | make clean 60 | poetry build 61 | 62 | 63 | ### JH setup 64 | 65 | setup_jh_env: 66 | make conda_setup 67 | make create_conda_env 68 | make create_conda_kernel 69 | 70 | conda_setup: # ensures conda env is persistent, need run only once 71 | mkdir -p /home/jovyan/.conda/pkgs/ 72 | touch /home/jovyan/.conda/pkgs/urls.txt 73 | 74 | create_conda_env: 75 | conda create --yes --prefix /home/jovyan/.conda/envs/${CONDA_ENV} ipykernel 76 | #conda create --yes --prefix /home/jovyan/.conda/envs/${CONDA_ENV} python==${PYTHON_V} ipykernel 77 | ($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make setup | source /home/jovyan/.poetry/env) 78 | # to install the project module as a dependency 79 | ($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make install) 80 | conda env export -n ${CONDA_ENV} -f ${PROJECT_DIR}/environment.yml 81 | 82 | create_conda_kernel: 83 | python -m ipykernel install --user --name=${CONDA_ENV} --display-name="${KERNEL_NAME}" 84 | 85 | update_conda_env: 86 | #($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make update) 87 | conda env update --name ${CONDA_ENV} -f ${PROJECT_DIR}/environment.yml --prune 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BioBLP: A Modular Framework for Learning on Multimodal Biomedical Knowledge Graphs 2 |
3 | 4 | 5 | DOI 6 | 7 |
8 | 9 |

10 |
11 | 12 |
13 |

14 | 15 | This is the official repository implementing BioBLP, presented in "BioBLP: A Modular Framework for Learning on Multimodal Biomedical Knowledge Graphs", published in the Journal of Biomedical Semantics ([link](https://doi.org/10.1186/s13326-023-00301-y)). 16 | 17 | BioBLP is a framework that allows encoding a diverse set of multimodal data that can appear in biomedical knowledge graphs. It is based on the idea of learning embeddings for each modality separately, and then combining them into a single multimodal embedding space. The framework is modular, and allows for easy integration of new modalities. 18 | 19 | To cite our work, please use the following: 20 | 21 | ```bibtex 22 | @article{bioblp, 23 | author = {Daniel Daza and 24 | Dimitrios Alivanistos and 25 | Payal Mitra and 26 | Thom Pijnenburg and 27 | Michael Cochez and 28 | Paul Groth}, 29 | title = {BioBLP: a modular framework for learning on multimodal biomedical 30 | knowledge graphs}, 31 | journal = {J. Biomed. Semant.}, 32 | volume = {14}, 33 | number = {1}, 34 | pages = {20}, 35 | year = {2023}, 36 | url = {https://doi.org/10.1186/s13326-023-00301-y}, 37 | doi = {10.1186/S13326-023-00301-Y}, 38 | } 39 | ``` 40 | 41 | ## Usage 42 | 43 | ### 1. Install the requirements 44 | 45 | We recommend using [Anaconda](https://www.anaconda.com/) to manage the dependencies. The following command will create and activate a new conda environment with all the required dependencies. 46 | 47 | ```bash 48 | conda create -f environment.yml && conda activate bioblp 49 | ``` 50 | 51 | ### 2. Download the data 52 | 53 | The data can be downloaded from [here](https://doi.org/10.5281/zenodo.8005711) as a tar.gz file. This corresponds to our version of BioKG that has been decoupled from the benchmarks (see the paper for more details), and it also includes the necessary attribute data for proteins, molecules, and diseases. 54 | The file should be placed inside the `data` folder and decompressed: 55 | 56 | ```bash 57 | tar xzf biokgb.tar.gz 58 | ``` 59 | 60 | ### 3. Training link prediction models 61 | 62 | Use the `bioblp.train` module to train a link prediction model. For example, to train a BioBLP-D model (which encodes disease descriptions) using the RotatE scoring function, use: 63 | 64 | ```sh 65 | python -m bioblp.train \ 66 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 67 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 68 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 69 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 70 | --model=rotate --dimension=256 --loss_fn=crossentropy --optimizer=adam \ 71 | --learning_rate=2e-5 --warmup_fraction=0.05 --num_epochs=100 \ 72 | --batch_size=1024 --eval_batch_size=64 --num_negatives=512 --in_batch_negatives=True 73 | ``` 74 | 75 | The above command on a NVIDIA A100 40G GPU takes about 9 hours to train. 76 | 77 | We use Weights and Biases to log the experiments, which is disabled by default. To enable it, add `--log_wandb=True` to the command above. 78 | 79 | More examples will be added soon. 80 | 81 | ### 4. Benchmark tasks 82 | * Pre-generate the input dataset with flags indicating if they are known or novel links. 83 | * Run `bioblp.benchmarking.preprocess.py` to prepare BM dataset for ML by shuffling, splits, etc. 84 | * `bioblp.benchmarking.featurize.py` can be used to featurize a list of pair wise entities into vectors composed from individual vector entities. 85 | 86 | Custom usage: 87 | ```bash 88 | $ python -m bioblp.benchmarking.featurize -i data/benchmarks/processed/dpi_benchmark_p2n-1-10.tsv -o data/features -t kgem -f models/1baon0eg/ -j concatenate 89 | ``` 90 | -------------------------------------------------------------------------------- /bioblp/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /bioblp/benchmarking/README.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | ## Experiment preparation 4 | Command to prepare experimental data, given config file. This script will load the raw benchmark dataset, perform negative sampling, generate features and splits: 5 | 6 | ```bash 7 | python bioblp/benchmarking/experiment.py \ 8 | --conf=conf/dpi-benchmark-cv-20230413.toml \ 9 | --override_data_root=./ \ 10 | --bm_file=data/benchmarks/transductive/dpi_fda.tsv \ 11 | --n_proc=1 12 | ``` 13 | 14 | You can execute the steps in `experiment.py` individually with the below. 15 | 16 | 1. Negative sampling. 17 | ```bash 18 | python bioblp/benchmarking/preprocess.py \ 19 | --bm_data_path=data/benchmarks/experiments/DPI/1681398697/features/raw.pt \ 20 | --kg_triples_dir=data/benchmarks/experiments/encoders/rotate/training_triples/ \ 21 | --num_negs_per_pos=10 \ 22 | --outdir=data/benchmarks/experiments/DPI/1681398697/sampled/ \ 23 | --override_run_id=1681398697 24 | ``` 25 | 26 | 2. Generate features. 27 | 28 | ```bash 29 | python bioblp/benchmarking/featurise.py \ 30 | --conf=conf/dpi-benchmark-cv-20230413.toml \ 31 | --bm_file=data/benchmarks/experiments/DPI/1681398697/sampled/dpi_fda_p2n-1-10.tsv \ 32 | --override_data_root=./ \ 33 | --override_run_id=1681398697 34 | 35 | ``` 36 | 37 | 3. Preparing data splits for cross validation. 38 | 39 | ```bash 40 | python bioblp/benchmarking/split.py \ 41 | --conf=conf/dpi-benchmark-cv-20230413.toml \ 42 | --data=data/benchmarks/experiments/DPI/1681398697/features/raw.pt \ 43 | --outdir=data/benchmarks/experiments/DPI/1681398697/splits/ \ 44 | --n_folds=5 \ 45 | --override_data_root=./ \ 46 | --override_run_id=1681398697 47 | ``` 48 | 49 | ## Model training 50 | 51 | Sample command for `train.py`. This script performs the training procedure for one model configuration, on one particular data split. 52 | ```bash 53 | python bioblp/benchmarking/train.py \ 54 | --model_clf=RF \ 55 | --model_feature=complex \ 56 | --feature_dir=data/benchmarks/experiments/dpi_fda/1681301749/features/ \ 57 | --splits_path=data/benchmarks/experiments/dpi_fda/1681301749/splits/train-test-split.pt \ 58 | --split_idx=0 \ 59 | --n_iter=3 \ 60 | --refit_params=AUCPR,AUCROC \ 61 | --outdir=data/benchmarks/experiments/dpi_fda/1681301749/models/ \ 62 | --model_label=complex__RF \ 63 | --timestamp=1681301749 \ 64 | --wandb_tag=dev 65 | ``` 66 | 67 | The `train_runner` script contains the procedure to run a full experiment, given a configuration file. This will perform the complete CV routine for all model configurations contained in the config file. Also supports multiprocessing through the `--n_proc` flag. For example, 68 | ```bash 69 | python bioblp/benchmarking/train_runner.py \ 70 | --conf conf/dpi-benchmark-cv-20230413.toml \ 71 | --override_data_root=./ \ 72 | --override_run_id=1681398697 \ 73 | --tag=dpi-20230413 \ 74 | --n_proc=5 75 | ``` 76 | 77 | In its current implementations here, the multiprocessing capability conflicts with PyTorch on GPU. For MLP models using GPU, we recommend setting `--n_proc=1`: 78 | ```bash 79 | python bioblp/benchmarking/train_runner.py \ 80 | --conf conf/dpi-benchmark-cv-20230413-mlp.toml \ 81 | --override_data_root=./ \ 82 | --override_run_id=1681398697 \ 83 | --tag=dpi-20230413 \ 84 | --n_proc=1 85 | ``` 86 | 87 | ## WandB logging 88 | 89 | By default logging to WandB is turned off. Change the assignments to `LOG_WANDB = True` in `train.py` for logging. -------------------------------------------------------------------------------- /bioblp/benchmarking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/benchmarking/__init__.py -------------------------------------------------------------------------------- /bioblp/benchmarking/config.py: -------------------------------------------------------------------------------- 1 | 2 | import abc 3 | import toml 4 | import json 5 | 6 | from dataclasses import dataclass, field 7 | from typing import List 8 | from pathlib import Path 9 | 10 | 11 | def load_toml(toml_path: str) -> dict: 12 | toml_path = Path(toml_path) 13 | config = {} 14 | with open(toml_path, "r") as f: 15 | config = toml.load(f) 16 | 17 | return config 18 | 19 | 20 | class ConfigJSONEncoder(json.JSONEncoder): 21 | def default(self, obj): 22 | # add conditional logic for any data structures that require special care 23 | # handling serialisation of Enum objects 24 | if isinstance(obj, Path): 25 | return str(obj.resolve()) 26 | return json.JSONEncoder.default(self, obj) 27 | 28 | 29 | @dataclass 30 | class BenchmarkStepBaseConfig(abc.ABC): 31 | data_root: str 32 | experiment_root: str 33 | run_id: str 34 | outdir: str 35 | 36 | @classmethod 37 | def from_toml(cls, toml_path, run_id): 38 | raise NotImplementedError 39 | 40 | def resolve_outdir(self) -> Path: 41 | outdir = Path(self.data_root)\ 42 | .joinpath(self.experiment_root)\ 43 | .joinpath(self.run_id)\ 44 | .joinpath(self.outdir) 45 | 46 | return outdir 47 | 48 | 49 | @dataclass 50 | class BenchmarkPreprocessConfig(BenchmarkStepBaseConfig): 51 | num_negs_per_pos: int 52 | kg_triples_dir: str 53 | 54 | @classmethod 55 | def from_toml(cls, toml_path: str, run_id: str): 56 | config_toml = load_toml(toml_path) 57 | 58 | cfg = config_toml.get("sampling") 59 | 60 | data_root = config_toml.get("data_root") 61 | experiment_root = config_toml.get("experiment_root") 62 | 63 | cfg.update({"data_root": data_root}) 64 | cfg.update({"experiment_root": experiment_root}) 65 | cfg.update({"run_id": run_id}) 66 | 67 | return cls(**cfg) 68 | 69 | 70 | @dataclass 71 | class BenchmarkFeatureConfig(BenchmarkStepBaseConfig): 72 | transform: str 73 | missing_values: str 74 | encoders: list 75 | encoder_args: dict 76 | 77 | @classmethod 78 | def from_toml(cls, toml_path: str, run_id: str): 79 | conf_path = Path(toml_path) 80 | config_toml = load_toml(conf_path) 81 | 82 | data_root = config_toml.get("data_root") 83 | experiment_root = config_toml.get("experiment_root") 84 | 85 | cfg = config_toml.get("features") 86 | 87 | cfg.update({"data_root": data_root}) 88 | cfg.update({"experiment_root": experiment_root}) 89 | cfg.update({"run_id": run_id}) 90 | 91 | return cls(**cfg) 92 | 93 | 94 | @dataclass 95 | class BenchmarkSplitConfig(BenchmarkStepBaseConfig): 96 | n_splits: int 97 | 98 | @classmethod 99 | def from_toml(cls, toml_path: str, run_id: str): 100 | conf_path = Path(toml_path) 101 | config_toml = load_toml(conf_path) 102 | 103 | data_root = config_toml.get("data_root") 104 | experiment_root = config_toml.get("experiment_root") 105 | 106 | cfg = config_toml.get("split") 107 | 108 | cfg.update({"data_root": data_root}) 109 | cfg.update({"experiment_root": experiment_root}) 110 | cfg.update({"run_id": run_id}) 111 | 112 | return cls(**cfg) 113 | 114 | 115 | @dataclass 116 | class BenchmarkTrainConfig(BenchmarkStepBaseConfig): 117 | feature_dir: str 118 | splits_dir: str 119 | splits_file: str 120 | models: dict 121 | refit_params: List[str] 122 | n_iter: int = field(default=10, metadata={"help": "Number of HPO trials"}) 123 | 124 | @classmethod 125 | def from_toml(cls, toml_path, run_id): 126 | conf = load_toml(toml_path=toml_path) 127 | cfg = {} 128 | 129 | cfg["models"] = conf.get("models") 130 | 131 | cfg.update(conf.get("train")) 132 | 133 | cfg["data_root"] = conf.get("data_root") 134 | cfg["experiment_root"] = conf.get("experiment_root") 135 | cfg["feature_dir"] = conf.get("features").get("outdir") 136 | cfg["splits_dir"] = conf.get("split").get("outdir") 137 | 138 | cfg.update({"run_id": run_id}) 139 | 140 | return cls(**cfg) 141 | 142 | def resolve_feature_dir(self) -> Path: 143 | feature_dir = Path(self.data_root)\ 144 | .joinpath(self.experiment_root)\ 145 | .joinpath(self.run_id)\ 146 | .joinpath(self.feature_dir) 147 | 148 | return feature_dir 149 | 150 | def resolve_splits_file(self) -> Path: 151 | splits_path = Path(self.data_root)\ 152 | .joinpath(self.experiment_root)\ 153 | .joinpath(self.run_id)\ 154 | .joinpath(self.splits_dir)\ 155 | .joinpath(self.splits_file) 156 | 157 | return splits_path 158 | -------------------------------------------------------------------------------- /bioblp/benchmarking/experiment.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from time import time 3 | from pathlib import Path 4 | from bioblp.benchmarking.preprocess import main as sampling_main 5 | from bioblp.benchmarking.config import BenchmarkPreprocessConfig 6 | 7 | from bioblp.benchmarking.featurise import main as featurise_main 8 | from bioblp.benchmarking.split import main as split_main 9 | 10 | 11 | def run_experiment(args): 12 | 13 | experiment_id = str(int(time())) 14 | 15 | override_data_root = Path( 16 | args.override_data_root) if args.override_data_root is not None else None 17 | 18 | # 19 | # Negative sampling 20 | # 21 | preprocess_config = BenchmarkPreprocessConfig.from_toml( 22 | args.conf, run_id=experiment_id) 23 | 24 | if override_data_root: 25 | preprocess_config.data_root = override_data_root 26 | 27 | sampled_bm_filepath = sampling_main(bm_data_path=args.bm_file, 28 | kg_triples_dir=preprocess_config.kg_triples_dir, 29 | num_negs_per_pos=preprocess_config.num_negs_per_pos, 30 | outdir=preprocess_config.resolve_outdir(), 31 | override_run_id=experiment_id) 32 | # 33 | # Prepare features 34 | # 35 | featurise_main(bm_file=sampled_bm_filepath, 36 | conf=args.conf, 37 | override_data_root=override_data_root, 38 | override_run_id=experiment_id) 39 | # 40 | # Prepare splits 41 | # 42 | split_main(data=sampled_bm_filepath, 43 | conf=args.conf, 44 | override_data_root=override_data_root, 45 | override_run_id=experiment_id) 46 | 47 | 48 | def get_parser() -> ArgumentParser: 49 | parser = ArgumentParser( 50 | description="Run full benchmark experiment procedure") 51 | parser.add_argument("--conf", type=str, 52 | help="Path to experiment configuration") 53 | parser.add_argument("--bm_file", type=str, help="Path to benchmark data") 54 | parser.add_argument("--override_data_root", type=str, default=None, 55 | help="Path to root of data tree") 56 | parser.add_argument( 57 | "--n_proc", type=int, default=-1, help="Number of cores to use in process." 58 | ) 59 | parser.add_argument("--tag", type=str, 60 | help="Optional tag to add to wandb runs") 61 | parser.add_argument("--dev_run", action='store_true', 62 | help="Quick dev run") 63 | return parser 64 | 65 | 66 | if __name__ == "__main__": 67 | 68 | args = get_parser().parse_args() 69 | run_experiment(args) 70 | -------------------------------------------------------------------------------- /bioblp/benchmarking/featurise.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from argparse import ArgumentParser 8 | from dataclasses import asdict 9 | from functools import reduce 10 | 11 | from torch import Tensor 12 | 13 | from pathlib import Path 14 | from time import time 15 | from tqdm import tqdm 16 | 17 | from typing import Tuple, List, Dict 18 | 19 | from bioblp.logger import get_logger 20 | from bioblp.benchmarking.encoders import get_encoder 21 | from bioblp.benchmarking.encoders import MissingValueMethod 22 | from bioblp.benchmarking.encoders import EntityPairEncoder 23 | from bioblp.benchmarking.encoders import EntityEncoder 24 | from bioblp.benchmarking.encoders import NoiseEncoder 25 | from bioblp.benchmarking.encoders import StructuralPairEncoder 26 | from bioblp.benchmarking.encoders import RandomNoisePairEncoder 27 | from bioblp.benchmarking.encoders import KGEMPairEncoder 28 | from bioblp.benchmarking.config import BenchmarkFeatureConfig, ConfigJSONEncoder 29 | 30 | from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET 31 | from bioblp.benchmarking.encoders import ROTATE, TRANSE, COMPLEX, STRUCTURAL, NOISE, LABEL 32 | 33 | 34 | logger = get_logger(__name__) 35 | 36 | 37 | # 38 | # Building script 39 | # 40 | 41 | 42 | def save_features(outdir: Path, label: str, feature: Tensor, labels: Tensor): 43 | outfile = outdir.joinpath(f"{label}.pt") 44 | 45 | torch_obj = {"X": feature, "y": labels} 46 | torch.save(torch_obj, outfile) 47 | 48 | 49 | def build_encodings(config: BenchmarkFeatureConfig, pairs: np.array, encoders: List[str], 50 | encoder_args: Dict[str, dict], entities_filter: List[str]) -> Tuple[str, Tensor, Tensor]: 51 | encoded_bm = [] 52 | 53 | for encoder_i_label in tqdm(encoders, desc=f"Encoding benchmarks..."): 54 | logger.info(f"Encoding with {encoder_i_label}") 55 | encoder_i_args = encoder_args.get(encoder_i_label) 56 | 57 | pair_encoder = get_encoder(encoder_i_label, 58 | encoder_i_args, 59 | entities=entities_filter) 60 | 61 | missing_value_method = MissingValueMethod(config.missing_values) 62 | 63 | encoded_pairs, encoded_mask = pair_encoder.encode(pairs, 64 | missing_value=missing_value_method, 65 | transform=config.transform) 66 | 67 | encoded_bm.append((encoder_i_label, encoded_pairs, encoded_mask)) 68 | return encoded_bm 69 | 70 | 71 | def apply_common_mask(encoded_bm: List[Tuple[str, Tensor, Tensor]], labels: Tensor) -> Tuple[List[Tuple[str, Tensor]], Tensor]: 72 | logger.info("Masking features...") 73 | 74 | all_masks = [x[2] for x in encoded_bm] 75 | common_mask = torch.from_numpy(reduce(np.intersect1d, all_masks)) 76 | 77 | logger.info(f"size after common mask {len(common_mask)}") 78 | 79 | masked_encoded_bm = [] 80 | for enc_label, enc_pairs, _ in encoded_bm: 81 | masked_enc_pairs = enc_pairs[common_mask] 82 | masked_encoded_bm.append((enc_label, masked_enc_pairs)) 83 | 84 | masked_labels = labels[common_mask] 85 | 86 | return masked_encoded_bm, masked_labels 87 | 88 | 89 | def main(bm_file: str, conf: str, override_data_root=None, override_run_id=None): 90 | 91 | run_id = override_run_id or str(int(time())) 92 | 93 | config = BenchmarkFeatureConfig.from_toml(conf, run_id=run_id) 94 | 95 | if override_data_root is not None: 96 | config.data_root = override_data_root 97 | 98 | logger.info( 99 | f"Running process with config: {config} at time {run_id}...") 100 | 101 | # load benchmark data 102 | # here entities are strings 103 | 104 | bm_df = pd.read_csv(bm_file, sep='\t', names=[ 105 | COL_SOURCE, COL_EDGE, COL_TARGET, LABEL], header=0) 106 | 107 | pairs = bm_df[[COL_SOURCE, COL_TARGET]].values 108 | all_entities = np.unique(np.ravel(pairs)).tolist() 109 | 110 | labels = torch.from_numpy(bm_df[LABEL].values) 111 | 112 | # perform encodings 113 | encoded_bm = build_encodings(config=config, 114 | pairs=pairs, 115 | encoders=config.encoders, 116 | encoder_args=config.encoder_args, 117 | entities_filter=all_entities) 118 | 119 | # add plain benchmark data too 120 | encoded_bm.append(("raw", pairs, np.arange(len(pairs)))) 121 | 122 | # common mask only when dropping missing embeddings 123 | if config.missing_values == MissingValueMethod.DROP.value: 124 | masked_encoded_bm, masked_labels = apply_common_mask( 125 | encoded_bm, labels) 126 | else: 127 | masked_encoded_bm = [(x[0], x[1]) for x in encoded_bm] 128 | masked_labels = labels 129 | 130 | feature_outdir = config.resolve_outdir() 131 | 132 | feature_outdir.mkdir(parents=True, exist_ok=True) 133 | 134 | logger.info(f"Saving features to {feature_outdir}...") 135 | 136 | for enc_label, enc_pairs in masked_encoded_bm: 137 | logger.info( 138 | f"Saving {enc_label} features with shape: {enc_pairs.shape}") 139 | save_features(outdir=feature_outdir, 140 | label=enc_label, 141 | feature=enc_pairs, 142 | labels=masked_labels) 143 | 144 | with open(feature_outdir.joinpath("config.json"), "w") as f: 145 | cfg_dict = asdict(config) 146 | json.dump(cfg_dict, f, cls=ConfigJSONEncoder) 147 | 148 | 149 | def get_parser() -> ArgumentParser: 150 | parser = ArgumentParser( 151 | description="Generate features for benchmark datasets") 152 | parser.add_argument("--conf", type=str, 153 | help="Path to experiment configuration") 154 | parser.add_argument("--bm_file", type=str, help="Path to benchmark data") 155 | parser.add_argument("--override_data_root", type=str, 156 | help="Path to root of data tree") 157 | parser.add_argument("--override_run_id", type=str, 158 | help="Override run_id") 159 | 160 | return parser 161 | 162 | 163 | if __name__ == "__main__": 164 | 165 | args = get_parser().parse_args() 166 | 167 | main(**vars(args)) 168 | -------------------------------------------------------------------------------- /bioblp/benchmarking/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from argparse import ArgumentParser 4 | from pathlib import Path 5 | from pykeen.sampling import PseudoTypedNegativeSampler 6 | from pykeen.triples import TriplesFactory 7 | 8 | from time import time 9 | from typing import Union 10 | 11 | from bioblp.logger import get_logger 12 | from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET 13 | 14 | logger = get_logger(__name__) 15 | COL_LABEL = 'label' 16 | 17 | 18 | def generate_negative_triples(pos_triples: TriplesFactory, 19 | filtered=True, 20 | num_negs_per_pos=1): 21 | 22 | neg_sampler = PseudoTypedNegativeSampler(mapped_triples=pos_triples.mapped_triples, 23 | filtered=filtered, 24 | num_negs_per_pos=num_negs_per_pos) 25 | pos_batch = pos_triples.mapped_triples 26 | neg_triples = neg_sampler.sample(pos_batch)[0] 27 | 28 | return neg_triples 29 | 30 | 31 | def prepare_dpi_samples(pos_df, 32 | num_negs_per_pos: Union[None, int, str] = 1, 33 | entity_to_id_map: Union[None, dict] = None, 34 | relation_to_id_map: Union[None, dict] = None, 35 | # map_to_kgem_ids=False, 36 | filtered=True): 37 | """ 38 | pos_df -> Expects dataframe with true positives in format ['src', edge', 'tgt'], 39 | where the entities and relations of the triple are in their string ids. 40 | These will be converted to KGEM integer ids at a later state 41 | """ 42 | pos_neg_df = pos_df.copy() 43 | pos_triples = TriplesFactory.from_labeled_triples(pos_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, 44 | entity_to_id=entity_to_id_map, 45 | relation_to_id=relation_to_id_map) 46 | 47 | # returns a tensor object 48 | neg_triples = generate_negative_triples(pos_triples, 49 | num_negs_per_pos=num_negs_per_pos, 50 | filtered=filtered) 51 | 52 | # convert to mapped triples 53 | neg_triples_ = pos_triples.clone_and_exchange_triples( 54 | neg_triples.view(-1, 3)) 55 | neg_df = pd.DataFrame(neg_triples_.triples, columns=[ 56 | COL_SOURCE, COL_EDGE, COL_TARGET]) 57 | 58 | # add labels 59 | pos_neg_df[COL_LABEL] = 1 60 | neg_df[COL_LABEL] = 0 61 | 62 | pos_neg_df = pd.concat([pos_neg_df, neg_df], axis=0, ignore_index=True) 63 | return pos_neg_df 64 | 65 | 66 | def main(bm_data_path: str, kg_triples_dir: str, outdir: str, num_negs_per_pos: int = 1, override_run_id=None): 67 | 68 | start = time() 69 | run_id = override_run_id or int(start) 70 | 71 | bm_data_path = Path(bm_data_path) 72 | kg_triples_dir = Path(kg_triples_dir) 73 | outdir = Path(outdir) 74 | outdir.mkdir(parents=True, exist_ok=True) 75 | 76 | num_negs_per_pos = num_negs_per_pos 77 | bm_dataset_name = bm_data_path.name.split('.tsv')[0] 78 | 79 | training_triples = TriplesFactory.from_path_binary(kg_triples_dir) 80 | entity_to_id_map = training_triples.entity_to_id 81 | relation_to_id_map = training_triples.relation_to_id 82 | 83 | # load the benchmark data 84 | bm_df = pd.read_csv(bm_data_path, sep='\t', names=[ 85 | COL_SOURCE, COL_EDGE, COL_TARGET]) 86 | 87 | # generate neg samples and prepare pos-neg pairs 88 | logger.info( 89 | f'Generating negative samples corresponding to benchmark triples') 90 | pos_neg_df = prepare_dpi_samples(bm_df, 91 | entity_to_id_map=entity_to_id_map, 92 | relation_to_id_map=relation_to_id_map, 93 | num_negs_per_pos=num_negs_per_pos) 94 | 95 | # save to disk 96 | bm_postprocessed_path = outdir.joinpath( 97 | f"{bm_dataset_name}_p2n-1-{num_negs_per_pos}.tsv") 98 | logger.info(f'Writing preprocessed data to {bm_postprocessed_path}') 99 | pos_neg_df.to_csv(bm_postprocessed_path, sep='\t') 100 | logger.info('Done!') 101 | 102 | return str(bm_postprocessed_path.resolve()) 103 | 104 | 105 | if __name__ == "__main__": 106 | 107 | parser = ArgumentParser( 108 | description="Preprocess benchmark triples (E.g. DPI data) for downstream prediction task") 109 | parser.add_argument("--bm_data_path", type=str, 110 | help="Path to pick up benchmark data") 111 | parser.add_argument("--kg_triples_dir", type=str, 112 | help="Directory housing kg positive triples. Needed to generate negative samples") 113 | parser.add_argument("--num_negs_per_pos", type=int, 114 | help="Number of negative samples to generate per positive instance") 115 | parser.add_argument("--outdir", type=str, 116 | help="Path to data dir to write output") 117 | parser.add_argument("--override_run_id", type=str, 118 | help="Run id of experiment") 119 | args = parser.parse_args() 120 | main(**vars(args)) 121 | -------------------------------------------------------------------------------- /bioblp/benchmarking/split.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from argparse import ArgumentParser 5 | from pathlib import Path 6 | 7 | from sklearn.model_selection import StratifiedKFold 8 | from sklearn.model_selection import train_test_split 9 | 10 | 11 | from bioblp.benchmarking.train_utils import load_feature_data 12 | from bioblp.logger import get_logger 13 | from bioblp.benchmarking.config import BenchmarkSplitConfig 14 | 15 | from typing import Union, Tuple, Dict, List 16 | 17 | RANDOM_STATE = 12 18 | 19 | logger = get_logger(__name__) 20 | 21 | 22 | def get_splits_iter(splits_path): 23 | def splits_iterable(): 24 | splits_data = torch.load(splits_path) 25 | n = len(splits_data) 26 | 27 | num = 0 28 | while num < n: 29 | fold_data = splits_data[num] 30 | yield (fold_data["split_idx"], fold_data["train_idx"], fold_data["test_idx"]) 31 | num += 1 32 | 33 | return splits_iterable 34 | 35 | 36 | def get_split_struct(train, test, idx) -> dict: 37 | return { 38 | "train_idx": train, 39 | "test_idx": test, 40 | "split_idx": str(idx) 41 | } 42 | 43 | 44 | def load_split(splits_file: Path, split_idx: int) -> Tuple[np.array, np.array]: 45 | 46 | splits_data = torch.load(splits_file) 47 | 48 | fold_splits = splits_data[split_idx] 49 | train_idx = fold_splits["train_idx"] 50 | test_idx = fold_splits["test_idx"] 51 | fold_idx = fold_splits["split_idx"] 52 | 53 | return (fold_idx, train_idx, test_idx) 54 | 55 | 56 | def main(data, n_folds=None, outdir=None, conf=None, override_data_root=None, override_run_id=None): 57 | 58 | if conf is not None: 59 | config = BenchmarkSplitConfig.from_toml(conf, run_id=override_run_id) 60 | if override_data_root is not None: 61 | config.data_root = override_data_root 62 | 63 | n_folds = config.n_splits 64 | data_path = Path(data) 65 | outdir = config.resolve_outdir() 66 | else: 67 | data_path = Path(data) 68 | outdir = Path(outdir) 69 | 70 | outdir.mkdir(parents=True, exist_ok=True) 71 | 72 | # load raw benchmark data 73 | X_bm, y_bm = load_feature_data(data_path) 74 | 75 | # generate train-test split 76 | logger.info("Generating train test split.") 77 | 78 | X_indices = torch.arange(len(X_bm)) 79 | 80 | train_idx, test_idx, _, _ = train_test_split( 81 | X_indices, y_bm, test_size=0.1, stratify=y_bm, random_state=RANDOM_STATE) 82 | 83 | split_data = {0: get_split_struct(train_idx, test_idx, idx=0)} 84 | train_test_split_file = outdir.joinpath("train-test-split.pt") 85 | torch.save(split_data, train_test_split_file) 86 | 87 | # generate cv splits 88 | logger.info("Generating cv splits.") 89 | 90 | cv = StratifiedKFold( 91 | n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE 92 | ) 93 | splits = [(train, test, idx) 94 | for idx, (train, test) in enumerate(cv.split(X_bm, y_bm))] 95 | 96 | cv_data = {x[2]: get_split_struct(x[0], x[1], x[2]) for x in splits} 97 | 98 | cv_split_file = outdir.joinpath("cv-splits.pt") 99 | torch.save(cv_data, cv_split_file) 100 | 101 | logger.info("Done.") 102 | 103 | 104 | if __name__ == "__main__": 105 | 106 | parser = ArgumentParser( 107 | description="Preprocess benchmark triples (E.g. DPI data) for downstream prediction task") 108 | 109 | parser.add_argument("--conf", type=str, default=None, 110 | help="Path to config file") 111 | parser.add_argument("--data", type=str, 112 | help="Path to pick up benchmark data") 113 | parser.add_argument("--n_folds", type=int, default=None, 114 | help="Number of cv folds to produce") 115 | parser.add_argument("--outdir", type=str, default=None, 116 | help="Path to data dir to write output") 117 | parser.add_argument("--override_data_root", type=str, 118 | help="Path to root of data tree") 119 | parser.add_argument("--override_run_id", type=str, 120 | help="Override run_id") 121 | args = parser.parse_args() 122 | main(**vars(args)) 123 | -------------------------------------------------------------------------------- /bioblp/benchmarking/train_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import string 3 | import numpy as np 4 | import random as rn 5 | 6 | from pathlib import Path 7 | 8 | 9 | from sklearn.metrics import roc_auc_score 10 | from sklearn.metrics import precision_score 11 | from sklearn.metrics import recall_score 12 | from sklearn.metrics import fbeta_score 13 | from sklearn.metrics import make_scorer 14 | from sklearn.metrics import accuracy_score 15 | from sklearn.metrics import precision_recall_curve 16 | from sklearn.metrics import roc_curve 17 | from sklearn.metrics import auc 18 | from sklearn.metrics import confusion_matrix 19 | 20 | 21 | from sklearn.model_selection import train_test_split 22 | 23 | from typing import Union, Tuple 24 | 25 | from bioblp.logger import get_logger 26 | 27 | 28 | logger = get_logger(__name__) 29 | 30 | 31 | def get_random_string(length): 32 | # choose from all lowercase letter 33 | characters = string.ascii_lowercase + string.digits 34 | result_str = "".join(rn.choice(characters) for i in range(length)) 35 | 36 | return result_str 37 | 38 | 39 | def unique_study_prefix(): 40 | unique_string = get_random_string(8) 41 | return unique_string 42 | 43 | 44 | def generate_study_name(prefix, model, fold): 45 | return f"{prefix}-{model}-{fold}" 46 | 47 | 48 | def aupr_score(y_true, y_pred): 49 | """Use AUC function to calculate the area under the curve of precision recall curve""" 50 | precision, recall, thresholds = precision_recall_curve(y_true, y_pred) 51 | return auc(recall, precision) 52 | 53 | 54 | def get_auc_scorers(): 55 | scorers = { 56 | "PRCURVE": make_scorer(precision_recall_curve, needs_proba=True), 57 | "ROCCURVE": make_scorer(roc_curve, needs_proba=True), 58 | "CM": make_scorer(confusion_matrix, needs_proba=False) 59 | } 60 | return scorers 61 | 62 | 63 | def get_scorers(): 64 | scorers = { 65 | "AUCROC": make_scorer(roc_auc_score, needs_proba=True), 66 | "f1": make_scorer(fbeta_score, beta=1, average="micro"), 67 | "precision": make_scorer(precision_score), 68 | "recall": make_scorer(recall_score), 69 | "accuracy": make_scorer(accuracy_score), 70 | "AUCPR": make_scorer(aupr_score, needs_proba=True), 71 | } 72 | return scorers 73 | 74 | 75 | def get_model_label(feature: str, model: str): 76 | return f"{feature}__{model}" 77 | 78 | 79 | def load_feature_data(feat_path: Union[str, Path], dev_run: bool = False) -> Tuple[np.array, np.array]: 80 | """ Load feature data into numpy arrays 81 | 82 | Parameters 83 | ---------- 84 | feat_path : Union[str, Path] 85 | Filepath to feature, eg 'features/rotate.pt' 86 | dev_run : bool, optional 87 | Flag to subsample data for development only, by default False 88 | 89 | Returns 90 | ------- 91 | Tuple[np.array, np.array] 92 | Return (features, labels) 93 | """ 94 | logger.info("Loading training data...") 95 | 96 | data = torch.load(feat_path) 97 | 98 | X = data.get("X") 99 | y = data.get("y") 100 | 101 | if torch.is_tensor(X): 102 | X = X.detach().numpy() 103 | y = y.detach().numpy() 104 | 105 | if dev_run: 106 | X, _, y, _ = train_test_split( 107 | X, y, stratify=y, train_size=0.1, random_state=12) 108 | 109 | logger.info( 110 | "Resulting shapes X: {}, y: {}".format( 111 | X.shape, y.shape) 112 | ) 113 | logger.info("Counts in y: {}".format( 114 | np.unique(y, return_counts=True))) 115 | 116 | return X, y 117 | 118 | 119 | def validate_features_exist(feature_dir: Path, models_conf: dict) -> bool: 120 | """ Check if all feature files exist in directory 121 | 122 | Parameters 123 | ---------- 124 | feature_dir : Path 125 | Path to feature location 126 | models_conf : dict 127 | Definition of model and feature. 128 | 129 | Returns 130 | ------- 131 | bool 132 | True if features are present. 133 | """ 134 | exists = {} 135 | 136 | all_features = list(set([v.get("feature") 137 | for _, v in models_conf.items()])) 138 | 139 | for feat in all_features: 140 | exists[feat] = feature_dir.joinpath(f"{feat}.pt").is_file() 141 | 142 | logger.info(f"Validated that features exist: {exists}..") 143 | 144 | missing = [k for k, v in exists.items() if v is False] 145 | if len(missing) > 0: 146 | logger.warning(f"Missing features {missing}!!") 147 | 148 | return all([v for _, v in exists.items()]) 149 | -------------------------------------------------------------------------------- /bioblp/data.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pandas as pd 3 | from bioblp.logger import get_logger 4 | from pykeen.triples import TriplesFactory 5 | 6 | #logger = get_logger(__name__) 7 | 8 | COL_SOURCE = 'src' 9 | COL_EDGE = 'edg' 10 | COL_TARGET = 'tgt' 11 | 12 | 13 | def create_random_splits(triples: pd.DataFrame, train_ratio: float, valid_ratio: float, test_ratio: float): 14 | """Create train/valid/test based on random strategy 15 | """ 16 | triples_array = triples[[COL_SOURCE, COL_EDGE, COL_TARGET]].values 17 | 18 | triples_factory = TriplesFactory.from_labeled_triples(triples_array) 19 | 20 | train, valid, test = triples_factory.split([train_ratio, valid_ratio, test_ratio], random_state=2021) 21 | 22 | train_triples = pd.DataFrame(train.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET]) 23 | valid_triples = pd.DataFrame(valid.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET]) 24 | test_triples = pd.DataFrame(test.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET]) 25 | 26 | return train_triples, valid_triples, test_triples 27 | 28 | 29 | def save_splits(train_df, test_df, valid_df, dataset_name, out_dir): 30 | out_dir = Path(out_dir) 31 | out_dir.mkdir(exist_ok=True, parents=True) 32 | 33 | train.to_csv(out_dir.joinpath(f"{dataset_name}-train.tsv"), sep='\t', index=None) 34 | test.to_csv(out_dir.joinpath(f"{dataset_name}-test.tsv"), sep='\t', index=None) 35 | valid.to_csv(out_dir.joinpath(f"{dataset_name}-valid.tsv"), sep='\t', index=None) 36 | print(f"saved to {out_dir}") 37 | 38 | 39 | def load_splits(dataset: str, data_path: str, dev_sample=False) -> (TriplesFactory, TriplesFactory, TriplesFactory): 40 | data_path = Path(data_path) 41 | 42 | training_path = data_path.joinpath(f"{dataset}-train.tsv") 43 | valid_path = data_path.joinpath(f"{dataset}-valid.tsv") 44 | test_path = data_path.joinpath(f"{dataset}-test.tsv") 45 | 46 | train_df = pd.read_csv(training_path, index_col=None, sep="\t", dtype=str) 47 | valid_df = pd.read_csv(valid_path, index_col=None, sep="\t", dtype=str) 48 | test_df = pd.read_csv(test_path, index_col=None, sep="\t", dtype=str) 49 | 50 | if dev_sample: 51 | dev_frac = 0.01 52 | train_df = train_df.sample(frac=dev_frac, random_state=2021) 53 | valid_df = valid_df.sample(frac=dev_frac, random_state=2021) 54 | test_df = test_df.sample(frac=dev_frac, random_state=2021) 55 | 56 | training = TriplesFactory.from_labeled_triples(train_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values) 57 | valid = TriplesFactory.from_labeled_triples( 58 | valid_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, entity_to_id=training.entity_to_id, 59 | relation_to_id=training.relation_to_id) 60 | test = TriplesFactory.from_labeled_triples( 61 | test_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, entity_to_id=training.entity_to_id, 62 | relation_to_id=training.relation_to_id) 63 | 64 | return training, valid, test -------------------------------------------------------------------------------- /bioblp/evaluate.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | import numpy as np 4 | from pykeen.evaluation import RankBasedEvaluator, RankBasedMetricResults 5 | from pykeen.evaluation.rank_based_evaluator import _iter_ranks 6 | from pykeen.triples import TriplesFactory 7 | from tap import Tap 8 | import torch 9 | 10 | 11 | class Arguments(Tap): 12 | model_path: str 13 | 14 | 15 | class SavedRanksEvaluator(RankBasedEvaluator): 16 | def __init__(self, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | self.saved_ranks = None 19 | 20 | def finalize(self) -> RankBasedMetricResults: 21 | if self.num_entities is None: 22 | raise ValueError 23 | 24 | result = RankBasedMetricResults.from_ranks( 25 | metrics=self.metrics, 26 | rank_and_candidates=_iter_ranks(ranks=self.ranks, num_candidates=self.num_candidates), 27 | ) 28 | 29 | self.saved_ranks = self.ranks.copy() 30 | self.ranks.clear() 31 | self.num_candidates.clear() 32 | 33 | return result 34 | 35 | 36 | def get_triple_ranks(args: Arguments): 37 | model_file = osp.join(args.model_path, 'trained_model.pkl') 38 | 39 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 40 | 41 | model = torch.load(model_file).to(device) 42 | train = TriplesFactory.from_path_binary(osp.join(args.model_path, 43 | 'training_triples')) 44 | 45 | graph_path = osp.join('data', 'biokgb', 'graph') 46 | valid_triples = 'biokg.links-valid.csv' 47 | test_triples = 'biokg.links-test.csv' 48 | 49 | valid, test = [TriplesFactory.from_path(osp.join(graph_path, f), 50 | entity_to_id=train.entity_to_id, 51 | relation_to_id=train.relation_to_id) 52 | for f in (valid_triples, test_triples)] 53 | 54 | evaluator = SavedRanksEvaluator(filtered=True) 55 | evaluator.evaluate(model, 56 | test.mapped_triples, 57 | additional_filter_triples=[train.mapped_triples, 58 | valid.mapped_triples]) 59 | 60 | head_ranks = evaluator.saved_ranks[('head', 'realistic')] 61 | tail_ranks = evaluator.saved_ranks[('tail', 'realistic')] 62 | ranks = np.concatenate(head_ranks + tail_ranks) 63 | # Save ranks to a csv file, specifying the integer format 64 | np.savetxt(osp.join(args.model_path, 'ranks.csv'), ranks, fmt='%d') 65 | 66 | 67 | if __name__ == '__main__': 68 | get_triple_ranks(Arguments().parse_args()) 69 | -------------------------------------------------------------------------------- /bioblp/loaders/preprocessors.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, Mapping 2 | 3 | from transformers import BertTokenizer 4 | import torch 5 | from torch import Tensor 6 | from torch.nn.utils.rnn import pad_sequence 7 | from tqdm import tqdm 8 | import numpy as np 9 | 10 | 11 | class EntityPropertyPreprocessor: 12 | """Abstract class for preprocessing entity properties of different types 13 | into tensors suitable for machine learning wizardry.""" 14 | def preprocess_file(self, file_path: str, 15 | entity_to_id: Mapping[str, int] 16 | ) -> Tuple[Tensor, Tensor, Tensor]: 17 | """Read a file of entity properties, with one entity per line. 18 | Expects at each line an entity name, a tab, and a property to be 19 | encoded. 20 | 21 | Args: 22 | file_path: file mapping entities to properties 23 | entity_to_id: maps an entity name to an integer ID 24 | 25 | Returns: 26 | entity_ids: torch.Tensor containing entity IDs read by the method 27 | rows: torch.Tensor mapping each entity in entity_ids to a row in 28 | data 29 | data: torch.Tensor containing data for each entity in entity_ids 30 | """ 31 | raise NotImplementedError 32 | 33 | 34 | class TextEntityPropertyPreprocessor(EntityPropertyPreprocessor): 35 | """Preprocessor for entities with textual descriptions""" 36 | def __init__(self, tokenizer: BertTokenizer, max_length: int): 37 | self.tokenizer = tokenizer 38 | self.max_length = max_length 39 | 40 | def preprocess_file(self, file_path: str, 41 | entity_to_id: Mapping[str, int] 42 | ) -> Tuple[Tensor, Tensor, Tensor]: 43 | all_tokens = [] 44 | entity_ids = [] 45 | rows = [] 46 | row_count = 0 47 | with open(file_path) as file: 48 | for i, line in enumerate(tqdm(file, desc=f'Encoding {file_path}')): 49 | tab_idx = line.find('\t') 50 | entity, text = line[:tab_idx], line[tab_idx:].strip() 51 | 52 | if entity in entity_to_id: 53 | tokens = self.tokenizer.encode(text, 54 | max_length=self.max_length, 55 | truncation=True, 56 | padding='max_length', 57 | return_tensors='pt') 58 | all_tokens.append(tokens) 59 | entity_id = entity_to_id[entity] 60 | entity_ids.append(entity_id) 61 | rows.append(row_count) 62 | row_count += 1 63 | 64 | if len(all_tokens) > 0: 65 | all_tokens = torch.cat(all_tokens, dim=0) 66 | else: 67 | all_tokens = torch.tensor([], dtype=torch.long) 68 | 69 | return (torch.tensor(entity_ids, dtype=torch.long), 70 | torch.tensor(rows, dtype=torch.long), 71 | all_tokens) 72 | 73 | 74 | class MolecularFingerprintPreprocessor(EntityPropertyPreprocessor): 75 | """Preprocessor for molecules with known molecular fingerprints""" 76 | def preprocess_file(self, file_path: str, 77 | entity_to_id: Mapping[str, int] 78 | ) -> Tuple[Tensor, Tensor, Tensor]: 79 | all_fprints = [] 80 | entity_ids = [] 81 | rows = [] 82 | row_count = 0 83 | with open(file_path) as file: 84 | for i, line in enumerate(tqdm(file, desc=f'Encoding {file_path}')): 85 | tab_idx = line.find('\t') 86 | entity, fprint = line[:tab_idx], line[tab_idx:].strip() 87 | 88 | if entity in entity_to_id: 89 | fprint = torch.tensor(np.array(list(fprint), dtype=float), dtype=torch.float) 90 | all_fprints.append(fprint) 91 | entity_id = entity_to_id[entity] 92 | entity_ids.append(entity_id) 93 | rows.append(row_count) 94 | row_count += 1 95 | 96 | return (torch.tensor(entity_ids, dtype=torch.long), 97 | torch.tensor(rows, dtype=torch.long), 98 | torch.stack(all_fprints, dim=0)) 99 | 100 | 101 | class PretrainedEmbeddingPreprocessor(EntityPropertyPreprocessor): 102 | def preprocess_file(self, file_path: str, 103 | entity_to_id: Mapping[str, int] 104 | ) -> Tuple[Tensor, Tensor, Tensor]: 105 | data_dict = torch.load(file_path) 106 | entity_to_row = data_dict['identifiers'] 107 | 108 | entity_ids = [] 109 | data = [] 110 | for entity, row in entity_to_row.items(): 111 | if entity in entity_to_id: 112 | entity_ids.append(entity_to_id[entity]) 113 | data.append(entity_to_row[entity]) 114 | 115 | entity_ids = torch.tensor(entity_ids, dtype=torch.long) 116 | data_idx = torch.arange(len(entity_ids)) 117 | data = torch.tensor(data, dtype=torch.long) 118 | 119 | return entity_ids, data_idx, data 120 | 121 | 122 | class MoleculeEmbeddingPreprocessor(EntityPropertyPreprocessor): 123 | def preprocess_file(self, file_path: str, 124 | entity_to_id: Mapping[str, int] 125 | ) -> Tuple[Tensor, Tensor, Tensor]: 126 | """Load embeddings for all the molecules we need, putting them 127 | in a single tensor that can be used to retrieve embeddings during 128 | training. Since molecules have variable length we use padding with 129 | a value of -1000 before placing them all inside a single 3D tensor 130 | of shape (N, L, D) where N is the number of molecules, 131 | L the maximum molecule length, and D the embedding dimension""" 132 | data_dict = torch.load(file_path) 133 | 134 | entity_ids = [] 135 | data = [] 136 | for molecule, embeddings in data_dict.items(): 137 | if molecule in entity_to_id: 138 | entity_ids.append(entity_to_id[molecule]) 139 | data.append(embeddings) 140 | 141 | entity_ids = torch.tensor(entity_ids, dtype=torch.long) 142 | data = pad_sequence(data, batch_first=True, padding_value=-10_000) 143 | data_idx = torch.arange(len(entity_ids)) 144 | 145 | return entity_ids, data_idx, data 146 | -------------------------------------------------------------------------------- /bioblp/logger.py: -------------------------------------------------------------------------------- 1 | import logging as lg 2 | 3 | 4 | def get_logger(logger_name=''): 5 | """Get a default logger that includes a timestamp.""" 6 | logger = lg.getLogger(logger_name) 7 | logger.handlers = [] 8 | ch = lg.StreamHandler() 9 | str_fmt = '%(asctime)s - %(levelname)s - %(name)s - %(message)s' 10 | formatter = lg.Formatter(str_fmt, datefmt='%H:%M:%S') 11 | ch.setFormatter(formatter) 12 | logger.addHandler(ch) 13 | logger.setLevel('INFO') 14 | 15 | return logger 16 | -------------------------------------------------------------------------------- /bioblp/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .bioblp import * 2 | -------------------------------------------------------------------------------- /bioblp/models/bioblp.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from typing import Optional 3 | 4 | import pykeen.models 5 | from pykeen.nn.representation import Embedding as PyKEmbedding 6 | from pykeen.typing import InductiveMode 7 | import torch 8 | 9 | from bioblp.models.encoders import PropertyEncoderRepresentation 10 | 11 | 12 | class BioBLP: 13 | def __init__(self, *, 14 | entity_representations: PropertyEncoderRepresentation, 15 | from_checkpoint: str = None, 16 | **kwargs): 17 | self.from_checkpoint = from_checkpoint 18 | 19 | super().__init__(**kwargs) 20 | 21 | entity_embedding_lut = self.entity_representations[0] 22 | entity_embedding_lut: PyKEmbedding 23 | 24 | entity_representations.wrap_lookup_table(entity_embedding_lut) 25 | self.property_encoder = entity_representations 26 | 27 | def reset_parameters_(self): 28 | super().reset_parameters_() 29 | if self.from_checkpoint: 30 | checkpoint = torch.load(osp.join(self.from_checkpoint, 31 | 'trained_model.pkl'), 32 | map_location='cpu') 33 | self.load_state_dict(checkpoint.state_dict(), strict=False) 34 | 35 | def score_hrt_and_negatives(self, 36 | hrt_batch: torch.LongTensor, 37 | num_negatives: int, 38 | *, mode: Optional[InductiveMode] = None 39 | ) -> tuple[torch.FloatTensor, torch.FloatTensor]: 40 | batch_size = hrt_batch.shape[0] 41 | 42 | h, r, t = self._get_representations(h=hrt_batch[:, 0], 43 | r=hrt_batch[:, 1], 44 | t=hrt_batch[:, 2], mode=mode) 45 | positive_scores = self.interaction.score_hrt(h=h, r=r, t=t) 46 | 47 | num_ents = batch_size * 2 48 | idx = torch.arange(num_ents).reshape(batch_size, 2) 49 | 50 | # For each row, sample entities, assigning 0 probability to entities 51 | # of the same row 52 | zeros = torch.zeros(batch_size, 2) 53 | head_weights = torch.ones(batch_size, num_ents, dtype=torch.float) 54 | head_weights.scatter_(1, idx, zeros) 55 | random_idx = head_weights.multinomial(num_negatives, replacement=True) 56 | random_idx = random_idx.t().flatten() 57 | 58 | # Select randomly the first or the second column 59 | row_selector = torch.arange(batch_size * num_negatives) 60 | col_selector = torch.randint(0, 2, [batch_size * num_negatives]) 61 | 62 | # Fill the array of negative samples with the sampled random entities 63 | # at the right positions 64 | neg_idx = idx.repeat((num_negatives, 1)) 65 | neg_idx[row_selector, col_selector] = random_idx 66 | # neg_idx = neg_idx.reshape(-1, batch_size, 2) 67 | # neg_idx.transpose_(0, 1) 68 | 69 | neg_embs = torch.stack([h, r], dim=1).view(batch_size * 2, -1) 70 | neg_embs = neg_embs[neg_idx.to(neg_embs.device)] 71 | h_neg, t_neg = neg_embs[:, 0], neg_embs[:, 1] 72 | 73 | r_neg_idx = torch.arange(batch_size).repeat(num_negatives) 74 | r_neg = r[r_neg_idx.to(r.device)] 75 | 76 | negative_scores = self.interaction.score_hrt(h=h_neg, r=r_neg, t=t_neg) 77 | negative_scores = negative_scores.reshape(batch_size, num_negatives) 78 | 79 | return positive_scores, negative_scores 80 | 81 | 82 | class BioBLPTransE(BioBLP, pykeen.models.TransE): 83 | ... 84 | 85 | 86 | class BioBLPComplEx(BioBLP, pykeen.models.ComplEx): 87 | ... 88 | 89 | 90 | class BioBLPRotatE(BioBLP, pykeen.models.RotatE): 91 | ... 92 | 93 | 94 | MODELS_DICT = { 95 | 'transe': BioBLPTransE, 96 | 'complex': BioBLPComplEx, 97 | 'rotate': BioBLPRotatE 98 | } 99 | 100 | 101 | def get_model_class(model_name: str): 102 | if model_name in MODELS_DICT: 103 | return MODELS_DICT[model_name] 104 | else: 105 | raise ValueError(f'Unknown model f{model_name}') 106 | 107 | -------------------------------------------------------------------------------- /bioblp/predict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/predict.py -------------------------------------------------------------------------------- /bioblp/preprocess.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pandas as pd 3 | import numpy as np 4 | import bio_embeddings 5 | from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, prottrans_t5_embedder, esm_embedder 6 | 7 | 8 | # Here we can change the Protein Embedder to w/e we want from the above. 9 | # TODO: An experiment with t5 embedding 10 | prot_trans_embedder = ProtTransBertBFDEmbedder() 11 | 12 | 13 | def get_protein_repr(amino_repr): 14 | """ Here we need to go from a collection of amino-acid embeddings to a full protein embedding 15 | 16 | # Example: 17 | # 18 | # M : (1,1024) 19 | # A : (1,1024) 20 | # S : (1,1024) 21 | # 22 | # Output: An aggregated representation for proteins 23 | # 24 | # Type: Dict(protein_id: (embedding)) 25 | # 26 | e.g Dict(: (LENG8_MOUSE, 1024)) """ 27 | 28 | emb_matrix = torch.Tensor(amino_repr) 29 | 30 | # We average over columns 31 | protein_emb = torch.mean(emb_matrix, dim=0) 32 | 33 | return protein_emb 34 | 35 | 36 | def get_protein_embedding(path, embedder="prottrans"): 37 | """ 38 | Wrapper over different protein embedders 39 | Parameters 40 | ---------- 41 | embedder: The model to embed proteins 42 | path: The data path 43 | 44 | Returns 45 | ------- 46 | """ 47 | print('Im in') 48 | 49 | # Load sequences 50 | sequence_data = pd.read_csv(path, sep='\t') 51 | 52 | # Sample : Uncomment for testing 53 | # sequence_data = sequence_data.sample(2) 54 | 55 | # Select correct columns 56 | sequence_data = sequence_data[['From', 'Sequence']] 57 | 58 | # Embed sequences 59 | sequence_data['embedding'] = sequence_data['Sequence'].apply(lambda x: prot_trans_embedder.embed(x)) 60 | 61 | # Aggregate sequences 62 | sequence_data['squashed'] = sequence_data['embedding'].apply(lambda x: get_protein_repr(x)) 63 | 64 | 65 | # Save sequences 66 | sequence_data.to_csv('../data/processed/uniprot_seq_embeddings.tsv') 67 | 68 | 69 | get_protein_embedding('../data/uniprot_sequences.tsv') 70 | -------------------------------------------------------------------------------- /bioblp/train.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from pykeen.pipeline import pipeline 4 | from pykeen.training import TrainingCallback 5 | from pykeen.triples import TriplesFactory 6 | 7 | from tap import Tap 8 | from transformers import get_linear_schedule_with_warmup 9 | import wandb 10 | 11 | from bioblp.logger import get_logger 12 | import bioblp.models as models 13 | from bioblp.utils.bioblp_utils import build_encoders 14 | from bioblp.utils.training import InBatchNegativesTraining 15 | 16 | 17 | class Arguments(Tap): 18 | train_triples: str 19 | valid_triples: str 20 | test_triples: str 21 | 22 | protein_data: str = None 23 | molecule_data: str = None 24 | text_data: str = None 25 | 26 | model: str = 'complex' 27 | dimension: int = 256 28 | loss_fn: str = 'crossentropy' 29 | loss_margin: float = 1.0 30 | optimizer: str = 'adagrad' 31 | learning_rate: float = 1e-2 32 | freeze_pretrained_embeddings: bool = False 33 | warmup_fraction: float = None 34 | regularizer: float = 1e-6 35 | num_epochs: int = 100 36 | batch_size: int = 1024 37 | eval_batch_size: int = 16 38 | eval_every: int = 10 39 | num_negatives: int = 512 40 | in_batch_negatives: bool = False 41 | add_inverses: bool = False 42 | early_stopper: str = 'both.realistic.inverse_harmonic_mean_rank' 43 | from_checkpoint: str = None 44 | 45 | search_train_batch_size: bool = False 46 | search_eval_batch_size: bool = False 47 | log_wandb: bool = False 48 | notes: str = None 49 | 50 | 51 | class BioBLPCallback(TrainingCallback): 52 | """A callback to get the wandb ID of the run before it gets closed. 53 | We use it to get a file name for the stored model.""" 54 | id = None 55 | scheduler = None 56 | 57 | def __init__(self, num_training_steps, warmup_fraction): 58 | super().__init__() 59 | self.use_scheduler = warmup_fraction is not None 60 | if self.use_scheduler: 61 | self.num_training_steps = num_training_steps 62 | self.num_warmup_steps = int(self.num_training_steps * warmup_fraction) 63 | 64 | def post_epoch(self, *args, **kwargs): 65 | if wandb.run is not None and BioBLPCallback.id is None: 66 | BioBLPCallback.id = wandb.run.id 67 | 68 | def pre_step(self, **kwargs): 69 | if not self.use_scheduler: 70 | return 71 | 72 | if self.scheduler is None: 73 | self.scheduler = get_linear_schedule_with_warmup( 74 | self.optimizer, 75 | self.num_warmup_steps, 76 | self.num_training_steps 77 | ) 78 | else: 79 | self.scheduler.step() 80 | 81 | 82 | def run(args: Arguments): 83 | cli_args_dict = {f'cli_{k}': v for k, v in args.as_dict().items()} 84 | if args.search_train_batch_size: 85 | args.batch_size = None 86 | if args.search_eval_batch_size: 87 | args.eval_batch_size = None 88 | 89 | logger = get_logger() 90 | logger.info('Loading triples...') 91 | 92 | entity_to_id = relation_to_id = None 93 | if args.from_checkpoint: 94 | checkpoint_triples = TriplesFactory.from_path_binary( 95 | osp.join(args.from_checkpoint, 'training_triples') 96 | ) 97 | entity_to_id = checkpoint_triples.entity_to_id 98 | relation_to_id = checkpoint_triples.relation_to_id 99 | 100 | training = TriplesFactory.from_path( 101 | args.train_triples, 102 | create_inverse_triples=args.add_inverses, 103 | entity_to_id=entity_to_id, 104 | relation_to_id=relation_to_id 105 | ) 106 | validation = TriplesFactory.from_path(args.valid_triples, 107 | entity_to_id=training.entity_to_id, 108 | relation_to_id=training.relation_to_id) 109 | testing = TriplesFactory.from_path(args.test_triples, 110 | entity_to_id=training.entity_to_id, 111 | relation_to_id=training.relation_to_id) 112 | 113 | logger.info(f'Loaded graph with {training.num_entities:,} entities') 114 | logger.info(f'{training.num_triples:,} training triples') 115 | logger.info(f'{validation.num_triples:,} validation triples') 116 | logger.info(f'{testing.num_triples:,} test triples') 117 | 118 | loss_kwargs = None 119 | if args.loss_fn in {'nssa', 'marginranking'}: 120 | loss_kwargs = {'margin': args.loss_margin} 121 | model = args.model 122 | model_kwargs = {'embedding_dim': args.dimension, 'loss': args.loss_fn} 123 | 124 | if any((args.protein_data, args.molecule_data, args.text_data)): 125 | model = models.get_model_class(args.model) 126 | dimension = args.dimension 127 | if args.model in ('complex', 'rotate'): 128 | dimension *= 2 129 | 130 | freeze_pretrained_embeddings = args.freeze_pretrained_embeddings 131 | encoders = build_encoders(dimension, 132 | training.entity_to_id, 133 | args.protein_data, 134 | args.molecule_data, 135 | args.text_data, 136 | freeze_pretrained_embeddings) 137 | model_kwargs['entity_representations'] = encoders 138 | 139 | if args.from_checkpoint: 140 | model_kwargs['from_checkpoint'] = args.from_checkpoint 141 | 142 | if args.warmup_fraction: 143 | if args.batch_size is None: 144 | raise ValueError('Batch size is needed to apply learning rate' 145 | ' warmup.') 146 | num_steps = (training.num_triples // args.batch_size) * args.num_epochs 147 | else: 148 | num_steps = None 149 | 150 | training_loop = InBatchNegativesTraining if args.in_batch_negatives else None 151 | 152 | result = pipeline(training=training, 153 | validation=validation, 154 | testing=testing, 155 | model=model, 156 | model_kwargs=model_kwargs, 157 | loss_kwargs=loss_kwargs, 158 | optimizer=args.optimizer, 159 | optimizer_kwargs={'lr': args.learning_rate}, 160 | regularizer='LpRegularizer', 161 | regularizer_kwargs={'weight': args.regularizer}, 162 | training_kwargs={'num_epochs': args.num_epochs, 163 | 'batch_size': args.batch_size, 164 | 'callbacks': BioBLPCallback, 165 | 'callback_kwargs': { 166 | 'num_training_steps': num_steps, 167 | 'warmup_fraction': args.warmup_fraction 168 | }}, 169 | training_loop=training_loop, 170 | negative_sampler='basic', 171 | negative_sampler_kwargs={ 172 | 'num_negs_per_pos': args.num_negatives 173 | }, 174 | stopper='early', 175 | stopper_kwargs={ 176 | 'evaluation_batch_size': args.eval_batch_size, 177 | 'metric': args.early_stopper, 178 | 'frequency': args.eval_every, 179 | 'patience': 5, 180 | 'relative_delta': 0.0001, 181 | 'larger_is_better': True 182 | }, 183 | evaluator_kwargs={'batch_size': args.eval_batch_size}, 184 | result_tracker='wandb', 185 | result_tracker_kwargs={ 186 | 'entity': 'discoverylab', 187 | 'project': 'bioblp', 188 | 'notes': args.notes, 189 | 'config': cli_args_dict, 190 | 'offline': not args.log_wandb 191 | } 192 | ) 193 | 194 | result.save_to_directory(osp.join('models', BioBLPCallback.id)) 195 | 196 | 197 | if __name__ == '__main__': 198 | run(Arguments(explicit_bool=True).parse_args()) 199 | -------------------------------------------------------------------------------- /bioblp/train_argparse.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from pathlib import Path 3 | from pykeen.pipeline import pipeline 4 | from pykeen.training import TrainingCallback 5 | from pykeen.triples import TriplesFactory 6 | from dataclasses import dataclass, asdict 7 | # from tap import Tap 8 | from argparse import ArgumentParser 9 | import wandb 10 | import toml 11 | 12 | from bioblp.logging import get_logger 13 | 14 | @dataclass 15 | class Arguments: 16 | #data_splits_path: str 17 | #dataset_name: str 18 | train_triples: str 19 | valid_triples: str 20 | test_triples: str 21 | 22 | model: str = 'complex' 23 | dimension: int = 256 24 | loss_fn: str = 'crossentropy' 25 | loss_margin: float = 1.0 26 | optimizer: str = 'adagrad' 27 | learning_rate: float = 1e-2 28 | regularizer: float = 1e-6 29 | num_epochs: int = 100 30 | batch_size: int = 1024 31 | eval_batch_size: int = 16 32 | num_negatives: int = 512 33 | add_inverses: bool = False 34 | early_stopper: str = 'both.realistic.inverse_harmonic_mean_rank' 35 | 36 | search_train_batch_size: bool = False 37 | search_eval_batch_size: bool = False 38 | log_wandb: bool = False 39 | notes: str = None 40 | 41 | 42 | class WBIDCallback(TrainingCallback): 43 | """A callback to get the wandb ID of the run before it gets closed. 44 | We use it to get a file name for the stored model.""" 45 | id = None 46 | 47 | def post_train(self, *args, **kwargs): 48 | if wandb.run is not None: 49 | WBIDCallback.id = wandb.run.id 50 | 51 | 52 | def load_toml(toml_path: str) -> dict: 53 | toml_path = Path(toml_path) 54 | config = {} 55 | with open(toml_path, "r") as f: 56 | config = toml.load(f) 57 | 58 | return config 59 | 60 | 61 | def run(args: Arguments): 62 | cli_args_dict = {f'cli_{k}': v for k, v in asdict(args).items()} 63 | if args.search_train_batch_size: 64 | args.batch_size = None 65 | if args.search_eval_batch_size: 66 | args.eval_batch_size = None 67 | 68 | logger = get_logger() 69 | logger.info('Loading triples...') 70 | 71 | training = TriplesFactory.from_path( 72 | args.train_triples, 73 | create_inverse_triples=args.add_inverses 74 | ) 75 | validation = TriplesFactory.from_path(args.valid_triples) 76 | testing = TriplesFactory.from_path(args.test_triples) 77 | 78 | logger.info(f'Loaded graph with {training.num_entities:,} entities') 79 | logger.info(f'{training.num_triples:,} training triples') 80 | logger.info(f'{validation.num_triples:,} validation triples') 81 | logger.info(f'{testing.num_triples:,} test triples') 82 | 83 | loss_kwargs = None 84 | if args.loss_fn in {'nssa', 'marginranking'}: 85 | loss_kwargs = {'margin': args.loss_margin} 86 | 87 | result = pipeline(training=training, 88 | validation=validation, 89 | testing=testing, 90 | model=args.model, 91 | model_kwargs={'embedding_dim': args.dimension, 92 | 'loss': args.loss_fn}, 93 | loss_kwargs=loss_kwargs, 94 | optimizer=args.optimizer, 95 | optimizer_kwargs={'lr': args.learning_rate}, 96 | regularizer='LpRegularizer', 97 | #regularizer_kwargs={'weight': args.regularizer}, 98 | training_kwargs={'num_epochs': args.num_epochs, 99 | 'batch_size': args.batch_size, 100 | 'callbacks': WBIDCallback}, 101 | negative_sampler='basic', 102 | negative_sampler_kwargs={ 103 | 'num_negs_per_pos': args.num_negatives 104 | }, 105 | stopper='early', 106 | stopper_kwargs={ 107 | 'evaluation_batch_size': args.eval_batch_size, 108 | 'metric': args.early_stopper, 109 | 'frequency': 10, 110 | 'patience': 5, 111 | 'relative_delta': 0.0001, 112 | 'larger_is_better': True 113 | }, 114 | evaluator_kwargs={'batch_size': args.eval_batch_size}, 115 | result_tracker='wandb', 116 | result_tracker_kwargs={ 117 | 'entity': 'discoverylab', 118 | 'project': 'bioblp', 119 | 'notes': args.notes, 120 | 'config': cli_args_dict, 121 | 'offline': not args.log_wandb 122 | } 123 | ) 124 | 125 | result.save_to_directory(osp.join('models', WBIDCallback.id)) 126 | 127 | 128 | if __name__ == '__main__': 129 | parser = ArgumentParser(description="Model training routing") 130 | parser.add_argument("--conf", type=str, 131 | help="Path to experiment toml file") 132 | #parser.add_argument('--out_path', type=str, 133 | # help='Path to write models output') 134 | 135 | args = parser.parse_args() 136 | conf = load_toml(args.conf) 137 | args = Arguments(**conf) 138 | run(args) 139 | #run(Arguments(explicit_bool=True).parse_args()) 140 | -------------------------------------------------------------------------------- /bioblp/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/utils/__init__.py -------------------------------------------------------------------------------- /bioblp/utils/bioblp_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Mapping 2 | 3 | import bioblp.models.encoders as encoders 4 | 5 | 6 | def build_encoders(dim: int, 7 | entity_to_id: Mapping[str, int], 8 | protein_data: str = None, 9 | molecule_data: str = None, 10 | text_data: str = None, 11 | freeze_pretrained_embeddings: bool = False 12 | ) -> encoders.PropertyEncoderRepresentation: 13 | if not any((protein_data, molecule_data, text_data)): 14 | raise ValueError("No entity data provided to build encoders.") 15 | 16 | encoders_list = [] 17 | 18 | if protein_data: 19 | protein_encoder = encoders.PretrainedLookupTableEncoder( 20 | file_path=protein_data, 21 | dim=dim, 22 | freeze_pretrained_embeddings=freeze_pretrained_embeddings 23 | ) 24 | encoders_list.append(protein_encoder) 25 | 26 | if molecule_data: 27 | # TODO: We might want to set different learning rates for different 28 | # modules, potentially also with learning rate scheduling 29 | molecule_encoder = encoders.MoleculeEmbeddingEncoder( 30 | file_path=molecule_data, 31 | dim=dim 32 | ) 33 | encoders_list.append(molecule_encoder) 34 | 35 | if text_data: 36 | text_encoder = encoders.TransformerTextEncoder( 37 | file_path=text_data, 38 | dim=dim 39 | ) 40 | encoders_list.append(text_encoder) 41 | 42 | entity_encoders = encoders.PropertyEncoderRepresentation( 43 | dim=dim, 44 | entity_to_id=entity_to_id, 45 | encoders=encoders_list 46 | ) 47 | 48 | return entity_encoders 49 | -------------------------------------------------------------------------------- /bioblp/utils/pipeline.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def create_pipeline(functions: list): 4 | """Sequentially executes a list of functions""" 5 | def pipeline(input): 6 | res = input 7 | for function in functions: 8 | res = function(res) 9 | return res 10 | 11 | return pipeline 12 | -------------------------------------------------------------------------------- /bioblp/utils/training.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Union 2 | 3 | from pykeen.training.slcwa import SLCWATrainingLoop 4 | from pykeen.models.base import Model 5 | from pykeen.losses import Loss 6 | from pykeen.typing import InductiveMode 7 | from pykeen.triples.instances import SLCWABatch 8 | import torch 9 | 10 | from bioblp.models import BioBLP 11 | 12 | 13 | class InBatchNegativesTraining(SLCWATrainingLoop): 14 | @staticmethod 15 | def _process_batch_static( 16 | model: Union[BioBLP, Model], 17 | loss: Loss, 18 | mode: Optional[InductiveMode], 19 | batch: SLCWABatch, 20 | start: Optional[int], 21 | stop: Optional[int], 22 | label_smoothing: float = 0.0, 23 | slice_size: Optional[int] = None, 24 | ) -> torch.FloatTensor: 25 | # Slicing is not possible in sLCWA training loops 26 | if slice_size is not None: 27 | raise AttributeError( 28 | "Slicing is not possible for sLCWA training loops.") 29 | 30 | positive_batch, negative_batch, positive_filter = batch 31 | positive_batch = positive_batch[start:stop].to(device=model.device) 32 | 33 | positive_scores, negative_scores = model.score_hrt_and_negatives( 34 | positive_batch, 35 | num_negatives=negative_batch.shape[1], 36 | mode=mode 37 | ) 38 | 39 | return ( 40 | loss.process_slcwa_scores( 41 | positive_scores=positive_scores, 42 | negative_scores=negative_scores, 43 | label_smoothing=label_smoothing, 44 | batch_filter=positive_filter, 45 | num_entities=model._get_entity_len(mode=mode), 46 | ) 47 | + model.collect_regularization_term() 48 | ) -------------------------------------------------------------------------------- /bioblp/utils/triples.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import os.path as osp 4 | from collections import Counter 5 | from argparse import ArgumentParser 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from tqdm import tqdm 10 | from pykeen.triples import TriplesFactory 11 | 12 | from bioblp.data import COL_SOURCE 13 | from bioblp.data import COL_EDGE 14 | from bioblp.data import COL_TARGET 15 | from bioblp.data import COL_PUBYEAR 16 | 17 | DIR_PROCESSED = 'processed' 18 | 19 | logger = logging.getLogger(__name__) 20 | handler = logging.StreamHandler() 21 | logger.addHandler(handler) 22 | logger.setLevel(logging.INFO) 23 | 24 | 25 | def get_entity_relation_counts(triples: pd.DataFrame): 26 | """Count frequency of entities and relations across triples. 27 | Entities are not counted twice if there is a self-loop.""" 28 | relation_counts = triples[COL_EDGE].value_counts() 29 | 30 | no_loops = triples[COL_SOURCE] != triples[COL_TARGET] 31 | tails_no_loops = triples[COL_TARGET].where(no_loops).dropna() 32 | entities = pd.concat([triples[COL_SOURCE], tails_no_loops]) 33 | entity_counts = entities.value_counts() 34 | 35 | return entity_counts, relation_counts 36 | 37 | 38 | def split_train_test_triples(triples: pd.DataFrame, ratio: float): 39 | """Split a dataset of triples into training and test sets, so that all 40 | entities in the test set are in the training set. 41 | Triples are removed in order starting from index 0. Edges are deleted so 42 | that the initial proportion of relation types is preserved in the training 43 | set.""" 44 | entity_counts, relation_counts = get_entity_relation_counts(triples) 45 | new_relation_counts = np.floor(relation_counts * ratio).astype(int) 46 | 47 | train_triples = [] 48 | test_triples = [] 49 | removed_relation_counts = Counter() 50 | done = {r: count == 0 for r, count in new_relation_counts.items()} 51 | 52 | with tqdm(total=new_relation_counts.sum(), desc='Removing triples') as bar: 53 | for i in range(len(triples)): 54 | row = triples.iloc[i] 55 | head = row[COL_SOURCE] 56 | rel = row[COL_EDGE] 57 | tail = row[COL_TARGET] 58 | 59 | # Check that removing the entity does not remove it from the 60 | # training set a count larger than two is required if head == tail 61 | if entity_counts[head] > 2 and entity_counts[tail] > 2 and not done[rel]: 62 | entity_counts[head] -= 1 63 | entity_counts[tail] -= 1 64 | test_triples.append(row) 65 | 66 | removed_relation_counts[rel] += 1 67 | bar.update(1) 68 | if removed_relation_counts[rel] == new_relation_counts[rel]: 69 | done[rel] = True 70 | if all(done.values()): 71 | break 72 | else: 73 | train_triples.append(row) 74 | 75 | test_triples = pd.DataFrame(test_triples, columns=triples.columns) 76 | train_triples = pd.DataFrame(train_triples, columns=triples.columns) 77 | # Add the rest of the triples that were not removed 78 | train_triples = pd.concat([train_triples, triples.iloc[i + 1:]]) 79 | 80 | print('Done!') 81 | 82 | return train_triples, test_triples 83 | 84 | 85 | def create_splits(triples_path: str, random: bool = False): 86 | """Create train/valid/test splits based on timestamps.""" 87 | print('Reading triples...') 88 | triples = pd.read_csv(triples_path, sep='\t') 89 | initial_length = len(triples) 90 | 91 | triples = triples.dropna(subset=[COL_SOURCE, COL_EDGE, COL_TARGET, 92 | COL_PUBYEAR]) 93 | triples[COL_PUBYEAR] = triples[COL_PUBYEAR].astype(int) 94 | 95 | # Sort whole dataframe first to ensure repeatability 96 | triples = triples.sort_values(by=list(triples.columns), kind='mergesort') 97 | 98 | if not random: 99 | # Sort by pubyear before deduplicating and removing triples! 100 | triples = triples.sort_values(by=COL_PUBYEAR, ascending=False, 101 | ignore_index=True, kind='mergesort') 102 | else: 103 | triples = triples.sample(frac=1, random_state=0) 104 | 105 | # In case of duplicates, keep most recent edge 106 | triples = triples.drop_duplicates(subset=[COL_SOURCE, COL_EDGE, 107 | COL_TARGET], 108 | keep='first') 109 | 110 | print(f'Read {initial_length:,} lines, got {len(triples):,} ' 111 | 'after keeping triples with dates and deduplicating.') 112 | 113 | train_triples, test_triples = split_train_test_triples(triples, ratio=0.1) 114 | 115 | num_test_triples = len(test_triples) 116 | split_idx = num_test_triples // 2 117 | valid_triples = test_triples.iloc[split_idx:] 118 | test_triples = test_triples.iloc[:split_idx] 119 | 120 | filename = osp.basename(triples_path) 121 | name, ext = osp.splitext(filename) 122 | data_path = osp.join(osp.dirname(osp.dirname(triples_path)), DIR_PROCESSED) 123 | 124 | if not osp.exists(data_path): 125 | os.mkdir(data_path) 126 | 127 | splits = {'train': train_triples, 128 | 'valid': valid_triples, 129 | 'test': test_triples} 130 | for s, dataframe in splits.items(): 131 | out_path = osp.join(data_path, f'{name}-{s}{ext}') 132 | dataframe.to_csv(out_path, sep='\t', index=False) 133 | print(f'Saved {len(dataframe):,} triples at {out_path}') 134 | 135 | 136 | def load_triples_array(path: str): 137 | """Given a path to a dataset file, extract only the colums containing 138 | (head, relation, tail) - i.e. the triples.""" 139 | triples = pd.read_csv(path, sep='\t', dtype=str) 140 | triples = triples[[COL_SOURCE, COL_EDGE, COL_TARGET]].to_numpy() 141 | 142 | return triples 143 | 144 | 145 | def load_triples_factories(data_path: str, dataset: str): 146 | """Load a pykeen.triples.TriplesFactory tuple for training, validation, 147 | and testing triples.""" 148 | processed_path = osp.join(data_path, DIR_PROCESSED) 149 | 150 | train_triples = load_triples_array(osp.join(processed_path, 151 | f'{dataset}-train.tsv')) 152 | valid_triples = load_triples_array(osp.join(processed_path, 153 | f'{dataset}-valid.tsv')) 154 | test_triples = load_triples_array(osp.join(processed_path, 155 | f'{dataset}-test.tsv')) 156 | 157 | training = TriplesFactory.from_labeled_triples(train_triples) 158 | validation = TriplesFactory.from_labeled_triples( 159 | valid_triples, 160 | entity_to_id=training.entity_to_id, 161 | relation_to_id=training.relation_to_id 162 | ) 163 | testing = TriplesFactory.from_labeled_triples( 164 | test_triples, 165 | entity_to_id=training.entity_to_id, 166 | relation_to_id=training.relation_to_id 167 | ) 168 | 169 | return training, validation, testing 170 | 171 | 172 | def reuse_existing_splits(triples_path, dataset_existing_splits): 173 | """""" 174 | 175 | triples = pd.read_csv(triples_path, sep='\t', dtype=str) 176 | initial_length = len(triples) 177 | logger.info(f"{initial_length} triples in input") 178 | 179 | triples = triples.dropna(subset=[COL_SOURCE, COL_EDGE, COL_TARGET, 180 | COL_PUBYEAR]) 181 | cols = [COL_SOURCE, COL_EDGE, COL_TARGET] 182 | triples = triples[cols] 183 | 184 | filename = osp.basename(triples_path) 185 | name, ext = osp.splitext(filename) 186 | data_path = osp.join(osp.dirname(osp.dirname(triples_path)), DIR_PROCESSED) 187 | 188 | existing_train_path = osp.join(data_path, f'{dataset_existing_splits}-train{ext}') 189 | existing_val_path = osp.join(data_path, f'{dataset_existing_splits}-valid{ext}') 190 | existing_test_path = osp.join(data_path, f'{dataset_existing_splits}-test{ext}') 191 | 192 | existing_train = pd.read_csv(existing_train_path, sep='\t', dtype=str)[cols] 193 | existing_valid = pd.read_csv(existing_val_path, sep='\t', dtype=str)[cols] 194 | existing_test = pd.read_csv(existing_test_path, sep='\t', dtype=str)[cols] 195 | 196 | all_existing_triples = existing_train.append(existing_valid.append( 197 | existing_test)).sort_values(by=cols, kind='mergesort') 198 | 199 | logger.info(f"{len(all_existing_triples)} triples in existing {dataset_existing_splits}") 200 | 201 | all_existing_triples_records = set([tuple(x) for x in all_existing_triples.values]) 202 | triple_records = [tuple(x) for x in triples.sort_values(by=cols, kind='mergesort').values] 203 | 204 | new_records = [] 205 | with tqdm(total=len(triple_records), desc='Checking triple overlap') as bar: 206 | for i in range(len(triple_records)): 207 | row = triple_records[i] 208 | 209 | try: 210 | all_existing_triples_records.remove(row) 211 | except KeyError: 212 | new_records.append(row) 213 | 214 | bar.update(1) 215 | bar.set_description( 216 | f"Checking triple overlap. Remaining set: {len(all_existing_triples_records)}", refresh=True) 217 | 218 | # merge new triples plus existing train for new train 219 | new_triples = pd.DataFrame.from_records(new_records, columns=cols) 220 | train_triples = new_triples.append(existing_train) 221 | 222 | splits = {'train': train_triples, 223 | 'valid': existing_valid, 224 | 'test': existing_test} 225 | 226 | for s, dataframe in splits.items(): 227 | out_path = osp.join(data_path, f'{name}-{s}{ext}') 228 | dataframe.to_csv(out_path, sep='\t', index=False) 229 | print(f'Saved {len(dataframe):,} triples at {out_path}') 230 | 231 | 232 | if __name__ == '__main__': 233 | parser = ArgumentParser(description='Split a file of triples into ' 234 | 'train/valid/test sets based on time.') 235 | parser.add_argument('file', type=str) 236 | parser.add_argument('--random', action='store_true', 237 | help='Split randomly instead.') 238 | parser.add_argument('--existing_dataset_splits', type=str, 239 | help='Name of existing splits (assumed to be in processed)') 240 | 241 | args = parser.parse_args() 242 | 243 | if args.existing_dataset_splits is not None: 244 | reuse_existing_splits(args.file, args.existing_dataset_splits) 245 | else: 246 | create_splits(args.file, args.random) 247 | -------------------------------------------------------------------------------- /bioblp/utils/util.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import pickle 4 | import torch 5 | 6 | def save_object(obj, filename): 7 | with open(filename, 'wb') as output: # Overwrites any existing file. 8 | torch.save(obj, output, pickle_module=dill) 9 | 10 | 11 | def load_object(filename): 12 | with open(filename, 'wb') as object: 13 | obj = torch.load(object, pickle_module=dill, encoding='utf-8') 14 | 15 | 16 | def read_query(query_filename): 17 | """ 18 | Read a query from file and return as a string 19 | Parameters 20 | ---------- 21 | query_filename: str name of the query. It will be looked for in the queries folder of this project 22 | Returns 23 | ------- 24 | query: str the query with placeholders for the query parameters, as a string to be formatted 25 | """ 26 | # query_filepath = Path(RAW_DIR / QUERY_DIR / query_filename) 27 | 28 | with open(query_filename) as fr: 29 | query = fr.read() 30 | return query 31 | 32 | 33 | def loading_animation(process, message="Loading") : 34 | while process.isAlive() : 35 | chars = "/—\|" 36 | for char in chars: 37 | sys.stdout.write('\r' + f'{message} {char} ') 38 | time.sleep(.1) 39 | sys.stdout.flush() 40 | 41 | 42 | def write_dict_as_pkl(dict_object, filename): 43 | """ 44 | filename: path to pickle file, should include appropiate .pkl extension 45 | """ 46 | with open(filename, "wb") as pkl_handle: 47 | pickle.dump(dict_object, pkl_handle) 48 | 49 | 50 | def load_dict_from_pkl(filename): 51 | """ 52 | filename: path to pickle file, should include appropiate .pkl extension 53 | """ 54 | with open(filename, "rb") as pkl_handle: 55 | dict_object = pickle.load(pkl_handle) 56 | 57 | return dict_object 58 | 59 | -------------------------------------------------------------------------------- /conf/complex-biokg-20220826.toml: -------------------------------------------------------------------------------- 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-train.tsv' 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-valid.tsv' 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-test.tsv' 4 | 5 | model = 'complex' 6 | dimension = 256 7 | loss_fn = 'crossentropy' 8 | loss_margin = 1.0 9 | optimizer = 'adagrad' 10 | learning_rate = 1e-2 11 | regularizer = 1e-6 12 | num_epochs = 20 13 | batch_size = 128 14 | eval_batch_size = 16 15 | num_negatives = 512 16 | add_inverses = false 17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank' 18 | 19 | search_train_batch_size = false 20 | search_eval_batch_size = false 21 | log_wandb = false 22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline' 23 | -------------------------------------------------------------------------------- /conf/complex-biokg-full-20220826.toml: -------------------------------------------------------------------------------- 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-train.tsv' 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-valid.tsv' 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-test.tsv' 4 | 5 | model = 'complex' 6 | dimension = 256 7 | loss_fn = 'crossentropy' 8 | loss_margin = 1.0 9 | optimizer = 'adagrad' 10 | learning_rate = 1e-2 11 | regularizer = 1e-6 12 | num_epochs = 2 13 | batch_size = 128 14 | eval_batch_size = 16 15 | num_negatives = 512 16 | add_inverses = false 17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank' 18 | 19 | search_train_batch_size = false 20 | search_eval_batch_size = false 21 | log_wandb = false 22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline' 23 | -------------------------------------------------------------------------------- /conf/complex-hetionet-20220826.toml: -------------------------------------------------------------------------------- 1 | train_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-train.tsv' 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-valid.tsv' 3 | test_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-test.tsv' 4 | 5 | model = 'complex' 6 | dimension = 256 7 | loss_fn = 'crossentropy' 8 | loss_margin = 1.0 9 | optimizer = 'adagrad' 10 | learning_rate = 1e-2 11 | regularizer = 1e-6 12 | num_epochs = 200 13 | batch_size = 128 14 | eval_batch_size = 16 15 | num_negatives = 128 16 | add_inverses = false 17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank' 18 | 19 | search_train_batch_size = false 20 | search_eval_batch_size = false 21 | log_wandb = true 22 | notes = 'attempt to reproduce hetionet reported results' -------------------------------------------------------------------------------- /conf/dpi-benchmark-cv-20230423-lr.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/DPI/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 10 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "mean" 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [features.encoder_args.bioblpd] 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/" 34 | 35 | [features.encoder_args.bioblpm] 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/" 37 | 38 | [features.encoder_args.bioblpp] 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/" 40 | 41 | 42 | [split] 43 | n_splits = 5 44 | outdir = "splits" 45 | 46 | 47 | [models] 48 | 49 | [models.noise_lr] 50 | feature = "noise" 51 | model = "LR" 52 | 53 | [models.structural_lr] 54 | feature = "structural" 55 | model = "LR" 56 | 57 | [models.transe_lr] 58 | feature = "transe" 59 | model = "LR" 60 | 61 | [models.complex_lr] 62 | feature = "complex" 63 | model = "LR" 64 | 65 | [models.rotate_lr] 66 | feature = "rotate" 67 | model = "LR" 68 | 69 | [models.bioblpd_lr] 70 | feature = "bioblpd" 71 | model = "LR" 72 | 73 | [models.bioblpm_lr] 74 | feature = "bioblpm" 75 | model = "LR" 76 | 77 | [models.bioblpp_lr] 78 | feature = "bioblpp" 79 | model = "LR" 80 | 81 | 82 | [train] 83 | n_iter = 10 84 | splits_file = "cv-splits.pt" 85 | refit_params = ["AUCPR", "AUCROC"] 86 | outdir = "models" 87 | -------------------------------------------------------------------------------- /conf/dpi-benchmark-cv-20230423-mlp-1.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/DPI/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 10 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "mean" 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [features.encoder_args.bioblpd] 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/" 34 | 35 | [features.encoder_args.bioblpm] 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/" 37 | 38 | [features.encoder_args.bioblpp] 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/" 40 | 41 | 42 | [split] 43 | n_splits = 5 44 | outdir = "splits" 45 | 46 | 47 | [models] 48 | 49 | [models.noise_mlp] 50 | feature = "noise" 51 | model = "MLP" 52 | 53 | [models.structural_mlp] 54 | feature = "structural" 55 | model = "MLP" 56 | 57 | [models.transe_mlp] 58 | feature = "transe" 59 | model = "MLP" 60 | 61 | [models.complex_mlp] 62 | feature = "complex" 63 | model = "MLP" 64 | 65 | 66 | [train] 67 | n_iter = 10 68 | splits_file = "cv-splits.pt" 69 | refit_params = ["AUCPR", "AUCROC"] 70 | outdir = "models" 71 | -------------------------------------------------------------------------------- /conf/dpi-benchmark-cv-20230423-mlp-2.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/DPI/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 10 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "mean" 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [features.encoder_args.bioblpd] 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/" 34 | 35 | [features.encoder_args.bioblpm] 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/" 37 | 38 | [features.encoder_args.bioblpp] 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/" 40 | 41 | 42 | [split] 43 | n_splits = 5 44 | outdir = "splits" 45 | 46 | 47 | [models] 48 | 49 | 50 | [models.rotate_mlp] 51 | feature = "rotate" 52 | model = "MLP" 53 | 54 | [models.bioblpd_mlp] 55 | feature = "bioblpd" 56 | model = "MLP" 57 | 58 | [models.bioblpm_mlp] 59 | feature = "bioblpm" 60 | model = "MLP" 61 | 62 | [models.bioblpp_mlp] 63 | feature = "bioblpp" 64 | model = "MLP" 65 | 66 | 67 | [train] 68 | n_iter = 10 69 | splits_file = "cv-splits.pt" 70 | refit_params = ["AUCPR", "AUCROC"] 71 | outdir = "models" 72 | -------------------------------------------------------------------------------- /conf/dpi-benchmark-cv-20230423-rf.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/DPI/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 10 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "mean" 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [features.encoder_args.bioblpd] 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/" 34 | 35 | [features.encoder_args.bioblpm] 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/" 37 | 38 | [features.encoder_args.bioblpp] 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/" 40 | 41 | 42 | [split] 43 | n_splits = 5 44 | outdir = "splits" 45 | 46 | 47 | [models] 48 | 49 | [models.noise_rf] 50 | feature = "noise" 51 | model = "RF" 52 | 53 | [models.structural_rf] 54 | feature = "structural" 55 | model = "RF" 56 | 57 | [models.transe_rf] 58 | feature = "transe" 59 | model = "RF" 60 | 61 | [models.complex_rf] 62 | feature = "complex" 63 | model = "RF" 64 | 65 | [models.rotate_rf] 66 | feature = "rotate" 67 | model = "RF" 68 | 69 | [models.bioblpd_rf] 70 | feature = "bioblpd" 71 | model = "RF" 72 | 73 | [models.bioblpm_rf] 74 | feature = "bioblpm" 75 | model = "RF" 76 | 77 | [models.bioblpp_rf] 78 | feature = "bioblpp" 79 | model = "RF" 80 | 81 | 82 | [train] 83 | n_iter = 10 84 | splits_file = "cv-splits.pt" 85 | refit_params = ["AUCPR", "AUCROC"] 86 | outdir = "models" 87 | -------------------------------------------------------------------------------- /conf/dpi-benchmark-cv-r1-20230424-mlp.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/DPI/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 1 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "mean" 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [features.encoder_args.bioblpd] 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/" 34 | 35 | [features.encoder_args.bioblpm] 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/" 37 | 38 | [features.encoder_args.bioblpp] 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/" 40 | 41 | 42 | [split] 43 | n_splits = 5 44 | outdir = "splits" 45 | 46 | 47 | [models] 48 | 49 | [models.noise_mlp] 50 | feature = "noise" 51 | model = "MLP" 52 | 53 | [models.structural_mlp] 54 | feature = "structural" 55 | model = "MLP" 56 | 57 | [models.transe_mlp] 58 | feature = "transe" 59 | model = "MLP" 60 | 61 | [models.complex_mlp] 62 | feature = "complex" 63 | model = "MLP" 64 | 65 | [models.rotate_mlp] 66 | feature = "rotate" 67 | model = "MLP" 68 | 69 | [models.bioblpd_mlp] 70 | feature = "bioblpd" 71 | model = "MLP" 72 | 73 | [models.bioblpm_mlp] 74 | feature = "bioblpm" 75 | model = "MLP" 76 | 77 | [models.bioblpp_mlp] 78 | feature = "bioblpp" 79 | model = "MLP" 80 | 81 | 82 | [train] 83 | n_iter = 10 84 | splits_file = "cv-splits.pt" 85 | refit_params = ["AUCPR", "AUCROC"] 86 | outdir = "models" 87 | -------------------------------------------------------------------------------- /conf/dpi-benchmark-cv-r1-20230424-rflr.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/DPI/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 1 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "mean" 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [features.encoder_args.bioblpd] 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/" 34 | 35 | [features.encoder_args.bioblpm] 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/" 37 | 38 | [features.encoder_args.bioblpp] 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/" 40 | 41 | 42 | [split] 43 | n_splits = 5 44 | outdir = "splits" 45 | 46 | 47 | [models] 48 | 49 | [models.noise_lr] 50 | feature = "noise" 51 | model = "LR" 52 | 53 | [models.structural_lr] 54 | feature = "structural" 55 | model = "LR" 56 | 57 | [models.transe_lr] 58 | feature = "transe" 59 | model = "LR" 60 | 61 | [models.complex_lr] 62 | feature = "complex" 63 | model = "LR" 64 | 65 | [models.rotate_lr] 66 | feature = "rotate" 67 | model = "LR" 68 | 69 | [models.bioblpd_lr] 70 | feature = "bioblpd" 71 | model = "LR" 72 | 73 | [models.bioblpm_lr] 74 | feature = "bioblpm" 75 | model = "LR" 76 | 77 | [models.bioblpp_lr] 78 | feature = "bioblpp" 79 | model = "LR" 80 | 81 | 82 | [models.noise_rf] 83 | feature = "noise" 84 | model = "RF" 85 | 86 | [models.structural_rf] 87 | feature = "structural" 88 | model = "RF" 89 | 90 | [models.transe_rf] 91 | feature = "transe" 92 | model = "RF" 93 | 94 | [models.complex_rf] 95 | feature = "complex" 96 | model = "RF" 97 | 98 | [models.rotate_rf] 99 | feature = "rotate" 100 | model = "RF" 101 | 102 | [models.bioblpd_rf] 103 | feature = "bioblpd" 104 | model = "RF" 105 | 106 | [models.bioblpm_rf] 107 | feature = "bioblpm" 108 | model = "RF" 109 | 110 | [models.bioblpp_rf] 111 | feature = "bioblpp" 112 | model = "RF" 113 | 114 | 115 | [train] 116 | n_iter = 10 117 | splits_file = "cv-splits.pt" 118 | refit_params = ["AUCPR", "AUCROC"] 119 | outdir = "models" 120 | -------------------------------------------------------------------------------- /data/conf/complex-biokg-20220826.toml: -------------------------------------------------------------------------------- 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-train.tsv' 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-valid.tsv' 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-test.tsv' 4 | 5 | model = 'complex' 6 | dimension = 256 7 | loss_fn = 'crossentropy' 8 | loss_margin = 1.0 9 | optimizer = 'adagrad' 10 | learning_rate = 1e-2 11 | regularizer = 1e-6 12 | num_epochs = 20 13 | batch_size = 128 14 | eval_batch_size = 16 15 | num_negatives = 512 16 | add_inverses = false 17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank' 18 | 19 | search_train_batch_size = false 20 | search_eval_batch_size = false 21 | log_wandb = false 22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline' 23 | -------------------------------------------------------------------------------- /data/conf/complex-biokg-full-20220826.toml: -------------------------------------------------------------------------------- 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-train.tsv' 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-valid.tsv' 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-test.tsv' 4 | 5 | model = 'complex' 6 | dimension = 256 7 | loss_fn = 'crossentropy' 8 | loss_margin = 1.0 9 | optimizer = 'adagrad' 10 | learning_rate = 1e-2 11 | regularizer = 1e-6 12 | num_epochs = 2 13 | batch_size = 128 14 | eval_batch_size = 16 15 | num_negatives = 512 16 | add_inverses = false 17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank' 18 | 19 | search_train_batch_size = false 20 | search_eval_batch_size = false 21 | log_wandb = false 22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline' 23 | -------------------------------------------------------------------------------- /data/conf/complex-hetionet-20220826.toml: -------------------------------------------------------------------------------- 1 | train_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-train.tsv' 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-valid.tsv' 3 | test_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-test.tsv' 4 | 5 | model = 'complex' 6 | dimension = 256 7 | loss_fn = 'crossentropy' 8 | loss_margin = 1.0 9 | optimizer = 'adagrad' 10 | learning_rate = 1e-2 11 | regularizer = 1e-6 12 | num_epochs = 200 13 | batch_size = 128 14 | eval_batch_size = 16 15 | num_negatives = 128 16 | add_inverses = false 17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank' 18 | 19 | search_train_batch_size = false 20 | search_eval_batch_size = false 21 | log_wandb = true 22 | notes = 'attempt to reproduce hetionet reported results' -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: bioblp 2 | channels: 3 | - huggingface 4 | - pytorch 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - _libgcc_mutex=0.1=main 9 | - _openmp_mutex=5.1=1_gnu 10 | - anyio=3.5.0=py39h06a4308_0 11 | - appdirs=1.4.4=pyh9f0ad1d_0 12 | - argon2-cffi=21.3.0=pyhd3eb1b0_0 13 | - argon2-cffi-bindings=21.2.0=py39h7f8727e_0 14 | - asttokens=2.0.5=pyhd3eb1b0_0 15 | - babel=2.9.1=pyhd3eb1b0_0 16 | - backcall=0.2.0=pyhd3eb1b0_0 17 | - beautifulsoup4=4.11.1=py39h06a4308_0 18 | - blas=1.0=mkl 19 | - bleach=4.1.0=pyhd3eb1b0_0 20 | - bottleneck=1.3.5=py39h7deecbd_0 21 | - brotli=1.0.9=h166bdaf_7 22 | - brotli-bin=1.0.9=h166bdaf_7 23 | - brotlipy=0.7.0=py39h27cfd23_1003 24 | - bzip2=1.0.8=h7b6447c_0 25 | - ca-certificates=2022.12.7=ha878542_0 26 | - certifi=2022.12.7=pyhd8ed1ab_0 27 | - cffi=1.15.1=py39h74dc2b5_0 28 | - charset-normalizer=2.0.4=pyhd3eb1b0_0 29 | - click=8.0.4=py39h06a4308_0 30 | - contourpy=1.0.5=py39hdb19cb5_0 31 | - cryptography=37.0.1=py39h9ce1e76_0 32 | - cudatoolkit=11.3.1=h2bc3f7f_2 33 | - cycler=0.11.0=pyhd8ed1ab_0 34 | - dataclasses=0.8=pyh6d0b6a4_7 35 | - dbus=1.13.18=hb2f20db_0 36 | - debugpy=1.5.1=py39h295c915_0 37 | - decorator=5.1.1=pyhd3eb1b0_0 38 | - defusedxml=0.7.1=pyhd3eb1b0_0 39 | - entrypoints=0.4=py39h06a4308_0 40 | - executing=0.8.3=pyhd3eb1b0_0 41 | - expat=2.4.9=h6a678d5_0 42 | - ffmpeg=4.3=hf484d3e_0 43 | - filelock=3.6.0=pyhd3eb1b0_0 44 | - fontconfig=2.13.1=h6c09931_0 45 | - fonttools=4.25.0=pyhd3eb1b0_0 46 | - freetype=2.11.0=h70c0345_0 47 | - giflib=5.2.1=h7b6447c_0 48 | - glib=2.69.1=h4ff587b_1 49 | - gmp=6.2.1=h295c915_3 50 | - gnutls=3.6.15=he1e5248_0 51 | - gst-plugins-base=1.14.0=h8213a91_2 52 | - gstreamer=1.14.0=h28cd5cc_2 53 | - huggingface_hub=0.10.1=py_0 54 | - icu=58.2=he6710b0_3 55 | - idna=3.4=py39h06a4308_0 56 | - importlib-metadata=4.11.3=py39h06a4308_0 57 | - importlib_metadata=4.11.3=hd3eb1b0_0 58 | - intel-openmp=2021.4.0=h06a4308_3561 59 | - ipykernel=6.15.2=py39h06a4308_0 60 | - ipython=8.4.0=py39h06a4308_0 61 | - ipython_genutils=0.2.0=pyhd3eb1b0_1 62 | - ipywidgets=7.6.5=pyhd3eb1b0_1 63 | - jedi=0.18.1=py39h06a4308_1 64 | - jinja2=3.0.3=pyhd3eb1b0_0 65 | - joblib=1.1.0=pyhd3eb1b0_0 66 | - jpeg=9e=h7f8727e_0 67 | - json5=0.9.6=pyhd3eb1b0_0 68 | - jsonschema=4.16.0=py39h06a4308_0 69 | - jupyter=1.0.0=py39h06a4308_8 70 | - jupyter_client=7.3.5=py39h06a4308_0 71 | - jupyter_console=6.4.3=pyhd3eb1b0_0 72 | - jupyter_core=4.11.1=py39h06a4308_0 73 | - jupyter_server=1.18.1=py39h06a4308_0 74 | - jupyterlab=3.4.4=py39h06a4308_0 75 | - jupyterlab_pygments=0.1.2=py_0 76 | - jupyterlab_server=2.15.2=py39h06a4308_0 77 | - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1 78 | - kiwisolver=1.4.2=py39h295c915_0 79 | - krb5=1.19.2=hac12032_0 80 | - lame=3.100=h7b6447c_0 81 | - lcms2=2.12=h3be6417_0 82 | - ld_impl_linux-64=2.38=h1181459_1 83 | - lerc=3.0=h295c915_0 84 | - libbrotlicommon=1.0.9=h166bdaf_7 85 | - libbrotlidec=1.0.9=h166bdaf_7 86 | - libbrotlienc=1.0.9=h166bdaf_7 87 | - libclang=10.0.1=default_hb85057a_2 88 | - libdeflate=1.8=h7f8727e_5 89 | - libedit=3.1.20210910=h7f8727e_0 90 | - libevent=2.1.12=h8f2d780_0 91 | - libffi=3.3=he6710b0_2 92 | - libgcc-ng=11.2.0=h1234567_1 93 | - libgfortran-ng=12.2.0=h69a702a_19 94 | - libgfortran5=12.2.0=h337968e_19 95 | - libgomp=11.2.0=h1234567_1 96 | - libiconv=1.16=h7f8727e_2 97 | - libidn2=2.3.2=h7f8727e_0 98 | - libllvm10=10.0.1=hbcb73fb_5 99 | - libpng=1.6.37=hbc83047_0 100 | - libpq=12.9=h16c4e8d_3 101 | - libprotobuf=3.20.1=h4ff587b_0 102 | - libsodium=1.0.18=h7b6447c_0 103 | - libstdcxx-ng=11.2.0=h1234567_1 104 | - libtasn1=4.16.0=h27cfd23_0 105 | - libtiff=4.4.0=hecacb30_0 106 | - libunistring=0.9.10=h27cfd23_0 107 | - libuuid=1.0.3=h7f8727e_2 108 | - libwebp=1.2.4=h11a3e52_0 109 | - libwebp-base=1.2.4=h5eee18b_0 110 | - libxcb=1.15=h7f8727e_0 111 | - libxkbcommon=1.0.1=hfa300c1_0 112 | - libxml2=2.9.14=h74e7548_0 113 | - libxslt=1.1.35=h4e12654_0 114 | - lz4-c=1.9.3=h295c915_1 115 | - markupsafe=2.1.1=py39h7f8727e_0 116 | - matplotlib=3.6.2=py39hf3d152e_0 117 | - matplotlib-base=3.6.2=py39h945d387_0 118 | - matplotlib-inline=0.1.6=py39h06a4308_0 119 | - mistune=0.8.4=py39h27cfd23_1000 120 | - mkl=2021.4.0=h06a4308_640 121 | - mkl-service=2.4.0=py39h7f8727e_0 122 | - mkl_fft=1.3.1=py39hd3c417c_0 123 | - mkl_random=1.2.2=py39h51133e4_0 124 | - munkres=1.1.4=pyh9f0ad1d_0 125 | - nbclassic=0.3.5=pyhd3eb1b0_0 126 | - nbclient=0.5.13=py39h06a4308_0 127 | - nbconvert=6.4.4=py39h06a4308_0 128 | - nbformat=5.5.0=py39h06a4308_0 129 | - ncurses=6.3=h5eee18b_3 130 | - nest-asyncio=1.5.5=py39h06a4308_0 131 | - nettle=3.7.3=hbbd107a_1 132 | - notebook=6.4.12=py39h06a4308_0 133 | - nspr=4.33=h295c915_0 134 | - nss=3.74=h0370c37_0 135 | - numexpr=2.8.4=py39he184ba9_0 136 | - numpy=1.23.3=py39h14f4228_0 137 | - numpy-base=1.23.3=py39h31eccc5_0 138 | - openh264=2.1.1=h4ff587b_0 139 | - openssl=1.1.1t=h7f8727e_0 140 | - packaging=21.3=pyhd3eb1b0_0 141 | - pandocfilters=1.5.0=pyhd3eb1b0_0 142 | - parso=0.8.3=pyhd3eb1b0_0 143 | - patsy=0.5.3=pyhd8ed1ab_0 144 | - pcre=8.45=h295c915_0 145 | - pexpect=4.8.0=pyhd3eb1b0_3 146 | - pickleshare=0.7.5=pyhd3eb1b0_1003 147 | - pillow=9.2.0=py39hace64e9_1 148 | - pip=22.2.2=py39h06a4308_0 149 | - ply=3.11=py39h06a4308_0 150 | - pooch=1.6.0=pyhd8ed1ab_0 151 | - prometheus_client=0.14.1=py39h06a4308_0 152 | - prompt-toolkit=3.0.20=pyhd3eb1b0_0 153 | - prompt_toolkit=3.0.20=hd3eb1b0_0 154 | - protobuf=3.20.1=py39h295c915_0 155 | - ptyprocess=0.7.0=pyhd3eb1b0_2 156 | - pure_eval=0.2.2=pyhd3eb1b0_0 157 | - pycparser=2.21=pyhd3eb1b0_0 158 | - pygments=2.11.2=pyhd3eb1b0_0 159 | - pyopenssl=22.0.0=pyhd3eb1b0_0 160 | - pyparsing=3.0.9=py39h06a4308_0 161 | - pyqt=5.15.7=py39h6a678d5_1 162 | - pyqt5-sip=12.11.0=py39h6a678d5_1 163 | - pyrsistent=0.18.0=py39heee7806_0 164 | - pysocks=1.7.1=py39h06a4308_0 165 | - python=3.9.13=haa1d7c7_2 166 | - python-dateutil=2.8.2=pyhd3eb1b0_0 167 | - python-fastjsonschema=2.16.2=py39h06a4308_0 168 | - python_abi=3.9=2_cp39 169 | - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0 170 | - pytorch-mutex=1.0=cuda 171 | - pyyaml=6.0=py39h7f8727e_1 172 | - pyzmq=23.2.0=py39h6a678d5_0 173 | - qt-main=5.15.2=h327a75a_7 174 | - qt-webengine=5.15.9=hd2b0992_4 175 | - qtconsole=5.3.2=py39h06a4308_0 176 | - qtpy=2.2.0=py39h06a4308_0 177 | - qtwebkit=5.212=h4eab89a_4 178 | - readline=8.1.2=h7f8727e_1 179 | - regex=2022.7.9=py39h5eee18b_0 180 | - requests=2.28.1=py39h06a4308_0 181 | - sacremoses=master=py_0 182 | - seaborn=0.12.2=hd8ed1ab_0 183 | - seaborn-base=0.12.2=pyhd8ed1ab_0 184 | - send2trash=1.8.0=pyhd3eb1b0_1 185 | - setuptools=63.4.1=py39h06a4308_0 186 | - sip=6.6.2=py39h6a678d5_0 187 | - six=1.16.0=pyhd3eb1b0_1 188 | - sniffio=1.2.0=py39h06a4308_1 189 | - soupsieve=2.3.1=pyhd3eb1b0_0 190 | - sqlite=3.39.3=h5082296_0 191 | - stack_data=0.2.0=pyhd3eb1b0_0 192 | - statsmodels=0.13.5=py39h7deecbd_1 193 | - terminado=0.13.1=py39h06a4308_0 194 | - testpath=0.6.0=py39h06a4308_0 195 | - tk=8.6.12=h1ccaba5_0 196 | - toml=0.10.2=pyhd3eb1b0_0 197 | - torchaudio=0.12.1=py39_cu113 198 | - torchvision=0.13.1=py39_cu113 199 | - tornado=6.2=py39h5eee18b_0 200 | - tqdm=4.64.1=py39h06a4308_0 201 | - traitlets=5.1.1=pyhd3eb1b0_0 202 | - typing-extensions=4.3.0=py39h06a4308_0 203 | - typing_extensions=4.3.0=py39h06a4308_0 204 | - tzdata=2022e=h04d1e81_0 205 | - urllib3=1.26.11=py39h06a4308_0 206 | - wcwidth=0.2.5=pyhd3eb1b0_0 207 | - webencodings=0.5.1=py39h06a4308_1 208 | - websocket-client=0.58.0=py39h06a4308_4 209 | - wheel=0.37.1=pyhd3eb1b0_0 210 | - widgetsnbextension=3.5.2=py39h06a4308_0 211 | - xz=5.2.6=h5eee18b_0 212 | - yaml=0.2.5=h7b6447c_0 213 | - zeromq=4.3.4=h2531618_0 214 | - zipp=3.8.0=py39h06a4308_0 215 | - zlib=1.2.12=h5eee18b_3 216 | - zstd=1.5.2=ha4553b6_0 217 | - pip: 218 | - alembic==1.8.1 219 | - attrs==22.1.0 220 | - autopage==0.5.1 221 | - class-resolver==0.3.10 222 | - click-default-group==1.2.2 223 | - cliff==4.0.0 224 | - cmaes==0.8.2 225 | - cmd2==2.4.2 226 | - colorlog==6.7.0 227 | - dataclasses-json==0.5.7 228 | - dill==0.3.6 229 | - docdata==0.0.3 230 | - docker-pycreds==0.4.0 231 | - gitdb==4.0.9 232 | - gitpython==3.1.29 233 | - greenlet==1.1.3.post0 234 | - mako==1.2.3 235 | - marshmallow==3.18.0 236 | - marshmallow-enum==1.5.1 237 | - more-click==0.1.1 238 | - more-itertools==9.0.0 239 | - mypy-extensions==0.4.3 240 | - networkx==3.0 241 | - optuna==3.0.3 242 | - pandas==1.5.1 243 | - pathtools==0.1.2 244 | - pbr==5.10.0 245 | - prettytable==3.4.1 246 | - promise==2.3 247 | - psutil==5.9.3 248 | - pykeen==1.9.0 249 | - pyperclip==1.8.2 250 | - pystow==0.4.6 251 | - pytz==2022.5 252 | - rexmex==0.1.2 253 | - scikit-learn==1.1.2 254 | - scipy==1.8.1 255 | - sentry-sdk==1.9.10 256 | - setproctitle==1.3.2 257 | - shortuuid==1.0.9 258 | - sklearn==0.0 259 | - smmap==5.0.0 260 | - sqlalchemy==1.4.42 261 | - stevedore==4.0.1 262 | - tabulate==0.9.0 263 | - threadpoolctl==3.1.0 264 | - tokenizers==0.10.3 265 | - torch-max-mem==0.0.4 266 | - torch-ppr==0.0.8 267 | - transformers==4.11.3 268 | - typed-argument-parser==1.7.2 269 | - typing-inspect==0.8.0 270 | - wandb==0.13.4 271 | -------------------------------------------------------------------------------- /fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/fig.png -------------------------------------------------------------------------------- /jobs/biokg-bioblp-d-complex-initialized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-d 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout fix_bioblp_init 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=complex \ 27 | --dimension=256 \ 28 | --loss_fn=bcewithlogits \ 29 | --optimizer=adam \ 30 | --learning_rate=2e-5 \ 31 | --warmup_fraction=0.05 \ 32 | --num_epochs=100 \ 33 | --batch_size=1024 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --in_batch_negatives=True \ 37 | --from_checkpoint=models/1e9b4f4o \ 38 | --log_wandb=True \ 39 | --notes="ComplEx BioBLP-D initialized with 1e9b4f4o" 40 | 41 | # Keep files generated during job 42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 43 | mkdir -p $RESULTS_FOLDER 44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 45 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-d-complex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-d-complex 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout develop 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=complex \ 27 | --dimension=256 \ 28 | --loss_fn=bcewithlogits \ 29 | --optimizer=adam \ 30 | --learning_rate=2e-5 \ 31 | --warmup_fraction=0.05 \ 32 | --num_epochs=100 \ 33 | --batch_size=1024 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --in_batch_negatives=True \ 37 | --log_wandb=True \ 38 | --notes="ComplEx BioBLP-D" 39 | 40 | # Keep files generated during job 41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 42 | mkdir -p $RESULTS_FOLDER 43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 44 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-d-rotate-initialized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-d 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout disease-encoder-checkpoint 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=rotate \ 27 | --dimension=256 \ 28 | --loss_fn=crossentropy \ 29 | --optimizer=adam \ 30 | --learning_rate=2e-5 \ 31 | --warmup_fraction=0.05 \ 32 | --num_epochs=100 \ 33 | --batch_size=1024 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --in_batch_negatives=True \ 37 | --from_checkpoint=models/36viovqn \ 38 | --log_wandb=True \ 39 | --notes="RotatE BioBLP-D initialized with 36viovqn, higher patience" 40 | 41 | # Keep files generated during job 42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 43 | mkdir -p $RESULTS_FOLDER 44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 45 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-d-rotate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-d 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout disease-encoder-dummy 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=rotate \ 27 | --dimension=256 \ 28 | --loss_fn=crossentropy \ 29 | --optimizer=adam \ 30 | --learning_rate=2e-5 \ 31 | --warmup_fraction=0.05 \ 32 | --num_epochs=100 \ 33 | --batch_size=1024 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --in_batch_negatives=True \ 37 | --log_wandb=True \ 38 | --notes="BioBLP-D" 39 | 40 | # Keep files generated during job 41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 42 | mkdir -p $RESULTS_FOLDER 43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 44 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-d-transe-initialized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-d-transe 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout fix_bioblp_init 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=transe \ 27 | --dimension=512 \ 28 | --loss_fn=marginranking \ 29 | --loss_margin=8.155451890616455 \ 30 | --optimizer=adam \ 31 | --learning_rate=2e-5 \ 32 | --warmup_fraction=0.05 \ 33 | --num_epochs=100 \ 34 | --batch_size=1024 \ 35 | --eval_batch_size=64 \ 36 | --num_negatives=512 \ 37 | --in_batch_negatives=True \ 38 | --from_checkpoint=models/394htt2x \ 39 | --log_wandb=True \ 40 | --notes="TransE BioBLP-D initialized with 394htt2x" 41 | 42 | # Keep files generated during job 43 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 44 | mkdir -p $RESULTS_FOLDER 45 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 46 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-d-transe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-d-transe 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout fix_bioblp_init 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=transe \ 27 | --dimension=512 \ 28 | --loss_fn=marginranking \ 29 | --loss_margin=8.155451890616455 \ 30 | --optimizer=adam \ 31 | --learning_rate=2e-5 \ 32 | --warmup_fraction=0.05 \ 33 | --num_epochs=100 \ 34 | --batch_size=1024 \ 35 | --eval_batch_size=64 \ 36 | --num_negatives=512 \ 37 | --in_batch_negatives=True \ 38 | --log_wandb=True \ 39 | --notes="TransE BioBLP-D, margin from sage-shadow-1047" 40 | 41 | # Keep files generated during job 42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 43 | mkdir -p $RESULTS_FOLDER 44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 45 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-complex-bce-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-complex-sweep 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout develop 20 | wandb agent --count 1 discoverylab/bioblp/70t4kuu5 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-complex-bce-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: complex 11 | loss_fn: 12 | value: bcewithlogits 13 | optimizer: 14 | value: adam 15 | learning_rate: 16 | distribution: log_uniform_values 17 | min: 1e-3 18 | max: 1.0 19 | regularizer: 20 | distribution: log_uniform_values 21 | min: 1e-6 22 | max: 1e-3 23 | batch_size: 24 | value: 1024 25 | eval_batch_size: 26 | value: 64 27 | in_batch_negatives: 28 | value: true 29 | command: 30 | - ${env} 31 | - python 32 | - "-m" 33 | - ${program} 34 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 35 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 36 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 37 | - '--search_eval_batch_size=True' 38 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt' 39 | - '--log_wandb=True' 40 | - '--notes="BioBLP-P ComplEx sweep"' 41 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-rotate-adagrad-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-bioblp-m-rotate-sweep 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | wandb agent --count 1 discoverylab/bioblp/oouxbq6p 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-rotate-adagrad-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: crossentropy 13 | optimizer: 14 | value: adagrad 15 | learning_rate: 16 | distribution: log_uniform_values 17 | min: 1e-3 18 | max: 1e-1 19 | regularizer: 20 | distribution: log_uniform_values 21 | min: 1e-6 22 | max: 1e-3 23 | batch_size: 24 | value: 1024 25 | eval_batch_size: 26 | value: 64 27 | in_batch_negatives: 28 | value: true 29 | command: 30 | - ${env} 31 | - python 32 | - "-m" 33 | - ${program} 34 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 35 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 36 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 37 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt' 38 | - '--log_wandb=True' 39 | - '--notes=BioBLP-M RotatE sweep' 40 | - ${args} 41 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-rotate-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-bioblp-m-rotate-sweep 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | wandb agent --count 1 discoverylab/bioblp/liqycjns 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-rotate-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: crossentropy 13 | optimizer: 14 | value: adam 15 | learning_rate: 16 | distribution: log_uniform_values 17 | min: 1e-4 18 | max: 1e-1 19 | regularizer: 20 | distribution: log_uniform_values 21 | min: 1e-6 22 | max: 1e-3 23 | batch_size: 24 | value: 1024 25 | eval_batch_size: 26 | value: 64 27 | in_batch_negatives: 28 | value: true 29 | command: 30 | - ${env} 31 | - python 32 | - "-m" 33 | - ${program} 34 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 35 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 36 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 37 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt' 38 | - '--log_wandb=True' 39 | - '--notes=BioBLP-M RotatE sweep' 40 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-transe-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-transe-sweep 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout develop 20 | wandb agent --count 1 discoverylab/bioblp/pgx00fqa 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-m-transe-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: transe 11 | dimension: 12 | value: 512 13 | loss_fn: 14 | value: marginranking 15 | optimizer: 16 | value: adam 17 | loss_margin: 18 | distribution: uniform 19 | min: 0.5 20 | max: 10.0 21 | learning_rate: 22 | distribution: log_uniform_values 23 | min: 1e-4 24 | max: 1e-1 25 | regularizer: 26 | distribution: log_uniform_values 27 | min: 1e-6 28 | max: 1e-3 29 | batch_size: 30 | value: 1024 31 | eval_batch_size: 32 | value: 64 33 | in_batch_negatives: 34 | value: true 35 | command: 36 | - ${env} 37 | - python 38 | - "-m" 39 | - ${program} 40 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 41 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 42 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 43 | - '--search_eval_batch_size=True' 44 | - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt' 45 | - '--log_wandb=True' 46 | - '--notes=BioBLP-M TransE sweep' 47 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-complex-bce-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-p-rotate 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | wandb agent --count 1 discoverylab/bioblp/6d2bwmy4 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-complex-bce-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | loss_fn: 10 | value: bcewithlogits 11 | freeze_pretrained_embeddings: 12 | value: true 13 | learning_rate: 14 | distribution: log_uniform_values 15 | min: 1e-3 16 | max: 1.0 17 | regularizer: 18 | distribution: log_uniform_values 19 | min: 1e-6 20 | max: 1e-3 21 | batch_size: 22 | values: 23 | - 128 24 | - 256 25 | - 512 26 | command: 27 | - ${env} 28 | - python 29 | - "-m" 30 | - ${program} 31 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 32 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 33 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 34 | - '--search_eval_batch_size=True' 35 | - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt' 36 | - '--log_wandb=True' 37 | - '--notes="BioBLP-P ComplEx sweep"' 38 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-complex-initialized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-p 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \ 26 | --model=complex \ 27 | --dimension=256 \ 28 | --loss_fn=bcewithlogits \ 29 | --regularizer=7.54616261352196e-05 \ 30 | --freeze_pretrained_embeddings=True \ 31 | --learning_rate=0.344274380857535 \ 32 | --num_epochs=100 \ 33 | --batch_size=512 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --from_checkpoint=models/1e9b4f4o \ 37 | --log_wandb=True \ 38 | --notes="ComplEx BioBLP-P initialized with 1e9b4f4o" 39 | 40 | # Keep files generated during job 41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 42 | mkdir -p $RESULTS_FOLDER 43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 44 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-rotate-initialized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-p 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \ 26 | --model=rotate \ 27 | --dimension=256 \ 28 | --loss_fn=crossentropy \ 29 | --regularizer=0.0003536270470551425 \ 30 | --freeze_pretrained_embeddings=True \ 31 | --learning_rate=0.04972680094809032 \ 32 | --num_epochs=100 \ 33 | --batch_size=512 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --from_checkpoint=models/36viovqn \ 37 | --log_wandb=True \ 38 | --notes="RotatE BioBLP-P initialized with 36viovqn" 39 | 40 | # Keep files generated during job 41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 42 | mkdir -p $RESULTS_FOLDER 43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 44 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-rotate-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-p-rotate 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | wandb agent --count 1 discoverylab/bioblp/u02tzec7 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-rotate-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: crossentropy 13 | freeze_pretrained_embeddings: 14 | value: true 15 | learning_rate: 16 | distribution: log_uniform_values 17 | min: 1e-3 18 | max: 1.0 19 | regularizer: 20 | distribution: log_uniform_values 21 | min: 1e-6 22 | max: 1e-3 23 | batch_size: 24 | values: 25 | - 128 26 | - 256 27 | - 512 28 | command: 29 | - ${env} 30 | - python 31 | - "-m" 32 | - ${program} 33 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 34 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 35 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 36 | - '--search_eval_batch_size=True' 37 | - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt' 38 | - '--log_wandb=True' 39 | - '--notes=BioBLP-P RotatE sweep' 40 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-transe-initialized.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-p 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=24:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \ 26 | --model=transe \ 27 | --dimension=512 \ 28 | --loss_fn=marginranking \ 29 | --loss_margin=7.234906889602847 \ 30 | --regularizer=0.0006031667561379036 \ 31 | --freeze_pretrained_embeddings=True \ 32 | --learning_rate=0.03569964236328523 \ 33 | --num_epochs=100 \ 34 | --batch_size=256 \ 35 | --eval_batch_size=64 \ 36 | --num_negatives=512 \ 37 | --from_checkpoint=models/394htt2x \ 38 | --log_wandb=True \ 39 | --notes="TransE BioBLP-P initialized with 394htt2x" 40 | 41 | # Keep files generated during job 42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 43 | mkdir -p $RESULTS_FOLDER 44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 45 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-transe-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-p-transe 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=72:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout freeze-embeddings 20 | wandb agent --count 1 discoverylab/bioblp/rw6nzzyx 21 | 22 | # Keep files generated during job 23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 24 | mkdir -p $RESULTS_FOLDER 25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 26 | -------------------------------------------------------------------------------- /jobs/biokg-bioblp-p-transe-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: random 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: transe 11 | dimension: 12 | value: 512 13 | loss_fn: 14 | value: marginranking 15 | freeze_pretrained_embeddings: 16 | value: true 17 | loss_margin: 18 | distribution: uniform 19 | min: 0.5 20 | max: 10.0 21 | learning_rate: 22 | distribution: log_uniform_values 23 | min: 1e-3 24 | max: 1.0 25 | regularizer: 26 | distribution: log_uniform_values 27 | min: 1e-6 28 | max: 1e-3 29 | batch_size: 30 | values: 31 | - 128 32 | - 256 33 | - 512 34 | command: 35 | - ${env} 36 | - python 37 | - "-m" 38 | - ${program} 39 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 40 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 41 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 42 | - '--search_eval_batch_size=True' 43 | - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt' 44 | - '--log_wandb=True' 45 | - '--notes=BioBLP-P TransE sweep' 46 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-complex-bce-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-complex-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/21oekub7 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/biokg-complex-bce-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | loss_fn: 10 | value: bcewithlogits 11 | learning_rate: 12 | distribution: log_uniform_values 13 | min: 1e-3 14 | max: 1.0 15 | regularizer: 16 | distribution: log_uniform_values 17 | min: 1e-6 18 | max: 1e-3 19 | batch_size: 20 | values: 21 | - 128 22 | - 256 23 | - 512 24 | - 1024 25 | command: 26 | - ${env} 27 | - python 28 | - "-m" 29 | - ${program} 30 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 31 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 32 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 33 | - '--log_wandb=True' 34 | - '--notes="ComplEx sweep"' 35 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-complex-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-complex-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/9m2x48u3 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/biokg-complex-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | learning_rate: 10 | distribution: log_uniform_values 11 | min: 1e-3 12 | max: 1.0 13 | regularizer: 14 | distribution: log_uniform_values 15 | min: 1e-6 16 | max: 1e-3 17 | batch_size: 18 | values: 19 | - 128 20 | - 256 21 | - 512 22 | - 1024 23 | command: 24 | - ${env} 25 | - python 26 | - "-m" 27 | - ${program} 28 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 29 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 30 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 31 | - '--log_wandb=True' 32 | - '--notes="ComplEx sweep"' 33 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-rotate-bce-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-rotate-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/7q2851co 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/biokg-rotate-bce-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: bcewithlogits 13 | learning_rate: 14 | distribution: log_uniform_values 15 | min: 1e-3 16 | max: 1.0 17 | regularizer: 18 | distribution: log_uniform_values 19 | min: 1e-6 20 | max: 1e-3 21 | batch_size: 22 | values: 23 | - 128 24 | - 256 25 | - 512 26 | - 1024 27 | command: 28 | - ${env} 29 | - python 30 | - "-m" 31 | - ${program} 32 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 33 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 34 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 35 | - '--log_wandb=True' 36 | - '--notes=RotatE sweep, bcewithlogits' 37 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-rotate-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-rotate-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/u75h00fl 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/biokg-rotate-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: crossentropy 13 | learning_rate: 14 | distribution: log_uniform_values 15 | min: 1e-3 16 | max: 1.0 17 | regularizer: 18 | distribution: log_uniform_values 19 | min: 1e-6 20 | max: 1e-3 21 | batch_size: 22 | values: 23 | - 128 24 | - 256 25 | - 512 26 | - 1024 27 | command: 28 | - ${env} 29 | - python 30 | - "-m" 31 | - ${program} 32 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 33 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 34 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 35 | - '--log_wandb=True' 36 | - '--notes=RotatE sweep' 37 | - ${args} -------------------------------------------------------------------------------- /jobs/biokg-transe-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-transe-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/n4zgfrhb 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/biokg-transe-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: transe 11 | dimension: 12 | value: 512 13 | loss_fn: 14 | value: marginranking 15 | loss_margin: 16 | distribution: uniform 17 | min: 0.5 18 | max: 10.0 19 | learning_rate: 20 | distribution: log_uniform_values 21 | min: 1e-3 22 | max: 1.0 23 | regularizer: 24 | distribution: log_uniform_values 25 | min: 1e-6 26 | max: 1e-3 27 | batch_size: 28 | values: 29 | - 128 30 | - 256 31 | - 512 32 | - 1024 33 | command: 34 | - ${env} 35 | - python 36 | - "-m" 37 | - ${program} 38 | - '--train_triples=data/biokgb/graph/biokg.links-train.csv' 39 | - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv' 40 | - '--test_triples=data/biokgb/graph/biokg.links-test.csv' 41 | - '--log_wandb=True' 42 | - '--notes=TransE sweep' 43 | - ${args} -------------------------------------------------------------------------------- /jobs/complex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=complex 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=10:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout fix_bioblp_init 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --model=complex \ 26 | --dimension=256 \ 27 | --loss_fn=bcewithlogits \ 28 | --learning_rate=0.3595182058943781 \ 29 | --regularizer=3.7579365087382533e-05 \ 30 | --num_epochs=100 \ 31 | --batch_size=256 \ 32 | --eval_batch_size=64 \ 33 | --num_negatives=512 \ 34 | --log_wandb=True \ 35 | --notes="ComplEx best hparams, rep" 36 | 37 | # Keep files generated during job 38 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 39 | mkdir -p $RESULTS_FOLDER 40 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 41 | -------------------------------------------------------------------------------- /jobs/hetionet-complex-bce-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-complex-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/ydoydkmt 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/hetionet-complex-bce-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | loss_fn: 10 | value: bcewithlogits 11 | learning_rate: 12 | distribution: log_uniform_values 13 | min: 1e-3 14 | max: 1.0 15 | regularizer: 16 | distribution: log_uniform_values 17 | min: 1e-6 18 | max: 1e-3 19 | batch_size: 20 | values: 21 | - 128 22 | - 256 23 | - 512 24 | - 1024 25 | command: 26 | - ${env} 27 | - python 28 | - "-m" 29 | - ${program} 30 | - '--train_triples=data/hetionet/hetionet.train.csv' 31 | - '--valid_triples=data/hetionet/hetionet.valid.csv' 32 | - '--test_triples=data/hetionet/hetionet.test.csv' 33 | - '--log_wandb=True' 34 | - '--notes="ComplEx sweep"' 35 | - ${args} -------------------------------------------------------------------------------- /jobs/hetionet-complex-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-complex-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/uvgnrmka 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/hetionet-complex-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | learning_rate: 10 | distribution: log_uniform_values 11 | min: 1e-3 12 | max: 1.0 13 | regularizer: 14 | distribution: log_uniform_values 15 | min: 1e-6 16 | max: 1e-3 17 | batch_size: 18 | values: 19 | - 128 20 | - 256 21 | - 512 22 | - 1024 23 | command: 24 | - ${env} 25 | - python 26 | - "-m" 27 | - ${program} 28 | - '--train_triples=data/hetionet/hetionet.train.csv' 29 | - '--valid_triples=data/hetionet/hetionet.valid.csv' 30 | - '--test_triples=data/hetionet/hetionet.test.csv' 31 | - '--log_wandb=True' 32 | - '--notes="ComplEx sweep"' 33 | - ${args} -------------------------------------------------------------------------------- /jobs/hetionet-rotate-bce-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-rotate-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/ge1smc54 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/hetionet-rotate-bce-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: bcewithlogits 13 | learning_rate: 14 | distribution: log_uniform_values 15 | min: 1e-3 16 | max: 1.0 17 | regularizer: 18 | distribution: log_uniform_values 19 | min: 1e-6 20 | max: 1e-3 21 | batch_size: 22 | values: 23 | - 128 24 | - 256 25 | - 512 26 | - 1024 27 | command: 28 | - ${env} 29 | - python 30 | - "-m" 31 | - ${program} 32 | - '--train_triples=data/hetionet/hetionet.train.csv' 33 | - '--valid_triples=data/hetionet/hetionet.valid.csv' 34 | - '--test_triples=data/hetionet/hetionet.test.csv' 35 | - '--log_wandb=True' 36 | - '--notes=RotatE sweep, bcewithlogits' 37 | - ${args} -------------------------------------------------------------------------------- /jobs/hetionet-rotate-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-rotate-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/2iderrf0 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/hetionet-rotate-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: rotate 11 | loss_fn: 12 | value: crossentropy 13 | learning_rate: 14 | distribution: log_uniform_values 15 | min: 1e-3 16 | max: 1.0 17 | regularizer: 18 | distribution: log_uniform_values 19 | min: 1e-6 20 | max: 1e-3 21 | batch_size: 22 | values: 23 | - 128 24 | - 256 25 | - 512 26 | - 1024 27 | command: 28 | - ${env} 29 | - python 30 | - "-m" 31 | - ${program} 32 | - '--train_triples=data/hetionet/hetionet.train.csv' 33 | - '--valid_triples=data/hetionet/hetionet.valid.csv' 34 | - '--test_triples=data/hetionet/hetionet.test.csv' 35 | - '--log_wandb=True' 36 | - '--notes=RotatE sweep' 37 | - ${args} -------------------------------------------------------------------------------- /jobs/hetionet-transe-sweep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=biokg-transe-sweep 3 | #SBATCH --output=array_%A_%a.out 4 | #SBATCH --error=array_%A_%a.err 5 | #SBATCH --ntasks=1 6 | #SBATCH --cpus-per-task=6 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --time=40:00:00 9 | #SBATCH --mem=10G 10 | #SBATCH --partition=gpu_shared 11 | #SBATCH --gres=gpu:1 12 | 13 | PROJ_FOLDER=bioblp 14 | OUT_FOLDER=models 15 | 16 | # Copy data to scratch 17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 18 | cd $TMPDIR/$PROJ_FOLDER 19 | 20 | source activate bioblp 21 | 22 | wandb agent --count 1 discoverylab/bioblp/jfb6wo19 23 | 24 | # Keep files generated during job 25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 26 | mkdir -p $RESULTS_FOLDER 27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 28 | -------------------------------------------------------------------------------- /jobs/hetionet-transe-sweep.yml: -------------------------------------------------------------------------------- 1 | entity: discoverylab 2 | project: bioblp 3 | program: bioblp.train 4 | method: bayes 5 | metric: 6 | name: validation.both.realistic.inverse_harmonic_mean_rank 7 | goal: maximize 8 | parameters: 9 | model: 10 | value: transe 11 | dimension: 12 | value: 512 13 | loss_fn: 14 | value: marginranking 15 | loss_margin: 16 | distribution: uniform 17 | min: 0.5 18 | max: 10.0 19 | learning_rate: 20 | distribution: log_uniform_values 21 | min: 1e-3 22 | max: 1.0 23 | regularizer: 24 | distribution: log_uniform_values 25 | min: 1e-6 26 | max: 1e-3 27 | batch_size: 28 | values: 29 | - 128 30 | - 256 31 | - 512 32 | - 1024 33 | command: 34 | - ${env} 35 | - python 36 | - "-m" 37 | - ${program} 38 | - '--train_triples=data/hetionet/hetionet.train.csv' 39 | - '--valid_triples=data/hetionet/hetionet.valid.csv' 40 | - '--test_triples=data/hetionet/hetionet.test.csv' 41 | - '--log_wandb=True' 42 | - '--notes=TransE sweep' 43 | - ${args} -------------------------------------------------------------------------------- /jobs/rotate-dummy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-rotate-dummy 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=08:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout disease-encoder-dummy 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/dummy_biokg_meshid_to_descr_name.tsv \ 26 | --model=rotate \ 27 | --dimension=256 \ 28 | --loss_fn=crossentropy \ 29 | --optimizer=adagrad \ 30 | --regularizer=0.0002757262741946316 \ 31 | --learning_rate=0.07300713133641318 \ 32 | --num_epochs=100 \ 33 | --batch_size=1024 \ 34 | --eval_batch_size=64 \ 35 | --num_negatives=512 \ 36 | --in_batch_negatives=False \ 37 | --log_wandb=True \ 38 | --notes="BioBLP-D RotatE, no descriptions, fixed eval batch size" 39 | 40 | # Keep files generated during job 41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 42 | mkdir -p $RESULTS_FOLDER 43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 44 | -------------------------------------------------------------------------------- /jobs/rotate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=bioblp-complex 3 | #SBATCH --ntasks=1 4 | #SBATCH --cpus-per-task=18 5 | #SBATCH --time=01:00:00 6 | #SBATCH --mem=16G 7 | #SBATCH --partition=gpu 8 | #SBATCH --gpus=1 9 | 10 | PROJ_FOLDER=bioblp 11 | OUT_FOLDER=models 12 | 13 | # Copy data to scratch 14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR 15 | cd $TMPDIR/$PROJ_FOLDER 16 | 17 | source activate bioblp 18 | 19 | git checkout disease-encoder 20 | 21 | python -m bioblp.train \ 22 | --train_triples=data/biokgb/graph/biokg.links-train.csv \ 23 | --valid_triples=data/biokgb/graph/biokg.links-valid.csv \ 24 | --test_triples=data/biokgb/graph/biokg.links-test.csv \ 25 | --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \ 26 | --model=rotate \ 27 | --dimension=256 \ 28 | --loss_fn=crossentropy \ 29 | --optimizer=adam \ 30 | --learning_rate=2e-5 \ 31 | --warmup_fraction=0.05 \ 32 | --num_epochs=10 \ 33 | --batch_size=1024 \ 34 | --search_eval_batch_size=True \ 35 | --eval_every=1 \ 36 | --num_negatives=512 \ 37 | --in_batch_negatives=True \ 38 | --log_wandb=True \ 39 | --notes="BioBLP-D 10 epoch test" 40 | 41 | # Keep files generated during job 42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER 43 | mkdir -p $RESULTS_FOLDER 44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER 45 | -------------------------------------------------------------------------------- /loaders/placeholder.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/loaders/placeholder.txt -------------------------------------------------------------------------------- /logs/placeholder.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/logs/placeholder.txt -------------------------------------------------------------------------------- /notebooks/01_01_biokg-data-prep-for-kge.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "dd58a8cf", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "%load_ext autoreload\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 11, 17 | "id": "b05d473c", 18 | "metadata": { 19 | "tags": [] 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd \n", 24 | "from pathlib import Path\n", 25 | "import toml\n", 26 | "\n", 27 | "from bioblp.data import COL_SOURCE, COL_TARGET,COL_EDGE\n", 28 | "from bioblp.data import create_random_splits" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "f36dd753", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "DATA_DIR = Path(\"../data\")\n", 39 | "SHARED_DATA_DIR = Path(\"/home/jovyan/workbench-shared-folder/bioblp/data\")\n", 40 | "config_path = DATA_DIR.joinpath(\"conf/complex-biokg-20220826.toml\")\n", 41 | "biokg_mini_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links_sample.tsv\")\n", 42 | "biokg_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links.tsv\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "f4732983-308b-44d7-8fd9-43a3b1506819", 48 | "metadata": { 49 | "tags": [] 50 | }, 51 | "source": [ 52 | "### BIOKG Data Prep" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 7, 58 | "id": "918f0203", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | "
srcedgtgt
0C566487DISEASE_PATHWAY_ASSOCIATIONhsa00071
1C567839DISEASE_PATHWAY_ASSOCIATIONmap04810
\n", 102 | "
" 103 | ], 104 | "text/plain": [ 105 | " src edg tgt\n", 106 | "0 C566487 DISEASE_PATHWAY_ASSOCIATION hsa00071\n", 107 | "1 C567839 DISEASE_PATHWAY_ASSOCIATION map04810" 108 | ] 109 | }, 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "#df = pd.read_csv(biokg_mini_path, delimiter=\"\\t\", names=[\"idx\", COL_SOURCE, COL_EDGE, COL_TARGET], header=0)\n", 117 | "df = pd.read_csv(biokg_path, delimiter=\"\\t\", names=[COL_SOURCE, COL_EDGE, COL_TARGET], header=None)\n", 118 | "df.head(2)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "37dac0a0-108d-4f4c-a1f3-95e985ca9db7", 124 | "metadata": {}, 125 | "source": [ 126 | "Create data splits" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "id": "cb5e4b6d", 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "name": "stderr", 137 | "output_type": "stream", 138 | "text": [ 139 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n", 140 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n", 141 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "train, test, valid = create_random_splits(df, 0.9, 0.05, 0.05)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 11, 152 | "id": "d06a6c1e", 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "saved to ../data/raw/biokg_full_splits\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "SAVE_SPLITS_TO_DISK = False\n", 165 | "dataset_name = 'biokg_random_900505'\n", 166 | "datasplits_dir = DATA_DIR.joinpath(\"raw/biokg_full_splits\")\n", 167 | "\n", 168 | "if SAVE_SPLITS_TO_DISK:\n", 169 | " save_splits(train_df=train,\n", 170 | " test_df=test, \n", 171 | " valid_df=valid,\n", 172 | " dataset_name=dataset_name,\n", 173 | " out_dir=datasplits_dir)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "ed5633c4-cf9f-477f-a468-582bbf91146d", 179 | "metadata": {}, 180 | "source": [ 181 | "### Training" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "388a8210-89f0-435f-8405-81b8c38caa12", 187 | "metadata": {}, 188 | "source": [ 189 | "```bash\n", 190 | "$ python -m bioblp.train_argparse --conf /home/jovyan/BioBLP/data/conf/complex-biokg-full-20220826.toml\n", 191 | "```" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "773a6c74-333b-49e8-b2df-022574889217", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [] 201 | } 202 | ], 203 | "metadata": { 204 | "kernelspec": { 205 | "display_name": ".conda-bioblp-env [Python]", 206 | "language": "python", 207 | "name": "conda-env-.conda-bioblp-env-py" 208 | }, 209 | "language_info": { 210 | "codemirror_mode": { 211 | "name": "ipython", 212 | "version": 3 213 | }, 214 | "file_extension": ".py", 215 | "mimetype": "text/x-python", 216 | "name": "python", 217 | "nbconvert_exporter": "python", 218 | "pygments_lexer": "ipython3", 219 | "version": "3.8.13" 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 5 224 | } 225 | -------------------------------------------------------------------------------- /notebooks/03-00-nested-cv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f8467842-5b37-4dc9-83f0-a684ed4a5fdd", 6 | "metadata": {}, 7 | "source": [ 8 | "# Run nested CV routine" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "259edda9-e110-4e05-b1de-2965c45ef58b", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import random\n", 19 | "\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "\n", 23 | "from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET\n", 24 | "from bioblp.logging import get_logger\n", 25 | "import torch\n", 26 | "\n", 27 | "\n", 28 | "logger = get_logger(__name__)\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "134fd3c5", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "DATA_DIR = Path(\"../data/\")\n", 39 | "DATA_SHARED = Path(\"/home/jovyan/workbench-shared-folder/bioblp\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "eee761be", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "from time import time\n", 50 | "from pathlib import Path\n", 51 | "from collections import defaultdict\n", 52 | "\n", 53 | "from bioblp.benchmarking.train import run_nested_cv\n", 54 | "from bioblp.benchmarking.train import get_scorers\n", 55 | "\n" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "326edf30", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "\"\"\"Perform train run\"\"\"\n", 66 | "\n", 67 | "# reproducibility\n", 68 | "# SEED is set as global\n", 69 | "shuffle = True\n", 70 | "refit_params = [\"AUCPR\", \"AUCROC\"]\n", 71 | "\n", 72 | "data_dir = Path(\"../data/features/kge-1baon0eg/\")\n", 73 | "out_dir = Path(\"../data/runs/\")\n", 74 | "\n", 75 | "n_proc = 1\n", 76 | "n_iter = 2\n", 77 | "inner_n_folds = 3\n", 78 | "outer_n_folds = 5\n", 79 | "\n", 80 | "exp_output = defaultdict(dict)\n", 81 | "exp_output[\"config\"] = {\n", 82 | " \"n_proc\": n_proc,\n", 83 | " \"n_iter\": n_iter,\n", 84 | " \"inner_n_folds\": inner_n_folds,\n", 85 | " \"outer_n_folds\": outer_n_folds,\n", 86 | " \"data_dir\": data_dir,\n", 87 | " \"seed\": SEED,\n", 88 | " \"shuffle\": shuffle\n", 89 | "}\n", 90 | "\n", 91 | "start = time()\n", 92 | "run_timestamp = int(start)\n", 93 | "\n", 94 | "logger.info(\"Starting model building script at {}.\".format(start))\n", 95 | "\n", 96 | "############\n", 97 | "# Load data\n", 98 | "############\n", 99 | "logger.info(\"Loading training data...\")\n", 100 | "\n", 101 | "X_train = np.load(data_dir.joinpath(\"X.npy\"))\n", 102 | "y_train = np.load(data_dir.joinpath(\"y.npy\"))\n", 103 | "\n", 104 | "logger.info(\n", 105 | " \"Resulting shapes X_train: {}, y_train: {}\".format(\n", 106 | " X_train.shape, y_train.shape)\n", 107 | ")\n", 108 | "logger.info(\"Counts in y_train: {}\".format(\n", 109 | " np.unique(y_train, return_counts=True)))\n", 110 | "\n", 111 | "############\n", 112 | "# Setup classifiers & pipelines\n", 113 | "############\n", 114 | "\n", 115 | "lr_label = \"LR\"\n", 116 | "rf_label = \"RF\"\n", 117 | "MLP_label = \"MLP\"\n", 118 | "\n", 119 | "############\n", 120 | "# Compare models\n", 121 | "############\n", 122 | "\n", 123 | "candidates = [\n", 124 | " lr_label,\n", 125 | " # rf_label,\n", 126 | " # MLP_label\n", 127 | "\n", 128 | "]\n", 129 | "\n", 130 | "scorer = get_scorers()\n", 131 | "\n", 132 | "nested_cv_scores = run_nested_cv(\n", 133 | " candidates=candidates,\n", 134 | " X=X_train,\n", 135 | " y=y_train,\n", 136 | " scoring=scorer,\n", 137 | " inner_n_folds=inner_n_folds,\n", 138 | " inner_n_iter=n_iter,\n", 139 | " outer_n_folds=outer_n_folds,\n", 140 | " shuffle=shuffle,\n", 141 | " n_jobs=n_proc,\n", 142 | " refit_params=refit_params,\n", 143 | " random_state=SEED,\n", 144 | " outdir=out_dir,\n", 145 | " timestamp=run_timestamp\n", 146 | ")\n", 147 | "\n", 148 | "for algo, scores in nested_cv_scores.items():\n", 149 | " logger.info(\"Scores {}: {}\".format(algo, scores))\n", 150 | "\n", 151 | "exp_output[\"results\"] = nested_cv_scores\n", 152 | "\n", 153 | "logger.info(exp_output)\n", 154 | "\n", 155 | "file_out = out_dir.joinpath(\n", 156 | " \"nested_cv_scores_{}.npy\".format(run_timestamp))\n", 157 | "logger.info(\"Saving to {}\".format(file_out))\n", 158 | "np.save(file_out, exp_output)\n", 159 | "\n", 160 | "end = time()\n", 161 | "\n", 162 | "logger.info(\"Ran script in {} seconds\".format(str(end - start)))" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "id": "703ff89a-dd11-4fb0-bdcb-87e9fa41e20a", 168 | "metadata": {}, 169 | "source": [ 170 | "_____" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "a6594c30-e73d-4214-989c-54512bef0e5b", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "df67346c-124a-49ec-9cfe-913d273f66c2", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "58d97f92-0a46-4bd0-92be-7124e6c91768", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [] 196 | } 197 | ], 198 | "metadata": { 199 | "kernelspec": { 200 | "display_name": ".conda-bioblp-env [Python]", 201 | "language": "python", 202 | "name": "conda-env-.conda-bioblp-env-py" 203 | }, 204 | "language_info": { 205 | "codemirror_mode": { 206 | "name": "ipython", 207 | "version": 3 208 | }, 209 | "file_extension": ".py", 210 | "mimetype": "text/x-python", 211 | "name": "python", 212 | "nbconvert_exporter": "python", 213 | "pygments_lexer": "ipython3", 214 | "version": "3.9.13" 215 | }, 216 | "vscode": { 217 | "interpreter": { 218 | "hash": "c313b0b0929f94c03130caa81adcdac46c3c408d7f1caca6c1104b192c16f937" 219 | } 220 | } 221 | }, 222 | "nbformat": 4, 223 | "nbformat_minor": 5 224 | } 225 | -------------------------------------------------------------------------------- /notebooks/03-frequency-baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Evaluating frequency-based baselines for link prediction\n", 7 | "\n", 8 | "Some knowledege graphs come with particularly frequent instances (either relations, or entities), that a model can use to learn spurious correlations that lead to high ranking metrics, due to the calculation of micro-averages.\n", 9 | "A sanity check thus consists of running a baseline that simply uses counts, which can be compared with models that are supposed to generalize much better." 10 | ], 11 | "metadata": { 12 | "collapsed": false, 13 | "pycharm": { 14 | "name": "#%% md\n" 15 | } 16 | } 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 5, 21 | "metadata": { 22 | "collapsed": true, 23 | "pycharm": { 24 | "name": "#%%\n" 25 | } 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import os.path as osp\n", 30 | "\n", 31 | "from pykeen.models.baseline import MarginalDistributionBaseline\n", 32 | "from pykeen.triples import TriplesFactory\n", 33 | "from pykeen.evaluation import RankBasedEvaluator, evaluate\n", 34 | "import torch" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "source": [ 40 | "## Data loading" 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "pycharm": { 45 | "name": "#%% md\n" 46 | } 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "outputs": [], 53 | "source": [ 54 | "graph_path = osp.join('..', 'data', 'biokgb', 'graph')\n", 55 | "train_triples = 'biokg.links-train.csv'\n", 56 | "valid_triples = 'biokg.links-valid.csv'\n", 57 | "test_triples = 'biokg.links-test.csv'\n", 58 | "\n", 59 | "train, valid, test = [TriplesFactory.from_path(osp.join(graph_path, f)) for f in (train_triples, valid_triples, test_triples)]" 60 | ], 61 | "metadata": { 62 | "collapsed": false, 63 | "pycharm": { 64 | "name": "#%%\n" 65 | } 66 | } 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "source": [ 71 | "## Instantiating a frequency-based baseline\n", 72 | "\n", 73 | "PyKEEN comes with a set of interesting baselines that, ideally, any machine learning model should outperform. Here we will use the [`MarginalDistributionBaseline`](https://pykeen.readthedocs.io/en/stable/api/pykeen.models.MarginalDistributionBaseline.html).\n", 74 | "\n", 75 | "When predicting the tail for a triple (h, r, t), the model scores each possible tail t as the probability that t co-occurs with r times the probability that t co-occurs with h:\n", 76 | "\n", 77 | "$$\n", 78 | "P(t\\vert h, r) = P(t\\vert r) P(t\\vert h)\n", 79 | "$$" 80 | ], 81 | "metadata": { 82 | "collapsed": false, 83 | "pycharm": { 84 | "name": "#%% md\n" 85 | } 86 | } 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 7, 91 | "outputs": [], 92 | "source": [ 93 | "model = MarginalDistributionBaseline(train)\n", 94 | "# An ugly hack to add a dummy parameter to this non-parametric baseline\n", 95 | "# so that evaluation works as for models with learnable parameters\n", 96 | "model.foo = torch.nn.Embedding(1, 2)" 97 | ], 98 | "metadata": { 99 | "collapsed": false, 100 | "pycharm": { 101 | "name": "#%%\n" 102 | } 103 | } 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "source": [ 108 | "## Evaluation\n", 109 | "\n", 110 | "We now get the ranking metrics on the test set, using triples in the training, validation, and test sets for filtering.\n", 111 | "\n", 112 | "**Warning:** the next cell can take around half an hour to run." 113 | ], 114 | "metadata": { 115 | "collapsed": false, 116 | "pycharm": { 117 | "name": "#%% md\n" 118 | } 119 | } 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 10, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": "Evaluating on cpu: 0%| | 0.00/185k [00:0010}')\n", 73 | "print('-' * 20)\n", 74 | "for name, split in splits_dict.items():\n", 75 | " print(f'{name:^10}{split.num_triples:>10,}')" 76 | ], 77 | "metadata": { 78 | "collapsed": false 79 | } 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 4, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n", 90 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n", 91 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "out_path = osp.join('..', 'data', 'hetionet')\n", 97 | "if not osp.exists(out_path):\n", 98 | " os.mkdir(out_path)\n", 99 | "\n", 100 | "for name, split in splits_dict.items():\n", 101 | " pd.DataFrame(split.triples).to_csv(osp.join(out_path, f'hetionet.{name}.csv'), sep='\\t', index=False, header=False)" 102 | ], 103 | "metadata": { 104 | "collapsed": false 105 | } 106 | } 107 | ], 108 | "metadata": { 109 | "kernelspec": { 110 | "display_name": "Python 3", 111 | "language": "python", 112 | "name": "python3" 113 | }, 114 | "language_info": { 115 | "codemirror_mode": { 116 | "name": "ipython", 117 | "version": 2 118 | }, 119 | "file_extension": ".py", 120 | "mimetype": "text/x-python", 121 | "name": "python", 122 | "nbconvert_exporter": "python", 123 | "pygments_lexer": "ipython2", 124 | "version": "2.7.6" 125 | } 126 | }, 127 | "nbformat": 4, 128 | "nbformat_minor": 0 129 | } 130 | -------------------------------------------------------------------------------- /notebooks/99-train_hetionet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "dd58a8cf", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "%load_ext autoreload\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 11, 17 | "id": "b05d473c", 18 | "metadata": { 19 | "tags": [] 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd \n", 24 | "from pathlib import Path\n", 25 | "import toml\n", 26 | "\n", 27 | "from bioblp.data import COL_SOURCE, COL_TARGET,COL_EDGE\n", 28 | "from bioblp.data import create_random_splits" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "f36dd753", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "DATA_DIR = Path(\"../data\")\n", 39 | "SHARED_DATA_DIR = Path(\"/home/jovyan/workbench-shared-folder/bioblp/data\")\n", 40 | "config_path = DATA_DIR.joinpath(\"conf/complex-biokg-20220826.toml\")\n", 41 | "biokg_mini_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links_sample.tsv\")\n", 42 | "biokg_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links.tsv\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "56b4e0a0", 48 | "metadata": {}, 49 | "source": [ 50 | "## Hetionet" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 21, 56 | "id": "cbbb5a42", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "EagerDataset (create_inverse_triples=False)\n", 64 | "Name Entities Relations Triples\n", 65 | "---------- ---------- ----------- ---------\n", 66 | "Training 45158 24 1800157\n", 67 | "Testing 45158 24 225020\n", 68 | "Validation 45158 24 225020\n", 69 | "Total - - 2250197\n", 70 | "Head Relation tail\n", 71 | "----------------------- ---------- ------------\n", 72 | "Anatomy::UBERON:0000002 AdG Gene::10005\n", 73 | "Anatomy::UBERON:0000002 AdG Gene::114804\n", 74 | "Anatomy::UBERON:0000002 AdG Gene::118670\n", 75 | "Anatomy::UBERON:0000002 AdG Gene::128989\n", 76 | "Anatomy::UBERON:0000002 AdG Gene::132851\n", 77 | "\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "from pykeen.datasets import Hetionet\n", 83 | "from pykeen.datasets import get_dataset\n", 84 | "\n", 85 | "ds = get_dataset(dataset=Hetionet)\n", 86 | "ds.summarize()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 39, 92 | "id": "35ad86ee", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stderr", 97 | "output_type": "stream", 98 | "text": [ 99 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n", 100 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n", 101 | "Reconstructing all label-based triples. This is expensive and rarely needed.\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "triples = Hetionet().factory_dict\n", 107 | "test = pd.DataFrame(triples['testing'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])\n", 108 | "train = pd.DataFrame(triples['training'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])\n", 109 | "valid = pd.DataFrame(triples['validation'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 41, 115 | "id": "978049a9", 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "data": { 120 | "text/plain": [ 121 | "0.10000013332166029" 122 | ] 123 | }, 124 | "execution_count": 41, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "len(test)/(len(train)+ len(test) +len(valid))" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 42, 136 | "id": "d6068102", 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "saved to ../data/raw/hetionet_splits\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "SAVE_SPLITS_TO_DISK = False\n", 149 | "hetio_dataset_name = 'hetionet_random_801010'\n", 150 | "hetio_datasplits_dir = DATA_DIR.joinpath(\"raw/hetionet_splits\")\n", 151 | "\n", 152 | "if SAVE_SPLITS_TO_DISK:\n", 153 | " save_splits(train_df=train,\n", 154 | " test_df=test, \n", 155 | " valid_df=valid,\n", 156 | " dataset_name=hetio_dataset_name\",\n", 157 | " out_dir=hetio_datasplits_dir)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "id": "3459292c", 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 15, 171 | "id": "527f6a4d", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "{'train_triples': 'data',\n", 178 | " 'valid_triples': 'data',\n", 179 | " 'test_triples': 'data',\n", 180 | " 'model': 'complex',\n", 181 | " 'dimension': 256,\n", 182 | " 'loss_fn': 'crossentropy',\n", 183 | " 'loss_margin': 1.0,\n", 184 | " 'optimizer': 'adagrad',\n", 185 | " 'learning_rate': 0.01,\n", 186 | " 'regularizer': 1e-06,\n", 187 | " 'num_epochs': 100,\n", 188 | " 'batch_size': 1024,\n", 189 | " 'eval_batch_size': 16,\n", 190 | " 'num_negatives': 512,\n", 191 | " 'add_inverses': False,\n", 192 | " 'early_stopper': 'both.realistic.inverse_harmonic_mean_rank',\n", 193 | " 'search_train_batch_size': False,\n", 194 | " 'search_eval_batch_size': False,\n", 195 | " 'log_wandb': False}" 196 | ] 197 | }, 198 | "execution_count": 15, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "def load_toml(toml_path: str) -> dict:\n", 205 | " toml_path = Path(toml_path)\n", 206 | "\n", 207 | " config = {}\n", 208 | "\n", 209 | " with open(toml_path, \"r\") as f:\n", 210 | " config = toml.load(f)\n", 211 | "\n", 212 | " return config\n", 213 | "\n", 214 | "config = load_toml(config_path)\n", 215 | "config" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "ed5633c4-cf9f-477f-a468-582bbf91146d", 221 | "metadata": {}, 222 | "source": [ 223 | "### Training" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "388a8210-89f0-435f-8405-81b8c38caa12", 229 | "metadata": {}, 230 | "source": [ 231 | "```bash\n", 232 | "$ python -m bioblp.train_argparse --conf /home/jovyan/BioBLP/data/conf/complex-hetionet-20220826.toml\n", 233 | "```" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "id": "773a6c74-333b-49e8-b2df-022574889217", 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": ".conda-bioblp-env [Python]", 248 | "language": "python", 249 | "name": "conda-env-.conda-bioblp-env-py" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 3 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython3", 261 | "version": "3.8.13" 262 | } 263 | }, 264 | "nbformat": 4, 265 | "nbformat_minor": 5 266 | } 267 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "bioblp" 3 | version = "0.1.0" 4 | description = "Link Prediction for biomedical data using KGE" 5 | authors = [] 6 | packages = [{include = "bioblp"}] 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.9,<3.11" 10 | tqdm = "^4.60.0" 11 | pykeen = "^1.4.0" 12 | toml = "^0.10.2" 13 | pandas = "^1.4.2" 14 | torch = "^1.11.0" 15 | scikit-learn = "^1.1.0" 16 | skorch = "^0.11.0" 17 | optuna = "3.0.1" 18 | dill = "^0.3.6" 19 | 20 | [tool.poetry.dev-dependencies] 21 | 22 | 23 | [tool.poetry.group.dev.dependencies] 24 | pytest = "^7.2.1" 25 | pycodestyle = "^2.10.0" 26 | autopep8 = "^2.0.1" 27 | 28 | [build-system] 29 | requires = ["poetry-core>=1.0.0"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.8.1 ; python_version >= "3.8" and python_version < "3.11" 2 | attrs==22.1.0 ; python_version >= "3.8" and python_version < "3.11" 3 | autopage==0.5.1 ; python_version >= "3.8" and python_version < "3.11" 4 | certifi==2022.6.15.1 ; python_version >= "3.8" and python_version < "3.11" 5 | charset-normalizer==2.1.1 ; python_version >= "3.8" and python_version < "3.11" 6 | class-resolver==0.3.10 ; python_version >= "3.8" and python_version < "3.11" 7 | click-default-group==1.2.2 ; python_version >= "3.8" and python_version < "3.11" 8 | click==8.1.3 ; python_version >= "3.8" and python_version < "3.11" 9 | cliff==4.0.0 ; python_version >= "3.8" and python_version < "3.11" 10 | cmaes==0.8.2 ; python_version >= "3.8" and python_version < "3.11" 11 | cmd2==2.4.2 ; python_version >= "3.8" and python_version < "3.11" 12 | colorama==0.4.5 ; python_version >= "3.8" and python_version < "3.11" and platform_system == "Windows" or python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32" 13 | colorlog==6.7.0 ; python_version >= "3.8" and python_version < "3.11" 14 | dataclasses-json==0.5.7 ; python_version >= "3.8" and python_version < "3.11" 15 | docdata==0.0.3 ; python_version >= "3.8" and python_version < "3.11" 16 | greenlet==1.1.3 ; python_version >= "3.8" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "3.11" 17 | idna==3.3 ; python_version >= "3.8" and python_version < "3.11" 18 | importlib-metadata==4.12.0 ; python_version >= "3.8" and python_version < "3.11" 19 | importlib-resources==5.9.0 ; python_version >= "3.8" and python_version < "3.9" 20 | joblib==1.1.0 ; python_version >= "3.8" and python_version < "3.11" 21 | mako==1.2.2 ; python_version >= "3.8" and python_version < "3.11" 22 | markupsafe==2.1.1 ; python_version >= "3.8" and python_version < "3.11" 23 | marshmallow-enum==1.5.1 ; python_version >= "3.8" and python_version < "3.11" 24 | marshmallow==3.17.1 ; python_version >= "3.8" and python_version < "3.11" 25 | more-click==0.1.1 ; python_version >= "3.8" and python_version < "3.11" 26 | more-itertools==8.14.0 ; python_version >= "3.8" and python_version < "3.11" 27 | mypy-extensions==0.4.3 ; python_version >= "3.8" and python_version < "3.11" 28 | numpy==1.23.3 ; python_version < "3.11" and python_version >= "3.8" 29 | optuna==3.0.1 ; python_version >= "3.8" and python_version < "3.11" 30 | packaging==21.3 ; python_version >= "3.8" and python_version < "3.11" 31 | pandas==1.4.4 ; python_version >= "3.8" and python_version < "3.11" 32 | pbr==5.10.0 ; python_version >= "3.8" and python_version < "3.11" 33 | prettytable==3.4.1 ; python_version >= "3.8" and python_version < "3.11" 34 | protobuf==3.20.1 ; python_version >= "3.8" and python_version < "3.11" 35 | pykeen==1.9.0 ; python_version >= "3.8" and python_version < "3.11" 36 | pyparsing==3.0.9 ; python_version >= "3.8" and python_version < "3.11" 37 | pyperclip==1.8.2 ; python_version >= "3.8" and python_version < "3.11" 38 | pyreadline3==3.4.1 ; python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32" 39 | pystow==0.4.6 ; python_version >= "3.8" and python_version < "3.11" 40 | python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "3.11" 41 | pytz==2022.2.1 ; python_version >= "3.8" and python_version < "3.11" 42 | pyyaml==6.0 ; python_version >= "3.8" and python_version < "3.11" 43 | requests==2.28.1 ; python_version >= "3.8" and python_version < "3.11" 44 | rexmex==0.0.15 ; python_version >= "3.8" and python_version < "3.11" 45 | scikit-learn==1.1.2 ; python_version >= "3.8" and python_version < "3.11" 46 | scipy==1.8.1 ; python_version >= "3.8" and python_version < "3.11" 47 | six==1.16.0 ; python_version >= "3.8" and python_version < "3.11" 48 | scikit-learn==0.0 ; python_version >= "3.8" and python_version < "3.11" 49 | skorch==0.11.0 ; python_version >= "3.8" and python_version < "3.11" 50 | sqlalchemy==1.4.41 ; python_version >= "3.8" and python_version < "3.11" 51 | stevedore==4.0.0 ; python_version >= "3.8" and python_version < "3.11" 52 | tabulate==0.8.10 ; python_version >= "3.8" and python_version < "3.11" 53 | threadpoolctl==3.1.0 ; python_version >= "3.8" and python_version < "3.11" 54 | toml==0.10.2 ; python_version >= "3.8" and python_version < "3.11" 55 | torch-max-mem==0.0.4 ; python_version >= "3.8" and python_version < "3.11" 56 | torch-ppr==0.0.8 ; python_version >= "3.8" and python_version < "3.11" 57 | torch==1.12.1 ; python_version >= "3.8" and python_version < "3.11" 58 | tqdm==4.64.1 ; python_version >= "3.8" and python_version < "3.11" 59 | typing-extensions==4.3.0 ; python_version >= "3.8" and python_version < "3.11" 60 | typing-inspect==0.8.0 ; python_version >= "3.8" and python_version < "3.11" 61 | urllib3==1.26.12 ; python_version >= "3.8" and python_version < "3.11" 62 | wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "3.11" 63 | zipp==3.8.1 ; python_version >= "3.8" and python_version < "3.11" 64 | 65 | bioblp~=0.1.0 66 | torch~=1.13.1 67 | transformers~=4.26.1 68 | pandas~=1.5.3 69 | numpy~=1.24.2 70 | tqdm~=4.64.1 71 | pykeen~=1.10.0 72 | wandb~=0.13.10 73 | optuna~=3.0.1 74 | scikit-learn~=1.2.1 75 | skorch~=0.11.0 -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/tests/__init__.py -------------------------------------------------------------------------------- /tests/benchmarking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/tests/benchmarking/__init__.py -------------------------------------------------------------------------------- /tests/benchmarking/bm_test_conf.toml: -------------------------------------------------------------------------------- 1 | 2 | data_root = "/home/skywalker/bioblp/" 3 | experiment_root = "data/benchmarks/experiments/dpi_fda/20230224/" 4 | 5 | [sampling] 6 | outdir = "sampled" 7 | num_negs_per_pos = 10 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/" 9 | 10 | [features] 11 | outdir = "features" 12 | transform = "concat" 13 | missing_values = "random" 14 | encoders = ["structural", "complex", "rotate", "noise"] 15 | 16 | [features.encoder_args.noise] 17 | random_seed = 24 18 | 19 | [features.encoder_args.structural] 20 | proteins = "data/benchmarks/experiments/encoders/proteins" 21 | molecules = "data/benchmarks/experiments/encoders/molecules" 22 | 23 | [features.encoder_args.complex] 24 | model_dir = "data/benchmarks/experiments/encoders/complex/" 25 | 26 | [features.encoder_args.rotate] 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/" 28 | 29 | [features.encoder_args.transe] 30 | model_dir = "data/benchmarks/experiments/encoders/transe/" 31 | 32 | [split] 33 | n_splits = 5 34 | outdir = "splits" 35 | 36 | [models] 37 | 38 | [models.noise_lr] 39 | feature = "noise" 40 | model = "LR" 41 | 42 | [models.noise_rf] 43 | feature = "noise" 44 | model = "RF" 45 | 46 | [models.noise_mlp] 47 | feature = "noise" 48 | model = "MLP" 49 | 50 | [models.structural_lr] 51 | feature = "structural" 52 | model = "LR" 53 | 54 | [models.complex_lr] 55 | feature = "complex" 56 | model = "LR" 57 | 58 | [models.rotate_lr] 59 | feature = "rotate" 60 | model = "LR" 61 | 62 | [train] 63 | n_iter = 2 64 | splits_file = "cv-splits.pt" 65 | refit_params = ["AUCPR", "AUCROC"] 66 | outdir = "models" 67 | -------------------------------------------------------------------------------- /tests/benchmarking/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from dataclasses import fields 4 | 5 | from pathlib import Path 6 | from bioblp.benchmarking.config import BenchmarkStepBaseConfig 7 | from bioblp.benchmarking.config import BenchmarkPreprocessConfig 8 | from bioblp.benchmarking.config import BenchmarkFeatureConfig 9 | from bioblp.benchmarking.config import BenchmarkTrainConfig 10 | 11 | 12 | from bioblp.logger import get_logger 13 | 14 | 15 | logger = get_logger(__name__) 16 | 17 | test_toml_file = Path(__file__).parent.joinpath("bm_test_conf.toml") 18 | 19 | 20 | class TestBenchmarkStepBaseConfig(): 21 | 22 | dr = "/home/skywalker/bioblp/data/" 23 | exp = "benchmark/experiments" 24 | step_out = "step_out" 25 | run_id = "123" 26 | 27 | def test_resolve_outdir(self): 28 | 29 | cfg = BenchmarkStepBaseConfig( 30 | data_root=self.dr, 31 | experiment_root=self.exp, 32 | run_id=self.run_id, 33 | outdir=self.step_out 34 | ) 35 | 36 | full_outdir = cfg.resolve_outdir() 37 | 38 | assert str(full_outdir) == self.dr + self.exp + \ 39 | "/" + self.run_id + "/" + self.step_out 40 | 41 | def test_test_resolve_outdir_mutated(self): 42 | cfg = BenchmarkStepBaseConfig( 43 | data_root=self.dr, 44 | experiment_root=self.exp, 45 | run_id=self.run_id, 46 | outdir=self.step_out 47 | ) 48 | 49 | override_data_root = "/home/vader/bioblp/data/" 50 | 51 | cfg.data_root = override_data_root 52 | 53 | full_outdir = cfg.resolve_outdir() 54 | 55 | assert str(full_outdir) == override_data_root + self.exp + \ 56 | "/" + self.run_id + "/" + self.step_out 57 | 58 | 59 | class TestBenchmarkPreprocessConfig(): 60 | 61 | def test_from_toml(self): 62 | expected_fields = ["data_root", "experiment_root", "run_id", "outdir", 63 | "num_negs_per_pos", "kg_triples_dir"] 64 | 65 | run_id = "123" 66 | cfg = BenchmarkPreprocessConfig.from_toml( 67 | test_toml_file, run_id=run_id) 68 | 69 | cfg_fields = [field.name for field in fields(cfg)] 70 | 71 | assert cfg.num_negs_per_pos == 10 72 | assert cfg.data_root == "/home/skywalker/bioblp/" 73 | assert len(set(cfg_fields).difference(set(expected_fields)) 74 | ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}" 75 | 76 | def test_resolve_outdir(self): 77 | 78 | run_id = "123" 79 | cfg = BenchmarkPreprocessConfig.from_toml( 80 | test_toml_file, run_id=run_id) 81 | 82 | outdir = cfg.resolve_outdir() 83 | 84 | assert str( 85 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/sampled" 86 | 87 | 88 | class TestBenchmarkFeatureConfig(): 89 | 90 | def test_from_toml(self): 91 | expected_fields = ["data_root", "experiment_root", "run_id", "outdir", 92 | "transform", "missing_values", "encoders", "encoder_args"] 93 | 94 | run_id = "123" 95 | cfg = BenchmarkFeatureConfig.from_toml(test_toml_file, run_id=run_id) 96 | 97 | cfg_fields = [field.name for field in fields(cfg)] 98 | 99 | assert len(set(cfg_fields).difference(set(expected_fields)) 100 | ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}" 101 | 102 | def test_resolve_outdir(self): 103 | 104 | run_id = "123" 105 | cfg = BenchmarkFeatureConfig.from_toml(test_toml_file, run_id=run_id) 106 | 107 | outdir = cfg.resolve_outdir() 108 | 109 | assert str( 110 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/features" 111 | 112 | 113 | class TestBenchmarkTrainConfig(): 114 | 115 | def test_from_toml(self): 116 | expected_fields = ["data_root", "experiment_root", "run_id", "outdir", 117 | "feature_dir", "models", "refit_params", "n_iter", "splits_dir", "splits_file"] 118 | 119 | run_id = "123" 120 | cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id) 121 | 122 | cfg_fields = [field.name for field in fields(cfg)] 123 | 124 | assert len(set(cfg_fields).difference(set(expected_fields)) 125 | ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}" 126 | 127 | def test_resolve_outdir(self): 128 | 129 | run_id = "123" 130 | cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id) 131 | 132 | outdir = cfg.resolve_outdir() 133 | 134 | assert str( 135 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/models" 136 | 137 | def test_resolve_feature_outdir(self): 138 | 139 | run_id = "123" 140 | cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id) 141 | 142 | outdir = cfg.resolve_feature_dir() 143 | 144 | assert str( 145 | outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/features" 146 | -------------------------------------------------------------------------------- /tests/benchmarking/test_featurise.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from bioblp.benchmarking.featurise import apply_common_mask 5 | 6 | 7 | class TestApplyCommonMask: 8 | 9 | data_A = torch.arange(0., 9.).resize(3, 3) 10 | data_B = torch.arange(9., 21.).resize(3, 4) 11 | 12 | labels = torch.ones(3) 13 | 14 | def test_mask_consistency(self): 15 | mask_A = torch.tensor([0, 1]) 16 | mask_B = torch.tensor([0, 1, 2]) 17 | 18 | inputs = [("A", self.data_A, mask_A), ("B", self.data_B, mask_B)] 19 | 20 | masked_inputs, _ = apply_common_mask(inputs, labels=self.labels) 21 | 22 | assert masked_inputs[0][1].size(0) == len(mask_A) 23 | assert masked_inputs[0][1].size(0) == masked_inputs[1][1].size(0) 24 | 25 | def test_mask_consistency_labels(self): 26 | mask_A = torch.tensor([0, 2]) 27 | mask_B = torch.tensor([0, 1, 2]) 28 | 29 | labels = torch.tensor([1, 1, 0]) 30 | expected_labels = torch.tensor([1, 0]) 31 | 32 | inputs = [("A", self.data_A, mask_A), ("B", self.data_B, mask_B)] 33 | 34 | _, masked_labels = apply_common_mask(inputs, labels=labels) 35 | 36 | assert len(masked_labels) == len(mask_A) 37 | assert torch.sum((masked_labels - expected_labels)) == 0 38 | -------------------------------------------------------------------------------- /tests/benchmarking/test_train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from bioblp.benchmarking.train_utils import validate_features_exist 3 | from bioblp.benchmarking.config import BenchmarkTrainConfig 4 | 5 | from bioblp.logger import get_logger 6 | 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | CONFIG_PATH = "conf/dpi-benchmark-cv-20230423-lr.toml" 12 | 13 | 14 | def test_parse_train_config(): 15 | cfg = BenchmarkTrainConfig.from_toml(CONFIG_PATH, run_id="abc") 16 | 17 | logger.info(cfg) 18 | 19 | 20 | class TestValidateFeatures(): 21 | 22 | models_conf = { 23 | "noise_lr": { 24 | "feature": "noise", 25 | "model": "LR" 26 | }, 27 | "complex_lr": { 28 | "feature": "complex", 29 | "model": "LR" 30 | } 31 | } 32 | 33 | existing_feats = ["noise", "complex"] 34 | 35 | def setup_feats(self, dir): 36 | data = torch.arange(0., 12.).resize(3, 4) 37 | 38 | for feat in self.existing_feats: 39 | torch.save(data, dir.joinpath(f"{feat}.pt")) 40 | 41 | def test_validate_features_exist(self, tmp_path): 42 | dir = tmp_path.joinpath("features") 43 | dir.mkdir() 44 | self.setup_feats(dir) 45 | 46 | exists = validate_features_exist(dir, self.models_conf) 47 | 48 | assert exists is True 49 | 50 | def test_validate_features_exist_missing(self, tmp_path): 51 | dir = tmp_path.joinpath("features") 52 | dir.mkdir() 53 | self.setup_feats(dir) 54 | 55 | missing_feat = { 56 | "feature": "rotate", 57 | "model": "LR" 58 | } 59 | conf = self.models_conf 60 | conf.update({"rotate_LR": missing_feat}) 61 | 62 | exists = validate_features_exist(dir, conf) 63 | 64 | assert exists is False 65 | -------------------------------------------------------------------------------- /tests/test_encoders.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import unittest 3 | import tempfile 4 | import os 5 | import os.path as osp 6 | import pytest 7 | import torch 8 | from transformers import BertTokenizer 9 | 10 | from bioblp.models.encoders import TransformerTextEncoder 11 | import bioblp.loaders.preprocessors as preprocessors 12 | 13 | 14 | class TestPropertyEncoders(unittest.TestCase): 15 | DISEASES = ['Irreversible FIBROSIS of the submucosal tissue of the MOUTH.', 16 | 'The co-occurrence of pregnancy and parasitic diseases.', 17 | 'Benign epidermal proliferations or tumors of viral in origin.', 18 | 'Infections with bacteria of the genus PASTEURELLA.'] 19 | 20 | MOLECULES = ['101010101010101010101010101010101010'] 21 | 22 | def setUp(self): 23 | self.temp_file = None 24 | 25 | def tearDown(self): 26 | if self.temp_file is not None: 27 | if osp.exists(self.temp_file): 28 | os.remove(self.temp_file) 29 | 30 | def make_test_file(self, entities: List[int], choices: List[str]): 31 | if self.temp_file is None: 32 | file_name = tempfile.NamedTemporaryFile().name 33 | self.temp_file = file_name 34 | else: 35 | file_name = self.temp_file 36 | 37 | with open(file_name, 'w') as file: 38 | for i, entity in enumerate(entities): 39 | sample = choices[i % len(choices)] 40 | file.write(f'{entity}\t{sample}\n') 41 | 42 | return file_name 43 | 44 | def make_protein_test_file(self, emb_dim: int, entities: List[str]): 45 | if self.temp_file is None: 46 | file_name = tempfile.NamedTemporaryFile().name 47 | self.temp_file = file_name 48 | else: 49 | file_name = self.temp_file 50 | 51 | embeddings = torch.rand([len(entities), emb_dim]) 52 | 53 | with open(file_name, 'w') as file: 54 | torch.save({'identifiers': entities, 'embeddings': embeddings}, 55 | file_name) 56 | 57 | return file_name 58 | 59 | @pytest.mark.skip(reason="no way of currently testing this") 60 | def test_text_preprocessor(self): 61 | entity_to_id = {str(i): i for i in range(10)} 62 | entities = list(entity_to_id.keys()) 63 | file = self.make_test_file(entities, choices=self.DISEASES) 64 | 65 | max_length = 32 66 | tokenizer = BertTokenizer.from_pretrained( 67 | TransformerTextEncoder.BASE_MODEL) 68 | preprocessor = preprocessors.TextEntityPropertyPreprocessor(tokenizer, 69 | max_length) 70 | 71 | entities_tensor, data_idx, data = preprocessor.preprocess_file(file, 72 | entity_to_id) 73 | self.assertEqual(len(entities_tensor), len(entities)) 74 | self.assertEqual(len(data_idx), len(entities)) 75 | self.assertTupleEqual(data.shape, (len(entities), max_length)) 76 | 77 | def test_molecule_preprocessor(self): 78 | entity_to_id = {str(i): i for i in range(10)} 79 | entities = list(entity_to_id.keys()) 80 | file = self.make_test_file(entities, choices=self.MOLECULES) 81 | 82 | preprocessor = preprocessors.MolecularFingerprintPreprocessor() 83 | entities_tensor, data_idx, data = preprocessor.preprocess_file(file, 84 | entity_to_id) 85 | 86 | self.assertEqual(len(entities_tensor), len(entities)) 87 | self.assertEqual(len(data_idx), len(entities)) 88 | self.assertTupleEqual( 89 | data.shape, (len(entities), len(self.MOLECULES[0]))) 90 | 91 | @pytest.mark.skip(reason="faulty test") 92 | def test_pretrained_protein_preprocessor(self): 93 | emb_dim = 32 94 | entity_to_id = {str(i): i for i in range(10)} 95 | entities = list(entity_to_id.keys()) 96 | file = self.make_protein_test_file(emb_dim, entities) 97 | 98 | preprocessor = preprocessors.PretrainedEmbeddingPreprocessor() 99 | entities_tensor, data_idx, data = preprocessor.preprocess_file(file, 100 | entity_to_id) 101 | 102 | self.assertEqual(len(entities_tensor), len(entities)) 103 | self.assertEqual(len(data_idx), len(entities)) 104 | self.assertTupleEqual(data.shape, (len(entities), emb_dim)) 105 | -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | from bioblp import __version__ 2 | 3 | 4 | def test_version(): 5 | assert __version__ == "0.1.0" 6 | --------------------------------------------------------------------------------