├── .gitignore
├── Makefile
├── README.md
├── bioblp
    ├── __init__.py
    ├── benchmarking
    │   ├── README.md
    │   ├── __init__.py
    │   ├── config.py
    │   ├── encoders.py
    │   ├── experiment.py
    │   ├── featurise.py
    │   ├── hpo.py
    │   ├── preprocess.py
    │   ├── split.py
    │   ├── train.py
    │   ├── train_runner.py
    │   └── train_utils.py
    ├── data.py
    ├── evaluate.py
    ├── loaders
    │   └── preprocessors.py
    ├── logger.py
    ├── models
    │   ├── __init__.py
    │   ├── bioblp.py
    │   └── encoders.py
    ├── predict.py
    ├── preprocess.py
    ├── train.py
    ├── train_argparse.py
    └── utils
    │   ├── __init__.py
    │   ├── bioblp_utils.py
    │   ├── pipeline.py
    │   ├── training.py
    │   ├── triples.py
    │   └── util.py
├── conf
    ├── complex-biokg-20220826.toml
    ├── complex-biokg-full-20220826.toml
    ├── complex-hetionet-20220826.toml
    ├── dpi-benchmark-cv-20230423-lr.toml
    ├── dpi-benchmark-cv-20230423-mlp-1.toml
    ├── dpi-benchmark-cv-20230423-mlp-2.toml
    ├── dpi-benchmark-cv-20230423-rf.toml
    ├── dpi-benchmark-cv-r1-20230424-mlp.toml
    └── dpi-benchmark-cv-r1-20230424-rflr.toml
├── data
    └── conf
    │   ├── complex-biokg-20220826.toml
    │   ├── complex-biokg-full-20220826.toml
    │   └── complex-hetionet-20220826.toml
├── environment.yml
├── fig.png
├── jobs
    ├── biokg-bioblp-d-complex-initialized.sh
    ├── biokg-bioblp-d-complex.sh
    ├── biokg-bioblp-d-rotate-initialized.sh
    ├── biokg-bioblp-d-rotate.sh
    ├── biokg-bioblp-d-transe-initialized.sh
    ├── biokg-bioblp-d-transe.sh
    ├── biokg-bioblp-m-complex-bce-sweep.sh
    ├── biokg-bioblp-m-complex-bce-sweep.yml
    ├── biokg-bioblp-m-rotate-adagrad-sweep.sh
    ├── biokg-bioblp-m-rotate-adagrad-sweep.yml
    ├── biokg-bioblp-m-rotate-sweep.sh
    ├── biokg-bioblp-m-rotate-sweep.yml
    ├── biokg-bioblp-m-transe-sweep.sh
    ├── biokg-bioblp-m-transe-sweep.yml
    ├── biokg-bioblp-p-complex-bce-sweep.sh
    ├── biokg-bioblp-p-complex-bce-sweep.yml
    ├── biokg-bioblp-p-complex-initialized.sh
    ├── biokg-bioblp-p-rotate-initialized.sh
    ├── biokg-bioblp-p-rotate-sweep.sh
    ├── biokg-bioblp-p-rotate-sweep.yml
    ├── biokg-bioblp-p-transe-initialized.sh
    ├── biokg-bioblp-p-transe-sweep.sh
    ├── biokg-bioblp-p-transe-sweep.yml
    ├── biokg-complex-bce-sweep.sh
    ├── biokg-complex-bce-sweep.yml
    ├── biokg-complex-sweep.sh
    ├── biokg-complex-sweep.yml
    ├── biokg-rotate-bce-sweep.sh
    ├── biokg-rotate-bce-sweep.yml
    ├── biokg-rotate-sweep.sh
    ├── biokg-rotate-sweep.yml
    ├── biokg-transe-sweep.sh
    ├── biokg-transe-sweep.yml
    ├── complex.sh
    ├── hetionet-complex-bce-sweep.sh
    ├── hetionet-complex-bce-sweep.yml
    ├── hetionet-complex-sweep.sh
    ├── hetionet-complex-sweep.yml
    ├── hetionet-rotate-bce-sweep.sh
    ├── hetionet-rotate-bce-sweep.yml
    ├── hetionet-rotate-sweep.sh
    ├── hetionet-rotate-sweep.yml
    ├── hetionet-transe-sweep.sh
    ├── hetionet-transe-sweep.yml
    ├── rotate-dummy.sh
    └── rotate.sh
├── loaders
    └── placeholder.txt
├── logs
    └── placeholder.txt
├── notebooks
    ├── 00-clean-biokg-benchmarks.ipynb
    ├── 01-generate-biokg-splits.ipynb
    ├── 01_01_biokg-data-prep-for-kge.ipynb
    ├── 01_01_disease_mesh_notes_retrieval.ipynb
    ├── 01_02_disease_bert_encodings.ipynb
    ├── 02-01-biokg_benchmarks_eda.ipynb
    ├── 02-02-biokg_benchmarks_data_prep.ipynb
    ├── 02-03-benchmark-results.ipynb
    ├── 02-03-biokg_benchmarks_data_embedders.ipynb
    ├── 02_01_01-biokg_benchmarks_eda.ipynb
    ├── 02_01_02-biokg_benchmark-reconciliation.ipynb
    ├── 02_03_01-biokg_bm_dpi_clf-mlp.ipynb
    ├── 02_04_01-sanity-check-benchmark-ppi.ipynb
    ├── 02_04_01_biokg_bm_dpi_clf_nestedcv.ipynb
    ├── 02_99-benchmark-prep-yamanashi-dpi.ipynb
    ├── 03-00-nested-cv.ipynb
    ├── 03-frequency-baseline.ipynb
    ├── 04_00_ProtTrans_embeddings_biokG.ipynb
    ├── 04_01_Load & merge protein embeddings_BioKG.ipynb
    ├── 05-00-Load HetioNet - Get Gene to Protein mappings.ipynb
    ├── 05-01-Load HetioNet - Protein Embedding Generation.ipynb
    ├── 06-01 - Molecular Embeddings - BioKG.ipynb
    ├── 06-hetionet.ipynb
    ├── 07_00_evaluate-link-prediction_archived.ipynb
    ├── 07_01_eval_lp_deepdive.ipynb
    ├── 07_02_eval_lp_node_degree_effect.ipynb
    ├── 08-00-evaluate-link-prediction.ipynb
    ├── 08-01-inductive-evaluation.ipynb
    ├── 08-02-per-triple-evaluation.ipynb
    ├── 09-wandb-hparam-figures.ipynb
    ├── 10-pretraining-significance.ipynb
    ├── 11-pretraining-curves.ipynb
    ├── 12-per-relation-figures.ipynb
    ├── 13-node-degree-analysis-v2.ipynb
    ├── 99-train_hetionet.ipynb
    └── nb_utils
    │   └── eval_utils.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── tests
    ├── __init__.py
    ├── benchmarking
        ├── __init__.py
        ├── bm_test_conf.toml
        ├── test_config.py
        ├── test_encoders.py
        ├── test_featurise.py
        └── test_train.py
    ├── test_encoders.py
    └── test_version.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv*
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Data paths
132 | data/
133 | 
134 | # Generated artifacts
135 | wandb/
136 | /models
137 | 
138 | # Editor
139 | .vscode
140 | 
141 | # PyCharm
142 | .idea
143 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Self-Documented Makefile https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html
 2 | .PHONY: clean setup install
 3 | 
 4 | #################################################################################
 5 | # GLOBALS                                                                       #
 6 | #################################################################################
 7 | 
 8 | SHELL=/bin/bash
 9 | PYTHON = python
10 | PROJECT_NAME = bioblp
11 | PACKAGE_NAME = bioblp
12 | PYTHON_INTERPRETER = python3
13 | KERNEL_NAME=Python (${PROJECT_NAME})
14 | PYTHON_FULL_V = $(shell python -V)
15 | PYTHON_V := $(PYTHON_FULL_V:Python%=%)
16 | CONDA_ENV=${PROJECT_NAME}-env
17 | PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
18 | CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate
19 | #PYTHON_V=3.8.6
20 | 
21 | #################################################################################
22 | # COMMANDS                                                                      #
23 | #################################################################################
24 | 
25 | default: help
26 | 
27 | print-%: ## Prints a variable value. Usage: make print-VARIABLE, eg: make print-TAG, result: TAG = 0.0.0
28 | 	@echo $* = $($*)
29 | 
30 | setup:
31 | 	make install_poetry
32 | 	@echo $(shell poetry --version) || "Install Poetry"
33 | 
34 | install_poetry:  ## installs poetry. Remember to `source /home/jovyan/.poetry/env` from a terminal after running this recipe. Need only be run once
35 | 	curl -sSL https://install.python-poetry.org | python3 -
36 | 
37 | install:
38 | 	poetry install
39 | 	poetry export -f requirements.txt --without-hashes  --with dev --output requirements.txt
40 | 
41 | 
42 | update:
43 | 	poetry update
44 | 	poetry export -f requirements.txt --without-hashes  --with dev --output requirements.txt
45 | 
46 | test:
47 | 	poetry run pytest tests -s -vv
48 | 
49 | create_ipython_kernel:
50 | 	poetry run ipython kernel install --user --display-name="${KERNEL_NAME}"
51 | 
52 | freeze_requirements: ## Writes python project dependencies as a requirements.txt
53 | 	poetry export -f requirements.txt --output requirements.txt --without-hashes
54 | 
55 | freeze_dev_requirements: ## Writes python project dependencies (including dev) as a requirements-dev.txt
56 | 	poetry export -f requirements.txt --output requirements-dev.txt --without-hashes --dev
57 | 
58 | dist: ## Builds a distribution package with version ${PACKAGE_NAME}.__version__, eg: dist/test_me-0.0.0.tar.gz
59 | 	make clean
60 | 	poetry build
61 | 
62 | 
63 | ### JH setup
64 | 
65 | setup_jh_env:
66 | 	make conda_setup
67 | 	make create_conda_env
68 | 	make create_conda_kernel
69 | 
70 | conda_setup: # ensures conda env is persistent, need run only once
71 | 	mkdir -p /home/jovyan/.conda/pkgs/
72 | 	touch /home/jovyan/.conda/pkgs/urls.txt
73 | 
74 | create_conda_env:
75 | 	conda create --yes --prefix /home/jovyan/.conda/envs/${CONDA_ENV} ipykernel
76 | 	#conda create --yes --prefix /home/jovyan/.conda/envs/${CONDA_ENV} python==${PYTHON_V} ipykernel
77 | 	($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make setup | source /home/jovyan/.poetry/env)
78 | 	# to install the project module as a dependency
79 | 	($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make install)
80 | 	conda env export -n ${CONDA_ENV} -f ${PROJECT_DIR}/environment.yml
81 | 
82 | create_conda_kernel:
83 | 	python -m ipykernel install --user --name=${CONDA_ENV} --display-name="${KERNEL_NAME}"
84 | 
85 | update_conda_env:
86 | 	#($(CONDA_ACTIVATE) /home/jovyan/.conda/envs/${CONDA_ENV} | make update)
87 | 	conda env update --name ${CONDA_ENV} -f ${PROJECT_DIR}/environment.yml  --prune
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BioBLP: A Modular Framework for Learning on Multimodal Biomedical Knowledge Graphs
 2 | <div>
 3 | <a href="https://github.com/dfdazac/blp/blob/master/LICENSE">
 4 |     <img src="https://img.shields.io/badge/License-MIT-blue.svg"></a>
 5 |     <a href="https://doi.org/10.5281/zenodo.8005711"><img src="https://zenodo.org/badge/DOI/10.5281/zenodo.8005711.svg" alt="DOI"></a>
 6 |     <a href="https://arxiv.org/abs/2306.03606"><img src="http://img.shields.io/badge/Paper-PDF-red.svg"></a>
 7 | </div>
 8 | 
 9 | <br><br>
10 | <div align="center">
11 | <img src="fig.png" width="800" />
12 | </div>
13 | <br><br>
14 | 
15 | This is the official repository implementing BioBLP, presented in "BioBLP: A Modular Framework for Learning on Multimodal Biomedical Knowledge Graphs", published in the Journal of Biomedical Semantics ([link](https://doi.org/10.1186/s13326-023-00301-y)).
16 | 
17 | BioBLP is a framework that allows encoding a diverse set of multimodal data that can appear in biomedical knowledge graphs. It is based on the idea of learning embeddings for each modality separately, and then combining them into a single multimodal embedding space. The framework is modular, and allows for easy integration of new modalities.
18 | 
19 | To cite our work, please use the following:
20 | 
21 | ```bibtex
22 | @article{bioblp,
23 |   author       = {Daniel Daza and
24 |                   Dimitrios Alivanistos and
25 |                   Payal Mitra and
26 |                   Thom Pijnenburg and
27 |                   Michael Cochez and
28 |                   Paul Groth},
29 |   title        = {BioBLP: a modular framework for learning on multimodal biomedical
30 |                   knowledge graphs},
31 |   journal      = {J. Biomed. Semant.},
32 |   volume       = {14},
33 |   number       = {1},
34 |   pages        = {20},
35 |   year         = {2023},
36 |   url          = {https://doi.org/10.1186/s13326-023-00301-y},
37 |   doi          = {10.1186/S13326-023-00301-Y},
38 | }
39 | ```
40 | 
41 | ## Usage
42 | 
43 | ### 1. Install the requirements
44 | 
45 | We recommend using [Anaconda](https://www.anaconda.com/) to manage the dependencies. The following command will create and activate a new conda environment with all the required dependencies.
46 | 
47 | ```bash
48 | conda create -f environment.yml && conda activate bioblp
49 | ```
50 | 
51 | ### 2. Download the data
52 | 
53 | The data can be downloaded from [here](https://doi.org/10.5281/zenodo.8005711) as a tar.gz file. This corresponds to our version of BioKG that has been decoupled from the benchmarks (see the paper for more details), and it also includes the necessary attribute data for proteins, molecules, and diseases.
54 | The file should be placed inside the `data` folder and decompressed:
55 | 
56 | ```bash
57 | tar xzf biokgb.tar.gz
58 | ```
59 | 
60 | ### 3. Training link prediction models
61 | 
62 | Use the `bioblp.train` module to train a link prediction model. For example, to train a BioBLP-D model (which encodes disease descriptions) using the RotatE scoring function, use:
63 | 
64 | ```sh
65 | python -m bioblp.train \
66 |     --train_triples=data/biokgb/graph/biokg.links-train.csv \
67 |     --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
68 |     --test_triples=data/biokgb/graph/biokg.links-test.csv \
69 |     --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
70 |     --model=rotate --dimension=256 --loss_fn=crossentropy --optimizer=adam \
71 |     --learning_rate=2e-5 --warmup_fraction=0.05 --num_epochs=100 \
72 |     --batch_size=1024 --eval_batch_size=64 --num_negatives=512 --in_batch_negatives=True
73 | ```
74 | 
75 | The above command on a NVIDIA A100 40G GPU takes about 9 hours to train.
76 | 
77 | We use Weights and Biases to log the experiments, which is disabled by default. To enable it, add `--log_wandb=True` to the command above.
78 | 
79 | More examples will be added soon.
80 | 
81 | ### 4. Benchmark tasks
82 | * Pre-generate the input dataset with flags indicating if they are known or novel links. 
83 | * Run `bioblp.benchmarking.preprocess.py` to prepare BM dataset for ML by shuffling, splits, etc.
84 | * `bioblp.benchmarking.featurize.py` can be used to featurize a list of pair wise entities into vectors composed from individual vector entities.
85 | 
86 | Custom usage:
87 | ```bash
88 | $ python -m bioblp.benchmarking.featurize -i data/benchmarks/processed/dpi_benchmark_p2n-1-10.tsv -o data/features -t kgem -f models/1baon0eg/ -j concatenate
89 | ```
90 | 


--------------------------------------------------------------------------------
/bioblp/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/bioblp/benchmarking/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | ## Experiment preparation
 4 | Command to prepare experimental data, given config file. This script will load the raw benchmark dataset, perform negative sampling, generate features and splits:
 5 | 
 6 | ```bash
 7 | python bioblp/benchmarking/experiment.py \
 8 |     --conf=conf/dpi-benchmark-cv-20230413.toml \
 9 |     --override_data_root=./ \
10 |     --bm_file=data/benchmarks/transductive/dpi_fda.tsv \
11 |     --n_proc=1
12 | ```
13 | 
14 | You can execute the steps in `experiment.py` individually with the below.
15 | 
16 | 1. Negative sampling.
17 | ```bash
18 | python bioblp/benchmarking/preprocess.py \
19 |     --bm_data_path=data/benchmarks/experiments/DPI/1681398697/features/raw.pt \
20 |     --kg_triples_dir=data/benchmarks/experiments/encoders/rotate/training_triples/ \
21 |     --num_negs_per_pos=10 \
22 |     --outdir=data/benchmarks/experiments/DPI/1681398697/sampled/ \
23 |     --override_run_id=1681398697
24 | ```
25 | 
26 | 2. Generate features.
27 | 
28 | ```bash
29 | python bioblp/benchmarking/featurise.py \
30 |     --conf=conf/dpi-benchmark-cv-20230413.toml \
31 |     --bm_file=data/benchmarks/experiments/DPI/1681398697/sampled/dpi_fda_p2n-1-10.tsv \
32 |     --override_data_root=./ \
33 |     --override_run_id=1681398697
34 | 
35 | ```
36 | 
37 | 3. Preparing data splits for cross validation.
38 | 
39 | ```bash
40 | python bioblp/benchmarking/split.py \
41 |     --conf=conf/dpi-benchmark-cv-20230413.toml \
42 |     --data=data/benchmarks/experiments/DPI/1681398697/features/raw.pt \
43 |     --outdir=data/benchmarks/experiments/DPI/1681398697/splits/ \
44 |     --n_folds=5 \
45 |     --override_data_root=./ \
46 |     --override_run_id=1681398697
47 | ```
48 | 
49 | ## Model training
50 | 
51 | Sample command for `train.py`. This script performs the training procedure for one model configuration, on one particular data split.
52 | ```bash
53 | python bioblp/benchmarking/train.py \
54 |     --model_clf=RF \
55 |     --model_feature=complex \
56 |     --feature_dir=data/benchmarks/experiments/dpi_fda/1681301749/features/ \
57 |     --splits_path=data/benchmarks/experiments/dpi_fda/1681301749/splits/train-test-split.pt \
58 |     --split_idx=0 \
59 |     --n_iter=3 \
60 |     --refit_params=AUCPR,AUCROC \
61 |     --outdir=data/benchmarks/experiments/dpi_fda/1681301749/models/ \
62 |     --model_label=complex__RF \
63 |     --timestamp=1681301749 \
64 |     --wandb_tag=dev
65 | ```
66 | 
67 | The `train_runner` script contains the procedure to run a full experiment, given a configuration file. This will perform the complete CV routine for all model configurations contained in the config file. Also supports multiprocessing through the `--n_proc` flag. For example, 
68 | ```bash
69 | python bioblp/benchmarking/train_runner.py \
70 |     --conf conf/dpi-benchmark-cv-20230413.toml \
71 |     --override_data_root=./ \
72 |     --override_run_id=1681398697 \
73 |     --tag=dpi-20230413 \
74 |     --n_proc=5
75 | ```
76 | 
77 | In its current implementations here, the multiprocessing capability conflicts with PyTorch on GPU. For MLP models using GPU, we recommend setting `--n_proc=1`:
78 | ```bash
79 | python bioblp/benchmarking/train_runner.py \
80 |     --conf conf/dpi-benchmark-cv-20230413-mlp.toml \
81 |     --override_data_root=./ \
82 |     --override_run_id=1681398697 \
83 |     --tag=dpi-20230413 \
84 |     --n_proc=1
85 | ```
86 | 
87 | ## WandB logging
88 | 
89 | By default logging to WandB is turned off. Change the assignments to `LOG_WANDB = True` in `train.py` for logging.


--------------------------------------------------------------------------------
/bioblp/benchmarking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/benchmarking/__init__.py


--------------------------------------------------------------------------------
/bioblp/benchmarking/config.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import abc
  3 | import toml
  4 | import json
  5 | 
  6 | from dataclasses import dataclass, field
  7 | from typing import List
  8 | from pathlib import Path
  9 | 
 10 | 
 11 | def load_toml(toml_path: str) -> dict:
 12 |     toml_path = Path(toml_path)
 13 |     config = {}
 14 |     with open(toml_path, "r") as f:
 15 |         config = toml.load(f)
 16 | 
 17 |     return config
 18 | 
 19 | 
 20 | class ConfigJSONEncoder(json.JSONEncoder):
 21 |     def default(self, obj):
 22 |         # add conditional logic for any data structures that require special care
 23 |         # handling serialisation of Enum objects
 24 |         if isinstance(obj, Path):
 25 |             return str(obj.resolve())
 26 |         return json.JSONEncoder.default(self, obj)
 27 | 
 28 | 
 29 | @dataclass
 30 | class BenchmarkStepBaseConfig(abc.ABC):
 31 |     data_root: str
 32 |     experiment_root: str
 33 |     run_id: str
 34 |     outdir: str
 35 | 
 36 |     @classmethod
 37 |     def from_toml(cls, toml_path, run_id):
 38 |         raise NotImplementedError
 39 | 
 40 |     def resolve_outdir(self) -> Path:
 41 |         outdir = Path(self.data_root)\
 42 |             .joinpath(self.experiment_root)\
 43 |             .joinpath(self.run_id)\
 44 |             .joinpath(self.outdir)
 45 | 
 46 |         return outdir
 47 | 
 48 | 
 49 | @dataclass
 50 | class BenchmarkPreprocessConfig(BenchmarkStepBaseConfig):
 51 |     num_negs_per_pos: int
 52 |     kg_triples_dir: str
 53 | 
 54 |     @classmethod
 55 |     def from_toml(cls, toml_path: str, run_id: str):
 56 |         config_toml = load_toml(toml_path)
 57 | 
 58 |         cfg = config_toml.get("sampling")
 59 | 
 60 |         data_root = config_toml.get("data_root")
 61 |         experiment_root = config_toml.get("experiment_root")
 62 | 
 63 |         cfg.update({"data_root": data_root})
 64 |         cfg.update({"experiment_root": experiment_root})
 65 |         cfg.update({"run_id": run_id})
 66 | 
 67 |         return cls(**cfg)
 68 | 
 69 | 
 70 | @dataclass
 71 | class BenchmarkFeatureConfig(BenchmarkStepBaseConfig):
 72 |     transform: str
 73 |     missing_values: str
 74 |     encoders: list
 75 |     encoder_args: dict
 76 | 
 77 |     @classmethod
 78 |     def from_toml(cls, toml_path: str, run_id: str):
 79 |         conf_path = Path(toml_path)
 80 |         config_toml = load_toml(conf_path)
 81 | 
 82 |         data_root = config_toml.get("data_root")
 83 |         experiment_root = config_toml.get("experiment_root")
 84 | 
 85 |         cfg = config_toml.get("features")
 86 | 
 87 |         cfg.update({"data_root": data_root})
 88 |         cfg.update({"experiment_root": experiment_root})
 89 |         cfg.update({"run_id": run_id})
 90 | 
 91 |         return cls(**cfg)
 92 | 
 93 | 
 94 | @dataclass
 95 | class BenchmarkSplitConfig(BenchmarkStepBaseConfig):
 96 |     n_splits: int
 97 | 
 98 |     @classmethod
 99 |     def from_toml(cls, toml_path: str, run_id: str):
100 |         conf_path = Path(toml_path)
101 |         config_toml = load_toml(conf_path)
102 | 
103 |         data_root = config_toml.get("data_root")
104 |         experiment_root = config_toml.get("experiment_root")
105 | 
106 |         cfg = config_toml.get("split")
107 | 
108 |         cfg.update({"data_root": data_root})
109 |         cfg.update({"experiment_root": experiment_root})
110 |         cfg.update({"run_id": run_id})
111 | 
112 |         return cls(**cfg)
113 | 
114 | 
115 | @dataclass
116 | class BenchmarkTrainConfig(BenchmarkStepBaseConfig):
117 |     feature_dir: str
118 |     splits_dir: str
119 |     splits_file: str
120 |     models: dict
121 |     refit_params: List[str]
122 |     n_iter: int = field(default=10, metadata={"help": "Number of HPO trials"})
123 | 
124 |     @classmethod
125 |     def from_toml(cls, toml_path, run_id):
126 |         conf = load_toml(toml_path=toml_path)
127 |         cfg = {}
128 | 
129 |         cfg["models"] = conf.get("models")
130 | 
131 |         cfg.update(conf.get("train"))
132 | 
133 |         cfg["data_root"] = conf.get("data_root")
134 |         cfg["experiment_root"] = conf.get("experiment_root")
135 |         cfg["feature_dir"] = conf.get("features").get("outdir")
136 |         cfg["splits_dir"] = conf.get("split").get("outdir")
137 | 
138 |         cfg.update({"run_id": run_id})
139 | 
140 |         return cls(**cfg)
141 | 
142 |     def resolve_feature_dir(self) -> Path:
143 |         feature_dir = Path(self.data_root)\
144 |             .joinpath(self.experiment_root)\
145 |             .joinpath(self.run_id)\
146 |             .joinpath(self.feature_dir)
147 | 
148 |         return feature_dir
149 | 
150 |     def resolve_splits_file(self) -> Path:
151 |         splits_path = Path(self.data_root)\
152 |             .joinpath(self.experiment_root)\
153 |             .joinpath(self.run_id)\
154 |             .joinpath(self.splits_dir)\
155 |             .joinpath(self.splits_file)
156 | 
157 |         return splits_path
158 | 


--------------------------------------------------------------------------------
/bioblp/benchmarking/experiment.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from time import time
 3 | from pathlib import Path
 4 | from bioblp.benchmarking.preprocess import main as sampling_main
 5 | from bioblp.benchmarking.config import BenchmarkPreprocessConfig
 6 | 
 7 | from bioblp.benchmarking.featurise import main as featurise_main
 8 | from bioblp.benchmarking.split import main as split_main
 9 | 
10 | 
11 | def run_experiment(args):
12 | 
13 |     experiment_id = str(int(time()))
14 | 
15 |     override_data_root = Path(
16 |         args.override_data_root) if args.override_data_root is not None else None
17 | 
18 |     #
19 |     # Negative sampling
20 |     #
21 |     preprocess_config = BenchmarkPreprocessConfig.from_toml(
22 |         args.conf, run_id=experiment_id)
23 | 
24 |     if override_data_root:
25 |         preprocess_config.data_root = override_data_root
26 | 
27 |     sampled_bm_filepath = sampling_main(bm_data_path=args.bm_file,
28 |                                         kg_triples_dir=preprocess_config.kg_triples_dir,
29 |                                         num_negs_per_pos=preprocess_config.num_negs_per_pos,
30 |                                         outdir=preprocess_config.resolve_outdir(),
31 |                                         override_run_id=experiment_id)
32 |     #
33 |     # Prepare features
34 |     #
35 |     featurise_main(bm_file=sampled_bm_filepath,
36 |                    conf=args.conf,
37 |                    override_data_root=override_data_root,
38 |                    override_run_id=experiment_id)
39 |     #
40 |     # Prepare splits
41 |     #
42 |     split_main(data=sampled_bm_filepath,
43 |                conf=args.conf,
44 |                override_data_root=override_data_root,
45 |                override_run_id=experiment_id)
46 | 
47 | 
48 | def get_parser() -> ArgumentParser:
49 |     parser = ArgumentParser(
50 |         description="Run full benchmark experiment procedure")
51 |     parser.add_argument("--conf", type=str,
52 |                         help="Path to experiment configuration")
53 |     parser.add_argument("--bm_file", type=str, help="Path to benchmark data")
54 |     parser.add_argument("--override_data_root", type=str, default=None,
55 |                         help="Path to root of data tree")
56 |     parser.add_argument(
57 |         "--n_proc", type=int, default=-1, help="Number of cores to use in process."
58 |     )
59 |     parser.add_argument("--tag", type=str,
60 |                         help="Optional tag to add to wandb runs")
61 |     parser.add_argument("--dev_run", action='store_true',
62 |                         help="Quick dev run")
63 |     return parser
64 | 
65 | 
66 | if __name__ == "__main__":
67 | 
68 |     args = get_parser().parse_args()
69 |     run_experiment(args)
70 | 


--------------------------------------------------------------------------------
/bioblp/benchmarking/featurise.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import json
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | from argparse import ArgumentParser
  8 | from dataclasses import asdict
  9 | from functools import reduce
 10 | 
 11 | from torch import Tensor
 12 | 
 13 | from pathlib import Path
 14 | from time import time
 15 | from tqdm import tqdm
 16 | 
 17 | from typing import Tuple, List, Dict
 18 | 
 19 | from bioblp.logger import get_logger
 20 | from bioblp.benchmarking.encoders import get_encoder
 21 | from bioblp.benchmarking.encoders import MissingValueMethod
 22 | from bioblp.benchmarking.encoders import EntityPairEncoder
 23 | from bioblp.benchmarking.encoders import EntityEncoder
 24 | from bioblp.benchmarking.encoders import NoiseEncoder
 25 | from bioblp.benchmarking.encoders import StructuralPairEncoder
 26 | from bioblp.benchmarking.encoders import RandomNoisePairEncoder
 27 | from bioblp.benchmarking.encoders import KGEMPairEncoder
 28 | from bioblp.benchmarking.config import BenchmarkFeatureConfig, ConfigJSONEncoder
 29 | 
 30 | from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET
 31 | from bioblp.benchmarking.encoders import ROTATE, TRANSE, COMPLEX, STRUCTURAL, NOISE, LABEL
 32 | 
 33 | 
 34 | logger = get_logger(__name__)
 35 | 
 36 | 
 37 | #
 38 | # Building script
 39 | #
 40 | 
 41 | 
 42 | def save_features(outdir: Path, label: str, feature: Tensor, labels: Tensor):
 43 |     outfile = outdir.joinpath(f"{label}.pt")
 44 | 
 45 |     torch_obj = {"X": feature, "y": labels}
 46 |     torch.save(torch_obj, outfile)
 47 | 
 48 | 
 49 | def build_encodings(config: BenchmarkFeatureConfig, pairs: np.array, encoders: List[str],
 50 |                     encoder_args: Dict[str, dict], entities_filter: List[str]) -> Tuple[str, Tensor, Tensor]:
 51 |     encoded_bm = []
 52 | 
 53 |     for encoder_i_label in tqdm(encoders, desc=f"Encoding benchmarks..."):
 54 |         logger.info(f"Encoding with {encoder_i_label}")
 55 |         encoder_i_args = encoder_args.get(encoder_i_label)
 56 | 
 57 |         pair_encoder = get_encoder(encoder_i_label,
 58 |                                    encoder_i_args,
 59 |                                    entities=entities_filter)
 60 | 
 61 |         missing_value_method = MissingValueMethod(config.missing_values)
 62 | 
 63 |         encoded_pairs, encoded_mask = pair_encoder.encode(pairs,
 64 |                                                           missing_value=missing_value_method,
 65 |                                                           transform=config.transform)
 66 | 
 67 |         encoded_bm.append((encoder_i_label, encoded_pairs, encoded_mask))
 68 |     return encoded_bm
 69 | 
 70 | 
 71 | def apply_common_mask(encoded_bm: List[Tuple[str, Tensor, Tensor]], labels: Tensor) -> Tuple[List[Tuple[str, Tensor]], Tensor]:
 72 |     logger.info("Masking features...")
 73 | 
 74 |     all_masks = [x[2] for x in encoded_bm]
 75 |     common_mask = torch.from_numpy(reduce(np.intersect1d, all_masks))
 76 | 
 77 |     logger.info(f"size after common mask {len(common_mask)}")
 78 | 
 79 |     masked_encoded_bm = []
 80 |     for enc_label, enc_pairs, _ in encoded_bm:
 81 |         masked_enc_pairs = enc_pairs[common_mask]
 82 |         masked_encoded_bm.append((enc_label, masked_enc_pairs))
 83 | 
 84 |     masked_labels = labels[common_mask]
 85 | 
 86 |     return masked_encoded_bm, masked_labels
 87 | 
 88 | 
 89 | def main(bm_file: str, conf: str, override_data_root=None, override_run_id=None):
 90 | 
 91 |     run_id = override_run_id or str(int(time()))
 92 | 
 93 |     config = BenchmarkFeatureConfig.from_toml(conf, run_id=run_id)
 94 | 
 95 |     if override_data_root is not None:
 96 |         config.data_root = override_data_root
 97 | 
 98 |     logger.info(
 99 |         f"Running process with config: {config} at time {run_id}...")
100 | 
101 |     # load benchmark data
102 |     # here entities are strings
103 | 
104 |     bm_df = pd.read_csv(bm_file, sep='\t', names=[
105 |                         COL_SOURCE, COL_EDGE, COL_TARGET, LABEL], header=0)
106 | 
107 |     pairs = bm_df[[COL_SOURCE, COL_TARGET]].values
108 |     all_entities = np.unique(np.ravel(pairs)).tolist()
109 | 
110 |     labels = torch.from_numpy(bm_df[LABEL].values)
111 | 
112 |     # perform encodings
113 |     encoded_bm = build_encodings(config=config,
114 |                                  pairs=pairs,
115 |                                  encoders=config.encoders,
116 |                                  encoder_args=config.encoder_args,
117 |                                  entities_filter=all_entities)
118 | 
119 |     # add plain benchmark data too
120 |     encoded_bm.append(("raw", pairs, np.arange(len(pairs))))
121 | 
122 |     # common mask only when dropping missing embeddings
123 |     if config.missing_values == MissingValueMethod.DROP.value:
124 |         masked_encoded_bm, masked_labels = apply_common_mask(
125 |             encoded_bm, labels)
126 |     else:
127 |         masked_encoded_bm = [(x[0], x[1]) for x in encoded_bm]
128 |         masked_labels = labels
129 | 
130 |     feature_outdir = config.resolve_outdir()
131 | 
132 |     feature_outdir.mkdir(parents=True, exist_ok=True)
133 | 
134 |     logger.info(f"Saving features to {feature_outdir}...")
135 | 
136 |     for enc_label, enc_pairs in masked_encoded_bm:
137 |         logger.info(
138 |             f"Saving {enc_label} features with shape: {enc_pairs.shape}")
139 |         save_features(outdir=feature_outdir,
140 |                       label=enc_label,
141 |                       feature=enc_pairs,
142 |                       labels=masked_labels)
143 | 
144 |     with open(feature_outdir.joinpath("config.json"), "w") as f:
145 |         cfg_dict = asdict(config)
146 |         json.dump(cfg_dict, f, cls=ConfigJSONEncoder)
147 | 
148 | 
149 | def get_parser() -> ArgumentParser:
150 |     parser = ArgumentParser(
151 |         description="Generate features for benchmark datasets")
152 |     parser.add_argument("--conf", type=str,
153 |                         help="Path to experiment configuration")
154 |     parser.add_argument("--bm_file", type=str, help="Path to benchmark data")
155 |     parser.add_argument("--override_data_root", type=str,
156 |                         help="Path to root of data tree")
157 |     parser.add_argument("--override_run_id", type=str,
158 |                         help="Override run_id")
159 | 
160 |     return parser
161 | 
162 | 
163 | if __name__ == "__main__":
164 | 
165 |     args = get_parser().parse_args()
166 | 
167 |     main(**vars(args))
168 | 


--------------------------------------------------------------------------------
/bioblp/benchmarking/preprocess.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from argparse import ArgumentParser
  4 | from pathlib import Path
  5 | from pykeen.sampling import PseudoTypedNegativeSampler
  6 | from pykeen.triples import TriplesFactory
  7 | 
  8 | from time import time
  9 | from typing import Union
 10 | 
 11 | from bioblp.logger import get_logger
 12 | from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET
 13 | 
 14 | logger = get_logger(__name__)
 15 | COL_LABEL = 'label'
 16 | 
 17 | 
 18 | def generate_negative_triples(pos_triples: TriplesFactory,
 19 |                               filtered=True,
 20 |                               num_negs_per_pos=1):
 21 | 
 22 |     neg_sampler = PseudoTypedNegativeSampler(mapped_triples=pos_triples.mapped_triples,
 23 |                                              filtered=filtered,
 24 |                                              num_negs_per_pos=num_negs_per_pos)
 25 |     pos_batch = pos_triples.mapped_triples
 26 |     neg_triples = neg_sampler.sample(pos_batch)[0]
 27 | 
 28 |     return neg_triples
 29 | 
 30 | 
 31 | def prepare_dpi_samples(pos_df,
 32 |                         num_negs_per_pos: Union[None, int, str] = 1,
 33 |                         entity_to_id_map: Union[None, dict] = None,
 34 |                         relation_to_id_map: Union[None, dict] = None,
 35 |                         # map_to_kgem_ids=False,
 36 |                         filtered=True):
 37 |     """
 38 |     pos_df -> Expects dataframe with true positives in format ['src', edge', 'tgt'],
 39 |               where the entities and relations of the triple are in their string ids.
 40 |               These will be converted to KGEM integer ids at a later state
 41 |     """
 42 |     pos_neg_df = pos_df.copy()
 43 |     pos_triples = TriplesFactory.from_labeled_triples(pos_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values,
 44 |                                                       entity_to_id=entity_to_id_map,
 45 |                                                       relation_to_id=relation_to_id_map)
 46 | 
 47 |     # returns a tensor object
 48 |     neg_triples = generate_negative_triples(pos_triples,
 49 |                                             num_negs_per_pos=num_negs_per_pos,
 50 |                                             filtered=filtered)
 51 | 
 52 |     # convert to mapped triples
 53 |     neg_triples_ = pos_triples.clone_and_exchange_triples(
 54 |         neg_triples.view(-1, 3))
 55 |     neg_df = pd.DataFrame(neg_triples_.triples, columns=[
 56 |                           COL_SOURCE, COL_EDGE, COL_TARGET])
 57 | 
 58 |     # add labels
 59 |     pos_neg_df[COL_LABEL] = 1
 60 |     neg_df[COL_LABEL] = 0
 61 | 
 62 |     pos_neg_df = pd.concat([pos_neg_df, neg_df], axis=0, ignore_index=True)
 63 |     return pos_neg_df
 64 | 
 65 | 
 66 | def main(bm_data_path: str, kg_triples_dir: str, outdir: str, num_negs_per_pos: int = 1, override_run_id=None):
 67 | 
 68 |     start = time()
 69 |     run_id = override_run_id or int(start)
 70 | 
 71 |     bm_data_path = Path(bm_data_path)
 72 |     kg_triples_dir = Path(kg_triples_dir)
 73 |     outdir = Path(outdir)
 74 |     outdir.mkdir(parents=True, exist_ok=True)
 75 | 
 76 |     num_negs_per_pos = num_negs_per_pos
 77 |     bm_dataset_name = bm_data_path.name.split('.tsv')[0]
 78 | 
 79 |     training_triples = TriplesFactory.from_path_binary(kg_triples_dir)
 80 |     entity_to_id_map = training_triples.entity_to_id
 81 |     relation_to_id_map = training_triples.relation_to_id
 82 | 
 83 |     # load the benchmark data
 84 |     bm_df = pd.read_csv(bm_data_path, sep='\t', names=[
 85 |                         COL_SOURCE, COL_EDGE, COL_TARGET])
 86 | 
 87 |     # generate neg samples and prepare pos-neg pairs
 88 |     logger.info(
 89 |         f'Generating negative samples corresponding to benchmark triples')
 90 |     pos_neg_df = prepare_dpi_samples(bm_df,
 91 |                                      entity_to_id_map=entity_to_id_map,
 92 |                                      relation_to_id_map=relation_to_id_map,
 93 |                                      num_negs_per_pos=num_negs_per_pos)
 94 | 
 95 |     # save to disk
 96 |     bm_postprocessed_path = outdir.joinpath(
 97 |         f"{bm_dataset_name}_p2n-1-{num_negs_per_pos}.tsv")
 98 |     logger.info(f'Writing preprocessed data to {bm_postprocessed_path}')
 99 |     pos_neg_df.to_csv(bm_postprocessed_path, sep='\t')
100 |     logger.info('Done!')
101 | 
102 |     return str(bm_postprocessed_path.resolve())
103 | 
104 | 
105 | if __name__ == "__main__":
106 | 
107 |     parser = ArgumentParser(
108 |         description="Preprocess benchmark triples (E.g. DPI data) for downstream prediction task")
109 |     parser.add_argument("--bm_data_path", type=str,
110 |                         help="Path to pick up benchmark data")
111 |     parser.add_argument("--kg_triples_dir", type=str,
112 |                         help="Directory housing kg positive triples. Needed to generate negative samples")
113 |     parser.add_argument("--num_negs_per_pos", type=int,
114 |                         help="Number of negative samples to generate per positive instance")
115 |     parser.add_argument("--outdir", type=str,
116 |                         help="Path to data dir to write output")
117 |     parser.add_argument("--override_run_id", type=str,
118 |                         help="Run id of experiment")
119 |     args = parser.parse_args()
120 |     main(**vars(args))
121 | 


--------------------------------------------------------------------------------
/bioblp/benchmarking/split.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | from argparse import ArgumentParser
  5 | from pathlib import Path
  6 | 
  7 | from sklearn.model_selection import StratifiedKFold
  8 | from sklearn.model_selection import train_test_split
  9 | 
 10 | 
 11 | from bioblp.benchmarking.train_utils import load_feature_data
 12 | from bioblp.logger import get_logger
 13 | from bioblp.benchmarking.config import BenchmarkSplitConfig
 14 | 
 15 | from typing import Union, Tuple, Dict, List
 16 | 
 17 | RANDOM_STATE = 12
 18 | 
 19 | logger = get_logger(__name__)
 20 | 
 21 | 
 22 | def get_splits_iter(splits_path):
 23 |     def splits_iterable():
 24 |         splits_data = torch.load(splits_path)
 25 |         n = len(splits_data)
 26 | 
 27 |         num = 0
 28 |         while num < n:
 29 |             fold_data = splits_data[num]
 30 |             yield (fold_data["split_idx"], fold_data["train_idx"], fold_data["test_idx"])
 31 |             num += 1
 32 | 
 33 |     return splits_iterable
 34 | 
 35 | 
 36 | def get_split_struct(train, test, idx) -> dict:
 37 |     return {
 38 |         "train_idx": train,
 39 |         "test_idx": test,
 40 |         "split_idx": str(idx)
 41 |     }
 42 | 
 43 | 
 44 | def load_split(splits_file: Path, split_idx: int) -> Tuple[np.array, np.array]:
 45 | 
 46 |     splits_data = torch.load(splits_file)
 47 | 
 48 |     fold_splits = splits_data[split_idx]
 49 |     train_idx = fold_splits["train_idx"]
 50 |     test_idx = fold_splits["test_idx"]
 51 |     fold_idx = fold_splits["split_idx"]
 52 | 
 53 |     return (fold_idx, train_idx, test_idx)
 54 | 
 55 | 
 56 | def main(data, n_folds=None, outdir=None, conf=None, override_data_root=None, override_run_id=None):
 57 | 
 58 |     if conf is not None:
 59 |         config = BenchmarkSplitConfig.from_toml(conf, run_id=override_run_id)
 60 |         if override_data_root is not None:
 61 |             config.data_root = override_data_root
 62 | 
 63 |         n_folds = config.n_splits
 64 |         data_path = Path(data)
 65 |         outdir = config.resolve_outdir()
 66 |     else:
 67 |         data_path = Path(data)
 68 |         outdir = Path(outdir)
 69 | 
 70 |     outdir.mkdir(parents=True, exist_ok=True)
 71 | 
 72 |     # load raw benchmark data
 73 |     X_bm, y_bm = load_feature_data(data_path)
 74 | 
 75 |     # generate train-test split
 76 |     logger.info("Generating train test split.")
 77 | 
 78 |     X_indices = torch.arange(len(X_bm))
 79 | 
 80 |     train_idx, test_idx, _, _ = train_test_split(
 81 |         X_indices, y_bm, test_size=0.1, stratify=y_bm, random_state=RANDOM_STATE)
 82 | 
 83 |     split_data = {0: get_split_struct(train_idx, test_idx, idx=0)}
 84 |     train_test_split_file = outdir.joinpath("train-test-split.pt")
 85 |     torch.save(split_data, train_test_split_file)
 86 | 
 87 |     # generate cv splits
 88 |     logger.info("Generating cv splits.")
 89 | 
 90 |     cv = StratifiedKFold(
 91 |         n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE
 92 |     )
 93 |     splits = [(train, test, idx)
 94 |               for idx, (train, test) in enumerate(cv.split(X_bm, y_bm))]
 95 | 
 96 |     cv_data = {x[2]: get_split_struct(x[0], x[1], x[2]) for x in splits}
 97 | 
 98 |     cv_split_file = outdir.joinpath("cv-splits.pt")
 99 |     torch.save(cv_data, cv_split_file)
100 | 
101 |     logger.info("Done.")
102 | 
103 | 
104 | if __name__ == "__main__":
105 | 
106 |     parser = ArgumentParser(
107 |         description="Preprocess benchmark triples (E.g. DPI data) for downstream prediction task")
108 | 
109 |     parser.add_argument("--conf", type=str, default=None,
110 |                         help="Path to config file")
111 |     parser.add_argument("--data", type=str,
112 |                         help="Path to pick up benchmark data")
113 |     parser.add_argument("--n_folds", type=int, default=None,
114 |                         help="Number of cv folds to produce")
115 |     parser.add_argument("--outdir", type=str, default=None,
116 |                         help="Path to data dir to write output")
117 |     parser.add_argument("--override_data_root", type=str,
118 |                         help="Path to root of data tree")
119 |     parser.add_argument("--override_run_id", type=str,
120 |                         help="Override run_id")
121 |     args = parser.parse_args()
122 |     main(**vars(args))
123 | 


--------------------------------------------------------------------------------
/bioblp/benchmarking/train_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import string
  3 | import numpy as np
  4 | import random as rn
  5 | 
  6 | from pathlib import Path
  7 | 
  8 | 
  9 | from sklearn.metrics import roc_auc_score
 10 | from sklearn.metrics import precision_score
 11 | from sklearn.metrics import recall_score
 12 | from sklearn.metrics import fbeta_score
 13 | from sklearn.metrics import make_scorer
 14 | from sklearn.metrics import accuracy_score
 15 | from sklearn.metrics import precision_recall_curve
 16 | from sklearn.metrics import roc_curve
 17 | from sklearn.metrics import auc
 18 | from sklearn.metrics import confusion_matrix
 19 | 
 20 | 
 21 | from sklearn.model_selection import train_test_split
 22 | 
 23 | from typing import Union, Tuple
 24 | 
 25 | from bioblp.logger import get_logger
 26 | 
 27 | 
 28 | logger = get_logger(__name__)
 29 | 
 30 | 
 31 | def get_random_string(length):
 32 |     # choose from all lowercase letter
 33 |     characters = string.ascii_lowercase + string.digits
 34 |     result_str = "".join(rn.choice(characters) for i in range(length))
 35 | 
 36 |     return result_str
 37 | 
 38 | 
 39 | def unique_study_prefix():
 40 |     unique_string = get_random_string(8)
 41 |     return unique_string
 42 | 
 43 | 
 44 | def generate_study_name(prefix, model, fold):
 45 |     return f"{prefix}-{model}-{fold}"
 46 | 
 47 | 
 48 | def aupr_score(y_true, y_pred):
 49 |     """Use AUC function to calculate the area under the curve of precision recall curve"""
 50 |     precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
 51 |     return auc(recall, precision)
 52 | 
 53 | 
 54 | def get_auc_scorers():
 55 |     scorers = {
 56 |         "PRCURVE": make_scorer(precision_recall_curve, needs_proba=True),
 57 |         "ROCCURVE": make_scorer(roc_curve, needs_proba=True),
 58 |         "CM": make_scorer(confusion_matrix, needs_proba=False)
 59 |     }
 60 |     return scorers
 61 | 
 62 | 
 63 | def get_scorers():
 64 |     scorers = {
 65 |         "AUCROC": make_scorer(roc_auc_score, needs_proba=True),
 66 |         "f1": make_scorer(fbeta_score, beta=1, average="micro"),
 67 |         "precision": make_scorer(precision_score),
 68 |         "recall": make_scorer(recall_score),
 69 |         "accuracy": make_scorer(accuracy_score),
 70 |         "AUCPR": make_scorer(aupr_score, needs_proba=True),
 71 |     }
 72 |     return scorers
 73 | 
 74 | 
 75 | def get_model_label(feature: str, model: str):
 76 |     return f"{feature}__{model}"
 77 | 
 78 | 
 79 | def load_feature_data(feat_path: Union[str, Path], dev_run: bool = False) -> Tuple[np.array, np.array]:
 80 |     """ Load feature data into numpy arrays
 81 | 
 82 |     Parameters
 83 |     ----------
 84 |     feat_path : Union[str, Path]
 85 |         Filepath to feature, eg 'features/rotate.pt'
 86 |     dev_run : bool, optional
 87 |         Flag to subsample data for development only, by default False
 88 | 
 89 |     Returns
 90 |     -------
 91 |     Tuple[np.array, np.array]
 92 |         Return (features, labels)
 93 |     """
 94 |     logger.info("Loading training data...")
 95 | 
 96 |     data = torch.load(feat_path)
 97 | 
 98 |     X = data.get("X")
 99 |     y = data.get("y")
100 | 
101 |     if torch.is_tensor(X):
102 |         X = X.detach().numpy()
103 |         y = y.detach().numpy()
104 | 
105 |     if dev_run:
106 |         X, _, y, _ = train_test_split(
107 |             X, y, stratify=y, train_size=0.1, random_state=12)
108 | 
109 |     logger.info(
110 |         "Resulting shapes X: {}, y: {}".format(
111 |             X.shape, y.shape)
112 |     )
113 |     logger.info("Counts in y: {}".format(
114 |         np.unique(y, return_counts=True)))
115 | 
116 |     return X, y
117 | 
118 | 
119 | def validate_features_exist(feature_dir: Path, models_conf: dict) -> bool:
120 |     """ Check if all feature files exist in directory
121 | 
122 |     Parameters
123 |     ----------
124 |     feature_dir : Path
125 |         Path to feature location
126 |     models_conf : dict
127 |         Definition of model and feature.
128 | 
129 |     Returns
130 |     -------
131 |     bool
132 |         True if features are present.
133 |     """
134 |     exists = {}
135 | 
136 |     all_features = list(set([v.get("feature")
137 |                              for _, v in models_conf.items()]))
138 | 
139 |     for feat in all_features:
140 |         exists[feat] = feature_dir.joinpath(f"{feat}.pt").is_file()
141 | 
142 |     logger.info(f"Validated that features exist: {exists}..")
143 | 
144 |     missing = [k for k, v in exists.items() if v is False]
145 |     if len(missing) > 0:
146 |         logger.warning(f"Missing features {missing}!!")
147 | 
148 |     return all([v for _, v in exists.items()])
149 | 


--------------------------------------------------------------------------------
/bioblp/data.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pandas as pd
 3 | from bioblp.logger import get_logger
 4 | from pykeen.triples import TriplesFactory
 5 | 
 6 | #logger = get_logger(__name__)
 7 | 
 8 | COL_SOURCE = 'src'
 9 | COL_EDGE = 'edg'
10 | COL_TARGET = 'tgt'
11 | 
12 | 
13 | def create_random_splits(triples: pd.DataFrame, train_ratio: float, valid_ratio: float, test_ratio: float):
14 |     """Create train/valid/test based on random strategy
15 |     """
16 |     triples_array = triples[[COL_SOURCE, COL_EDGE, COL_TARGET]].values
17 | 
18 |     triples_factory = TriplesFactory.from_labeled_triples(triples_array)
19 |     
20 |     train, valid, test = triples_factory.split([train_ratio, valid_ratio, test_ratio], random_state=2021)
21 | 
22 |     train_triples = pd.DataFrame(train.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET])
23 |     valid_triples = pd.DataFrame(valid.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET])
24 |     test_triples = pd.DataFrame(test.triples, columns=[COL_SOURCE, COL_EDGE, COL_TARGET])
25 | 
26 |     return train_triples, valid_triples, test_triples
27 | 
28 | 
29 | def save_splits(train_df, test_df, valid_df, dataset_name, out_dir):
30 |     out_dir = Path(out_dir)
31 |     out_dir.mkdir(exist_ok=True, parents=True)
32 | 
33 |     train.to_csv(out_dir.joinpath(f"{dataset_name}-train.tsv"), sep='\t', index=None)
34 |     test.to_csv(out_dir.joinpath(f"{dataset_name}-test.tsv"), sep='\t', index=None)
35 |     valid.to_csv(out_dir.joinpath(f"{dataset_name}-valid.tsv"), sep='\t', index=None)
36 |     print(f"saved to {out_dir}")
37 |     
38 |     
39 | def load_splits(dataset: str, data_path: str, dev_sample=False) -> (TriplesFactory, TriplesFactory, TriplesFactory):
40 |     data_path = Path(data_path)
41 | 
42 |     training_path = data_path.joinpath(f"{dataset}-train.tsv")
43 |     valid_path = data_path.joinpath(f"{dataset}-valid.tsv")
44 |     test_path = data_path.joinpath(f"{dataset}-test.tsv")
45 |     
46 |     train_df = pd.read_csv(training_path, index_col=None, sep="\t", dtype=str)
47 |     valid_df = pd.read_csv(valid_path, index_col=None, sep="\t", dtype=str)
48 |     test_df = pd.read_csv(test_path, index_col=None, sep="\t", dtype=str)
49 |     
50 |     if dev_sample:
51 |         dev_frac = 0.01
52 |         train_df = train_df.sample(frac=dev_frac, random_state=2021)
53 |         valid_df = valid_df.sample(frac=dev_frac, random_state=2021)
54 |         test_df = test_df.sample(frac=dev_frac, random_state=2021)
55 | 
56 |     training = TriplesFactory.from_labeled_triples(train_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values)
57 |     valid = TriplesFactory.from_labeled_triples(
58 |         valid_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, entity_to_id=training.entity_to_id,
59 |         relation_to_id=training.relation_to_id)
60 |     test = TriplesFactory.from_labeled_triples(
61 |         test_df[[COL_SOURCE, COL_EDGE, COL_TARGET]].values, entity_to_id=training.entity_to_id,
62 |         relation_to_id=training.relation_to_id)
63 | 
64 |     return training, valid, test


--------------------------------------------------------------------------------
/bioblp/evaluate.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | import numpy as np
 4 | from pykeen.evaluation import RankBasedEvaluator, RankBasedMetricResults
 5 | from pykeen.evaluation.rank_based_evaluator import _iter_ranks
 6 | from pykeen.triples import TriplesFactory
 7 | from tap import Tap
 8 | import torch
 9 | 
10 | 
11 | class Arguments(Tap):
12 |     model_path: str
13 | 
14 | 
15 | class SavedRanksEvaluator(RankBasedEvaluator):
16 |     def __init__(self, *args, **kwargs):
17 |         super().__init__(*args, **kwargs)
18 |         self.saved_ranks = None
19 | 
20 |     def finalize(self) -> RankBasedMetricResults:
21 |         if self.num_entities is None:
22 |             raise ValueError
23 | 
24 |         result = RankBasedMetricResults.from_ranks(
25 |             metrics=self.metrics,
26 |             rank_and_candidates=_iter_ranks(ranks=self.ranks, num_candidates=self.num_candidates),
27 |         )
28 | 
29 |         self.saved_ranks = self.ranks.copy()
30 |         self.ranks.clear()
31 |         self.num_candidates.clear()
32 | 
33 |         return result
34 | 
35 | 
36 | def get_triple_ranks(args: Arguments):
37 |     model_file = osp.join(args.model_path, 'trained_model.pkl')
38 | 
39 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
40 | 
41 |     model = torch.load(model_file).to(device)
42 |     train = TriplesFactory.from_path_binary(osp.join(args.model_path,
43 |                                                      'training_triples'))
44 | 
45 |     graph_path = osp.join('data', 'biokgb', 'graph')
46 |     valid_triples = 'biokg.links-valid.csv'
47 |     test_triples = 'biokg.links-test.csv'
48 | 
49 |     valid, test = [TriplesFactory.from_path(osp.join(graph_path, f),
50 |                                             entity_to_id=train.entity_to_id,
51 |                                             relation_to_id=train.relation_to_id)
52 |                    for f in (valid_triples, test_triples)]
53 | 
54 |     evaluator = SavedRanksEvaluator(filtered=True)
55 |     evaluator.evaluate(model,
56 |                        test.mapped_triples,
57 |                        additional_filter_triples=[train.mapped_triples,
58 |                                                   valid.mapped_triples])
59 | 
60 |     head_ranks = evaluator.saved_ranks[('head', 'realistic')]
61 |     tail_ranks = evaluator.saved_ranks[('tail', 'realistic')]
62 |     ranks = np.concatenate(head_ranks + tail_ranks)
63 |     # Save ranks to a csv file, specifying the integer format
64 |     np.savetxt(osp.join(args.model_path, 'ranks.csv'), ranks, fmt='%d')
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     get_triple_ranks(Arguments().parse_args())
69 | 


--------------------------------------------------------------------------------
/bioblp/loaders/preprocessors.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, Mapping
  2 | 
  3 | from transformers import BertTokenizer
  4 | import torch
  5 | from torch import Tensor
  6 | from torch.nn.utils.rnn import pad_sequence
  7 | from tqdm import tqdm
  8 | import numpy as np
  9 | 
 10 | 
 11 | class EntityPropertyPreprocessor:
 12 |     """Abstract class for preprocessing entity properties of different types
 13 |     into tensors suitable for machine learning wizardry."""
 14 |     def preprocess_file(self, file_path: str,
 15 |                         entity_to_id: Mapping[str, int]
 16 |                         ) -> Tuple[Tensor, Tensor, Tensor]:
 17 |         """Read a file of entity properties, with one entity per line.
 18 |         Expects at each line an entity name, a tab, and a property to be
 19 |         encoded.
 20 | 
 21 |         Args:
 22 |             file_path: file mapping entities to properties
 23 |             entity_to_id: maps an entity name to an integer ID
 24 | 
 25 |         Returns:
 26 |             entity_ids: torch.Tensor containing entity IDs read by the method
 27 |             rows: torch.Tensor mapping each entity in entity_ids to a row in
 28 |                 data
 29 |             data: torch.Tensor containing data for each entity in entity_ids
 30 |         """
 31 |         raise NotImplementedError
 32 | 
 33 | 
 34 | class TextEntityPropertyPreprocessor(EntityPropertyPreprocessor):
 35 |     """Preprocessor for entities with textual descriptions"""
 36 |     def __init__(self, tokenizer: BertTokenizer, max_length: int):
 37 |         self.tokenizer = tokenizer
 38 |         self.max_length = max_length
 39 | 
 40 |     def preprocess_file(self, file_path: str,
 41 |                         entity_to_id: Mapping[str, int]
 42 |                         ) -> Tuple[Tensor, Tensor, Tensor]:
 43 |         all_tokens = []
 44 |         entity_ids = []
 45 |         rows = []
 46 |         row_count = 0
 47 |         with open(file_path) as file:
 48 |             for i, line in enumerate(tqdm(file, desc=f'Encoding {file_path}')):
 49 |                 tab_idx = line.find('\t')
 50 |                 entity, text = line[:tab_idx], line[tab_idx:].strip()
 51 | 
 52 |                 if entity in entity_to_id:
 53 |                     tokens = self.tokenizer.encode(text,
 54 |                                                    max_length=self.max_length,
 55 |                                                    truncation=True,
 56 |                                                    padding='max_length',
 57 |                                                    return_tensors='pt')
 58 |                     all_tokens.append(tokens)
 59 |                     entity_id = entity_to_id[entity]
 60 |                     entity_ids.append(entity_id)
 61 |                     rows.append(row_count)
 62 |                     row_count += 1
 63 | 
 64 |         if len(all_tokens) > 0:
 65 |             all_tokens = torch.cat(all_tokens, dim=0)
 66 |         else:
 67 |             all_tokens = torch.tensor([], dtype=torch.long)
 68 | 
 69 |         return (torch.tensor(entity_ids, dtype=torch.long),
 70 |                 torch.tensor(rows, dtype=torch.long),
 71 |                 all_tokens)
 72 | 
 73 | 
 74 | class MolecularFingerprintPreprocessor(EntityPropertyPreprocessor):
 75 |     """Preprocessor for molecules with known molecular fingerprints"""
 76 |     def preprocess_file(self, file_path: str,
 77 |                         entity_to_id: Mapping[str, int]
 78 |                         ) -> Tuple[Tensor, Tensor, Tensor]:
 79 |         all_fprints = []
 80 |         entity_ids = []
 81 |         rows = []
 82 |         row_count = 0
 83 |         with open(file_path) as file:
 84 |             for i, line in enumerate(tqdm(file, desc=f'Encoding {file_path}')):
 85 |                 tab_idx = line.find('\t')
 86 |                 entity, fprint = line[:tab_idx], line[tab_idx:].strip()
 87 | 
 88 |                 if entity in entity_to_id:
 89 |                     fprint = torch.tensor(np.array(list(fprint), dtype=float), dtype=torch.float)
 90 |                     all_fprints.append(fprint)
 91 |                     entity_id = entity_to_id[entity]
 92 |                     entity_ids.append(entity_id)
 93 |                     rows.append(row_count)
 94 |                     row_count += 1
 95 | 
 96 |         return (torch.tensor(entity_ids, dtype=torch.long),
 97 |                 torch.tensor(rows, dtype=torch.long),
 98 |                 torch.stack(all_fprints, dim=0))
 99 | 
100 | 
101 | class PretrainedEmbeddingPreprocessor(EntityPropertyPreprocessor):
102 |     def preprocess_file(self, file_path: str,
103 |                         entity_to_id: Mapping[str, int]
104 |                         ) -> Tuple[Tensor, Tensor, Tensor]:
105 |         data_dict = torch.load(file_path)
106 |         entity_to_row = data_dict['identifiers']
107 | 
108 |         entity_ids = []
109 |         data = []
110 |         for entity, row in entity_to_row.items():
111 |             if entity in entity_to_id:
112 |                 entity_ids.append(entity_to_id[entity])
113 |                 data.append(entity_to_row[entity])
114 | 
115 |         entity_ids = torch.tensor(entity_ids, dtype=torch.long)
116 |         data_idx = torch.arange(len(entity_ids))
117 |         data = torch.tensor(data, dtype=torch.long)
118 | 
119 |         return entity_ids, data_idx, data
120 | 
121 | 
122 | class MoleculeEmbeddingPreprocessor(EntityPropertyPreprocessor):
123 |     def preprocess_file(self, file_path: str,
124 |                         entity_to_id: Mapping[str, int]
125 |                         ) -> Tuple[Tensor, Tensor, Tensor]:
126 |         """Load embeddings for all the molecules we need, putting them
127 |         in a single tensor that can be used to retrieve embeddings during
128 |         training. Since molecules have variable length we use padding with
129 |         a value of -1000 before placing them all inside a single 3D tensor
130 |         of shape (N, L, D) where N is the number of molecules,
131 |         L the maximum molecule length, and D the embedding dimension"""
132 |         data_dict = torch.load(file_path)
133 | 
134 |         entity_ids = []
135 |         data = []
136 |         for molecule, embeddings in data_dict.items():
137 |             if molecule in entity_to_id:
138 |                 entity_ids.append(entity_to_id[molecule])
139 |                 data.append(embeddings)
140 | 
141 |         entity_ids = torch.tensor(entity_ids, dtype=torch.long)
142 |         data = pad_sequence(data, batch_first=True, padding_value=-10_000)
143 |         data_idx = torch.arange(len(entity_ids))
144 | 
145 |         return entity_ids, data_idx, data
146 | 


--------------------------------------------------------------------------------
/bioblp/logger.py:
--------------------------------------------------------------------------------
 1 | import logging as lg
 2 | 
 3 | 
 4 | def get_logger(logger_name=''):
 5 |     """Get a default logger that includes a timestamp."""
 6 |     logger = lg.getLogger(logger_name)
 7 |     logger.handlers = []
 8 |     ch = lg.StreamHandler()
 9 |     str_fmt = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
10 |     formatter = lg.Formatter(str_fmt, datefmt='%H:%M:%S')
11 |     ch.setFormatter(formatter)
12 |     logger.addHandler(ch)
13 |     logger.setLevel('INFO')
14 | 
15 |     return logger
16 | 


--------------------------------------------------------------------------------
/bioblp/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .bioblp import *
2 | 


--------------------------------------------------------------------------------
/bioblp/models/bioblp.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | from typing import Optional
  3 | 
  4 | import pykeen.models
  5 | from pykeen.nn.representation import Embedding as PyKEmbedding
  6 | from pykeen.typing import InductiveMode
  7 | import torch
  8 | 
  9 | from bioblp.models.encoders import PropertyEncoderRepresentation
 10 | 
 11 | 
 12 | class BioBLP:
 13 |     def __init__(self, *,
 14 |                  entity_representations: PropertyEncoderRepresentation,
 15 |                  from_checkpoint: str = None,
 16 |                  **kwargs):
 17 |         self.from_checkpoint = from_checkpoint
 18 | 
 19 |         super().__init__(**kwargs)
 20 | 
 21 |         entity_embedding_lut = self.entity_representations[0]
 22 |         entity_embedding_lut: PyKEmbedding
 23 | 
 24 |         entity_representations.wrap_lookup_table(entity_embedding_lut)
 25 |         self.property_encoder = entity_representations
 26 | 
 27 |     def reset_parameters_(self):
 28 |         super().reset_parameters_()
 29 |         if self.from_checkpoint:
 30 |             checkpoint = torch.load(osp.join(self.from_checkpoint,
 31 |                                              'trained_model.pkl'),
 32 |                                     map_location='cpu')
 33 |             self.load_state_dict(checkpoint.state_dict(), strict=False)
 34 | 
 35 |     def score_hrt_and_negatives(self,
 36 |                                 hrt_batch: torch.LongTensor,
 37 |                                 num_negatives: int,
 38 |                                 *, mode: Optional[InductiveMode] = None
 39 |                                 ) -> tuple[torch.FloatTensor, torch.FloatTensor]:
 40 |         batch_size = hrt_batch.shape[0]
 41 | 
 42 |         h, r, t = self._get_representations(h=hrt_batch[:, 0],
 43 |                                             r=hrt_batch[:, 1],
 44 |                                             t=hrt_batch[:, 2], mode=mode)
 45 |         positive_scores = self.interaction.score_hrt(h=h, r=r, t=t)
 46 | 
 47 |         num_ents = batch_size * 2
 48 |         idx = torch.arange(num_ents).reshape(batch_size, 2)
 49 | 
 50 |         # For each row, sample entities, assigning 0 probability to entities
 51 |         # of the same row
 52 |         zeros = torch.zeros(batch_size, 2)
 53 |         head_weights = torch.ones(batch_size, num_ents, dtype=torch.float)
 54 |         head_weights.scatter_(1, idx, zeros)
 55 |         random_idx = head_weights.multinomial(num_negatives, replacement=True)
 56 |         random_idx = random_idx.t().flatten()
 57 | 
 58 |         # Select randomly the first or the second column
 59 |         row_selector = torch.arange(batch_size * num_negatives)
 60 |         col_selector = torch.randint(0, 2, [batch_size * num_negatives])
 61 | 
 62 |         # Fill the array of negative samples with the sampled random entities
 63 |         # at the right positions
 64 |         neg_idx = idx.repeat((num_negatives, 1))
 65 |         neg_idx[row_selector, col_selector] = random_idx
 66 |         # neg_idx = neg_idx.reshape(-1, batch_size, 2)
 67 |         # neg_idx.transpose_(0, 1)
 68 | 
 69 |         neg_embs = torch.stack([h, r], dim=1).view(batch_size * 2, -1)
 70 |         neg_embs = neg_embs[neg_idx.to(neg_embs.device)]
 71 |         h_neg, t_neg = neg_embs[:, 0], neg_embs[:, 1]
 72 | 
 73 |         r_neg_idx = torch.arange(batch_size).repeat(num_negatives)
 74 |         r_neg = r[r_neg_idx.to(r.device)]
 75 | 
 76 |         negative_scores = self.interaction.score_hrt(h=h_neg, r=r_neg, t=t_neg)
 77 |         negative_scores = negative_scores.reshape(batch_size, num_negatives)
 78 | 
 79 |         return positive_scores, negative_scores
 80 | 
 81 | 
 82 | class BioBLPTransE(BioBLP, pykeen.models.TransE):
 83 |     ...
 84 | 
 85 | 
 86 | class BioBLPComplEx(BioBLP, pykeen.models.ComplEx):
 87 |     ...
 88 | 
 89 | 
 90 | class BioBLPRotatE(BioBLP, pykeen.models.RotatE):
 91 |     ...
 92 | 
 93 | 
 94 | MODELS_DICT = {
 95 |     'transe': BioBLPTransE,
 96 |     'complex': BioBLPComplEx,
 97 |     'rotate': BioBLPRotatE
 98 | }
 99 | 
100 | 
101 | def get_model_class(model_name: str):
102 |     if model_name in MODELS_DICT:
103 |         return MODELS_DICT[model_name]
104 |     else:
105 |         raise ValueError(f'Unknown model f{model_name}')
106 | 
107 | 


--------------------------------------------------------------------------------
/bioblp/predict.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/predict.py


--------------------------------------------------------------------------------
/bioblp/preprocess.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pandas as pd
 3 | import numpy as np
 4 | import bio_embeddings
 5 | from bio_embeddings.embed import SeqVecEmbedder, ProtTransBertBFDEmbedder, prottrans_t5_embedder, esm_embedder
 6 | 
 7 | 
 8 | # Here we can change the Protein Embedder to w/e we want from the above.
 9 | # TODO: An experiment with t5 embedding
10 | prot_trans_embedder = ProtTransBertBFDEmbedder()
11 | 
12 | 
13 | def get_protein_repr(amino_repr):
14 |     """ Here we need to go from a collection of amino-acid embeddings to a full protein embedding
15 | 
16 |     # Example:
17 |     #
18 |     #   M : (1,1024)
19 |     #   A : (1,1024)
20 |     #   S : (1,1024)
21 |     #
22 |     #  Output: An aggregated representation for proteins
23 |     #
24 |     #  Type: Dict(protein_id: (embedding))
25 |     #
26 |        e.g Dict(: (LENG8_MOUSE, 1024)) """
27 | 
28 |     emb_matrix = torch.Tensor(amino_repr)
29 | 
30 |     # We average over columns
31 |     protein_emb = torch.mean(emb_matrix, dim=0)
32 | 
33 |     return protein_emb
34 | 
35 | 
36 | def get_protein_embedding(path, embedder="prottrans"):
37 |     """
38 |         Wrapper over different protein embedders
39 |     Parameters
40 |     ----------
41 |     embedder: The model to embed proteins
42 |     path: The data path
43 | 
44 |     Returns
45 |     -------
46 |     """
47 |     print('Im in')
48 | 
49 |     # Load sequences
50 |     sequence_data = pd.read_csv(path, sep='\t')
51 | 
52 |     # Sample : Uncomment for testing
53 |     # sequence_data = sequence_data.sample(2)
54 | 
55 |     # Select correct columns
56 |     sequence_data = sequence_data[['From', 'Sequence']]
57 | 
58 |     # Embed sequences
59 |     sequence_data['embedding'] = sequence_data['Sequence'].apply(lambda x: prot_trans_embedder.embed(x))
60 | 
61 |     # Aggregate sequences
62 |     sequence_data['squashed'] = sequence_data['embedding'].apply(lambda x: get_protein_repr(x))
63 | 
64 | 
65 |     # Save sequences
66 |     sequence_data.to_csv('../data/processed/uniprot_seq_embeddings.tsv')
67 | 
68 | 
69 | get_protein_embedding('../data/uniprot_sequences.tsv')
70 | 


--------------------------------------------------------------------------------
/bioblp/train.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | 
  3 | from pykeen.pipeline import pipeline
  4 | from pykeen.training import TrainingCallback
  5 | from pykeen.triples import TriplesFactory
  6 | 
  7 | from tap import Tap
  8 | from transformers import get_linear_schedule_with_warmup
  9 | import wandb
 10 | 
 11 | from bioblp.logger import get_logger
 12 | import bioblp.models as models
 13 | from bioblp.utils.bioblp_utils import build_encoders
 14 | from bioblp.utils.training import InBatchNegativesTraining
 15 | 
 16 | 
 17 | class Arguments(Tap):
 18 |     train_triples: str
 19 |     valid_triples: str
 20 |     test_triples: str
 21 | 
 22 |     protein_data: str = None
 23 |     molecule_data: str = None
 24 |     text_data: str = None
 25 | 
 26 |     model: str = 'complex'
 27 |     dimension: int = 256
 28 |     loss_fn: str = 'crossentropy'
 29 |     loss_margin: float = 1.0
 30 |     optimizer: str = 'adagrad'
 31 |     learning_rate: float = 1e-2
 32 |     freeze_pretrained_embeddings: bool = False
 33 |     warmup_fraction: float = None
 34 |     regularizer: float = 1e-6
 35 |     num_epochs: int = 100
 36 |     batch_size: int = 1024
 37 |     eval_batch_size: int = 16
 38 |     eval_every: int = 10
 39 |     num_negatives: int = 512
 40 |     in_batch_negatives: bool = False
 41 |     add_inverses: bool = False
 42 |     early_stopper: str = 'both.realistic.inverse_harmonic_mean_rank'
 43 |     from_checkpoint: str = None
 44 | 
 45 |     search_train_batch_size: bool = False
 46 |     search_eval_batch_size: bool = False
 47 |     log_wandb: bool = False
 48 |     notes: str = None
 49 | 
 50 | 
 51 | class BioBLPCallback(TrainingCallback):
 52 |     """A callback to get the wandb ID of the run before it gets closed.
 53 |     We use it to get a file name for the stored model."""
 54 |     id = None
 55 |     scheduler = None
 56 | 
 57 |     def __init__(self, num_training_steps, warmup_fraction):
 58 |         super().__init__()
 59 |         self.use_scheduler = warmup_fraction is not None
 60 |         if self.use_scheduler:
 61 |             self.num_training_steps = num_training_steps
 62 |             self.num_warmup_steps = int(self.num_training_steps * warmup_fraction)
 63 | 
 64 |     def post_epoch(self, *args, **kwargs):
 65 |         if wandb.run is not None and BioBLPCallback.id is None:
 66 |             BioBLPCallback.id = wandb.run.id
 67 | 
 68 |     def pre_step(self, **kwargs):
 69 |         if not self.use_scheduler:
 70 |             return
 71 | 
 72 |         if self.scheduler is None:
 73 |             self.scheduler = get_linear_schedule_with_warmup(
 74 |                 self.optimizer,
 75 |                 self.num_warmup_steps,
 76 |                 self.num_training_steps
 77 |             )
 78 |         else:
 79 |             self.scheduler.step()
 80 | 
 81 | 
 82 | def run(args: Arguments):
 83 |     cli_args_dict = {f'cli_{k}': v for k, v in args.as_dict().items()}
 84 |     if args.search_train_batch_size:
 85 |         args.batch_size = None
 86 |     if args.search_eval_batch_size:
 87 |         args.eval_batch_size = None
 88 | 
 89 |     logger = get_logger()
 90 |     logger.info('Loading triples...')
 91 | 
 92 |     entity_to_id = relation_to_id = None
 93 |     if args.from_checkpoint:
 94 |         checkpoint_triples = TriplesFactory.from_path_binary(
 95 |             osp.join(args.from_checkpoint, 'training_triples')
 96 |         )
 97 |         entity_to_id = checkpoint_triples.entity_to_id
 98 |         relation_to_id = checkpoint_triples.relation_to_id
 99 | 
100 |     training = TriplesFactory.from_path(
101 |         args.train_triples,
102 |         create_inverse_triples=args.add_inverses,
103 |         entity_to_id=entity_to_id,
104 |         relation_to_id=relation_to_id
105 |     )
106 |     validation = TriplesFactory.from_path(args.valid_triples,
107 |                                           entity_to_id=training.entity_to_id,
108 |                                           relation_to_id=training.relation_to_id)
109 |     testing = TriplesFactory.from_path(args.test_triples,
110 |                                        entity_to_id=training.entity_to_id,
111 |                                        relation_to_id=training.relation_to_id)
112 | 
113 |     logger.info(f'Loaded graph with {training.num_entities:,} entities')
114 |     logger.info(f'{training.num_triples:,} training triples')
115 |     logger.info(f'{validation.num_triples:,} validation triples')
116 |     logger.info(f'{testing.num_triples:,} test triples')
117 | 
118 |     loss_kwargs = None
119 |     if args.loss_fn in {'nssa', 'marginranking'}:
120 |         loss_kwargs = {'margin': args.loss_margin}
121 |     model = args.model
122 |     model_kwargs = {'embedding_dim': args.dimension, 'loss': args.loss_fn}
123 | 
124 |     if any((args.protein_data, args.molecule_data, args.text_data)):
125 |         model = models.get_model_class(args.model)
126 |         dimension = args.dimension
127 |         if args.model in ('complex', 'rotate'):
128 |             dimension *= 2
129 | 
130 |         freeze_pretrained_embeddings = args.freeze_pretrained_embeddings
131 |         encoders = build_encoders(dimension,
132 |                                   training.entity_to_id,
133 |                                   args.protein_data,
134 |                                   args.molecule_data,
135 |                                   args.text_data,
136 |                                   freeze_pretrained_embeddings)
137 |         model_kwargs['entity_representations'] = encoders
138 | 
139 |         if args.from_checkpoint:
140 |             model_kwargs['from_checkpoint'] = args.from_checkpoint
141 | 
142 |     if args.warmup_fraction:
143 |         if args.batch_size is None:
144 |             raise ValueError('Batch size is needed to apply learning rate'
145 |                              ' warmup.')
146 |         num_steps = (training.num_triples // args.batch_size) * args.num_epochs
147 |     else:
148 |         num_steps = None
149 | 
150 |     training_loop = InBatchNegativesTraining if args.in_batch_negatives else None
151 | 
152 |     result = pipeline(training=training,
153 |                       validation=validation,
154 |                       testing=testing,
155 |                       model=model,
156 |                       model_kwargs=model_kwargs,
157 |                       loss_kwargs=loss_kwargs,
158 |                       optimizer=args.optimizer,
159 |                       optimizer_kwargs={'lr': args.learning_rate},
160 |                       regularizer='LpRegularizer',
161 |                       regularizer_kwargs={'weight': args.regularizer},
162 |                       training_kwargs={'num_epochs': args.num_epochs,
163 |                                        'batch_size': args.batch_size,
164 |                                        'callbacks': BioBLPCallback,
165 |                                        'callback_kwargs': {
166 |                                            'num_training_steps': num_steps,
167 |                                            'warmup_fraction': args.warmup_fraction
168 |                                        }},
169 |                       training_loop=training_loop,
170 |                       negative_sampler='basic',
171 |                       negative_sampler_kwargs={
172 |                           'num_negs_per_pos': args.num_negatives
173 |                       },
174 |                       stopper='early',
175 |                       stopper_kwargs={
176 |                           'evaluation_batch_size': args.eval_batch_size,
177 |                           'metric': args.early_stopper,
178 |                           'frequency': args.eval_every,
179 |                           'patience': 5,
180 |                           'relative_delta': 0.0001,
181 |                           'larger_is_better': True
182 |                       },
183 |                       evaluator_kwargs={'batch_size': args.eval_batch_size},
184 |                       result_tracker='wandb',
185 |                       result_tracker_kwargs={
186 |                           'entity': 'discoverylab',
187 |                           'project': 'bioblp',
188 |                           'notes': args.notes,
189 |                           'config': cli_args_dict,
190 |                           'offline': not args.log_wandb
191 |                       }
192 |                       )
193 | 
194 |     result.save_to_directory(osp.join('models', BioBLPCallback.id))
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     run(Arguments(explicit_bool=True).parse_args())
199 | 


--------------------------------------------------------------------------------
/bioblp/train_argparse.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | from pathlib import Path
  3 | from pykeen.pipeline import pipeline
  4 | from pykeen.training import TrainingCallback
  5 | from pykeen.triples import TriplesFactory
  6 | from dataclasses import dataclass, asdict
  7 | # from tap import Tap
  8 | from argparse import ArgumentParser
  9 | import wandb
 10 | import toml
 11 | 
 12 | from bioblp.logging import get_logger
 13 | 
 14 | @dataclass
 15 | class Arguments:
 16 |     #data_splits_path: str
 17 |     #dataset_name: str
 18 |     train_triples: str
 19 |     valid_triples: str
 20 |     test_triples: str
 21 | 
 22 |     model: str = 'complex'
 23 |     dimension: int = 256
 24 |     loss_fn: str = 'crossentropy'
 25 |     loss_margin: float = 1.0
 26 |     optimizer: str = 'adagrad'
 27 |     learning_rate: float = 1e-2
 28 |     regularizer: float = 1e-6
 29 |     num_epochs: int = 100
 30 |     batch_size: int = 1024
 31 |     eval_batch_size: int = 16
 32 |     num_negatives: int = 512
 33 |     add_inverses: bool = False
 34 |     early_stopper: str = 'both.realistic.inverse_harmonic_mean_rank'
 35 | 
 36 |     search_train_batch_size: bool = False
 37 |     search_eval_batch_size: bool = False
 38 |     log_wandb: bool = False
 39 |     notes: str = None
 40 | 
 41 | 
 42 | class WBIDCallback(TrainingCallback):
 43 |     """A callback to get the wandb ID of the run before it gets closed.
 44 |     We use it to get a file name for the stored model."""
 45 |     id = None
 46 | 
 47 |     def post_train(self, *args, **kwargs):
 48 |         if wandb.run is not None:
 49 |             WBIDCallback.id = wandb.run.id
 50 | 
 51 |             
 52 | def load_toml(toml_path: str) -> dict:
 53 |     toml_path = Path(toml_path)
 54 |     config = {}
 55 |     with open(toml_path, "r") as f:
 56 |         config = toml.load(f)
 57 | 
 58 |     return config
 59 | 
 60 | 
 61 | def run(args: Arguments):
 62 |     cli_args_dict = {f'cli_{k}': v for k, v in asdict(args).items()}
 63 |     if args.search_train_batch_size:
 64 |         args.batch_size = None
 65 |     if args.search_eval_batch_size:
 66 |         args.eval_batch_size = None
 67 | 
 68 |     logger = get_logger()
 69 |     logger.info('Loading triples...')
 70 | 
 71 |     training = TriplesFactory.from_path(
 72 |         args.train_triples,
 73 |         create_inverse_triples=args.add_inverses
 74 |     )
 75 |     validation = TriplesFactory.from_path(args.valid_triples)
 76 |     testing = TriplesFactory.from_path(args.test_triples)
 77 | 
 78 |     logger.info(f'Loaded graph with {training.num_entities:,} entities')
 79 |     logger.info(f'{training.num_triples:,} training triples')
 80 |     logger.info(f'{validation.num_triples:,} validation triples')
 81 |     logger.info(f'{testing.num_triples:,} test triples')
 82 | 
 83 |     loss_kwargs = None
 84 |     if args.loss_fn in {'nssa', 'marginranking'}:
 85 |         loss_kwargs = {'margin': args.loss_margin}
 86 | 
 87 |     result = pipeline(training=training,
 88 |                       validation=validation,
 89 |                       testing=testing,
 90 |                       model=args.model,
 91 |                       model_kwargs={'embedding_dim': args.dimension,
 92 |                                     'loss': args.loss_fn},
 93 |                       loss_kwargs=loss_kwargs,
 94 |                       optimizer=args.optimizer,
 95 |                       optimizer_kwargs={'lr': args.learning_rate},
 96 |                       regularizer='LpRegularizer',
 97 |                       #regularizer_kwargs={'weight': args.regularizer},
 98 |                       training_kwargs={'num_epochs': args.num_epochs,
 99 |                                        'batch_size': args.batch_size,
100 |                                        'callbacks': WBIDCallback},
101 |                       negative_sampler='basic',
102 |                       negative_sampler_kwargs={
103 |                           'num_negs_per_pos': args.num_negatives
104 |                       },
105 |                       stopper='early',
106 |                       stopper_kwargs={
107 |                           'evaluation_batch_size': args.eval_batch_size,
108 |                           'metric': args.early_stopper,
109 |                           'frequency': 10,
110 |                           'patience': 5,
111 |                           'relative_delta': 0.0001,
112 |                           'larger_is_better': True
113 |                       },
114 |                       evaluator_kwargs={'batch_size': args.eval_batch_size},
115 |                       result_tracker='wandb',
116 |                       result_tracker_kwargs={
117 |                           'entity': 'discoverylab',
118 |                           'project': 'bioblp',
119 |                           'notes': args.notes,
120 |                           'config': cli_args_dict,
121 |                           'offline': not args.log_wandb
122 |                       }
123 |                       )
124 |     
125 |     result.save_to_directory(osp.join('models', WBIDCallback.id))
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     parser = ArgumentParser(description="Model training routing")
130 |     parser.add_argument("--conf", type=str,
131 |                         help="Path to experiment toml file")
132 |     #parser.add_argument('--out_path', type=str,
133 |     #                    help='Path to write models output')
134 |     
135 |     args = parser.parse_args()
136 |     conf = load_toml(args.conf)
137 |     args = Arguments(**conf)
138 |     run(args)
139 |     #run(Arguments(explicit_bool=True).parse_args())
140 | 


--------------------------------------------------------------------------------
/bioblp/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/bioblp/utils/__init__.py


--------------------------------------------------------------------------------
/bioblp/utils/bioblp_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Mapping
 2 | 
 3 | import bioblp.models.encoders as encoders
 4 | 
 5 | 
 6 | def build_encoders(dim: int,
 7 |                    entity_to_id: Mapping[str, int],
 8 |                    protein_data: str = None,
 9 |                    molecule_data: str = None,
10 |                    text_data: str = None,
11 |                    freeze_pretrained_embeddings: bool = False
12 |                    ) -> encoders.PropertyEncoderRepresentation:
13 |     if not any((protein_data, molecule_data, text_data)):
14 |         raise ValueError("No entity data provided to build encoders.")
15 | 
16 |     encoders_list = []
17 | 
18 |     if protein_data:
19 |         protein_encoder = encoders.PretrainedLookupTableEncoder(
20 |             file_path=protein_data,
21 |             dim=dim,
22 |             freeze_pretrained_embeddings=freeze_pretrained_embeddings
23 |         )
24 |         encoders_list.append(protein_encoder)
25 | 
26 |     if molecule_data:
27 |         # TODO: We might want to set different learning rates for different
28 |         # modules, potentially also with learning rate scheduling
29 |         molecule_encoder = encoders.MoleculeEmbeddingEncoder(
30 |             file_path=molecule_data,
31 |             dim=dim
32 |         )
33 |         encoders_list.append(molecule_encoder)
34 | 
35 |     if text_data:
36 |         text_encoder = encoders.TransformerTextEncoder(
37 |             file_path=text_data,
38 |             dim=dim
39 |         )
40 |         encoders_list.append(text_encoder)
41 | 
42 |     entity_encoders = encoders.PropertyEncoderRepresentation(
43 |         dim=dim,
44 |         entity_to_id=entity_to_id,
45 |         encoders=encoders_list
46 |     )
47 | 
48 |     return entity_encoders
49 | 


--------------------------------------------------------------------------------
/bioblp/utils/pipeline.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def create_pipeline(functions: list):
 4 |     """Sequentially executes a list of functions"""
 5 |     def pipeline(input):
 6 |         res = input
 7 |         for function in functions:
 8 |             res = function(res)
 9 |         return res
10 | 
11 |     return pipeline
12 | 


--------------------------------------------------------------------------------
/bioblp/utils/training.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, Union
 2 | 
 3 | from pykeen.training.slcwa import SLCWATrainingLoop
 4 | from pykeen.models.base import Model
 5 | from pykeen.losses import Loss
 6 | from pykeen.typing import InductiveMode
 7 | from pykeen.triples.instances import SLCWABatch
 8 | import torch
 9 | 
10 | from bioblp.models import BioBLP
11 | 
12 | 
13 | class InBatchNegativesTraining(SLCWATrainingLoop):
14 |     @staticmethod
15 |     def _process_batch_static(
16 |             model: Union[BioBLP, Model],
17 |             loss: Loss,
18 |             mode: Optional[InductiveMode],
19 |             batch: SLCWABatch,
20 |             start: Optional[int],
21 |             stop: Optional[int],
22 |             label_smoothing: float = 0.0,
23 |             slice_size: Optional[int] = None,
24 |     ) -> torch.FloatTensor:
25 |         # Slicing is not possible in sLCWA training loops
26 |         if slice_size is not None:
27 |             raise AttributeError(
28 |                 "Slicing is not possible for sLCWA training loops.")
29 | 
30 |         positive_batch, negative_batch, positive_filter = batch
31 |         positive_batch = positive_batch[start:stop].to(device=model.device)
32 | 
33 |         positive_scores, negative_scores = model.score_hrt_and_negatives(
34 |             positive_batch,
35 |             num_negatives=negative_batch.shape[1],
36 |             mode=mode
37 |         )
38 | 
39 |         return (
40 |                 loss.process_slcwa_scores(
41 |                     positive_scores=positive_scores,
42 |                     negative_scores=negative_scores,
43 |                     label_smoothing=label_smoothing,
44 |                     batch_filter=positive_filter,
45 |                     num_entities=model._get_entity_len(mode=mode),
46 |                 )
47 |                 + model.collect_regularization_term()
48 |         )


--------------------------------------------------------------------------------
/bioblp/utils/triples.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import os.path as osp
  4 | from collections import Counter
  5 | from argparse import ArgumentParser
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | from pykeen.triples import TriplesFactory
 11 | 
 12 | from bioblp.data import COL_SOURCE
 13 | from bioblp.data import COL_EDGE
 14 | from bioblp.data import COL_TARGET
 15 | from bioblp.data import COL_PUBYEAR
 16 | 
 17 | DIR_PROCESSED = 'processed'
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | handler = logging.StreamHandler()
 21 | logger.addHandler(handler)
 22 | logger.setLevel(logging.INFO)
 23 | 
 24 | 
 25 | def get_entity_relation_counts(triples: pd.DataFrame):
 26 |     """Count frequency of entities and relations across triples.
 27 |     Entities are not counted twice if there is a self-loop."""
 28 |     relation_counts = triples[COL_EDGE].value_counts()
 29 | 
 30 |     no_loops = triples[COL_SOURCE] != triples[COL_TARGET]
 31 |     tails_no_loops = triples[COL_TARGET].where(no_loops).dropna()
 32 |     entities = pd.concat([triples[COL_SOURCE], tails_no_loops])
 33 |     entity_counts = entities.value_counts()
 34 | 
 35 |     return entity_counts, relation_counts
 36 | 
 37 | 
 38 | def split_train_test_triples(triples: pd.DataFrame, ratio: float):
 39 |     """Split a dataset of triples into training and test sets, so that all
 40 |     entities in the test set are in the training set.
 41 |     Triples are removed in order starting from index 0. Edges are deleted so
 42 |     that the initial proportion of relation types is preserved in the training
 43 |     set."""
 44 |     entity_counts, relation_counts = get_entity_relation_counts(triples)
 45 |     new_relation_counts = np.floor(relation_counts * ratio).astype(int)
 46 | 
 47 |     train_triples = []
 48 |     test_triples = []
 49 |     removed_relation_counts = Counter()
 50 |     done = {r: count == 0 for r, count in new_relation_counts.items()}
 51 | 
 52 |     with tqdm(total=new_relation_counts.sum(), desc='Removing triples') as bar:
 53 |         for i in range(len(triples)):
 54 |             row = triples.iloc[i]
 55 |             head = row[COL_SOURCE]
 56 |             rel = row[COL_EDGE]
 57 |             tail = row[COL_TARGET]
 58 | 
 59 |             # Check that removing the entity does not remove it from the
 60 |             # training set a count larger than two is required if head == tail
 61 |             if entity_counts[head] > 2 and entity_counts[tail] > 2 and not done[rel]:
 62 |                 entity_counts[head] -= 1
 63 |                 entity_counts[tail] -= 1
 64 |                 test_triples.append(row)
 65 | 
 66 |                 removed_relation_counts[rel] += 1
 67 |                 bar.update(1)
 68 |                 if removed_relation_counts[rel] == new_relation_counts[rel]:
 69 |                     done[rel] = True
 70 |                     if all(done.values()):
 71 |                         break
 72 |             else:
 73 |                 train_triples.append(row)
 74 | 
 75 |     test_triples = pd.DataFrame(test_triples, columns=triples.columns)
 76 |     train_triples = pd.DataFrame(train_triples, columns=triples.columns)
 77 |     # Add the rest of the triples that were not removed
 78 |     train_triples = pd.concat([train_triples, triples.iloc[i + 1:]])
 79 | 
 80 |     print('Done!')
 81 | 
 82 |     return train_triples, test_triples
 83 | 
 84 | 
 85 | def create_splits(triples_path: str, random: bool = False):
 86 |     """Create train/valid/test splits based on timestamps."""
 87 |     print('Reading triples...')
 88 |     triples = pd.read_csv(triples_path, sep='\t')
 89 |     initial_length = len(triples)
 90 | 
 91 |     triples = triples.dropna(subset=[COL_SOURCE, COL_EDGE, COL_TARGET,
 92 |                                      COL_PUBYEAR])
 93 |     triples[COL_PUBYEAR] = triples[COL_PUBYEAR].astype(int)
 94 | 
 95 |     # Sort whole dataframe first to ensure repeatability
 96 |     triples = triples.sort_values(by=list(triples.columns), kind='mergesort')
 97 | 
 98 |     if not random:
 99 |         # Sort by pubyear before deduplicating and removing triples!
100 |         triples = triples.sort_values(by=COL_PUBYEAR, ascending=False,
101 |                                       ignore_index=True, kind='mergesort')
102 |     else:
103 |         triples = triples.sample(frac=1, random_state=0)
104 | 
105 |     # In case of duplicates, keep most recent edge
106 |     triples = triples.drop_duplicates(subset=[COL_SOURCE, COL_EDGE,
107 |                                               COL_TARGET],
108 |                                       keep='first')
109 | 
110 |     print(f'Read {initial_length:,} lines, got {len(triples):,} '
111 |           'after keeping triples with dates and deduplicating.')
112 | 
113 |     train_triples, test_triples = split_train_test_triples(triples, ratio=0.1)
114 | 
115 |     num_test_triples = len(test_triples)
116 |     split_idx = num_test_triples // 2
117 |     valid_triples = test_triples.iloc[split_idx:]
118 |     test_triples = test_triples.iloc[:split_idx]
119 | 
120 |     filename = osp.basename(triples_path)
121 |     name, ext = osp.splitext(filename)
122 |     data_path = osp.join(osp.dirname(osp.dirname(triples_path)), DIR_PROCESSED)
123 | 
124 |     if not osp.exists(data_path):
125 |         os.mkdir(data_path)
126 | 
127 |     splits = {'train': train_triples,
128 |               'valid': valid_triples,
129 |               'test': test_triples}
130 |     for s, dataframe in splits.items():
131 |         out_path = osp.join(data_path, f'{name}-{s}{ext}')
132 |         dataframe.to_csv(out_path, sep='\t', index=False)
133 |         print(f'Saved {len(dataframe):,} triples at {out_path}')
134 | 
135 | 
136 | def load_triples_array(path: str):
137 |     """Given a path to a dataset file, extract only the colums containing
138 |     (head, relation, tail) - i.e. the triples."""
139 |     triples = pd.read_csv(path, sep='\t', dtype=str)
140 |     triples = triples[[COL_SOURCE, COL_EDGE, COL_TARGET]].to_numpy()
141 | 
142 |     return triples
143 | 
144 | 
145 | def load_triples_factories(data_path: str, dataset: str):
146 |     """Load a pykeen.triples.TriplesFactory tuple for training, validation,
147 |     and testing triples."""
148 |     processed_path = osp.join(data_path, DIR_PROCESSED)
149 | 
150 |     train_triples = load_triples_array(osp.join(processed_path,
151 |                                                 f'{dataset}-train.tsv'))
152 |     valid_triples = load_triples_array(osp.join(processed_path,
153 |                                                 f'{dataset}-valid.tsv'))
154 |     test_triples = load_triples_array(osp.join(processed_path,
155 |                                                f'{dataset}-test.tsv'))
156 | 
157 |     training = TriplesFactory.from_labeled_triples(train_triples)
158 |     validation = TriplesFactory.from_labeled_triples(
159 |         valid_triples,
160 |         entity_to_id=training.entity_to_id,
161 |         relation_to_id=training.relation_to_id
162 |     )
163 |     testing = TriplesFactory.from_labeled_triples(
164 |         test_triples,
165 |         entity_to_id=training.entity_to_id,
166 |         relation_to_id=training.relation_to_id
167 |     )
168 | 
169 |     return training, validation, testing
170 | 
171 | 
172 | def reuse_existing_splits(triples_path, dataset_existing_splits):
173 |     """"""
174 | 
175 |     triples = pd.read_csv(triples_path, sep='\t', dtype=str)
176 |     initial_length = len(triples)
177 |     logger.info(f"{initial_length} triples in input")
178 | 
179 |     triples = triples.dropna(subset=[COL_SOURCE, COL_EDGE, COL_TARGET,
180 |                                      COL_PUBYEAR])
181 |     cols = [COL_SOURCE, COL_EDGE, COL_TARGET]
182 |     triples = triples[cols]
183 | 
184 |     filename = osp.basename(triples_path)
185 |     name, ext = osp.splitext(filename)
186 |     data_path = osp.join(osp.dirname(osp.dirname(triples_path)), DIR_PROCESSED)
187 | 
188 |     existing_train_path = osp.join(data_path, f'{dataset_existing_splits}-train{ext}')
189 |     existing_val_path = osp.join(data_path, f'{dataset_existing_splits}-valid{ext}')
190 |     existing_test_path = osp.join(data_path, f'{dataset_existing_splits}-test{ext}')
191 | 
192 |     existing_train = pd.read_csv(existing_train_path, sep='\t', dtype=str)[cols]
193 |     existing_valid = pd.read_csv(existing_val_path, sep='\t', dtype=str)[cols]
194 |     existing_test = pd.read_csv(existing_test_path, sep='\t', dtype=str)[cols]
195 | 
196 |     all_existing_triples = existing_train.append(existing_valid.append(
197 |         existing_test)).sort_values(by=cols, kind='mergesort')
198 | 
199 |     logger.info(f"{len(all_existing_triples)} triples in existing {dataset_existing_splits}")
200 | 
201 |     all_existing_triples_records = set([tuple(x) for x in all_existing_triples.values])
202 |     triple_records = [tuple(x) for x in triples.sort_values(by=cols, kind='mergesort').values]
203 | 
204 |     new_records = []
205 |     with tqdm(total=len(triple_records), desc='Checking triple overlap') as bar:
206 |         for i in range(len(triple_records)):
207 |             row = triple_records[i]
208 | 
209 |             try:
210 |                 all_existing_triples_records.remove(row)
211 |             except KeyError:
212 |                 new_records.append(row)
213 | 
214 |             bar.update(1)
215 |             bar.set_description(
216 |                 f"Checking triple overlap. Remaining set: {len(all_existing_triples_records)}", refresh=True)
217 | 
218 |     # merge new triples plus existing train for new train
219 |     new_triples = pd.DataFrame.from_records(new_records, columns=cols)
220 |     train_triples = new_triples.append(existing_train)
221 | 
222 |     splits = {'train': train_triples,
223 |               'valid': existing_valid,
224 |               'test': existing_test}
225 | 
226 |     for s, dataframe in splits.items():
227 |         out_path = osp.join(data_path, f'{name}-{s}{ext}')
228 |         dataframe.to_csv(out_path, sep='\t', index=False)
229 |         print(f'Saved {len(dataframe):,} triples at {out_path}')
230 | 
231 | 
232 | if __name__ == '__main__':
233 |     parser = ArgumentParser(description='Split a file of triples into '
234 |                                         'train/valid/test sets based on time.')
235 |     parser.add_argument('file', type=str)
236 |     parser.add_argument('--random', action='store_true',
237 |                         help='Split randomly instead.')
238 |     parser.add_argument('--existing_dataset_splits', type=str,
239 |                         help='Name of existing splits (assumed to be in processed)')
240 | 
241 |     args = parser.parse_args()
242 | 
243 |     if args.existing_dataset_splits is not None:
244 |         reuse_existing_splits(args.file, args.existing_dataset_splits)
245 |     else:
246 |         create_splits(args.file, args.random)
247 | 


--------------------------------------------------------------------------------
/bioblp/utils/util.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import pickle
 4 | import torch
 5 | 
 6 | def save_object(obj, filename):
 7 |     with open(filename, 'wb') as output:  # Overwrites any existing file.
 8 |         torch.save(obj, output, pickle_module=dill)
 9 | 
10 | 
11 | def load_object(filename):
12 |     with open(filename, 'wb') as object:
13 |         obj = torch.load(object, pickle_module=dill, encoding='utf-8')
14 | 
15 | 
16 | def read_query(query_filename):
17 |     """
18 |     Read a query from file and return as a string
19 |     Parameters
20 |     ----------
21 |     query_filename: str name of the query. It will be looked for in the queries folder of this project
22 |     Returns
23 |     -------
24 |     query: str the query with placeholders for the query parameters, as a string to be formatted
25 |     """
26 |     # query_filepath = Path(RAW_DIR / QUERY_DIR / query_filename)
27 | 
28 |     with open(query_filename) as fr:
29 |         query = fr.read()
30 |     return query
31 | 
32 |     
33 | def loading_animation(process, message="Loading") :
34 |     while process.isAlive() :
35 |         chars = "/—\|" 
36 |         for char in chars:
37 |             sys.stdout.write('\r' + f'{message} {char} ')
38 |             time.sleep(.1)
39 |             sys.stdout.flush()
40 | 
41 | 
42 | def write_dict_as_pkl(dict_object, filename):
43 |     """
44 |     filename: path to pickle file, should include appropiate .pkl extension
45 |     """
46 |     with open(filename, "wb") as pkl_handle:
47 |         pickle.dump(dict_object, pkl_handle)
48 | 
49 | 
50 | def load_dict_from_pkl(filename):
51 |     """
52 |     filename: path to pickle file, should include appropiate .pkl extension
53 |     """
54 |     with open(filename, "rb") as pkl_handle:
55 |         dict_object = pickle.load(pkl_handle)
56 | 
57 |     return dict_object
58 | 
59 | 


--------------------------------------------------------------------------------
/conf/complex-biokg-20220826.toml:
--------------------------------------------------------------------------------
 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-train.tsv'
 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-valid.tsv'
 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-test.tsv'
 4 | 
 5 | model = 'complex'
 6 | dimension = 256
 7 | loss_fn = 'crossentropy'
 8 | loss_margin = 1.0
 9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 20
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 | 
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 | 


--------------------------------------------------------------------------------
/conf/complex-biokg-full-20220826.toml:
--------------------------------------------------------------------------------
 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-train.tsv'
 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-valid.tsv'
 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-test.tsv'
 4 | 
 5 | model = 'complex'
 6 | dimension = 256
 7 | loss_fn = 'crossentropy'
 8 | loss_margin = 1.0
 9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 2
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 | 
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 | 


--------------------------------------------------------------------------------
/conf/complex-hetionet-20220826.toml:
--------------------------------------------------------------------------------
 1 | train_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-train.tsv'
 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-valid.tsv'
 3 | test_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-test.tsv'
 4 | 
 5 | model = 'complex'
 6 | dimension = 256
 7 | loss_fn = 'crossentropy'
 8 | loss_margin = 1.0
 9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 200
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 128
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 | 
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = true
22 | notes = 'attempt to reproduce hetionet reported results'


--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-lr.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
 3 | experiment_root = "data/benchmarks/experiments/DPI/"
 4 | 
 5 | [sampling]
 6 | outdir = "sampled"
 7 | num_negs_per_pos = 10
 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
 9 | 
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 | 
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 | 
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 | 
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 | 
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 | 
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 | 
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 | 
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 | 
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 | 
41 | 
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 | 
46 | 
47 | [models]
48 | 
49 | [models.noise_lr]
50 | feature = "noise"
51 | model = "LR"
52 | 
53 | [models.structural_lr]
54 | feature = "structural"
55 | model = "LR"
56 | 
57 | [models.transe_lr]
58 | feature = "transe"
59 | model = "LR"
60 | 
61 | [models.complex_lr]
62 | feature = "complex"
63 | model = "LR"
64 | 
65 | [models.rotate_lr]
66 | feature = "rotate"
67 | model = "LR"
68 | 
69 | [models.bioblpd_lr]
70 | feature = "bioblpd"
71 | model = "LR"
72 | 
73 | [models.bioblpm_lr]
74 | feature = "bioblpm"
75 | model = "LR"
76 | 
77 | [models.bioblpp_lr]
78 | feature = "bioblpp"
79 | model = "LR"
80 | 
81 | 
82 | [train]
83 | n_iter = 10
84 | splits_file = "cv-splits.pt"
85 | refit_params = ["AUCPR", "AUCROC"]
86 | outdir = "models"
87 | 


--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-mlp-1.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
 3 | experiment_root = "data/benchmarks/experiments/DPI/"
 4 | 
 5 | [sampling]
 6 | outdir = "sampled"
 7 | num_negs_per_pos = 10
 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
 9 | 
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 | 
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 | 
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 | 
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 | 
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 | 
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 | 
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 | 
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 | 
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 | 
41 | 
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 | 
46 | 
47 | [models]
48 | 
49 | [models.noise_mlp]
50 | feature = "noise"
51 | model = "MLP"
52 | 
53 | [models.structural_mlp]
54 | feature = "structural"
55 | model = "MLP"
56 | 
57 | [models.transe_mlp]
58 | feature = "transe"
59 | model = "MLP"
60 | 
61 | [models.complex_mlp]
62 | feature = "complex"
63 | model = "MLP"
64 | 
65 | 
66 | [train]
67 | n_iter = 10
68 | splits_file = "cv-splits.pt"
69 | refit_params = ["AUCPR", "AUCROC"]
70 | outdir = "models"
71 | 


--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-mlp-2.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
 3 | experiment_root = "data/benchmarks/experiments/DPI/"
 4 | 
 5 | [sampling]
 6 | outdir = "sampled"
 7 | num_negs_per_pos = 10
 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
 9 | 
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 | 
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 | 
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 | 
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 | 
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 | 
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 | 
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 | 
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 | 
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 | 
41 | 
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 | 
46 | 
47 | [models]
48 | 
49 | 
50 | [models.rotate_mlp]
51 | feature = "rotate"
52 | model = "MLP"
53 | 
54 | [models.bioblpd_mlp]
55 | feature = "bioblpd"
56 | model = "MLP"
57 | 
58 | [models.bioblpm_mlp]
59 | feature = "bioblpm"
60 | model = "MLP"
61 | 
62 | [models.bioblpp_mlp]
63 | feature = "bioblpp"
64 | model = "MLP"
65 | 
66 | 
67 | [train]
68 | n_iter = 10
69 | splits_file = "cv-splits.pt"
70 | refit_params = ["AUCPR", "AUCROC"]
71 | outdir = "models"
72 | 


--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-20230423-rf.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
 3 | experiment_root = "data/benchmarks/experiments/DPI/"
 4 | 
 5 | [sampling]
 6 | outdir = "sampled"
 7 | num_negs_per_pos = 10
 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
 9 | 
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 | 
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 | 
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 | 
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 | 
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 | 
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 | 
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 | 
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 | 
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 | 
41 | 
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 | 
46 | 
47 | [models]
48 | 
49 | [models.noise_rf]
50 | feature = "noise"
51 | model = "RF"
52 | 
53 | [models.structural_rf]
54 | feature = "structural"
55 | model = "RF"
56 | 
57 | [models.transe_rf]
58 | feature = "transe"
59 | model = "RF"
60 | 
61 | [models.complex_rf]
62 | feature = "complex"
63 | model = "RF"
64 | 
65 | [models.rotate_rf]
66 | feature = "rotate"
67 | model = "RF"
68 | 
69 | [models.bioblpd_rf]
70 | feature = "bioblpd"
71 | model = "RF"
72 | 
73 | [models.bioblpm_rf]
74 | feature = "bioblpm"
75 | model = "RF"
76 | 
77 | [models.bioblpp_rf]
78 | feature = "bioblpp"
79 | model = "RF"
80 | 
81 | 
82 | [train]
83 | n_iter = 10
84 | splits_file = "cv-splits.pt"
85 | refit_params = ["AUCPR", "AUCROC"]
86 | outdir = "models"
87 | 


--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-r1-20230424-mlp.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
 3 | experiment_root = "data/benchmarks/experiments/DPI/"
 4 | 
 5 | [sampling]
 6 | outdir = "sampled"
 7 | num_negs_per_pos = 1
 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
 9 | 
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "mean"
14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
15 | 
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 | 
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 | 
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 | 
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 | 
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 | 
32 | [features.encoder_args.bioblpd]
33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
34 | 
35 | [features.encoder_args.bioblpm]
36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
37 | 
38 | [features.encoder_args.bioblpp]
39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
40 | 
41 | 
42 | [split]
43 | n_splits = 5
44 | outdir = "splits"
45 | 
46 | 
47 | [models]
48 | 
49 | [models.noise_mlp]
50 | feature = "noise"
51 | model = "MLP"
52 | 
53 | [models.structural_mlp]
54 | feature = "structural"
55 | model = "MLP"
56 | 
57 | [models.transe_mlp]
58 | feature = "transe"
59 | model = "MLP"
60 | 
61 | [models.complex_mlp]
62 | feature = "complex"
63 | model = "MLP"
64 | 
65 | [models.rotate_mlp]
66 | feature = "rotate"
67 | model = "MLP"
68 | 
69 | [models.bioblpd_mlp]
70 | feature = "bioblpd"
71 | model = "MLP"
72 | 
73 | [models.bioblpm_mlp]
74 | feature = "bioblpm"
75 | model = "MLP"
76 | 
77 | [models.bioblpp_mlp]
78 | feature = "bioblpp"
79 | model = "MLP"
80 | 
81 | 
82 | [train]
83 | n_iter = 10
84 | splits_file = "cv-splits.pt"
85 | refit_params = ["AUCPR", "AUCROC"]
86 | outdir = "models"
87 | 


--------------------------------------------------------------------------------
/conf/dpi-benchmark-cv-r1-20230424-rflr.toml:
--------------------------------------------------------------------------------
  1 | 
  2 | data_root = "/home/jovyan/workbench-shared-folder/bioblp/"
  3 | experiment_root = "data/benchmarks/experiments/DPI/"
  4 | 
  5 | [sampling]
  6 | outdir = "sampled"
  7 | num_negs_per_pos = 1
  8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
  9 | 
 10 | [features]
 11 | outdir = "features"
 12 | transform = "concat"
 13 | missing_values = "mean"
 14 | encoders = ["noise", "structural", "complex", "rotate", "transe", "bioblpd", "bioblpm", "bioblpp" ]
 15 | 
 16 | [features.encoder_args.noise]
 17 | random_seed = 24
 18 | 
 19 | [features.encoder_args.structural]
 20 | proteins = "data/benchmarks/experiments/encoders/proteins"
 21 | molecules = "data/benchmarks/experiments/encoders/molecules"
 22 | 
 23 | [features.encoder_args.complex]
 24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
 25 | 
 26 | [features.encoder_args.rotate]
 27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
 28 | 
 29 | [features.encoder_args.transe]
 30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
 31 | 
 32 | [features.encoder_args.bioblpd]
 33 | model_dir = "data/benchmarks/experiments/encoders/bioblpd/"
 34 | 
 35 | [features.encoder_args.bioblpm]
 36 | model_dir = "data/benchmarks/experiments/encoders/bioblpm/"
 37 | 
 38 | [features.encoder_args.bioblpp]
 39 | model_dir = "data/benchmarks/experiments/encoders/bioblpp/"
 40 | 
 41 | 
 42 | [split]
 43 | n_splits = 5
 44 | outdir = "splits"
 45 | 
 46 | 
 47 | [models]
 48 | 
 49 | [models.noise_lr]
 50 | feature = "noise"
 51 | model = "LR"
 52 | 
 53 | [models.structural_lr]
 54 | feature = "structural"
 55 | model = "LR"
 56 | 
 57 | [models.transe_lr]
 58 | feature = "transe"
 59 | model = "LR"
 60 | 
 61 | [models.complex_lr]
 62 | feature = "complex"
 63 | model = "LR"
 64 | 
 65 | [models.rotate_lr]
 66 | feature = "rotate"
 67 | model = "LR"
 68 | 
 69 | [models.bioblpd_lr]
 70 | feature = "bioblpd"
 71 | model = "LR"
 72 | 
 73 | [models.bioblpm_lr]
 74 | feature = "bioblpm"
 75 | model = "LR"
 76 | 
 77 | [models.bioblpp_lr]
 78 | feature = "bioblpp"
 79 | model = "LR"
 80 | 
 81 | 
 82 | [models.noise_rf]
 83 | feature = "noise"
 84 | model = "RF"
 85 | 
 86 | [models.structural_rf]
 87 | feature = "structural"
 88 | model = "RF"
 89 | 
 90 | [models.transe_rf]
 91 | feature = "transe"
 92 | model = "RF"
 93 | 
 94 | [models.complex_rf]
 95 | feature = "complex"
 96 | model = "RF"
 97 | 
 98 | [models.rotate_rf]
 99 | feature = "rotate"
100 | model = "RF"
101 | 
102 | [models.bioblpd_rf]
103 | feature = "bioblpd"
104 | model = "RF"
105 | 
106 | [models.bioblpm_rf]
107 | feature = "bioblpm"
108 | model = "RF"
109 | 
110 | [models.bioblpp_rf]
111 | feature = "bioblpp"
112 | model = "RF"
113 | 
114 | 
115 | [train]
116 | n_iter = 10
117 | splits_file = "cv-splits.pt"
118 | refit_params = ["AUCPR", "AUCROC"]
119 | outdir = "models"
120 | 


--------------------------------------------------------------------------------
/data/conf/complex-biokg-20220826.toml:
--------------------------------------------------------------------------------
 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-train.tsv'
 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-valid.tsv'
 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_splits/biokg_mini_random_900505-test.tsv'
 4 | 
 5 | model = 'complex'
 6 | dimension = 256
 7 | loss_fn = 'crossentropy'
 8 | loss_margin = 1.0
 9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 20
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 | 
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 | 


--------------------------------------------------------------------------------
/data/conf/complex-biokg-full-20220826.toml:
--------------------------------------------------------------------------------
 1 | train_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-train.tsv'
 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-valid.tsv'
 3 | test_triples = '/home/jovyan/BioBLP/data/raw/biokg_full_splits/biokg_random_900505-test.tsv'
 4 | 
 5 | model = 'complex'
 6 | dimension = 256
 7 | loss_fn = 'crossentropy'
 8 | loss_margin = 1.0
 9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 2
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 512
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 | 
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = false
22 | notes = 'training a play model on biokg data to setup downstream DPI ML clf for benchmarking pipeline'
23 | 


--------------------------------------------------------------------------------
/data/conf/complex-hetionet-20220826.toml:
--------------------------------------------------------------------------------
 1 | train_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-train.tsv'
 2 | valid_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-valid.tsv'
 3 | test_triples = '/home/jovyan/BioBLP/data/raw/hetionet_splits/hetionet_random_801010-test.tsv'
 4 | 
 5 | model = 'complex'
 6 | dimension = 256
 7 | loss_fn = 'crossentropy'
 8 | loss_margin = 1.0
 9 | optimizer = 'adagrad'
10 | learning_rate = 1e-2
11 | regularizer = 1e-6
12 | num_epochs = 200
13 | batch_size = 128
14 | eval_batch_size = 16
15 | num_negatives = 128
16 | add_inverses = false
17 | early_stopper = 'both.realistic.inverse_harmonic_mean_rank'
18 | 
19 | search_train_batch_size = false
20 | search_eval_batch_size = false
21 | log_wandb = true
22 | notes = 'attempt to reproduce hetionet reported results'


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: bioblp
  2 | channels:
  3 |   - huggingface
  4 |   - pytorch
  5 |   - conda-forge
  6 |   - defaults
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=main
  9 |   - _openmp_mutex=5.1=1_gnu
 10 |   - anyio=3.5.0=py39h06a4308_0
 11 |   - appdirs=1.4.4=pyh9f0ad1d_0
 12 |   - argon2-cffi=21.3.0=pyhd3eb1b0_0
 13 |   - argon2-cffi-bindings=21.2.0=py39h7f8727e_0
 14 |   - asttokens=2.0.5=pyhd3eb1b0_0
 15 |   - babel=2.9.1=pyhd3eb1b0_0
 16 |   - backcall=0.2.0=pyhd3eb1b0_0
 17 |   - beautifulsoup4=4.11.1=py39h06a4308_0
 18 |   - blas=1.0=mkl
 19 |   - bleach=4.1.0=pyhd3eb1b0_0
 20 |   - bottleneck=1.3.5=py39h7deecbd_0
 21 |   - brotli=1.0.9=h166bdaf_7
 22 |   - brotli-bin=1.0.9=h166bdaf_7
 23 |   - brotlipy=0.7.0=py39h27cfd23_1003
 24 |   - bzip2=1.0.8=h7b6447c_0
 25 |   - ca-certificates=2022.12.7=ha878542_0
 26 |   - certifi=2022.12.7=pyhd8ed1ab_0
 27 |   - cffi=1.15.1=py39h74dc2b5_0
 28 |   - charset-normalizer=2.0.4=pyhd3eb1b0_0
 29 |   - click=8.0.4=py39h06a4308_0
 30 |   - contourpy=1.0.5=py39hdb19cb5_0
 31 |   - cryptography=37.0.1=py39h9ce1e76_0
 32 |   - cudatoolkit=11.3.1=h2bc3f7f_2
 33 |   - cycler=0.11.0=pyhd8ed1ab_0
 34 |   - dataclasses=0.8=pyh6d0b6a4_7
 35 |   - dbus=1.13.18=hb2f20db_0
 36 |   - debugpy=1.5.1=py39h295c915_0
 37 |   - decorator=5.1.1=pyhd3eb1b0_0
 38 |   - defusedxml=0.7.1=pyhd3eb1b0_0
 39 |   - entrypoints=0.4=py39h06a4308_0
 40 |   - executing=0.8.3=pyhd3eb1b0_0
 41 |   - expat=2.4.9=h6a678d5_0
 42 |   - ffmpeg=4.3=hf484d3e_0
 43 |   - filelock=3.6.0=pyhd3eb1b0_0
 44 |   - fontconfig=2.13.1=h6c09931_0
 45 |   - fonttools=4.25.0=pyhd3eb1b0_0
 46 |   - freetype=2.11.0=h70c0345_0
 47 |   - giflib=5.2.1=h7b6447c_0
 48 |   - glib=2.69.1=h4ff587b_1
 49 |   - gmp=6.2.1=h295c915_3
 50 |   - gnutls=3.6.15=he1e5248_0
 51 |   - gst-plugins-base=1.14.0=h8213a91_2
 52 |   - gstreamer=1.14.0=h28cd5cc_2
 53 |   - huggingface_hub=0.10.1=py_0
 54 |   - icu=58.2=he6710b0_3
 55 |   - idna=3.4=py39h06a4308_0
 56 |   - importlib-metadata=4.11.3=py39h06a4308_0
 57 |   - importlib_metadata=4.11.3=hd3eb1b0_0
 58 |   - intel-openmp=2021.4.0=h06a4308_3561
 59 |   - ipykernel=6.15.2=py39h06a4308_0
 60 |   - ipython=8.4.0=py39h06a4308_0
 61 |   - ipython_genutils=0.2.0=pyhd3eb1b0_1
 62 |   - ipywidgets=7.6.5=pyhd3eb1b0_1
 63 |   - jedi=0.18.1=py39h06a4308_1
 64 |   - jinja2=3.0.3=pyhd3eb1b0_0
 65 |   - joblib=1.1.0=pyhd3eb1b0_0
 66 |   - jpeg=9e=h7f8727e_0
 67 |   - json5=0.9.6=pyhd3eb1b0_0
 68 |   - jsonschema=4.16.0=py39h06a4308_0
 69 |   - jupyter=1.0.0=py39h06a4308_8
 70 |   - jupyter_client=7.3.5=py39h06a4308_0
 71 |   - jupyter_console=6.4.3=pyhd3eb1b0_0
 72 |   - jupyter_core=4.11.1=py39h06a4308_0
 73 |   - jupyter_server=1.18.1=py39h06a4308_0
 74 |   - jupyterlab=3.4.4=py39h06a4308_0
 75 |   - jupyterlab_pygments=0.1.2=py_0
 76 |   - jupyterlab_server=2.15.2=py39h06a4308_0
 77 |   - jupyterlab_widgets=1.0.0=pyhd3eb1b0_1
 78 |   - kiwisolver=1.4.2=py39h295c915_0
 79 |   - krb5=1.19.2=hac12032_0
 80 |   - lame=3.100=h7b6447c_0
 81 |   - lcms2=2.12=h3be6417_0
 82 |   - ld_impl_linux-64=2.38=h1181459_1
 83 |   - lerc=3.0=h295c915_0
 84 |   - libbrotlicommon=1.0.9=h166bdaf_7
 85 |   - libbrotlidec=1.0.9=h166bdaf_7
 86 |   - libbrotlienc=1.0.9=h166bdaf_7
 87 |   - libclang=10.0.1=default_hb85057a_2
 88 |   - libdeflate=1.8=h7f8727e_5
 89 |   - libedit=3.1.20210910=h7f8727e_0
 90 |   - libevent=2.1.12=h8f2d780_0
 91 |   - libffi=3.3=he6710b0_2
 92 |   - libgcc-ng=11.2.0=h1234567_1
 93 |   - libgfortran-ng=12.2.0=h69a702a_19
 94 |   - libgfortran5=12.2.0=h337968e_19
 95 |   - libgomp=11.2.0=h1234567_1
 96 |   - libiconv=1.16=h7f8727e_2
 97 |   - libidn2=2.3.2=h7f8727e_0
 98 |   - libllvm10=10.0.1=hbcb73fb_5
 99 |   - libpng=1.6.37=hbc83047_0
100 |   - libpq=12.9=h16c4e8d_3
101 |   - libprotobuf=3.20.1=h4ff587b_0
102 |   - libsodium=1.0.18=h7b6447c_0
103 |   - libstdcxx-ng=11.2.0=h1234567_1
104 |   - libtasn1=4.16.0=h27cfd23_0
105 |   - libtiff=4.4.0=hecacb30_0
106 |   - libunistring=0.9.10=h27cfd23_0
107 |   - libuuid=1.0.3=h7f8727e_2
108 |   - libwebp=1.2.4=h11a3e52_0
109 |   - libwebp-base=1.2.4=h5eee18b_0
110 |   - libxcb=1.15=h7f8727e_0
111 |   - libxkbcommon=1.0.1=hfa300c1_0
112 |   - libxml2=2.9.14=h74e7548_0
113 |   - libxslt=1.1.35=h4e12654_0
114 |   - lz4-c=1.9.3=h295c915_1
115 |   - markupsafe=2.1.1=py39h7f8727e_0
116 |   - matplotlib=3.6.2=py39hf3d152e_0
117 |   - matplotlib-base=3.6.2=py39h945d387_0
118 |   - matplotlib-inline=0.1.6=py39h06a4308_0
119 |   - mistune=0.8.4=py39h27cfd23_1000
120 |   - mkl=2021.4.0=h06a4308_640
121 |   - mkl-service=2.4.0=py39h7f8727e_0
122 |   - mkl_fft=1.3.1=py39hd3c417c_0
123 |   - mkl_random=1.2.2=py39h51133e4_0
124 |   - munkres=1.1.4=pyh9f0ad1d_0
125 |   - nbclassic=0.3.5=pyhd3eb1b0_0
126 |   - nbclient=0.5.13=py39h06a4308_0
127 |   - nbconvert=6.4.4=py39h06a4308_0
128 |   - nbformat=5.5.0=py39h06a4308_0
129 |   - ncurses=6.3=h5eee18b_3
130 |   - nest-asyncio=1.5.5=py39h06a4308_0
131 |   - nettle=3.7.3=hbbd107a_1
132 |   - notebook=6.4.12=py39h06a4308_0
133 |   - nspr=4.33=h295c915_0
134 |   - nss=3.74=h0370c37_0
135 |   - numexpr=2.8.4=py39he184ba9_0
136 |   - numpy=1.23.3=py39h14f4228_0
137 |   - numpy-base=1.23.3=py39h31eccc5_0
138 |   - openh264=2.1.1=h4ff587b_0
139 |   - openssl=1.1.1t=h7f8727e_0
140 |   - packaging=21.3=pyhd3eb1b0_0
141 |   - pandocfilters=1.5.0=pyhd3eb1b0_0
142 |   - parso=0.8.3=pyhd3eb1b0_0
143 |   - patsy=0.5.3=pyhd8ed1ab_0
144 |   - pcre=8.45=h295c915_0
145 |   - pexpect=4.8.0=pyhd3eb1b0_3
146 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
147 |   - pillow=9.2.0=py39hace64e9_1
148 |   - pip=22.2.2=py39h06a4308_0
149 |   - ply=3.11=py39h06a4308_0
150 |   - pooch=1.6.0=pyhd8ed1ab_0
151 |   - prometheus_client=0.14.1=py39h06a4308_0
152 |   - prompt-toolkit=3.0.20=pyhd3eb1b0_0
153 |   - prompt_toolkit=3.0.20=hd3eb1b0_0
154 |   - protobuf=3.20.1=py39h295c915_0
155 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
156 |   - pure_eval=0.2.2=pyhd3eb1b0_0
157 |   - pycparser=2.21=pyhd3eb1b0_0
158 |   - pygments=2.11.2=pyhd3eb1b0_0
159 |   - pyopenssl=22.0.0=pyhd3eb1b0_0
160 |   - pyparsing=3.0.9=py39h06a4308_0
161 |   - pyqt=5.15.7=py39h6a678d5_1
162 |   - pyqt5-sip=12.11.0=py39h6a678d5_1
163 |   - pyrsistent=0.18.0=py39heee7806_0
164 |   - pysocks=1.7.1=py39h06a4308_0
165 |   - python=3.9.13=haa1d7c7_2
166 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
167 |   - python-fastjsonschema=2.16.2=py39h06a4308_0
168 |   - python_abi=3.9=2_cp39
169 |   - pytorch=1.12.1=py3.9_cuda11.3_cudnn8.3.2_0
170 |   - pytorch-mutex=1.0=cuda
171 |   - pyyaml=6.0=py39h7f8727e_1
172 |   - pyzmq=23.2.0=py39h6a678d5_0
173 |   - qt-main=5.15.2=h327a75a_7
174 |   - qt-webengine=5.15.9=hd2b0992_4
175 |   - qtconsole=5.3.2=py39h06a4308_0
176 |   - qtpy=2.2.0=py39h06a4308_0
177 |   - qtwebkit=5.212=h4eab89a_4
178 |   - readline=8.1.2=h7f8727e_1
179 |   - regex=2022.7.9=py39h5eee18b_0
180 |   - requests=2.28.1=py39h06a4308_0
181 |   - sacremoses=master=py_0
182 |   - seaborn=0.12.2=hd8ed1ab_0
183 |   - seaborn-base=0.12.2=pyhd8ed1ab_0
184 |   - send2trash=1.8.0=pyhd3eb1b0_1
185 |   - setuptools=63.4.1=py39h06a4308_0
186 |   - sip=6.6.2=py39h6a678d5_0
187 |   - six=1.16.0=pyhd3eb1b0_1
188 |   - sniffio=1.2.0=py39h06a4308_1
189 |   - soupsieve=2.3.1=pyhd3eb1b0_0
190 |   - sqlite=3.39.3=h5082296_0
191 |   - stack_data=0.2.0=pyhd3eb1b0_0
192 |   - statsmodels=0.13.5=py39h7deecbd_1
193 |   - terminado=0.13.1=py39h06a4308_0
194 |   - testpath=0.6.0=py39h06a4308_0
195 |   - tk=8.6.12=h1ccaba5_0
196 |   - toml=0.10.2=pyhd3eb1b0_0
197 |   - torchaudio=0.12.1=py39_cu113
198 |   - torchvision=0.13.1=py39_cu113
199 |   - tornado=6.2=py39h5eee18b_0
200 |   - tqdm=4.64.1=py39h06a4308_0
201 |   - traitlets=5.1.1=pyhd3eb1b0_0
202 |   - typing-extensions=4.3.0=py39h06a4308_0
203 |   - typing_extensions=4.3.0=py39h06a4308_0
204 |   - tzdata=2022e=h04d1e81_0
205 |   - urllib3=1.26.11=py39h06a4308_0
206 |   - wcwidth=0.2.5=pyhd3eb1b0_0
207 |   - webencodings=0.5.1=py39h06a4308_1
208 |   - websocket-client=0.58.0=py39h06a4308_4
209 |   - wheel=0.37.1=pyhd3eb1b0_0
210 |   - widgetsnbextension=3.5.2=py39h06a4308_0
211 |   - xz=5.2.6=h5eee18b_0
212 |   - yaml=0.2.5=h7b6447c_0
213 |   - zeromq=4.3.4=h2531618_0
214 |   - zipp=3.8.0=py39h06a4308_0
215 |   - zlib=1.2.12=h5eee18b_3
216 |   - zstd=1.5.2=ha4553b6_0
217 |   - pip:
218 |     - alembic==1.8.1
219 |     - attrs==22.1.0
220 |     - autopage==0.5.1
221 |     - class-resolver==0.3.10
222 |     - click-default-group==1.2.2
223 |     - cliff==4.0.0
224 |     - cmaes==0.8.2
225 |     - cmd2==2.4.2
226 |     - colorlog==6.7.0
227 |     - dataclasses-json==0.5.7
228 |     - dill==0.3.6
229 |     - docdata==0.0.3
230 |     - docker-pycreds==0.4.0
231 |     - gitdb==4.0.9
232 |     - gitpython==3.1.29
233 |     - greenlet==1.1.3.post0
234 |     - mako==1.2.3
235 |     - marshmallow==3.18.0
236 |     - marshmallow-enum==1.5.1
237 |     - more-click==0.1.1
238 |     - more-itertools==9.0.0
239 |     - mypy-extensions==0.4.3
240 |     - networkx==3.0
241 |     - optuna==3.0.3
242 |     - pandas==1.5.1
243 |     - pathtools==0.1.2
244 |     - pbr==5.10.0
245 |     - prettytable==3.4.1
246 |     - promise==2.3
247 |     - psutil==5.9.3
248 |     - pykeen==1.9.0
249 |     - pyperclip==1.8.2
250 |     - pystow==0.4.6
251 |     - pytz==2022.5
252 |     - rexmex==0.1.2
253 |     - scikit-learn==1.1.2
254 |     - scipy==1.8.1
255 |     - sentry-sdk==1.9.10
256 |     - setproctitle==1.3.2
257 |     - shortuuid==1.0.9
258 |     - sklearn==0.0
259 |     - smmap==5.0.0
260 |     - sqlalchemy==1.4.42
261 |     - stevedore==4.0.1
262 |     - tabulate==0.9.0
263 |     - threadpoolctl==3.1.0
264 |     - tokenizers==0.10.3
265 |     - torch-max-mem==0.0.4
266 |     - torch-ppr==0.0.8
267 |     - transformers==4.11.3
268 |     - typed-argument-parser==1.7.2
269 |     - typing-inspect==0.8.0
270 |     - wandb==0.13.4
271 | 


--------------------------------------------------------------------------------
/fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/fig.png


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-complex-initialized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-d
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout fix_bioblp_init
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=complex \
27 |         --dimension=256 \
28 |         --loss_fn=bcewithlogits \
29 |         --optimizer=adam \
30 |         --learning_rate=2e-5 \
31 |         --warmup_fraction=0.05 \
32 |         --num_epochs=100 \
33 |         --batch_size=1024 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --in_batch_negatives=True \
37 |         --from_checkpoint=models/1e9b4f4o \
38 |         --log_wandb=True \
39 |         --notes="ComplEx BioBLP-D initialized with 1e9b4f4o"
40 | 
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-complex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-d-complex
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout develop 
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=complex \
27 |         --dimension=256 \
28 |         --loss_fn=bcewithlogits \
29 |         --optimizer=adam \
30 |         --learning_rate=2e-5 \
31 |         --warmup_fraction=0.05 \
32 |         --num_epochs=100 \
33 |         --batch_size=1024 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --in_batch_negatives=True \
37 |         --log_wandb=True \
38 |         --notes="ComplEx BioBLP-D"
39 | 
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-rotate-initialized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-d
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout disease-encoder-checkpoint
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=rotate \
27 |         --dimension=256 \
28 |         --loss_fn=crossentropy \
29 |         --optimizer=adam \
30 |         --learning_rate=2e-5 \
31 |         --warmup_fraction=0.05 \
32 |         --num_epochs=100 \
33 |         --batch_size=1024 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --in_batch_negatives=True \
37 |         --from_checkpoint=models/36viovqn \
38 |         --log_wandb=True \
39 |         --notes="RotatE BioBLP-D initialized with 36viovqn, higher patience"
40 | 
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-rotate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-d
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout disease-encoder-dummy
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=rotate \
27 |         --dimension=256 \
28 |         --loss_fn=crossentropy \
29 |         --optimizer=adam \
30 |         --learning_rate=2e-5 \
31 |         --warmup_fraction=0.05 \
32 |         --num_epochs=100 \
33 |         --batch_size=1024 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --in_batch_negatives=True \
37 |         --log_wandb=True \
38 |         --notes="BioBLP-D"
39 | 
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-transe-initialized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-d-transe
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout fix_bioblp_init
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=transe \
27 |         --dimension=512 \
28 |         --loss_fn=marginranking \
29 |         --loss_margin=8.155451890616455 \
30 |         --optimizer=adam \
31 |         --learning_rate=2e-5 \
32 |         --warmup_fraction=0.05 \
33 |         --num_epochs=100 \
34 |         --batch_size=1024 \
35 |         --eval_batch_size=64 \
36 |         --num_negatives=512 \
37 |         --in_batch_negatives=True \
38 |         --from_checkpoint=models/394htt2x \
39 |         --log_wandb=True \
40 |         --notes="TransE BioBLP-D initialized with 394htt2x"
41 | 
42 | # Keep files generated during job
43 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
44 | mkdir -p $RESULTS_FOLDER
45 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
46 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-d-transe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-d-transe
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout fix_bioblp_init
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=transe \
27 |         --dimension=512 \
28 |         --loss_fn=marginranking \
29 |         --loss_margin=8.155451890616455 \
30 |         --optimizer=adam \
31 |         --learning_rate=2e-5 \
32 |         --warmup_fraction=0.05 \
33 |         --num_epochs=100 \
34 |         --batch_size=1024 \
35 |         --eval_batch_size=64 \
36 |         --num_negatives=512 \
37 |         --in_batch_negatives=True \
38 |         --log_wandb=True \
39 |         --notes="TransE BioBLP-D, margin from sage-shadow-1047"
40 | 
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-complex-sweep
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout develop
20 | wandb agent --count 1 discoverylab/bioblp/70t4kuu5
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: complex
11 |   loss_fn:
12 |     value: bcewithlogits
13 |   optimizer:
14 |     value: adam
15 |   learning_rate:
16 |     distribution: log_uniform_values
17 |     min: 1e-3
18 |     max: 1.0
19 |   regularizer:
20 |     distribution: log_uniform_values
21 |     min: 1e-6
22 |     max: 1e-3
23 |   batch_size:
24 |     value: 1024
25 |   eval_batch_size:
26 |     value: 64
27 |   in_batch_negatives:
28 |     value: true
29 | command:
30 |   - ${env}
31 |   - python
32 |   - "-m"
33 |   - ${program}
34 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
35 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
36 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
37 |   - '--search_eval_batch_size=True'
38 |   - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
39 |   - '--log_wandb=True'
40 |   - '--notes="BioBLP-P ComplEx sweep"'
41 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-adagrad-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-bioblp-m-rotate-sweep
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/oouxbq6p
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-adagrad-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: crossentropy
13 |   optimizer:
14 |     value: adagrad
15 |   learning_rate:
16 |     distribution: log_uniform_values
17 |     min: 1e-3
18 |     max: 1e-1
19 |   regularizer:
20 |     distribution: log_uniform_values
21 |     min: 1e-6
22 |     max: 1e-3
23 |   batch_size:
24 |     value: 1024
25 |   eval_batch_size:
26 |     value: 64
27 |   in_batch_negatives:
28 |     value: true
29 | command:
30 |   - ${env}
31 |   - python
32 |   - "-m"
33 |   - ${program}
34 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
35 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
36 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
37 |   - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
38 |   - '--log_wandb=True'
39 |   - '--notes=BioBLP-M RotatE sweep'
40 |   - ${args}
41 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-bioblp-m-rotate-sweep
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/liqycjns
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-rotate-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: crossentropy
13 |   optimizer:
14 |     value: adam
15 |   learning_rate:
16 |     distribution: log_uniform_values
17 |     min: 1e-4
18 |     max: 1e-1
19 |   regularizer:
20 |     distribution: log_uniform_values
21 |     min: 1e-6
22 |     max: 1e-3
23 |   batch_size:
24 |     value: 1024
25 |   eval_batch_size:
26 |     value: 64
27 |   in_batch_negatives:
28 |     value: true
29 | command:
30 |   - ${env}
31 |   - python
32 |   - "-m"
33 |   - ${program}
34 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
35 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
36 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
37 |   - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
38 |   - '--log_wandb=True'
39 |   - '--notes=BioBLP-M RotatE sweep'
40 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-transe-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-transe-sweep
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout develop
20 | wandb agent --count 1 discoverylab/bioblp/pgx00fqa
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-m-transe-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: transe
11 |   dimension:
12 |     value: 512
13 |   loss_fn:
14 |     value: marginranking
15 |   optimizer:
16 |     value: adam
17 |   loss_margin:
18 |     distribution: uniform
19 |     min: 0.5
20 |     max: 10.0
21 |   learning_rate:
22 |     distribution: log_uniform_values
23 |     min: 1e-4
24 |     max: 1e-1
25 |   regularizer:
26 |     distribution: log_uniform_values
27 |     min: 1e-6
28 |     max: 1e-3
29 |   batch_size:
30 |     value: 1024
31 |   eval_batch_size:
32 |     value: 64
33 |   in_batch_negatives:
34 |     value: true
35 | command:
36 |   - ${env}
37 |   - python
38 |   - "-m"
39 |   - ${program}
40 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
41 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
42 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
43 |   - '--search_eval_batch_size=True'
44 |   - '--molecule_data=data/biokgb/properties/molecule_moltrans_embeddings.pt'
45 |   - '--log_wandb=True'
46 |   - '--notes=BioBLP-M TransE sweep'
47 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-p-rotate
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/6d2bwmy4
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   loss_fn:
10 |     value: bcewithlogits
11 |   freeze_pretrained_embeddings:
12 |     value: true
13 |   learning_rate:
14 |     distribution: log_uniform_values
15 |     min: 1e-3
16 |     max: 1.0
17 |   regularizer:
18 |     distribution: log_uniform_values
19 |     min: 1e-6
20 |     max: 1e-3
21 |   batch_size:
22 |     values:
23 |       - 128
24 |       - 256
25 |       - 512
26 | command:
27 |   - ${env}
28 |   - python
29 |   - "-m"
30 |   - ${program}
31 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
32 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
33 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
34 |   - '--search_eval_batch_size=True'
35 |   - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt'
36 |   - '--log_wandb=True'
37 |   - '--notes="BioBLP-P ComplEx sweep"'
38 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-complex-initialized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-p
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \
26 |         --model=complex \
27 |         --dimension=256 \
28 |         --loss_fn=bcewithlogits \
29 |         --regularizer=7.54616261352196e-05 \
30 |         --freeze_pretrained_embeddings=True \
31 |         --learning_rate=0.344274380857535 \
32 |         --num_epochs=100 \
33 |         --batch_size=512 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --from_checkpoint=models/1e9b4f4o \
37 |         --log_wandb=True \
38 |         --notes="ComplEx BioBLP-P initialized with 1e9b4f4o"
39 | 
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-rotate-initialized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-p
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \
26 |         --model=rotate \
27 |         --dimension=256 \
28 |         --loss_fn=crossentropy \
29 |         --regularizer=0.0003536270470551425 \
30 |         --freeze_pretrained_embeddings=True \
31 |         --learning_rate=0.04972680094809032 \
32 |         --num_epochs=100 \
33 |         --batch_size=512 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --from_checkpoint=models/36viovqn \
37 |         --log_wandb=True \
38 |         --notes="RotatE BioBLP-P initialized with 36viovqn"
39 | 
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-rotate-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-p-rotate
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/u02tzec7
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-rotate-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: crossentropy
13 |   freeze_pretrained_embeddings:
14 |     value: true
15 |   learning_rate:
16 |     distribution: log_uniform_values
17 |     min: 1e-3
18 |     max: 1.0
19 |   regularizer:
20 |     distribution: log_uniform_values
21 |     min: 1e-6
22 |     max: 1e-3
23 |   batch_size:
24 |     values:
25 |       - 128
26 |       - 256
27 |       - 512
28 | command:
29 |   - ${env}
30 |   - python
31 |   - "-m"
32 |   - ${program}
33 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
34 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
35 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
36 |   - '--search_eval_batch_size=True'
37 |   - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt'
38 |   - '--log_wandb=True'
39 |   - '--notes=BioBLP-P RotatE sweep'
40 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-transe-initialized.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-p
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=24:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt \
26 |         --model=transe \
27 |         --dimension=512 \
28 |         --loss_fn=marginranking \
29 |         --loss_margin=7.234906889602847 \
30 |         --regularizer=0.0006031667561379036 \
31 |         --freeze_pretrained_embeddings=True \
32 |         --learning_rate=0.03569964236328523 \
33 |         --num_epochs=100 \
34 |         --batch_size=256 \
35 |         --eval_batch_size=64 \
36 |         --num_negatives=512 \
37 |         --from_checkpoint=models/394htt2x \
38 |         --log_wandb=True \
39 |         --notes="TransE BioBLP-P initialized with 394htt2x"
40 | 
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-transe-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-p-transe
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=72:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout freeze-embeddings
20 | wandb agent --count 1 discoverylab/bioblp/rw6nzzyx
21 | 
22 | # Keep files generated during job
23 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
24 | mkdir -p $RESULTS_FOLDER
25 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
26 | 


--------------------------------------------------------------------------------
/jobs/biokg-bioblp-p-transe-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: random
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: transe
11 |   dimension:
12 |     value: 512
13 |   loss_fn:
14 |     value: marginranking
15 |   freeze_pretrained_embeddings:
16 |     value: true
17 |   loss_margin:
18 |     distribution: uniform
19 |     min: 0.5
20 |     max: 10.0
21 |   learning_rate:
22 |     distribution: log_uniform_values
23 |     min: 1e-3
24 |     max: 1.0
25 |   regularizer:
26 |     distribution: log_uniform_values
27 |     min: 1e-6
28 |     max: 1e-3
29 |   batch_size:
30 |     values:
31 |       - 128
32 |       - 256
33 |       - 512
34 | command:
35 |   - ${env}
36 |   - python
37 |   - "-m"
38 |   - ${program}
39 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
40 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
41 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
42 |   - '--search_eval_batch_size=True'
43 |   - '--protein_data=data/biokgb/properties/protein_prottrans_embeddings_24_12.pt'
44 |   - '--log_wandb=True'
45 |   - '--notes=BioBLP-P TransE sweep'
46 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-complex-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/21oekub7
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/biokg-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   loss_fn:
10 |     value: bcewithlogits
11 |   learning_rate:
12 |     distribution: log_uniform_values
13 |     min: 1e-3
14 |     max: 1.0
15 |   regularizer:
16 |     distribution: log_uniform_values
17 |     min: 1e-6
18 |     max: 1e-3
19 |   batch_size:
20 |     values:
21 |       - 128
22 |       - 256
23 |       - 512
24 |       - 1024
25 | command:
26 |   - ${env}
27 |   - python
28 |   - "-m"
29 |   - ${program}
30 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
31 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
32 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
33 |   - '--log_wandb=True'
34 |   - '--notes="ComplEx sweep"'
35 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-complex-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-complex-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/9m2x48u3
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/biokg-complex-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   learning_rate:
10 |     distribution: log_uniform_values
11 |     min: 1e-3
12 |     max: 1.0
13 |   regularizer:
14 |     distribution: log_uniform_values
15 |     min: 1e-6
16 |     max: 1e-3
17 |   batch_size:
18 |     values:
19 |       - 128
20 |       - 256
21 |       - 512
22 |       - 1024
23 | command:
24 |   - ${env}
25 |   - python
26 |   - "-m"
27 |   - ${program}
28 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
29 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
30 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
31 |   - '--log_wandb=True'
32 |   - '--notes="ComplEx sweep"'
33 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-rotate-bce-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-rotate-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/7q2851co
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/biokg-rotate-bce-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: bcewithlogits
13 |   learning_rate:
14 |     distribution: log_uniform_values
15 |     min: 1e-3
16 |     max: 1.0
17 |   regularizer:
18 |     distribution: log_uniform_values
19 |     min: 1e-6
20 |     max: 1e-3
21 |   batch_size:
22 |     values:
23 |       - 128
24 |       - 256
25 |       - 512
26 |       - 1024
27 | command:
28 |   - ${env}
29 |   - python
30 |   - "-m"
31 |   - ${program}
32 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
33 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
34 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
35 |   - '--log_wandb=True'
36 |   - '--notes=RotatE sweep, bcewithlogits'
37 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-rotate-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-rotate-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/u75h00fl
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/biokg-rotate-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: crossentropy
13 |   learning_rate:
14 |     distribution: log_uniform_values
15 |     min: 1e-3
16 |     max: 1.0
17 |   regularizer:
18 |     distribution: log_uniform_values
19 |     min: 1e-6
20 |     max: 1e-3
21 |   batch_size:
22 |     values:
23 |       - 128
24 |       - 256
25 |       - 512
26 |       - 1024
27 | command:
28 |   - ${env}
29 |   - python
30 |   - "-m"
31 |   - ${program}
32 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
33 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
34 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
35 |   - '--log_wandb=True'
36 |   - '--notes=RotatE sweep'
37 |   - ${args}


--------------------------------------------------------------------------------
/jobs/biokg-transe-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-transe-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/n4zgfrhb
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/biokg-transe-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: transe
11 |   dimension:
12 |     value: 512
13 |   loss_fn:
14 |     value: marginranking
15 |   loss_margin:
16 |     distribution: uniform
17 |     min: 0.5
18 |     max: 10.0
19 |   learning_rate:
20 |     distribution: log_uniform_values
21 |     min: 1e-3
22 |     max: 1.0
23 |   regularizer:
24 |     distribution: log_uniform_values
25 |     min: 1e-6
26 |     max: 1e-3
27 |   batch_size:
28 |     values:
29 |       - 128
30 |       - 256
31 |       - 512
32 |       - 1024
33 | command:
34 |   - ${env}
35 |   - python
36 |   - "-m"
37 |   - ${program}
38 |   - '--train_triples=data/biokgb/graph/biokg.links-train.csv'
39 |   - '--valid_triples=data/biokgb/graph/biokg.links-valid.csv'
40 |   - '--test_triples=data/biokgb/graph/biokg.links-test.csv'
41 |   - '--log_wandb=True'
42 |   - '--notes=TransE sweep'
43 |   - ${args}


--------------------------------------------------------------------------------
/jobs/complex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=complex
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=10:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout fix_bioblp_init
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --model=complex \
26 |         --dimension=256 \
27 |         --loss_fn=bcewithlogits \
28 |         --learning_rate=0.3595182058943781 \
29 |         --regularizer=3.7579365087382533e-05 \
30 |         --num_epochs=100 \
31 |         --batch_size=256 \
32 |         --eval_batch_size=64 \
33 |         --num_negatives=512 \
34 |         --log_wandb=True \
35 |         --notes="ComplEx best hparams, rep"
36 | 
37 | # Keep files generated during job
38 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
39 | mkdir -p $RESULTS_FOLDER
40 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
41 | 


--------------------------------------------------------------------------------
/jobs/hetionet-complex-bce-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-complex-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/ydoydkmt
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/hetionet-complex-bce-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   loss_fn:
10 |     value: bcewithlogits
11 |   learning_rate:
12 |     distribution: log_uniform_values
13 |     min: 1e-3
14 |     max: 1.0
15 |   regularizer:
16 |     distribution: log_uniform_values
17 |     min: 1e-6
18 |     max: 1e-3
19 |   batch_size:
20 |     values:
21 |       - 128
22 |       - 256
23 |       - 512
24 |       - 1024
25 | command:
26 |   - ${env}
27 |   - python
28 |   - "-m"
29 |   - ${program}
30 |   - '--train_triples=data/hetionet/hetionet.train.csv'
31 |   - '--valid_triples=data/hetionet/hetionet.valid.csv'
32 |   - '--test_triples=data/hetionet/hetionet.test.csv'
33 |   - '--log_wandb=True'
34 |   - '--notes="ComplEx sweep"'
35 |   - ${args}


--------------------------------------------------------------------------------
/jobs/hetionet-complex-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-complex-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/uvgnrmka
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/hetionet-complex-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   learning_rate:
10 |     distribution: log_uniform_values
11 |     min: 1e-3
12 |     max: 1.0
13 |   regularizer:
14 |     distribution: log_uniform_values
15 |     min: 1e-6
16 |     max: 1e-3
17 |   batch_size:
18 |     values:
19 |       - 128
20 |       - 256
21 |       - 512
22 |       - 1024
23 | command:
24 |   - ${env}
25 |   - python
26 |   - "-m"
27 |   - ${program}
28 |   - '--train_triples=data/hetionet/hetionet.train.csv'
29 |   - '--valid_triples=data/hetionet/hetionet.valid.csv'
30 |   - '--test_triples=data/hetionet/hetionet.test.csv'
31 |   - '--log_wandb=True'
32 |   - '--notes="ComplEx sweep"'
33 |   - ${args}


--------------------------------------------------------------------------------
/jobs/hetionet-rotate-bce-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-rotate-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/ge1smc54
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/hetionet-rotate-bce-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: bcewithlogits
13 |   learning_rate:
14 |     distribution: log_uniform_values
15 |     min: 1e-3
16 |     max: 1.0
17 |   regularizer:
18 |     distribution: log_uniform_values
19 |     min: 1e-6
20 |     max: 1e-3
21 |   batch_size:
22 |     values:
23 |       - 128
24 |       - 256
25 |       - 512
26 |       - 1024
27 | command:
28 |   - ${env}
29 |   - python
30 |   - "-m"
31 |   - ${program}
32 |   - '--train_triples=data/hetionet/hetionet.train.csv'
33 |   - '--valid_triples=data/hetionet/hetionet.valid.csv'
34 |   - '--test_triples=data/hetionet/hetionet.test.csv'
35 |   - '--log_wandb=True'
36 |   - '--notes=RotatE sweep, bcewithlogits'
37 |   - ${args}


--------------------------------------------------------------------------------
/jobs/hetionet-rotate-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-rotate-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/2iderrf0
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/hetionet-rotate-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: rotate
11 |   loss_fn:
12 |     value: crossentropy
13 |   learning_rate:
14 |     distribution: log_uniform_values
15 |     min: 1e-3
16 |     max: 1.0
17 |   regularizer:
18 |     distribution: log_uniform_values
19 |     min: 1e-6
20 |     max: 1e-3
21 |   batch_size:
22 |     values:
23 |       - 128
24 |       - 256
25 |       - 512
26 |       - 1024
27 | command:
28 |   - ${env}
29 |   - python
30 |   - "-m"
31 |   - ${program}
32 |   - '--train_triples=data/hetionet/hetionet.train.csv'
33 |   - '--valid_triples=data/hetionet/hetionet.valid.csv'
34 |   - '--test_triples=data/hetionet/hetionet.test.csv'
35 |   - '--log_wandb=True'
36 |   - '--notes=RotatE sweep'
37 |   - ${args}


--------------------------------------------------------------------------------
/jobs/hetionet-transe-sweep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=biokg-transe-sweep
 3 | #SBATCH --output=array_%A_%a.out
 4 | #SBATCH --error=array_%A_%a.err
 5 | #SBATCH --ntasks=1
 6 | #SBATCH --cpus-per-task=6
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --time=40:00:00
 9 | #SBATCH --mem=10G
10 | #SBATCH --partition=gpu_shared
11 | #SBATCH --gres=gpu:1
12 | 
13 | PROJ_FOLDER=bioblp
14 | OUT_FOLDER=models
15 | 
16 | # Copy data to scratch
17 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
18 | cd $TMPDIR/$PROJ_FOLDER
19 | 
20 | source activate bioblp
21 | 
22 | wandb agent --count 1 discoverylab/bioblp/jfb6wo19
23 | 
24 | # Keep files generated during job
25 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
26 | mkdir -p $RESULTS_FOLDER
27 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
28 | 


--------------------------------------------------------------------------------
/jobs/hetionet-transe-sweep.yml:
--------------------------------------------------------------------------------
 1 | entity: discoverylab
 2 | project: bioblp
 3 | program: bioblp.train
 4 | method: bayes
 5 | metric:
 6 |   name: validation.both.realistic.inverse_harmonic_mean_rank
 7 |   goal: maximize
 8 | parameters:
 9 |   model:
10 |     value: transe
11 |   dimension:
12 |     value: 512
13 |   loss_fn:
14 |     value: marginranking
15 |   loss_margin:
16 |     distribution: uniform
17 |     min: 0.5
18 |     max: 10.0
19 |   learning_rate:
20 |     distribution: log_uniform_values
21 |     min: 1e-3
22 |     max: 1.0
23 |   regularizer:
24 |     distribution: log_uniform_values
25 |     min: 1e-6
26 |     max: 1e-3
27 |   batch_size:
28 |     values:
29 |       - 128
30 |       - 256
31 |       - 512
32 |       - 1024
33 | command:
34 |   - ${env}
35 |   - python
36 |   - "-m"
37 |   - ${program}
38 |   - '--train_triples=data/hetionet/hetionet.train.csv'
39 |   - '--valid_triples=data/hetionet/hetionet.valid.csv'
40 |   - '--test_triples=data/hetionet/hetionet.test.csv'
41 |   - '--log_wandb=True'
42 |   - '--notes=TransE sweep'
43 |   - ${args}


--------------------------------------------------------------------------------
/jobs/rotate-dummy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-rotate-dummy
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=08:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout disease-encoder-dummy
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/dummy_biokg_meshid_to_descr_name.tsv \
26 |         --model=rotate \
27 |         --dimension=256 \
28 |         --loss_fn=crossentropy \
29 |         --optimizer=adagrad \
30 |         --regularizer=0.0002757262741946316 \
31 |         --learning_rate=0.07300713133641318 \
32 |         --num_epochs=100 \
33 |         --batch_size=1024 \
34 |         --eval_batch_size=64 \
35 |         --num_negatives=512 \
36 |         --in_batch_negatives=False \
37 |         --log_wandb=True \
38 |         --notes="BioBLP-D RotatE, no descriptions, fixed eval batch size"
39 | 
40 | # Keep files generated during job
41 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
42 | mkdir -p $RESULTS_FOLDER
43 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
44 | 


--------------------------------------------------------------------------------
/jobs/rotate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=bioblp-complex
 3 | #SBATCH --ntasks=1
 4 | #SBATCH --cpus-per-task=18
 5 | #SBATCH --time=01:00:00
 6 | #SBATCH --mem=16G
 7 | #SBATCH --partition=gpu
 8 | #SBATCH --gpus=1
 9 | 
10 | PROJ_FOLDER=bioblp
11 | OUT_FOLDER=models
12 | 
13 | # Copy data to scratch
14 | cp -r $HOME/$PROJ_FOLDER $TMPDIR
15 | cd $TMPDIR/$PROJ_FOLDER
16 | 
17 | source activate bioblp
18 | 
19 | git checkout disease-encoder
20 | 
21 | python -m bioblp.train \
22 |         --train_triples=data/biokgb/graph/biokg.links-train.csv \
23 |         --valid_triples=data/biokgb/graph/biokg.links-valid.csv \
24 |         --test_triples=data/biokgb/graph/biokg.links-test.csv \
25 |         --text_data=data/biokgb/properties/biokg_meshid_to_descr_name.tsv \
26 |         --model=rotate \
27 |         --dimension=256 \
28 |         --loss_fn=crossentropy \
29 |         --optimizer=adam \
30 |         --learning_rate=2e-5 \
31 |         --warmup_fraction=0.05 \
32 |         --num_epochs=10 \
33 |         --batch_size=1024 \
34 |         --search_eval_batch_size=True \
35 |         --eval_every=1 \
36 |         --num_negatives=512 \
37 |         --in_batch_negatives=True \
38 |         --log_wandb=True \
39 |         --notes="BioBLP-D 10 epoch test"
40 | 
41 | # Keep files generated during job
42 | RESULTS_FOLDER=$HOME/$PROJ_FOLDER-$OUT_FOLDER
43 | mkdir -p $RESULTS_FOLDER
44 | cp -r $TMPDIR/$PROJ_FOLDER/$OUT_FOLDER/* $RESULTS_FOLDER
45 | 


--------------------------------------------------------------------------------
/loaders/placeholder.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/loaders/placeholder.txt


--------------------------------------------------------------------------------
/logs/placeholder.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/logs/placeholder.txt


--------------------------------------------------------------------------------
/notebooks/01_01_biokg-data-prep-for-kge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "id": "dd58a8cf",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "%load_ext autoreload\n",
 11 |     "%autoreload 2"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 11,
 17 |    "id": "b05d473c",
 18 |    "metadata": {
 19 |     "tags": []
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import pandas as pd \n",
 24 |     "from pathlib import Path\n",
 25 |     "import toml\n",
 26 |     "\n",
 27 |     "from bioblp.data import COL_SOURCE, COL_TARGET,COL_EDGE\n",
 28 |     "from bioblp.data import create_random_splits"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "f36dd753",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "DATA_DIR = Path(\"../data\")\n",
 39 |     "SHARED_DATA_DIR = Path(\"/home/jovyan/workbench-shared-folder/bioblp/data\")\n",
 40 |     "config_path = DATA_DIR.joinpath(\"conf/complex-biokg-20220826.toml\")\n",
 41 |     "biokg_mini_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links_sample.tsv\")\n",
 42 |     "biokg_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links.tsv\")"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "id": "f4732983-308b-44d7-8fd9-43a3b1506819",
 48 |    "metadata": {
 49 |     "tags": []
 50 |    },
 51 |    "source": [
 52 |     "### BIOKG Data Prep"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 7,
 58 |    "id": "918f0203",
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/html": [
 64 |        "<div>\n",
 65 |        "<style scoped>\n",
 66 |        "    .dataframe tbody tr th:only-of-type {\n",
 67 |        "        vertical-align: middle;\n",
 68 |        "    }\n",
 69 |        "\n",
 70 |        "    .dataframe tbody tr th {\n",
 71 |        "        vertical-align: top;\n",
 72 |        "    }\n",
 73 |        "\n",
 74 |        "    .dataframe thead th {\n",
 75 |        "        text-align: right;\n",
 76 |        "    }\n",
 77 |        "</style>\n",
 78 |        "<table border=\"1\" class=\"dataframe\">\n",
 79 |        "  <thead>\n",
 80 |        "    <tr style=\"text-align: right;\">\n",
 81 |        "      <th></th>\n",
 82 |        "      <th>src</th>\n",
 83 |        "      <th>edg</th>\n",
 84 |        "      <th>tgt</th>\n",
 85 |        "    </tr>\n",
 86 |        "  </thead>\n",
 87 |        "  <tbody>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>0</th>\n",
 90 |        "      <td>C566487</td>\n",
 91 |        "      <td>DISEASE_PATHWAY_ASSOCIATION</td>\n",
 92 |        "      <td>hsa00071</td>\n",
 93 |        "    </tr>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>1</th>\n",
 96 |        "      <td>C567839</td>\n",
 97 |        "      <td>DISEASE_PATHWAY_ASSOCIATION</td>\n",
 98 |        "      <td>map04810</td>\n",
 99 |        "    </tr>\n",
100 |        "  </tbody>\n",
101 |        "</table>\n",
102 |        "</div>"
103 |       ],
104 |       "text/plain": [
105 |        "       src                          edg       tgt\n",
106 |        "0  C566487  DISEASE_PATHWAY_ASSOCIATION  hsa00071\n",
107 |        "1  C567839  DISEASE_PATHWAY_ASSOCIATION  map04810"
108 |       ]
109 |      },
110 |      "execution_count": 7,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "#df = pd.read_csv(biokg_mini_path, delimiter=\"\\t\", names=[\"idx\", COL_SOURCE, COL_EDGE, COL_TARGET], header=0)\n",
117 |     "df = pd.read_csv(biokg_path, delimiter=\"\\t\", names=[COL_SOURCE, COL_EDGE, COL_TARGET], header=None)\n",
118 |     "df.head(2)"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "37dac0a0-108d-4f4c-a1f3-95e985ca9db7",
124 |    "metadata": {},
125 |    "source": [
126 |     "Create data splits"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 8,
132 |    "id": "cb5e4b6d",
133 |    "metadata": {},
134 |    "outputs": [
135 |     {
136 |      "name": "stderr",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
140 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
141 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "train, test, valid  = create_random_splits(df, 0.9, 0.05, 0.05)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 11,
152 |    "id": "d06a6c1e",
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "saved to ../data/raw/biokg_full_splits\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "SAVE_SPLITS_TO_DISK = False\n",
165 |     "dataset_name = 'biokg_random_900505'\n",
166 |     "datasplits_dir =  DATA_DIR.joinpath(\"raw/biokg_full_splits\")\n",
167 |     "\n",
168 |     "if SAVE_SPLITS_TO_DISK:\n",
169 |     "    save_splits(train_df=train,\n",
170 |     "               test_df=test, \n",
171 |     "               valid_df=valid,\n",
172 |     "               dataset_name=dataset_name,\n",
173 |     "               out_dir=datasplits_dir)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "id": "ed5633c4-cf9f-477f-a468-582bbf91146d",
179 |    "metadata": {},
180 |    "source": [
181 |     "### Training"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "id": "388a8210-89f0-435f-8405-81b8c38caa12",
187 |    "metadata": {},
188 |    "source": [
189 |     "```bash\n",
190 |     "$ python -m bioblp.train_argparse --conf /home/jovyan/BioBLP/data/conf/complex-biokg-full-20220826.toml\n",
191 |     "```"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "id": "773a6c74-333b-49e8-b2df-022574889217",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": []
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": ".conda-bioblp-env [Python]",
206 |    "language": "python",
207 |    "name": "conda-env-.conda-bioblp-env-py"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.8.13"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 5
224 | }
225 | 


--------------------------------------------------------------------------------
/notebooks/03-00-nested-cv.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "f8467842-5b37-4dc9-83f0-a684ed4a5fdd",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Run nested CV routine"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "259edda9-e110-4e05-b1de-2965c45ef58b",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import random\n",
 19 |     "\n",
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "\n",
 23 |     "from bioblp.data import COL_EDGE, COL_SOURCE, COL_TARGET\n",
 24 |     "from bioblp.logging import get_logger\n",
 25 |     "import torch\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "logger = get_logger(__name__)\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "134fd3c5",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "DATA_DIR = Path(\"../data/\")\n",
 39 |     "DATA_SHARED = Path(\"/home/jovyan/workbench-shared-folder/bioblp\")"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "eee761be",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "from time import time\n",
 50 |     "from pathlib import Path\n",
 51 |     "from collections import defaultdict\n",
 52 |     "\n",
 53 |     "from bioblp.benchmarking.train import run_nested_cv\n",
 54 |     "from bioblp.benchmarking.train import get_scorers\n",
 55 |     "\n"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "id": "326edf30",
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "\"\"\"Perform train run\"\"\"\n",
 66 |     "\n",
 67 |     "# reproducibility\n",
 68 |     "# SEED is set as global\n",
 69 |     "shuffle = True\n",
 70 |     "refit_params = [\"AUCPR\", \"AUCROC\"]\n",
 71 |     "\n",
 72 |     "data_dir = Path(\"../data/features/kge-1baon0eg/\")\n",
 73 |     "out_dir = Path(\"../data/runs/\")\n",
 74 |     "\n",
 75 |     "n_proc = 1\n",
 76 |     "n_iter = 2\n",
 77 |     "inner_n_folds = 3\n",
 78 |     "outer_n_folds = 5\n",
 79 |     "\n",
 80 |     "exp_output = defaultdict(dict)\n",
 81 |     "exp_output[\"config\"] = {\n",
 82 |     "    \"n_proc\": n_proc,\n",
 83 |     "    \"n_iter\": n_iter,\n",
 84 |     "    \"inner_n_folds\": inner_n_folds,\n",
 85 |     "    \"outer_n_folds\": outer_n_folds,\n",
 86 |     "    \"data_dir\": data_dir,\n",
 87 |     "    \"seed\": SEED,\n",
 88 |     "    \"shuffle\": shuffle\n",
 89 |     "}\n",
 90 |     "\n",
 91 |     "start = time()\n",
 92 |     "run_timestamp = int(start)\n",
 93 |     "\n",
 94 |     "logger.info(\"Starting model building script at {}.\".format(start))\n",
 95 |     "\n",
 96 |     "############\n",
 97 |     "# Load data\n",
 98 |     "############\n",
 99 |     "logger.info(\"Loading training data...\")\n",
100 |     "\n",
101 |     "X_train = np.load(data_dir.joinpath(\"X.npy\"))\n",
102 |     "y_train = np.load(data_dir.joinpath(\"y.npy\"))\n",
103 |     "\n",
104 |     "logger.info(\n",
105 |     "    \"Resulting shapes X_train: {}, y_train: {}\".format(\n",
106 |     "        X_train.shape, y_train.shape)\n",
107 |     ")\n",
108 |     "logger.info(\"Counts in y_train: {}\".format(\n",
109 |     "    np.unique(y_train, return_counts=True)))\n",
110 |     "\n",
111 |     "############\n",
112 |     "# Setup classifiers & pipelines\n",
113 |     "############\n",
114 |     "\n",
115 |     "lr_label = \"LR\"\n",
116 |     "rf_label = \"RF\"\n",
117 |     "MLP_label = \"MLP\"\n",
118 |     "\n",
119 |     "############\n",
120 |     "# Compare models\n",
121 |     "############\n",
122 |     "\n",
123 |     "candidates = [\n",
124 |     "    lr_label,\n",
125 |     "    # rf_label,\n",
126 |     "    # MLP_label\n",
127 |     "\n",
128 |     "]\n",
129 |     "\n",
130 |     "scorer = get_scorers()\n",
131 |     "\n",
132 |     "nested_cv_scores = run_nested_cv(\n",
133 |     "    candidates=candidates,\n",
134 |     "    X=X_train,\n",
135 |     "    y=y_train,\n",
136 |     "    scoring=scorer,\n",
137 |     "    inner_n_folds=inner_n_folds,\n",
138 |     "    inner_n_iter=n_iter,\n",
139 |     "    outer_n_folds=outer_n_folds,\n",
140 |     "    shuffle=shuffle,\n",
141 |     "    n_jobs=n_proc,\n",
142 |     "    refit_params=refit_params,\n",
143 |     "    random_state=SEED,\n",
144 |     "    outdir=out_dir,\n",
145 |     "    timestamp=run_timestamp\n",
146 |     ")\n",
147 |     "\n",
148 |     "for algo, scores in nested_cv_scores.items():\n",
149 |     "    logger.info(\"Scores {}: {}\".format(algo, scores))\n",
150 |     "\n",
151 |     "exp_output[\"results\"] = nested_cv_scores\n",
152 |     "\n",
153 |     "logger.info(exp_output)\n",
154 |     "\n",
155 |     "file_out = out_dir.joinpath(\n",
156 |     "    \"nested_cv_scores_{}.npy\".format(run_timestamp))\n",
157 |     "logger.info(\"Saving to {}\".format(file_out))\n",
158 |     "np.save(file_out, exp_output)\n",
159 |     "\n",
160 |     "end = time()\n",
161 |     "\n",
162 |     "logger.info(\"Ran script in {} seconds\".format(str(end - start)))"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "id": "703ff89a-dd11-4fb0-bdcb-87e9fa41e20a",
168 |    "metadata": {},
169 |    "source": [
170 |     "_____"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "id": "a6594c30-e73d-4214-989c-54512bef0e5b",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": []
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "id": "df67346c-124a-49ec-9cfe-913d273f66c2",
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": []
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "id": "58d97f92-0a46-4bd0-92be-7124e6c91768",
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": []
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": ".conda-bioblp-env [Python]",
201 |    "language": "python",
202 |    "name": "conda-env-.conda-bioblp-env-py"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.9.13"
215 |   },
216 |   "vscode": {
217 |    "interpreter": {
218 |     "hash": "c313b0b0929f94c03130caa81adcdac46c3c408d7f1caca6c1104b192c16f937"
219 |    }
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 5
224 | }
225 | 


--------------------------------------------------------------------------------
/notebooks/03-frequency-baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Evaluating frequency-based baselines for link prediction\n",
  7 |     "\n",
  8 |     "Some knowledege graphs come with particularly frequent instances (either relations, or entities), that a model can use to learn spurious correlations that lead to high ranking metrics, due to the calculation of micro-averages.\n",
  9 |     "A sanity check thus consists of running a baseline that simply uses counts, which can be compared with models that are supposed to generalize much better."
 10 |    ],
 11 |    "metadata": {
 12 |     "collapsed": false,
 13 |     "pycharm": {
 14 |      "name": "#%% md\n"
 15 |     }
 16 |    }
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 5,
 21 |    "metadata": {
 22 |     "collapsed": true,
 23 |     "pycharm": {
 24 |      "name": "#%%\n"
 25 |     }
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import os.path as osp\n",
 30 |     "\n",
 31 |     "from pykeen.models.baseline import MarginalDistributionBaseline\n",
 32 |     "from pykeen.triples import TriplesFactory\n",
 33 |     "from pykeen.evaluation import RankBasedEvaluator, evaluate\n",
 34 |     "import torch"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "source": [
 40 |     "## Data loading"
 41 |    ],
 42 |    "metadata": {
 43 |     "collapsed": false,
 44 |     "pycharm": {
 45 |      "name": "#%% md\n"
 46 |     }
 47 |    }
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 6,
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "graph_path = osp.join('..', 'data', 'biokgb', 'graph')\n",
 55 |     "train_triples = 'biokg.links-train.csv'\n",
 56 |     "valid_triples = 'biokg.links-valid.csv'\n",
 57 |     "test_triples = 'biokg.links-test.csv'\n",
 58 |     "\n",
 59 |     "train, valid, test = [TriplesFactory.from_path(osp.join(graph_path, f)) for f in (train_triples, valid_triples, test_triples)]"
 60 |    ],
 61 |    "metadata": {
 62 |     "collapsed": false,
 63 |     "pycharm": {
 64 |      "name": "#%%\n"
 65 |     }
 66 |    }
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "source": [
 71 |     "## Instantiating a frequency-based baseline\n",
 72 |     "\n",
 73 |     "PyKEEN comes with a set of interesting baselines that, ideally, any machine learning model should outperform. Here we will use the [`MarginalDistributionBaseline`](https://pykeen.readthedocs.io/en/stable/api/pykeen.models.MarginalDistributionBaseline.html).\n",
 74 |     "\n",
 75 |     "When predicting the tail for a triple (h, r, t), the model scores each possible tail t as the probability that t co-occurs with r times the probability that t co-occurs with h:\n",
 76 |     "\n",
 77 |     "$$\n",
 78 |     "P(t\\vert h, r) = P(t\\vert r) P(t\\vert h)\n",
 79 |     "$$"
 80 |    ],
 81 |    "metadata": {
 82 |     "collapsed": false,
 83 |     "pycharm": {
 84 |      "name": "#%% md\n"
 85 |     }
 86 |    }
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 7,
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "model = MarginalDistributionBaseline(train)\n",
 94 |     "# An ugly hack to add a dummy parameter to this non-parametric baseline\n",
 95 |     "# so that evaluation works as for models with learnable parameters\n",
 96 |     "model.foo = torch.nn.Embedding(1, 2)"
 97 |    ],
 98 |    "metadata": {
 99 |     "collapsed": false,
100 |     "pycharm": {
101 |      "name": "#%%\n"
102 |     }
103 |    }
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "source": [
108 |     "## Evaluation\n",
109 |     "\n",
110 |     "We now get the ranking metrics on the test set, using triples in the training, validation, and test sets for filtering.\n",
111 |     "\n",
112 |     "**Warning:** the next cell can take around half an hour to run."
113 |    ],
114 |    "metadata": {
115 |     "collapsed": false,
116 |     "pycharm": {
117 |      "name": "#%% md\n"
118 |     }
119 |    }
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 10,
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": "Evaluating on cpu:   0%|          | 0.00/185k [00:00<?, ?triple/s]",
128 |       "application/vnd.jupyter.widget-view+json": {
129 |        "version_major": 2,
130 |        "version_minor": 0,
131 |        "model_id": "c19c7651c4d148c4a90c6c58a905d73d"
132 |       }
133 |      },
134 |      "metadata": {},
135 |      "output_type": "display_data"
136 |     }
137 |    ],
138 |    "source": [
139 |     "evaluator = RankBasedEvaluator()\n",
140 |     "results = evaluate(model, test.mapped_triples, evaluator, batch_size=1024, mode=None, device=torch.device('cpu'),\n",
141 |     "                   additional_filter_triples=[train.mapped_triples, valid.mapped_triples, test.mapped_triples])"
142 |    ],
143 |    "metadata": {
144 |     "collapsed": false,
145 |     "pycharm": {
146 |      "name": "#%%\n"
147 |     }
148 |    }
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 17,
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "both.inverse_harmonic_mean_rank         0.07\n",
159 |       "both.hits_at_1                          0.07\n",
160 |       "both.hits_at_3                          0.07\n",
161 |       "both.hits_at_10                         0.07\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "metrics = ['both.inverse_harmonic_mean_rank',\n",
167 |     "           'both.hits_at_1',\n",
168 |     "           'both.hits_at_3',\n",
169 |     "           'both.hits_at_10']\n",
170 |     "\n",
171 |     "for m in metrics:\n",
172 |     "    print(f'{m:<40}{results.get_metric(m) * 100:.2f}')"
173 |    ],
174 |    "metadata": {
175 |     "collapsed": false,
176 |     "pycharm": {
177 |      "name": "#%%\n"
178 |     }
179 |    }
180 |   }
181 |  ],
182 |  "metadata": {
183 |   "kernelspec": {
184 |    "display_name": "Python 3",
185 |    "language": "python",
186 |    "name": "python3"
187 |   },
188 |   "language_info": {
189 |    "codemirror_mode": {
190 |     "name": "ipython",
191 |     "version": 2
192 |    },
193 |    "file_extension": ".py",
194 |    "mimetype": "text/x-python",
195 |    "name": "python",
196 |    "nbconvert_exporter": "python",
197 |    "pygments_lexer": "ipython2",
198 |    "version": "2.7.6"
199 |   }
200 |  },
201 |  "nbformat": 4,
202 |  "nbformat_minor": 0
203 | }


--------------------------------------------------------------------------------
/notebooks/06-hetionet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "source": [
  6 |     "# Exporting Hetionet from PyKEEN\n",
  7 |     "\n",
  8 |     "We will use PyKEEN to export Hetionet, because we need the string identifiers to retrieve properties.\n",
  9 |     "\n",
 10 |     "The resulting triples will be stored in `data/hetionet`."
 11 |    ],
 12 |    "metadata": {
 13 |     "collapsed": false
 14 |    }
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import os.path as osp\n",
 25 |     "import os\n",
 26 |     "\n",
 27 |     "import pandas as pd\n",
 28 |     "from pykeen.datasets import Hetionet"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "dataset = Hetionet()\n",
 37 |     "train, valid, test = dataset.training, dataset.validation, dataset.testing\n",
 38 |     "splits_dict = {'train': train, 'valid': valid, 'test':test}"
 39 |    ],
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    }
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "source": [
 47 |     "## Some stats"
 48 |    ],
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    }
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "There are 45,158 entities and 24 relations.\n",
 62 |       "  Split      Triples\n",
 63 |       "--------------------\n",
 64 |       "  train    1,800,157\n",
 65 |       "  valid      225,020\n",
 66 |       "   test      225,020\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "print(f'There are {dataset.num_entities:,} entities and {dataset.num_relations:,} relations.')\n",
 72 |     "print(f'{\"Split\":^10}{\"Triples\":>10}')\n",
 73 |     "print('-' * 20)\n",
 74 |     "for name, split in splits_dict.items():\n",
 75 |     "    print(f'{name:^10}{split.num_triples:>10,}')"
 76 |    ],
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    }
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 4,
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
 90 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
 91 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "out_path = osp.join('..', 'data', 'hetionet')\n",
 97 |     "if not osp.exists(out_path):\n",
 98 |     "    os.mkdir(out_path)\n",
 99 |     "\n",
100 |     "for name, split in splits_dict.items():\n",
101 |     "    pd.DataFrame(split.triples).to_csv(osp.join(out_path, f'hetionet.{name}.csv'), sep='\\t', index=False, header=False)"
102 |    ],
103 |    "metadata": {
104 |     "collapsed": false
105 |    }
106 |   }
107 |  ],
108 |  "metadata": {
109 |   "kernelspec": {
110 |    "display_name": "Python 3",
111 |    "language": "python",
112 |    "name": "python3"
113 |   },
114 |   "language_info": {
115 |    "codemirror_mode": {
116 |     "name": "ipython",
117 |     "version": 2
118 |    },
119 |    "file_extension": ".py",
120 |    "mimetype": "text/x-python",
121 |    "name": "python",
122 |    "nbconvert_exporter": "python",
123 |    "pygments_lexer": "ipython2",
124 |    "version": "2.7.6"
125 |   }
126 |  },
127 |  "nbformat": 4,
128 |  "nbformat_minor": 0
129 | }
130 | 


--------------------------------------------------------------------------------
/notebooks/99-train_hetionet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "id": "dd58a8cf",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "%load_ext autoreload\n",
 11 |     "%autoreload 2"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 11,
 17 |    "id": "b05d473c",
 18 |    "metadata": {
 19 |     "tags": []
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import pandas as pd \n",
 24 |     "from pathlib import Path\n",
 25 |     "import toml\n",
 26 |     "\n",
 27 |     "from bioblp.data import COL_SOURCE, COL_TARGET,COL_EDGE\n",
 28 |     "from bioblp.data import create_random_splits"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "f36dd753",
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "DATA_DIR = Path(\"../data\")\n",
 39 |     "SHARED_DATA_DIR = Path(\"/home/jovyan/workbench-shared-folder/bioblp/data\")\n",
 40 |     "config_path = DATA_DIR.joinpath(\"conf/complex-biokg-20220826.toml\")\n",
 41 |     "biokg_mini_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links_sample.tsv\")\n",
 42 |     "biokg_path = SHARED_DATA_DIR.joinpath(\"raw/biokg.links.tsv\")"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "id": "56b4e0a0",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Hetionet"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 21,
 56 |    "id": "cbbb5a42",
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "EagerDataset (create_inverse_triples=False)\n",
 64 |       "Name        Entities    Relations      Triples\n",
 65 |       "----------  ----------  -----------  ---------\n",
 66 |       "Training    45158       24             1800157\n",
 67 |       "Testing     45158       24              225020\n",
 68 |       "Validation  45158       24              225020\n",
 69 |       "Total       -           -              2250197\n",
 70 |       "Head                     Relation    tail\n",
 71 |       "-----------------------  ----------  ------------\n",
 72 |       "Anatomy::UBERON:0000002  AdG         Gene::10005\n",
 73 |       "Anatomy::UBERON:0000002  AdG         Gene::114804\n",
 74 |       "Anatomy::UBERON:0000002  AdG         Gene::118670\n",
 75 |       "Anatomy::UBERON:0000002  AdG         Gene::128989\n",
 76 |       "Anatomy::UBERON:0000002  AdG         Gene::132851\n",
 77 |       "\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "from pykeen.datasets import Hetionet\n",
 83 |     "from pykeen.datasets import get_dataset\n",
 84 |     "\n",
 85 |     "ds = get_dataset(dataset=Hetionet)\n",
 86 |     "ds.summarize()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 39,
 92 |    "id": "35ad86ee",
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stderr",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
100 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n",
101 |       "Reconstructing all label-based triples. This is expensive and rarely needed.\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "triples = Hetionet().factory_dict\n",
107 |     "test = pd.DataFrame(triples['testing'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])\n",
108 |     "train = pd.DataFrame(triples['training'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])\n",
109 |     "valid = pd.DataFrame(triples['validation'].triples, columns=[[COL_SOURCE, COL_EDGE, COL_TARGET]])"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 41,
115 |    "id": "978049a9",
116 |    "metadata": {},
117 |    "outputs": [
118 |     {
119 |      "data": {
120 |       "text/plain": [
121 |        "0.10000013332166029"
122 |       ]
123 |      },
124 |      "execution_count": 41,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "len(test)/(len(train)+ len(test) +len(valid))"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 42,
136 |    "id": "d6068102",
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "saved to ../data/raw/hetionet_splits\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "SAVE_SPLITS_TO_DISK = False\n",
149 |     "hetio_dataset_name = 'hetionet_random_801010'\n",
150 |     "hetio_datasplits_dir =  DATA_DIR.joinpath(\"raw/hetionet_splits\")\n",
151 |     "\n",
152 |     "if SAVE_SPLITS_TO_DISK:\n",
153 |     "    save_splits(train_df=train,\n",
154 |     "               test_df=test, \n",
155 |     "               valid_df=valid,\n",
156 |     "               dataset_name=hetio_dataset_name\",\n",
157 |     "               out_dir=hetio_datasplits_dir)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "id": "3459292c",
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": []
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 15,
171 |    "id": "527f6a4d",
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "data": {
176 |       "text/plain": [
177 |        "{'train_triples': 'data',\n",
178 |        " 'valid_triples': 'data',\n",
179 |        " 'test_triples': 'data',\n",
180 |        " 'model': 'complex',\n",
181 |        " 'dimension': 256,\n",
182 |        " 'loss_fn': 'crossentropy',\n",
183 |        " 'loss_margin': 1.0,\n",
184 |        " 'optimizer': 'adagrad',\n",
185 |        " 'learning_rate': 0.01,\n",
186 |        " 'regularizer': 1e-06,\n",
187 |        " 'num_epochs': 100,\n",
188 |        " 'batch_size': 1024,\n",
189 |        " 'eval_batch_size': 16,\n",
190 |        " 'num_negatives': 512,\n",
191 |        " 'add_inverses': False,\n",
192 |        " 'early_stopper': 'both.realistic.inverse_harmonic_mean_rank',\n",
193 |        " 'search_train_batch_size': False,\n",
194 |        " 'search_eval_batch_size': False,\n",
195 |        " 'log_wandb': False}"
196 |       ]
197 |      },
198 |      "execution_count": 15,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "def load_toml(toml_path: str) -> dict:\n",
205 |     "    toml_path = Path(toml_path)\n",
206 |     "\n",
207 |     "    config = {}\n",
208 |     "\n",
209 |     "    with open(toml_path, \"r\") as f:\n",
210 |     "        config = toml.load(f)\n",
211 |     "\n",
212 |     "    return config\n",
213 |     "\n",
214 |     "config = load_toml(config_path)\n",
215 |     "config"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "id": "ed5633c4-cf9f-477f-a468-582bbf91146d",
221 |    "metadata": {},
222 |    "source": [
223 |     "### Training"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "id": "388a8210-89f0-435f-8405-81b8c38caa12",
229 |    "metadata": {},
230 |    "source": [
231 |     "```bash\n",
232 |     "$ python -m bioblp.train_argparse --conf /home/jovyan/BioBLP/data/conf/complex-hetionet-20220826.toml\n",
233 |     "```"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "id": "773a6c74-333b-49e8-b2df-022574889217",
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": []
243 |   }
244 |  ],
245 |  "metadata": {
246 |   "kernelspec": {
247 |    "display_name": ".conda-bioblp-env [Python]",
248 |    "language": "python",
249 |    "name": "conda-env-.conda-bioblp-env-py"
250 |   },
251 |   "language_info": {
252 |    "codemirror_mode": {
253 |     "name": "ipython",
254 |     "version": 3
255 |    },
256 |    "file_extension": ".py",
257 |    "mimetype": "text/x-python",
258 |    "name": "python",
259 |    "nbconvert_exporter": "python",
260 |    "pygments_lexer": "ipython3",
261 |    "version": "3.8.13"
262 |   }
263 |  },
264 |  "nbformat": 4,
265 |  "nbformat_minor": 5
266 | }
267 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "bioblp"
 3 | version = "0.1.0"
 4 | description = "Link Prediction for biomedical data using KGE"
 5 | authors = []
 6 | packages = [{include = "bioblp"}]
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.9,<3.11"
10 | tqdm = "^4.60.0"
11 | pykeen = "^1.4.0"
12 | toml = "^0.10.2"
13 | pandas = "^1.4.2"
14 | torch = "^1.11.0"
15 | scikit-learn = "^1.1.0"
16 | skorch = "^0.11.0"
17 | optuna = "3.0.1"
18 | dill = "^0.3.6"
19 | 
20 | [tool.poetry.dev-dependencies]
21 | 
22 | 
23 | [tool.poetry.group.dev.dependencies]
24 | pytest = "^7.2.1"
25 | pycodestyle = "^2.10.0"
26 | autopep8 = "^2.0.1"
27 | 
28 | [build-system]
29 | requires = ["poetry-core>=1.0.0"]
30 | build-backend = "poetry.core.masonry.api"
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alembic==1.8.1 ; python_version >= "3.8" and python_version < "3.11"
 2 | attrs==22.1.0 ; python_version >= "3.8" and python_version < "3.11"
 3 | autopage==0.5.1 ; python_version >= "3.8" and python_version < "3.11"
 4 | certifi==2022.6.15.1 ; python_version >= "3.8" and python_version < "3.11"
 5 | charset-normalizer==2.1.1 ; python_version >= "3.8" and python_version < "3.11"
 6 | class-resolver==0.3.10 ; python_version >= "3.8" and python_version < "3.11"
 7 | click-default-group==1.2.2 ; python_version >= "3.8" and python_version < "3.11"
 8 | click==8.1.3 ; python_version >= "3.8" and python_version < "3.11"
 9 | cliff==4.0.0 ; python_version >= "3.8" and python_version < "3.11"
10 | cmaes==0.8.2 ; python_version >= "3.8" and python_version < "3.11"
11 | cmd2==2.4.2 ; python_version >= "3.8" and python_version < "3.11"
12 | colorama==0.4.5 ; python_version >= "3.8" and python_version < "3.11" and platform_system == "Windows" or python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32"
13 | colorlog==6.7.0 ; python_version >= "3.8" and python_version < "3.11"
14 | dataclasses-json==0.5.7 ; python_version >= "3.8" and python_version < "3.11"
15 | docdata==0.0.3 ; python_version >= "3.8" and python_version < "3.11"
16 | greenlet==1.1.3 ; python_version >= "3.8" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version < "3.11"
17 | idna==3.3 ; python_version >= "3.8" and python_version < "3.11"
18 | importlib-metadata==4.12.0 ; python_version >= "3.8" and python_version < "3.11"
19 | importlib-resources==5.9.0 ; python_version >= "3.8" and python_version < "3.9"
20 | joblib==1.1.0 ; python_version >= "3.8" and python_version < "3.11"
21 | mako==1.2.2 ; python_version >= "3.8" and python_version < "3.11"
22 | markupsafe==2.1.1 ; python_version >= "3.8" and python_version < "3.11"
23 | marshmallow-enum==1.5.1 ; python_version >= "3.8" and python_version < "3.11"
24 | marshmallow==3.17.1 ; python_version >= "3.8" and python_version < "3.11"
25 | more-click==0.1.1 ; python_version >= "3.8" and python_version < "3.11"
26 | more-itertools==8.14.0 ; python_version >= "3.8" and python_version < "3.11"
27 | mypy-extensions==0.4.3 ; python_version >= "3.8" and python_version < "3.11"
28 | numpy==1.23.3 ; python_version < "3.11" and python_version >= "3.8"
29 | optuna==3.0.1 ; python_version >= "3.8" and python_version < "3.11"
30 | packaging==21.3 ; python_version >= "3.8" and python_version < "3.11"
31 | pandas==1.4.4 ; python_version >= "3.8" and python_version < "3.11"
32 | pbr==5.10.0 ; python_version >= "3.8" and python_version < "3.11"
33 | prettytable==3.4.1 ; python_version >= "3.8" and python_version < "3.11"
34 | protobuf==3.20.1 ; python_version >= "3.8" and python_version < "3.11"
35 | pykeen==1.9.0 ; python_version >= "3.8" and python_version < "3.11"
36 | pyparsing==3.0.9 ; python_version >= "3.8" and python_version < "3.11"
37 | pyperclip==1.8.2 ; python_version >= "3.8" and python_version < "3.11"
38 | pyreadline3==3.4.1 ; python_version >= "3.8" and python_version < "3.11" and sys_platform == "win32"
39 | pystow==0.4.6 ; python_version >= "3.8" and python_version < "3.11"
40 | python-dateutil==2.8.2 ; python_version >= "3.8" and python_version < "3.11"
41 | pytz==2022.2.1 ; python_version >= "3.8" and python_version < "3.11"
42 | pyyaml==6.0 ; python_version >= "3.8" and python_version < "3.11"
43 | requests==2.28.1 ; python_version >= "3.8" and python_version < "3.11"
44 | rexmex==0.0.15 ; python_version >= "3.8" and python_version < "3.11"
45 | scikit-learn==1.1.2 ; python_version >= "3.8" and python_version < "3.11"
46 | scipy==1.8.1 ; python_version >= "3.8" and python_version < "3.11"
47 | six==1.16.0 ; python_version >= "3.8" and python_version < "3.11"
48 | scikit-learn==0.0 ; python_version >= "3.8" and python_version < "3.11"
49 | skorch==0.11.0 ; python_version >= "3.8" and python_version < "3.11"
50 | sqlalchemy==1.4.41 ; python_version >= "3.8" and python_version < "3.11"
51 | stevedore==4.0.0 ; python_version >= "3.8" and python_version < "3.11"
52 | tabulate==0.8.10 ; python_version >= "3.8" and python_version < "3.11"
53 | threadpoolctl==3.1.0 ; python_version >= "3.8" and python_version < "3.11"
54 | toml==0.10.2 ; python_version >= "3.8" and python_version < "3.11"
55 | torch-max-mem==0.0.4 ; python_version >= "3.8" and python_version < "3.11"
56 | torch-ppr==0.0.8 ; python_version >= "3.8" and python_version < "3.11"
57 | torch==1.12.1 ; python_version >= "3.8" and python_version < "3.11"
58 | tqdm==4.64.1 ; python_version >= "3.8" and python_version < "3.11"
59 | typing-extensions==4.3.0 ; python_version >= "3.8" and python_version < "3.11"
60 | typing-inspect==0.8.0 ; python_version >= "3.8" and python_version < "3.11"
61 | urllib3==1.26.12 ; python_version >= "3.8" and python_version < "3.11"
62 | wcwidth==0.2.5 ; python_version >= "3.8" and python_version < "3.11"
63 | zipp==3.8.1 ; python_version >= "3.8" and python_version < "3.11"
64 | 
65 | bioblp~=0.1.0
66 | torch~=1.13.1
67 | transformers~=4.26.1
68 | pandas~=1.5.3
69 | numpy~=1.24.2
70 | tqdm~=4.64.1
71 | pykeen~=1.10.0
72 | wandb~=0.13.10
73 | optuna~=3.0.1
74 | scikit-learn~=1.2.1
75 | skorch~=0.11.0


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/tests/__init__.py


--------------------------------------------------------------------------------
/tests/benchmarking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elsevier-AI-Lab/BioBLP/c752f0e7269ff59dda4de8cf12843c09d94a30cf/tests/benchmarking/__init__.py


--------------------------------------------------------------------------------
/tests/benchmarking/bm_test_conf.toml:
--------------------------------------------------------------------------------
 1 | 
 2 | data_root = "/home/skywalker/bioblp/"
 3 | experiment_root = "data/benchmarks/experiments/dpi_fda/20230224/"
 4 | 
 5 | [sampling]
 6 | outdir = "sampled"
 7 | num_negs_per_pos = 10
 8 | kg_triples_dir = "data/benchmarks/experiments/encoders/rotate/training_triples/"
 9 | 
10 | [features]
11 | outdir = "features"
12 | transform = "concat"
13 | missing_values = "random"
14 | encoders = ["structural", "complex", "rotate", "noise"]
15 | 
16 | [features.encoder_args.noise]
17 | random_seed = 24
18 | 
19 | [features.encoder_args.structural]
20 | proteins = "data/benchmarks/experiments/encoders/proteins"
21 | molecules = "data/benchmarks/experiments/encoders/molecules"
22 | 
23 | [features.encoder_args.complex]
24 | model_dir = "data/benchmarks/experiments/encoders/complex/"
25 | 
26 | [features.encoder_args.rotate]
27 | model_dir = "data/benchmarks/experiments/encoders/rotate/"
28 | 
29 | [features.encoder_args.transe]
30 | model_dir = "data/benchmarks/experiments/encoders/transe/"
31 | 
32 | [split]
33 | n_splits = 5
34 | outdir = "splits"
35 | 
36 | [models]
37 | 
38 | [models.noise_lr]
39 | feature = "noise"
40 | model = "LR"
41 | 
42 | [models.noise_rf]
43 | feature = "noise"
44 | model = "RF"
45 | 
46 | [models.noise_mlp]
47 | feature = "noise"
48 | model = "MLP"
49 | 
50 | [models.structural_lr]
51 | feature = "structural"
52 | model = "LR"
53 | 
54 | [models.complex_lr]
55 | feature = "complex"
56 | model = "LR"
57 | 
58 | [models.rotate_lr]
59 | feature = "rotate"
60 | model = "LR"
61 | 
62 | [train]
63 | n_iter = 2
64 | splits_file = "cv-splits.pt"
65 | refit_params = ["AUCPR", "AUCROC"]
66 | outdir = "models"
67 | 


--------------------------------------------------------------------------------
/tests/benchmarking/test_config.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from dataclasses import fields
  4 | 
  5 | from pathlib import Path
  6 | from bioblp.benchmarking.config import BenchmarkStepBaseConfig
  7 | from bioblp.benchmarking.config import BenchmarkPreprocessConfig
  8 | from bioblp.benchmarking.config import BenchmarkFeatureConfig
  9 | from bioblp.benchmarking.config import BenchmarkTrainConfig
 10 | 
 11 | 
 12 | from bioblp.logger import get_logger
 13 | 
 14 | 
 15 | logger = get_logger(__name__)
 16 | 
 17 | test_toml_file = Path(__file__).parent.joinpath("bm_test_conf.toml")
 18 | 
 19 | 
 20 | class TestBenchmarkStepBaseConfig():
 21 | 
 22 |     dr = "/home/skywalker/bioblp/data/"
 23 |     exp = "benchmark/experiments"
 24 |     step_out = "step_out"
 25 |     run_id = "123"
 26 | 
 27 |     def test_resolve_outdir(self):
 28 | 
 29 |         cfg = BenchmarkStepBaseConfig(
 30 |             data_root=self.dr,
 31 |             experiment_root=self.exp,
 32 |             run_id=self.run_id,
 33 |             outdir=self.step_out
 34 |         )
 35 | 
 36 |         full_outdir = cfg.resolve_outdir()
 37 | 
 38 |         assert str(full_outdir) == self.dr + self.exp + \
 39 |             "/" + self.run_id + "/" + self.step_out
 40 | 
 41 |     def test_test_resolve_outdir_mutated(self):
 42 |         cfg = BenchmarkStepBaseConfig(
 43 |             data_root=self.dr,
 44 |             experiment_root=self.exp,
 45 |             run_id=self.run_id,
 46 |             outdir=self.step_out
 47 |         )
 48 | 
 49 |         override_data_root = "/home/vader/bioblp/data/"
 50 | 
 51 |         cfg.data_root = override_data_root
 52 | 
 53 |         full_outdir = cfg.resolve_outdir()
 54 | 
 55 |         assert str(full_outdir) == override_data_root + self.exp + \
 56 |             "/" + self.run_id + "/" + self.step_out
 57 | 
 58 | 
 59 | class TestBenchmarkPreprocessConfig():
 60 | 
 61 |     def test_from_toml(self):
 62 |         expected_fields = ["data_root", "experiment_root", "run_id", "outdir",
 63 |                            "num_negs_per_pos", "kg_triples_dir"]
 64 | 
 65 |         run_id = "123"
 66 |         cfg = BenchmarkPreprocessConfig.from_toml(
 67 |             test_toml_file, run_id=run_id)
 68 | 
 69 |         cfg_fields = [field.name for field in fields(cfg)]
 70 | 
 71 |         assert cfg.num_negs_per_pos == 10
 72 |         assert cfg.data_root == "/home/skywalker/bioblp/"
 73 |         assert len(set(cfg_fields).difference(set(expected_fields))
 74 |                    ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}"
 75 | 
 76 |     def test_resolve_outdir(self):
 77 | 
 78 |         run_id = "123"
 79 |         cfg = BenchmarkPreprocessConfig.from_toml(
 80 |             test_toml_file, run_id=run_id)
 81 | 
 82 |         outdir = cfg.resolve_outdir()
 83 | 
 84 |         assert str(
 85 |             outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/sampled"
 86 | 
 87 | 
 88 | class TestBenchmarkFeatureConfig():
 89 | 
 90 |     def test_from_toml(self):
 91 |         expected_fields = ["data_root", "experiment_root", "run_id", "outdir",
 92 |                            "transform", "missing_values", "encoders", "encoder_args"]
 93 | 
 94 |         run_id = "123"
 95 |         cfg = BenchmarkFeatureConfig.from_toml(test_toml_file, run_id=run_id)
 96 | 
 97 |         cfg_fields = [field.name for field in fields(cfg)]
 98 | 
 99 |         assert len(set(cfg_fields).difference(set(expected_fields))
100 |                    ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}"
101 | 
102 |     def test_resolve_outdir(self):
103 | 
104 |         run_id = "123"
105 |         cfg = BenchmarkFeatureConfig.from_toml(test_toml_file, run_id=run_id)
106 | 
107 |         outdir = cfg.resolve_outdir()
108 | 
109 |         assert str(
110 |             outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/features"
111 | 
112 | 
113 | class TestBenchmarkTrainConfig():
114 | 
115 |     def test_from_toml(self):
116 |         expected_fields = ["data_root", "experiment_root", "run_id", "outdir",
117 |                            "feature_dir", "models", "refit_params", "n_iter", "splits_dir", "splits_file"]
118 | 
119 |         run_id = "123"
120 |         cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id)
121 | 
122 |         cfg_fields = [field.name for field in fields(cfg)]
123 | 
124 |         assert len(set(cfg_fields).difference(set(expected_fields))
125 |                    ) == 0, f"Mismatch in fields: {set(cfg_fields).difference(set(expected_fields))}"
126 | 
127 |     def test_resolve_outdir(self):
128 | 
129 |         run_id = "123"
130 |         cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id)
131 | 
132 |         outdir = cfg.resolve_outdir()
133 | 
134 |         assert str(
135 |             outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/models"
136 | 
137 |     def test_resolve_feature_outdir(self):
138 | 
139 |         run_id = "123"
140 |         cfg = BenchmarkTrainConfig.from_toml(test_toml_file, run_id=run_id)
141 | 
142 |         outdir = cfg.resolve_feature_dir()
143 | 
144 |         assert str(
145 |             outdir) == f"/home/skywalker/bioblp/data/benchmarks/experiments/dpi_fda/20230224/{run_id}/features"
146 | 


--------------------------------------------------------------------------------
/tests/benchmarking/test_featurise.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from bioblp.benchmarking.featurise import apply_common_mask
 5 | 
 6 | 
 7 | class TestApplyCommonMask:
 8 | 
 9 |     data_A = torch.arange(0., 9.).resize(3, 3)
10 |     data_B = torch.arange(9., 21.).resize(3, 4)
11 | 
12 |     labels = torch.ones(3)
13 | 
14 |     def test_mask_consistency(self):
15 |         mask_A = torch.tensor([0, 1])
16 |         mask_B = torch.tensor([0, 1, 2])
17 | 
18 |         inputs = [("A", self.data_A, mask_A), ("B", self.data_B, mask_B)]
19 | 
20 |         masked_inputs, _ = apply_common_mask(inputs, labels=self.labels)
21 | 
22 |         assert masked_inputs[0][1].size(0) == len(mask_A)
23 |         assert masked_inputs[0][1].size(0) == masked_inputs[1][1].size(0)
24 | 
25 |     def test_mask_consistency_labels(self):
26 |         mask_A = torch.tensor([0, 2])
27 |         mask_B = torch.tensor([0, 1, 2])
28 | 
29 |         labels = torch.tensor([1, 1, 0])
30 |         expected_labels = torch.tensor([1, 0])
31 | 
32 |         inputs = [("A", self.data_A, mask_A), ("B", self.data_B, mask_B)]
33 | 
34 |         _, masked_labels = apply_common_mask(inputs, labels=labels)
35 | 
36 |         assert len(masked_labels) == len(mask_A)
37 |         assert torch.sum((masked_labels - expected_labels)) == 0
38 | 


--------------------------------------------------------------------------------
/tests/benchmarking/test_train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from bioblp.benchmarking.train_utils import validate_features_exist
 3 | from bioblp.benchmarking.config import BenchmarkTrainConfig
 4 | 
 5 | from bioblp.logger import get_logger
 6 | 
 7 | 
 8 | logger = get_logger(__name__)
 9 | 
10 | 
11 | CONFIG_PATH = "conf/dpi-benchmark-cv-20230423-lr.toml"
12 | 
13 | 
14 | def test_parse_train_config():
15 |     cfg = BenchmarkTrainConfig.from_toml(CONFIG_PATH, run_id="abc")
16 | 
17 |     logger.info(cfg)
18 | 
19 | 
20 | class TestValidateFeatures():
21 | 
22 |     models_conf = {
23 |         "noise_lr": {
24 |             "feature": "noise",
25 |             "model": "LR"
26 |         },
27 |         "complex_lr": {
28 |             "feature": "complex",
29 |             "model": "LR"
30 |         }
31 |     }
32 | 
33 |     existing_feats = ["noise", "complex"]
34 | 
35 |     def setup_feats(self, dir):
36 |         data = torch.arange(0., 12.).resize(3, 4)
37 | 
38 |         for feat in self.existing_feats:
39 |             torch.save(data, dir.joinpath(f"{feat}.pt"))
40 | 
41 |     def test_validate_features_exist(self, tmp_path):
42 |         dir = tmp_path.joinpath("features")
43 |         dir.mkdir()
44 |         self.setup_feats(dir)
45 | 
46 |         exists = validate_features_exist(dir, self.models_conf)
47 | 
48 |         assert exists is True
49 | 
50 |     def test_validate_features_exist_missing(self, tmp_path):
51 |         dir = tmp_path.joinpath("features")
52 |         dir.mkdir()
53 |         self.setup_feats(dir)
54 | 
55 |         missing_feat = {
56 |             "feature": "rotate",
57 |             "model": "LR"
58 |         }
59 |         conf = self.models_conf
60 |         conf.update({"rotate_LR": missing_feat})
61 | 
62 |         exists = validate_features_exist(dir, conf)
63 | 
64 |         assert exists is False
65 | 


--------------------------------------------------------------------------------
/tests/test_encoders.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import unittest
  3 | import tempfile
  4 | import os
  5 | import os.path as osp
  6 | import pytest
  7 | import torch
  8 | from transformers import BertTokenizer
  9 | 
 10 | from bioblp.models.encoders import TransformerTextEncoder
 11 | import bioblp.loaders.preprocessors as preprocessors
 12 | 
 13 | 
 14 | class TestPropertyEncoders(unittest.TestCase):
 15 |     DISEASES = ['Irreversible FIBROSIS of the submucosal tissue of the MOUTH.',
 16 |                 'The co-occurrence of pregnancy and parasitic diseases.',
 17 |                 'Benign epidermal proliferations or tumors of viral in origin.',
 18 |                 'Infections with bacteria of the genus PASTEURELLA.']
 19 | 
 20 |     MOLECULES = ['101010101010101010101010101010101010']
 21 | 
 22 |     def setUp(self):
 23 |         self.temp_file = None
 24 | 
 25 |     def tearDown(self):
 26 |         if self.temp_file is not None:
 27 |             if osp.exists(self.temp_file):
 28 |                 os.remove(self.temp_file)
 29 | 
 30 |     def make_test_file(self, entities: List[int], choices: List[str]):
 31 |         if self.temp_file is None:
 32 |             file_name = tempfile.NamedTemporaryFile().name
 33 |             self.temp_file = file_name
 34 |         else:
 35 |             file_name = self.temp_file
 36 | 
 37 |         with open(file_name, 'w') as file:
 38 |             for i, entity in enumerate(entities):
 39 |                 sample = choices[i % len(choices)]
 40 |                 file.write(f'{entity}\t{sample}\n')
 41 | 
 42 |         return file_name
 43 | 
 44 |     def make_protein_test_file(self, emb_dim: int, entities: List[str]):
 45 |         if self.temp_file is None:
 46 |             file_name = tempfile.NamedTemporaryFile().name
 47 |             self.temp_file = file_name
 48 |         else:
 49 |             file_name = self.temp_file
 50 | 
 51 |         embeddings = torch.rand([len(entities), emb_dim])
 52 | 
 53 |         with open(file_name, 'w') as file:
 54 |             torch.save({'identifiers': entities, 'embeddings': embeddings},
 55 |                        file_name)
 56 | 
 57 |         return file_name
 58 | 
 59 |     @pytest.mark.skip(reason="no way of currently testing this")
 60 |     def test_text_preprocessor(self):
 61 |         entity_to_id = {str(i): i for i in range(10)}
 62 |         entities = list(entity_to_id.keys())
 63 |         file = self.make_test_file(entities, choices=self.DISEASES)
 64 | 
 65 |         max_length = 32
 66 |         tokenizer = BertTokenizer.from_pretrained(
 67 |             TransformerTextEncoder.BASE_MODEL)
 68 |         preprocessor = preprocessors.TextEntityPropertyPreprocessor(tokenizer,
 69 |                                                                     max_length)
 70 | 
 71 |         entities_tensor, data_idx, data = preprocessor.preprocess_file(file,
 72 |                                                                        entity_to_id)
 73 |         self.assertEqual(len(entities_tensor), len(entities))
 74 |         self.assertEqual(len(data_idx), len(entities))
 75 |         self.assertTupleEqual(data.shape, (len(entities), max_length))
 76 | 
 77 |     def test_molecule_preprocessor(self):
 78 |         entity_to_id = {str(i): i for i in range(10)}
 79 |         entities = list(entity_to_id.keys())
 80 |         file = self.make_test_file(entities, choices=self.MOLECULES)
 81 | 
 82 |         preprocessor = preprocessors.MolecularFingerprintPreprocessor()
 83 |         entities_tensor, data_idx, data = preprocessor.preprocess_file(file,
 84 |                                                                        entity_to_id)
 85 | 
 86 |         self.assertEqual(len(entities_tensor), len(entities))
 87 |         self.assertEqual(len(data_idx), len(entities))
 88 |         self.assertTupleEqual(
 89 |             data.shape, (len(entities), len(self.MOLECULES[0])))
 90 | 
 91 |     @pytest.mark.skip(reason="faulty test")
 92 |     def test_pretrained_protein_preprocessor(self):
 93 |         emb_dim = 32
 94 |         entity_to_id = {str(i): i for i in range(10)}
 95 |         entities = list(entity_to_id.keys())
 96 |         file = self.make_protein_test_file(emb_dim, entities)
 97 | 
 98 |         preprocessor = preprocessors.PretrainedEmbeddingPreprocessor()
 99 |         entities_tensor, data_idx, data = preprocessor.preprocess_file(file,
100 |                                                                        entity_to_id)
101 | 
102 |         self.assertEqual(len(entities_tensor), len(entities))
103 |         self.assertEqual(len(data_idx), len(entities))
104 |         self.assertTupleEqual(data.shape, (len(entities), emb_dim))
105 | 


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
1 | from bioblp import __version__
2 | 
3 | 
4 | def test_version():
5 |     assert __version__ == "0.1.0"
6 | 


--------------------------------------------------------------------------------